diff options
Diffstat (limited to 'llvm/lib')
109 files changed, 5609 insertions, 1013 deletions
| diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 853bd66..a572eef 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -1582,6 +1582,23 @@ static const SCEV *minusSCEVNoSignedOverflow(const SCEV *A, const SCEV *B,    return nullptr;  } +/// Returns the absolute value of \p A. In the context of dependence analysis, +/// we need an absolute value in a mathematical sense. If \p A is the signed +/// minimum value, we cannot represent it unless extending the original type. +/// Thus if we cannot prove that \p A is not the signed minimum value, returns +/// nullptr. +static const SCEV *absSCEVNoSignedOverflow(const SCEV *A, ScalarEvolution &SE) { +  IntegerType *Ty = cast<IntegerType>(A->getType()); +  if (!Ty) +    return nullptr; + +  const SCEV *SMin = +      SE.getConstant(APInt::getSignedMinValue(Ty->getBitWidth())); +  if (!SE.isKnownPredicate(CmpInst::ICMP_NE, A, SMin)) +    return nullptr; +  return SE.getAbsExpr(A, /*IsNSW=*/true); +} +  /// Returns true iff \p Test is enabled.  static bool isDependenceTestEnabled(DependenceTestType Test) {    if (EnableDependenceTest == DependenceTestType::All) @@ -1669,21 +1686,25 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,    LLVM_DEBUG(dbgs() << ", " << *Delta->getType() << "\n");    // check that |Delta| < iteration count -  if (const SCEV *UpperBound = -          collectUpperBound(CurSrcLoop, Delta->getType())) { +  bool IsDeltaLarge = [&] { +    const SCEV *UpperBound = collectUpperBound(CurSrcLoop, Delta->getType()); +    if (!UpperBound) +      return false; +      LLVM_DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound);      LLVM_DEBUG(dbgs() << ", " << *UpperBound->getType() << "\n"); -    const SCEV *AbsDelta = -        SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta); -    const SCEV *AbsCoeff = -        SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff); +    const SCEV *AbsDelta = absSCEVNoSignedOverflow(Delta, *SE); +    const SCEV *AbsCoeff = absSCEVNoSignedOverflow(Coeff, *SE); +    if (!AbsDelta || !AbsCoeff) +      return false;      const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff); -    if (isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product)) { -      // Distance greater than trip count - no dependence -      ++StrongSIVindependence; -      ++StrongSIVsuccesses; -      return true; -    } +    return isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product); +  }(); +  if (IsDeltaLarge) { +    // Distance greater than trip count - no dependence +    ++StrongSIVindependence; +    ++StrongSIVsuccesses; +    return true;    }    // Can we compute distance? @@ -2259,6 +2280,9 @@ bool DependenceInfo::weakZeroSrcSIVtest(    const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(DstCoeff);    if (!ConstCoeff)      return false; + +  // Since ConstCoeff is constant, !isKnownNegative means it's non-negative. +  // TODO: Bail out if it's a signed minimum value.    const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff)                               ? SE->getNegativeSCEV(ConstCoeff)                               : ConstCoeff; @@ -2369,6 +2393,9 @@ bool DependenceInfo::weakZeroDstSIVtest(    const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(SrcCoeff);    if (!ConstCoeff)      return false; + +  // Since ConstCoeff is constant, !isKnownNegative means it's non-negative. +  // TODO: Bail out if it's a signed minimum value.    const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff)                               ? SE->getNegativeSCEV(ConstCoeff)                               : ConstCoeff; diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index 92a5b6f..b09f4ed 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -241,9 +241,13 @@ static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack,        ColdBytes += TotalSize;        // If we have the max cold context size from summary information and have        // requested identification of contexts above a percentage of the max, see -      // if this context qualifies. -      if (MaxColdSize > 0 && MinPercentMaxColdSize < 100 && -          TotalSize * 100 >= MaxColdSize * MinPercentMaxColdSize) +      // if this context qualifies. We should assume this is large if we rebuilt +      // the trie from existing metadata (i.e. to update after inlining), in +      // which case we don't have a MaxSize from the profile - we assume any +      // context size info in existence on the metadata should be propagated. +      if (BuiltFromExistingMetadata || +          (MaxColdSize > 0 && MinPercentMaxColdSize < 100 && +           TotalSize * 100 >= MaxColdSize * MinPercentMaxColdSize))          LargeColdContext = true;      }      // Only add the context size info as metadata if we need it in the thin diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index b425b95..1f10478 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -391,19 +391,6 @@ void CombinerHelper::applyCombineConcatVectors(    MI.eraseFromParent();  } -bool CombinerHelper::matchCombineShuffleToBuildVector(MachineInstr &MI) const { -  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR && -         "Invalid instruction"); -  auto &Shuffle = cast<GShuffleVector>(MI); - -  Register SrcVec1 = Shuffle.getSrc1Reg(); -  Register SrcVec2 = Shuffle.getSrc2Reg(); - -  LLT SrcVec1Type = MRI.getType(SrcVec1); -  LLT SrcVec2Type = MRI.getType(SrcVec2); -  return SrcVec1Type.isVector() && SrcVec2Type.isVector(); -} -  void CombinerHelper::applyCombineShuffleToBuildVector(MachineInstr &MI) const {    auto &Shuffle = cast<GShuffleVector>(MI); @@ -535,11 +522,9 @@ bool CombinerHelper::matchCombineShuffleVector(    LLT DstType = MRI.getType(MI.getOperand(0).getReg());    Register Src1 = MI.getOperand(1).getReg();    LLT SrcType = MRI.getType(Src1); -  // As bizarre as it may look, shuffle vector can actually produce -  // scalar! This is because at the IR level a <1 x ty> shuffle -  // vector is perfectly valid. -  unsigned DstNumElts = DstType.isVector() ? DstType.getNumElements() : 1; -  unsigned SrcNumElts = SrcType.isVector() ? SrcType.getNumElements() : 1; + +  unsigned DstNumElts = DstType.getNumElements(); +  unsigned SrcNumElts = SrcType.getNumElements();    // If the resulting vector is smaller than the size of the source    // vectors being concatenated, we won't be able to replace the @@ -556,7 +541,7 @@ bool CombinerHelper::matchCombineShuffleVector(    //    // TODO: If the size between the source and destination don't match    //       we could still emit an extract vector element in that case. -  if (DstNumElts < 2 * SrcNumElts && DstNumElts != 1) +  if (DstNumElts < 2 * SrcNumElts)      return false;    // Check that the shuffle mask can be broken evenly between the @@ -619,39 +604,6 @@ void CombinerHelper::applyCombineShuffleVector(    MI.eraseFromParent();  } -bool CombinerHelper::matchShuffleToExtract(MachineInstr &MI) const { -  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR && -         "Invalid instruction kind"); - -  ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); -  return Mask.size() == 1; -} - -void CombinerHelper::applyShuffleToExtract(MachineInstr &MI) const { -  Register DstReg = MI.getOperand(0).getReg(); -  Builder.setInsertPt(*MI.getParent(), MI); - -  int I = MI.getOperand(3).getShuffleMask()[0]; -  Register Src1 = MI.getOperand(1).getReg(); -  LLT Src1Ty = MRI.getType(Src1); -  int Src1NumElts = Src1Ty.isVector() ? Src1Ty.getNumElements() : 1; -  Register SrcReg; -  if (I >= Src1NumElts) { -    SrcReg = MI.getOperand(2).getReg(); -    I -= Src1NumElts; -  } else if (I >= 0) -    SrcReg = Src1; - -  if (I < 0) -    Builder.buildUndef(DstReg); -  else if (!MRI.getType(SrcReg).isVector()) -    Builder.buildCopy(DstReg, SrcReg); -  else -    Builder.buildExtractVectorElementConstant(DstReg, SrcReg, I); - -  MI.eraseFromParent(); -} -  namespace {  /// Select a preference between two uses. CurrentUse is the current preference @@ -8369,7 +8321,7 @@ bool CombinerHelper::matchShuffleDisjointMask(MachineInstr &MI,      return false;    ArrayRef<int> Mask = Shuffle.getMask(); -  const unsigned NumSrcElems = Src1Ty.isVector() ? Src1Ty.getNumElements() : 1; +  const unsigned NumSrcElems = Src1Ty.getNumElements();    bool TouchesSrc1 = false;    bool TouchesSrc2 = false; diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp index 04d9309..d6f23b6 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp @@ -602,6 +602,8 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known,                           Depth + 1);      computeKnownBitsImpl(MI.getOperand(3).getReg(), WidthKnown, DemandedElts,                           Depth + 1); +    OffsetKnown = OffsetKnown.sext(BitWidth); +    WidthKnown = WidthKnown.sext(BitWidth);      Known = extractBits(BitWidth, SrcOpKnown, OffsetKnown, WidthKnown);      // Sign extend the extracted value using shift left and arithmetic shift      // right. diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index b49040b..1fc90d0 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -3359,6 +3359,54 @@ bool IRTranslator::translateShuffleVector(const User &U,      Mask = SVI->getShuffleMask();    else      Mask = cast<ConstantExpr>(U).getShuffleMask(); + +  // As GISel does not represent <1 x > vectors as a separate type from scalars, +  // we transform shuffle_vector with a scalar output to an +  // ExtractVectorElement. If the input type is also scalar it becomes a Copy. +  unsigned DstElts = cast<FixedVectorType>(U.getType())->getNumElements(); +  unsigned SrcElts = +      cast<FixedVectorType>(U.getOperand(0)->getType())->getNumElements(); +  if (DstElts == 1) { +    unsigned M = Mask[0]; +    if (SrcElts == 1) { +      if (M == 0 || M == 1) +        return translateCopy(U, *U.getOperand(M), MIRBuilder); +      MIRBuilder.buildUndef(getOrCreateVReg(U)); +    } else { +      Register Dst = getOrCreateVReg(U); +      if (M < SrcElts) { +        MIRBuilder.buildExtractVectorElementConstant( +            Dst, getOrCreateVReg(*U.getOperand(0)), M); +      } else if (M < SrcElts * 2) { +        MIRBuilder.buildExtractVectorElementConstant( +            Dst, getOrCreateVReg(*U.getOperand(1)), M - SrcElts); +      } else { +        MIRBuilder.buildUndef(Dst); +      } +    } +    return true; +  } + +  // A single element src is transformed to a build_vector. +  if (SrcElts == 1) { +    SmallVector<Register> Ops; +    Register Undef; +    for (int M : Mask) { +      LLT SrcTy = getLLTForType(*U.getOperand(0)->getType(), *DL); +      if (M == 0 || M == 1) { +        Ops.push_back(getOrCreateVReg(*U.getOperand(M))); +      } else { +        if (!Undef.isValid()) { +          Undef = MRI->createGenericVirtualRegister(SrcTy); +          MIRBuilder.buildUndef(Undef); +        } +        Ops.push_back(Undef); +      } +    } +    MIRBuilder.buildBuildVector(getOrCreateVReg(U), Ops); +    return true; +  } +    ArrayRef<int> MaskAlloc = MF->allocateShuffleMask(Mask);    MIRBuilder        .buildInstr(TargetOpcode::G_SHUFFLE_VECTOR, {getOrCreateVReg(U)}, diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 38ec83f..178529f 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -4748,6 +4748,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {    case G_FMINIMUMNUM:    case G_FMAXIMUMNUM:      return lowerFMinNumMaxNum(MI); +  case G_FMINIMUM: +  case G_FMAXIMUM: +    return lowerFMinimumMaximum(MI);    case G_MERGE_VALUES:      return lowerMergeValues(MI);    case G_UNMERGE_VALUES: @@ -5819,6 +5822,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(      } else if (InputUsed[0] == -1U) {        // No input vectors were used! The result is undefined.        Output = MIRBuilder.buildUndef(NarrowTy).getReg(0); +    } else if (NewElts == 1) { +      Output = MIRBuilder.buildCopy(NarrowTy, Inputs[InputUsed[0]]).getReg(0);      } else {        Register Op0 = Inputs[InputUsed[0]];        // If only one input was used, use an undefined vector for the other. @@ -8775,6 +8780,77 @@ LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {    return Legalized;  } +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerFMinimumMaximum(MachineInstr &MI) { +  unsigned Opc = MI.getOpcode(); +  auto [Dst, Src0, Src1] = MI.getFirst3Regs(); +  LLT Ty = MRI.getType(Dst); +  LLT CmpTy = Ty.changeElementSize(1); + +  bool IsMax = (Opc == TargetOpcode::G_FMAXIMUM); +  unsigned OpcIeee = +      IsMax ? TargetOpcode::G_FMAXNUM_IEEE : TargetOpcode::G_FMINNUM_IEEE; +  unsigned OpcNonIeee = +      IsMax ? TargetOpcode::G_FMAXNUM : TargetOpcode::G_FMINNUM; +  bool MinMaxMustRespectOrderedZero = false; +  Register Res; + +  // IEEE variants don't need canonicalization +  if (LI.isLegalOrCustom({OpcIeee, Ty})) { +    Res = MIRBuilder.buildInstr(OpcIeee, {Ty}, {Src0, Src1}).getReg(0); +    MinMaxMustRespectOrderedZero = true; +  } else if (LI.isLegalOrCustom({OpcNonIeee, Ty})) { +    Res = MIRBuilder.buildInstr(OpcNonIeee, {Ty}, {Src0, Src1}).getReg(0); +  } else { +    auto Compare = MIRBuilder.buildFCmp( +        IsMax ? CmpInst::FCMP_OGT : CmpInst::FCMP_OLT, CmpTy, Src0, Src1); +    Res = MIRBuilder.buildSelect(Ty, Compare, Src0, Src1).getReg(0); +  } + +  // Propagate any NaN of both operands +  if (!MI.getFlag(MachineInstr::FmNoNans) && +      (!isKnownNeverNaN(Src0, MRI) || isKnownNeverNaN(Src1, MRI))) { +    auto IsOrdered = MIRBuilder.buildFCmp(CmpInst::FCMP_ORD, CmpTy, Src0, Src1); + +    LLT ElementTy = Ty.isScalar() ? Ty : Ty.getElementType(); +    APFloat NaNValue = APFloat::getNaN(getFltSemanticForLLT(ElementTy)); +    Register NaN = MIRBuilder.buildFConstant(ElementTy, NaNValue).getReg(0); +    if (Ty.isVector()) +      NaN = MIRBuilder.buildSplatBuildVector(Ty, NaN).getReg(0); + +    Res = MIRBuilder.buildSelect(Ty, IsOrdered, Res, NaN).getReg(0); +  } + +  // fminimum/fmaximum requires -0.0 less than +0.0 +  if (!MinMaxMustRespectOrderedZero && !MI.getFlag(MachineInstr::FmNsz)) { +    GISelValueTracking VT(MIRBuilder.getMF()); +    KnownFPClass Src0Info = VT.computeKnownFPClass(Src0, fcZero); +    KnownFPClass Src1Info = VT.computeKnownFPClass(Src1, fcZero); + +    if (!Src0Info.isKnownNeverZero() && !Src1Info.isKnownNeverZero()) { +      const unsigned Flags = MI.getFlags(); +      Register Zero = MIRBuilder.buildFConstant(Ty, 0.0).getReg(0); +      auto IsZero = MIRBuilder.buildFCmp(CmpInst::FCMP_OEQ, CmpTy, Res, Zero); + +      unsigned TestClass = IsMax ? fcPosZero : fcNegZero; + +      auto LHSTestZero = MIRBuilder.buildIsFPClass(CmpTy, Src0, TestClass); +      auto LHSSelect = +          MIRBuilder.buildSelect(Ty, LHSTestZero, Src0, Res, Flags); + +      auto RHSTestZero = MIRBuilder.buildIsFPClass(CmpTy, Src1, TestClass); +      auto RHSSelect = +          MIRBuilder.buildSelect(Ty, RHSTestZero, Src1, LHSSelect, Flags); + +      Res = MIRBuilder.buildSelect(Ty, IsZero, RHSSelect, Res, Flags).getReg(0); +    } +  } + +  MIRBuilder.buildCopy(Dst, Res); +  MI.eraseFromParent(); +  return Legalized; +} +  LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {    // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c    Register DstReg = MI.getOperand(0).getReg(); @@ -9016,22 +9092,18 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {        continue;      } -    if (Src0Ty.isScalar()) { -      BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg); -    } else { -      int NumElts = Src0Ty.getNumElements(); -      Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg; -      int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts; -      auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx); -      auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK); -      BuildVec.push_back(Extract.getReg(0)); -    } +    assert(!Src0Ty.isScalar() && "Unexpected scalar G_SHUFFLE_VECTOR"); + +    int NumElts = Src0Ty.getNumElements(); +    Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg; +    int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts; +    auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx); +    auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK); +    BuildVec.push_back(Extract.getReg(0));    } -  if (DstTy.isVector()) -    MIRBuilder.buildBuildVector(DstReg, BuildVec); -  else -    MIRBuilder.buildCopy(DstReg, BuildVec[0]); +  assert(DstTy.isVector() && "Unexpected scalar G_SHUFFLE_VECTOR"); +  MIRBuilder.buildBuildVector(DstReg, BuildVec);    MI.eraseFromParent();    return Legalized;  } diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 27df7e3..4b4df98 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -800,10 +800,11 @@ MachineInstrBuilder MachineIRBuilder::buildShuffleVector(const DstOp &Res,    LLT DstTy = Res.getLLTTy(*getMRI());    LLT Src1Ty = Src1.getLLTTy(*getMRI());    LLT Src2Ty = Src2.getLLTTy(*getMRI()); -  const LLT DstElemTy = DstTy.isVector() ? DstTy.getElementType() : DstTy; -  const LLT ElemTy1 = Src1Ty.isVector() ? Src1Ty.getElementType() : Src1Ty; -  const LLT ElemTy2 = Src2Ty.isVector() ? Src2Ty.getElementType() : Src2Ty; +  const LLT DstElemTy = DstTy.getScalarType(); +  const LLT ElemTy1 = Src1Ty.getScalarType(); +  const LLT ElemTy2 = Src2Ty.getScalarType();    assert(DstElemTy == ElemTy1 && DstElemTy == ElemTy2); +  assert(Mask.size() > 1 && "Scalar G_SHUFFLE_VECTOR are not supported");    (void)DstElemTy;    (void)ElemTy1;    (void)ElemTy2; diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp index 6a464d9..4795d81 100644 --- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -2788,6 +2788,9 @@ bool MIParser::parseShuffleMaskOperand(MachineOperand &Dest) {    if (expectAndConsume(MIToken::rparen))      return error("shufflemask should be terminated by ')'."); +  if (ShufMask.size() < 2) +    return error("shufflemask should have > 1 element"); +    ArrayRef<int> MaskAlloc = MF.allocateShuffleMask(ShufMask);    Dest = MachineOperand::CreateShuffleMask(MaskAlloc);    return false; diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index 1154855..c0710c4 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -1924,13 +1924,23 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {      if (Src0Ty != Src1Ty)        report("Source operands must be the same type", MI); -    if (Src0Ty.getScalarType() != DstTy.getScalarType()) +    if (Src0Ty.getScalarType() != DstTy.getScalarType()) {        report("G_SHUFFLE_VECTOR cannot change element type", MI); +      break; +    } +    if (!Src0Ty.isVector()) { +      report("G_SHUFFLE_VECTOR must have vector src", MI); +      break; +    } +    if (!DstTy.isVector()) { +      report("G_SHUFFLE_VECTOR must have vector dst", MI); +      break; +    }      // Don't check that all operands are vector because scalars are used in      // place of 1 element vectors. -    int SrcNumElts = Src0Ty.isVector() ? Src0Ty.getNumElements() : 1; -    int DstNumElts = DstTy.isVector() ? DstTy.getNumElements() : 1; +    int SrcNumElts = Src0Ty.getNumElements(); +    int DstNumElts = DstTy.getNumElements();      ArrayRef<int> MaskIdxes = MaskOp.getShuffleMask(); diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp index 72b364c..697b779 100644 --- a/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/llvm/lib/CodeGen/RegAllocFast.cpp @@ -211,7 +211,7 @@ private:      unsigned getSparseSetIndex() const { return VirtReg.virtRegIndex(); }    }; -  using LiveRegMap = SparseSet<LiveReg, unsigned, identity_cxx20, uint16_t>; +  using LiveRegMap = SparseSet<LiveReg, unsigned, identity, uint16_t>;    /// This map contains entries for each virtual register that is currently    /// available in a physical register.    LiveRegMap LiveVirtRegs; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d2ea652..8676060 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19993,8 +19993,12 @@ static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad,    //    nor a successor of N. Otherwise, if Op is folded that would    //    create a cycle.    unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps(); -  for (SDNode *Op : Ptr->users()) { +  for (SDUse &U : Ptr->uses()) { +    if (U.getResNo() != Ptr.getResNo()) +      continue; +      // Check for #1. +    SDNode *Op = U.getUser();      if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI))        continue; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index bfa566a..dee0909 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1162,6 +1162,43 @@ SDValue SelectionDAGBuilder::getMemoryRoot() {    return updateRoot(PendingLoads);  } +SDValue SelectionDAGBuilder::getFPOperationRoot(fp::ExceptionBehavior EB) { +  // If the new exception behavior differs from that of the pending +  // ones, chain up them and update the root. +  switch (EB) { +  case fp::ExceptionBehavior::ebMayTrap: +  case fp::ExceptionBehavior::ebIgnore: +    // Floating-point exceptions produced by such operations are not intended +    // to be observed, so the sequence of these operations does not need to be +    // preserved. +    // +    // They however must not be mixed with the instructions that have strict +    // exception behavior. Placing an operation with 'ebIgnore' behavior between +    // 'ebStrict' operations could distort the observed exception behavior. +    if (!PendingConstrainedFPStrict.empty()) { +      assert(PendingConstrainedFP.empty()); +      updateRoot(PendingConstrainedFPStrict); +    } +    break; +  case fp::ExceptionBehavior::ebStrict: +    // Floating-point exception produced by these operations may be observed, so +    // they must be correctly chained. If trapping on FP exceptions is +    // disabled, the exceptions can be observed only by functions that read +    // exception flags, like 'llvm.get_fpenv' or 'fetestexcept'. It means that +    // the order of operations is not significant between barriers. +    // +    // If trapping is enabled, each operation becomes an implicit observation +    // point, so the operations must be sequenced according their original +    // source order. +    if (!PendingConstrainedFP.empty()) { +      assert(PendingConstrainedFPStrict.empty()); +      updateRoot(PendingConstrainedFP); +    } +    // TODO: Add support for trapping-enabled scenarios. +  } +  return DAG.getRoot(); +} +  SDValue SelectionDAGBuilder::getRoot() {    // Chain up all pending constrained intrinsics together with all    // pending loads, by simply appending them to PendingLoads and @@ -8298,6 +8335,30 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,    }  } +void SelectionDAGBuilder::pushFPOpOutChain(SDValue Result, +                                           fp::ExceptionBehavior EB) { +  assert(Result.getNode()->getNumValues() == 2); +  SDValue OutChain = Result.getValue(1); +  assert(OutChain.getValueType() == MVT::Other); + +  // Instead of updating the root immediately, push the produced chain to the +  // appropriate list, deferring the update until the root is requested. In this +  // case, the nodes from the lists are chained using TokenFactor, indicating +  // that the operations are independent. +  // +  // In particular, the root is updated before any call that might access the +  // floating-point environment, except for constrained intrinsics. +  switch (EB) { +  case fp::ExceptionBehavior::ebMayTrap: +  case fp::ExceptionBehavior::ebIgnore: +    PendingConstrainedFP.push_back(OutChain); +    break; +  case fp::ExceptionBehavior::ebStrict: +    PendingConstrainedFPStrict.push_back(OutChain); +    break; +  } +} +  void SelectionDAGBuilder::visitConstrainedFPIntrinsic(      const ConstrainedFPIntrinsic &FPI) {    SDLoc sdl = getCurSDLoc(); @@ -8305,42 +8366,16 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(    // We do not need to serialize constrained FP intrinsics against    // each other or against (nonvolatile) loads, so they can be    // chained like loads. -  SDValue Chain = DAG.getRoot(); +  fp::ExceptionBehavior EB = *FPI.getExceptionBehavior(); +  SDValue Chain = getFPOperationRoot(EB);    SmallVector<SDValue, 4> Opers;    Opers.push_back(Chain);    for (unsigned I = 0, E = FPI.getNonMetadataArgCount(); I != E; ++I)      Opers.push_back(getValue(FPI.getArgOperand(I))); -  auto pushOutChain = [this](SDValue Result, fp::ExceptionBehavior EB) { -    assert(Result.getNode()->getNumValues() == 2); - -    // Push node to the appropriate list so that future instructions can be -    // chained up correctly. -    SDValue OutChain = Result.getValue(1); -    switch (EB) { -    case fp::ExceptionBehavior::ebIgnore: -      // The only reason why ebIgnore nodes still need to be chained is that -      // they might depend on the current rounding mode, and therefore must -      // not be moved across instruction that may change that mode. -      [[fallthrough]]; -    case fp::ExceptionBehavior::ebMayTrap: -      // These must not be moved across calls or instructions that may change -      // floating-point exception masks. -      PendingConstrainedFP.push_back(OutChain); -      break; -    case fp::ExceptionBehavior::ebStrict: -      // These must not be moved across calls or instructions that may change -      // floating-point exception masks or read floating-point exception flags. -      // In addition, they cannot be optimized out even if unused. -      PendingConstrainedFPStrict.push_back(OutChain); -      break; -    } -  }; -    const TargetLowering &TLI = DAG.getTargetLoweringInfo();    EVT VT = TLI.getValueType(DAG.getDataLayout(), FPI.getType());    SDVTList VTs = DAG.getVTList(VT, MVT::Other); -  fp::ExceptionBehavior EB = *FPI.getExceptionBehavior();    SDNodeFlags Flags;    if (EB == fp::ExceptionBehavior::ebIgnore) @@ -8364,7 +8399,7 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(          !TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {        Opers.pop_back();        SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, sdl, VTs, Opers, Flags); -      pushOutChain(Mul, EB); +      pushFPOpOutChain(Mul, EB);        Opcode = ISD::STRICT_FADD;        Opers.clear();        Opers.push_back(Mul.getValue(1)); @@ -8395,7 +8430,7 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(    }    SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers, Flags); -  pushOutChain(Result, EB); +  pushFPOpOutChain(Result, EB);    SDValue FPResult = Result.getValue(0);    setValue(&FPI, FPResult); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index c7577fa..47e19f7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -195,6 +195,11 @@ private:    /// Update root to include all chains from the Pending list.    SDValue updateRoot(SmallVectorImpl<SDValue> &Pending); +  /// Given a node representing a floating-point operation and its specified +  /// exception behavior, this either updates the root or stores the node in +  /// a list to be added to chains latter. +  void pushFPOpOutChain(SDValue Result, fp::ExceptionBehavior EB); +    /// A unique monotonically increasing number used to order the SDNodes we    /// create.    unsigned SDNodeOrder; @@ -300,6 +305,13 @@ public:    /// memory node that may need to be ordered after any prior load instructions.    SDValue getMemoryRoot(); +  /// Return the current virtual root of the Selection DAG, flushing +  /// PendingConstrainedFP or PendingConstrainedFPStrict items if the new +  /// exception behavior (specified by \p EB) differs from that of the pending +  /// instructions. This must be done before emitting constrained FP operation +  /// call. +  SDValue getFPOperationRoot(fp::ExceptionBehavior EB); +    /// Similar to getMemoryRoot, but also flushes PendingConstrainedFP(Strict)    /// items. This must be done before emitting any call other any other node    /// that may need to be ordered after FP instructions due to other side diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 060b1dd..59798b3 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -2097,6 +2097,11 @@ Value *TargetLoweringBase::getSDagStackGuard(const Module &M) const {  }  Function *TargetLoweringBase::getSSPStackGuardCheck(const Module &M) const { +  // MSVC CRT has a function to validate security cookie. +  RTLIB::LibcallImpl SecurityCheckCookieLibcall = +      getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE); +  if (SecurityCheckCookieLibcall != RTLIB::Unsupported) +    return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall));    return nullptr;  } diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 488b078..1096e57 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -4082,10 +4082,10 @@ void AssemblyWriter::printTypeIdentities() {  /// printFunction - Print all aspects of a function.  void AssemblyWriter::printFunction(const Function *F) { -  if (AnnotationWriter) AnnotationWriter->emitFunctionAnnot(F, Out); -    if (F->isMaterializable())      Out << "; Materializable\n"; +  else if (AnnotationWriter) +    AnnotationWriter->emitFunctionAnnot(F, Out);    const AttributeList &Attrs = F->getAttributes();    if (Attrs.hasFnAttrs()) { diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp index 30b5e48..e19336e 100644 --- a/llvm/lib/IR/Module.cpp +++ b/llvm/lib/IR/Module.cpp @@ -403,9 +403,14 @@ void Module::setModuleFlag(ModFlagBehavior Behavior, StringRef Key,                             Metadata *Val) {    NamedMDNode *ModFlags = getOrInsertModuleFlagsMetadata();    // Replace the flag if it already exists. -  for (MDNode *Flag : ModFlags->operands()) { +  for (unsigned i = 0; i < ModFlags->getNumOperands(); ++i) { +    MDNode *Flag = ModFlags->getOperand(i);      if (cast<MDString>(Flag->getOperand(1))->getString() == Key) { -      Flag->replaceOperandWith(2, Val); +      Type *Int32Ty = Type::getInt32Ty(Context); +      Metadata *Ops[3] = { +          ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Behavior)), +          MDString::get(Context, Key), Val}; +      ModFlags->setOperand(i, MDNode::get(Context, Ops));        return;      }    } diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 9d0fa11..4bc2a18 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -471,16 +471,14 @@ static void thinLTOInternalizeAndPromoteGUID(      ValueInfo VI, function_ref<bool(StringRef, ValueInfo)> isExported,      function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>          isPrevailing) { -  auto ExternallyVisibleCopies = -      llvm::count_if(VI.getSummaryList(), -                     [](const std::unique_ptr<GlobalValueSummary> &Summary) { -                       return !GlobalValue::isLocalLinkage(Summary->linkage()); -                     }); -    // Before performing index-based internalization and promotion for this GUID,    // the local flag should be consistent with the summary list linkage types.    VI.verifyLocal(); +  const bool SingleExternallyVisibleCopy = +      VI.getSummaryList().size() == 1 && +      !GlobalValue::isLocalLinkage(VI.getSummaryList().front()->linkage()); +    for (auto &S : VI.getSummaryList()) {      // First see if we need to promote an internal value because it is not      // exported. @@ -543,7 +541,9 @@ static void thinLTOInternalizeAndPromoteGUID(          GlobalValue::isExternalWeakLinkage(S->linkage()))        continue; -    if (isPrevailing(VI.getGUID(), S.get()) && ExternallyVisibleCopies == 1) +    // We may have a single summary copy that is externally visible but not +    // prevailing if the prevailing copy is in a native object. +    if (SingleExternallyVisibleCopy && isPrevailing(VI.getGUID(), S.get()))        S->setLinkage(GlobalValue::InternalLinkage);    }  } @@ -1086,15 +1086,15 @@ LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,            GlobalValue::getGlobalIdentifier(Sym.getIRName(),                                             GlobalValue::ExternalLinkage, ""));        if (R.Prevailing) -        ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier(); +        ThinLTO.setPrevailingModuleForGUID(GUID, BM.getModuleIdentifier());      }    }    if (Error Err =            BM.readSummary(ThinLTO.CombinedIndex, BM.getModuleIdentifier(),                           [&](GlobalValue::GUID GUID) { -                           return ThinLTO.PrevailingModuleForGUID[GUID] == -                                  BM.getModuleIdentifier(); +                           return ThinLTO.isPrevailingModuleForGUID( +                               GUID, BM.getModuleIdentifier());                           }))      return Err;    LLVM_DEBUG(dbgs() << "Module " << BM.getModuleIdentifier() << "\n"); @@ -1108,8 +1108,8 @@ LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,            GlobalValue::getGlobalIdentifier(Sym.getIRName(),                                             GlobalValue::ExternalLinkage, ""));        if (R.Prevailing) { -        assert(ThinLTO.PrevailingModuleForGUID[GUID] == -               BM.getModuleIdentifier()); +        assert( +            ThinLTO.isPrevailingModuleForGUID(GUID, BM.getModuleIdentifier()));          // For linker redefined symbols (via --wrap or --defsym) we want to          // switch the linkage to `weak` to prevent IPOs from happening. @@ -1988,7 +1988,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,                                 LocalWPDTargetsMap);    auto isPrevailing = [&](GlobalValue::GUID GUID, const GlobalValueSummary *S) { -    return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath(); +    return ThinLTO.isPrevailingModuleForGUID(GUID, S->modulePath());    };    if (EnableMemProfContextDisambiguation) {      MemProfContextDisambiguation ContextDisambiguation; diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt index 1e1d0a6..70c4577 100644 --- a/llvm/lib/MC/CMakeLists.txt +++ b/llvm/lib/MC/CMakeLists.txt @@ -73,9 +73,10 @@ add_llvm_component_library(LLVMMC    ${LLVM_MAIN_INCLUDE_DIR}/llvm/MC    LINK_COMPONENTS +  BinaryFormat +  DebugInfoDWARFLowLevel    Support    TargetParser -  BinaryFormat    DEPENDS    intrinsics_gen diff --git a/llvm/lib/MC/MCSFrame.cpp b/llvm/lib/MC/MCSFrame.cpp index d6fa54c..e0a90df 100644 --- a/llvm/lib/MC/MCSFrame.cpp +++ b/llvm/lib/MC/MCSFrame.cpp @@ -8,6 +8,8 @@  #include "llvm/MC/MCSFrame.h"  #include "llvm/BinaryFormat/SFrame.h" +#include "llvm/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.h" +#include "llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h"  #include "llvm/MC/MCAsmInfo.h"  #include "llvm/MC/MCContext.h"  #include "llvm/MC/MCObjectFileInfo.h" @@ -211,8 +213,152 @@ class SFrameEmitterImpl {      return true;    } +  // Technically, the escape data could be anything, but it is commonly a dwarf +  // CFI program. Even then, it could contain an arbitrarily complicated Dwarf +  // expression. Following gnu-gas, look for certain common cases that could +  // invalidate an FDE, emit a warning for those sequences, and don't generate +  // an FDE in those cases. Allow any that are known safe. It is likely that +  // more thorough test cases could refine this code, but it handles the most +  // important ones compatibly with gas. +  // Returns true if the CFI escape sequence is safe for sframes. +  bool isCFIEscapeSafe(SFrameFDE &FDE, const SFrameFRE &FRE, +                       const MCCFIInstruction &CFI) { +    const MCAsmInfo *AI = Streamer.getContext().getAsmInfo(); +    DWARFDataExtractorSimple data(CFI.getValues(), AI->isLittleEndian(), +                                  AI->getCodePointerSize()); + +    // Normally, both alignment factors are extracted from the enclosing Dwarf +    // FDE or CIE. We don't have one here. Alignments are used for scaling +    // factors for ops like CFA_def_cfa_offset_sf. But this particular function +    // is only interested in registers. +    dwarf::CFIProgram P(/*CodeAlignmentFactor=*/1, +                        /*DataAlignmentFactor=*/1, +                        Streamer.getContext().getTargetTriple().getArch()); +    uint64_t Offset = 0; +    if (P.parse(data, &Offset, CFI.getValues().size())) { +      // Not a parsable dwarf expression. Assume the worst. +      Streamer.getContext().reportWarning( +          CFI.getLoc(), +          "skipping SFrame FDE; .cfi_escape with unknown effects"); +      return false; +    } + +    // This loop deals with dwarf::CFIProgram::Instructions. Everywhere else +    // this file deals with MCCFIInstructions. +    for (const dwarf::CFIProgram::Instruction &I : P) { +      switch (I.Opcode) { +      case dwarf::DW_CFA_nop: +        break; +      case dwarf::DW_CFA_val_offset: { +        // First argument is a register. Anything that touches CFA, FP, or RA is +        // a problem, but allow others through. As an even more special case, +        // allow SP + 0. +        auto Reg = I.getOperandAsUnsigned(P, 0); +        // The parser should have failed in this case. +        assert(Reg && "DW_CFA_val_offset with no register."); +        bool SPOk = true; +        if (*Reg == SPReg) { +          auto Opnd = I.getOperandAsSigned(P, 1); +          if (!Opnd || *Opnd != 0) +            SPOk = false; +        } +        if (!SPOk || *Reg == RAReg || *Reg == FPReg) { +          StringRef RN = *Reg == SPReg +                             ? "SP reg " +                             : (*Reg == FPReg ? "FP reg " : "RA reg "); +          Streamer.getContext().reportWarning( +              CFI.getLoc(), +              Twine( +                  "skipping SFrame FDE; .cfi_escape DW_CFA_val_offset with ") + +                  RN + Twine(*Reg)); +          return false; +        } +      } break; +      case dwarf::DW_CFA_expression: { +        // First argument is a register. Anything that touches CFA, FP, or RA is +        // a problem, but allow others through. +        auto Reg = I.getOperandAsUnsigned(P, 0); +        if (!Reg) { +          Streamer.getContext().reportWarning( +              CFI.getLoc(), +              "skipping SFrame FDE; .cfi_escape with unknown effects"); +          return false; +        } +        if (*Reg == SPReg || *Reg == RAReg || *Reg == FPReg) { +          StringRef RN = *Reg == SPReg +                             ? "SP reg " +                             : (*Reg == FPReg ? "FP reg " : "RA reg "); +          Streamer.getContext().reportWarning( +              CFI.getLoc(), +              Twine( +                  "skipping SFrame FDE; .cfi_escape DW_CFA_expression with ") + +                  RN + Twine(*Reg)); +          return false; +        } +      } break; +      case dwarf::DW_CFA_GNU_args_size: { +        auto Size = I.getOperandAsSigned(P, 0); +        // Zero size doesn't affect the cfa. +        if (Size && *Size == 0) +          break; +        if (FRE.Info.getBaseRegister() != BaseReg::FP) { +          Streamer.getContext().reportWarning( +              CFI.getLoc(), +              Twine("skipping SFrame FDE; .cfi_escape DW_CFA_GNU_args_size " +                    "with non frame-pointer CFA")); +          return false; +        } +      } break; +      // Cases that gas doesn't specially handle. TODO: Some of these could be +      // analyzed and handled instead of just punting. But these are uncommon, +      // or should be written as normal cfi directives. Some will need fixes to +      // the scaling factor. +      case dwarf::DW_CFA_advance_loc: +      case dwarf::DW_CFA_offset: +      case dwarf::DW_CFA_restore: +      case dwarf::DW_CFA_set_loc: +      case dwarf::DW_CFA_advance_loc1: +      case dwarf::DW_CFA_advance_loc2: +      case dwarf::DW_CFA_advance_loc4: +      case dwarf::DW_CFA_offset_extended: +      case dwarf::DW_CFA_restore_extended: +      case dwarf::DW_CFA_undefined: +      case dwarf::DW_CFA_same_value: +      case dwarf::DW_CFA_register: +      case dwarf::DW_CFA_remember_state: +      case dwarf::DW_CFA_restore_state: +      case dwarf::DW_CFA_def_cfa: +      case dwarf::DW_CFA_def_cfa_register: +      case dwarf::DW_CFA_def_cfa_offset: +      case dwarf::DW_CFA_def_cfa_expression: +      case dwarf::DW_CFA_offset_extended_sf: +      case dwarf::DW_CFA_def_cfa_sf: +      case dwarf::DW_CFA_def_cfa_offset_sf: +      case dwarf::DW_CFA_val_offset_sf: +      case dwarf::DW_CFA_val_expression: +      case dwarf::DW_CFA_MIPS_advance_loc8: +      case dwarf::DW_CFA_AARCH64_negate_ra_state_with_pc: +      case dwarf::DW_CFA_AARCH64_negate_ra_state: +      case dwarf::DW_CFA_LLVM_def_aspace_cfa: +      case dwarf::DW_CFA_LLVM_def_aspace_cfa_sf: +        Streamer.getContext().reportWarning( +            CFI.getLoc(), "skipping SFrame FDE; .cfi_escape " +                          "CFA expression with unknown side effects"); +        return false; +      default: +        // Dwarf expression was only partially valid, and user could have +        // written anything. +        Streamer.getContext().reportWarning( +            CFI.getLoc(), +            "skipping SFrame FDE; .cfi_escape with unknown effects"); +        return false; +      } +    } +    return true; +  } +    // Add the effects of CFI to the current FDE, creating a new FRE when -  // necessary. +  // necessary. Return true if the CFI is representable in the sframe format.    bool handleCFI(SFrameFDE &FDE, SFrameFRE &FRE, const MCCFIInstruction &CFI) {      switch (CFI.getOperation()) {      case MCCFIInstruction::OpDefCfaRegister: @@ -265,10 +411,11 @@ class SFrameEmitterImpl {        FRE = FDE.SaveState.pop_back_val();        return true;      case MCCFIInstruction::OpEscape: -      // TODO: Implement. Will use FDE. -      return true; +      // This is a string of bytes that contains an arbitrary dwarf-expression +      // that may or may not affect unwind info. +      return isCFIEscapeSafe(FDE, FRE, CFI);      default: -      // Instructions that don't affect the CFA, RA, and SP can be safely +      // Instructions that don't affect the CFA, RA, and FP can be safely        // ignored.        return true;      } diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp index 67483ba..9d45096 100644 --- a/llvm/lib/Support/Timer.cpp +++ b/llvm/lib/Support/Timer.cpp @@ -240,7 +240,8 @@ private:    getGroupEntry(StringRef GroupName, StringRef GroupDescription) {      std::pair<TimerGroup *, Name2TimerMap> &GroupEntry = Map[GroupName];      if (!GroupEntry.first) -      GroupEntry.first = new TimerGroup(GroupName, GroupDescription); +      GroupEntry.first = +          new TimerGroup(GroupName, GroupDescription, /*PrintOnExit=*/true);      return GroupEntry;    } @@ -270,9 +271,10 @@ TimerGroup &NamedRegionTimer::getNamedTimerGroup(StringRef GroupName,  static TimerGroup *TimerGroupList = nullptr;  TimerGroup::TimerGroup(StringRef Name, StringRef Description, -                       sys::SmartMutex<true> &lock) +                       sys::SmartMutex<true> &lock, bool PrintOnExit)      : Name(Name.begin(), Name.end()), -      Description(Description.begin(), Description.end()) { +      Description(Description.begin(), Description.end()), +      PrintOnExit(PrintOnExit) {    // Add the group to TimerGroupList.    sys::SmartScopedLock<true> L(lock);    if (TimerGroupList) @@ -282,12 +284,12 @@ TimerGroup::TimerGroup(StringRef Name, StringRef Description,    TimerGroupList = this;  } -TimerGroup::TimerGroup(StringRef Name, StringRef Description) -    : TimerGroup(Name, Description, timerLock()) {} +TimerGroup::TimerGroup(StringRef Name, StringRef Description, bool PrintOnExit) +    : TimerGroup(Name, Description, timerLock(), PrintOnExit) {}  TimerGroup::TimerGroup(StringRef Name, StringRef Description, -                       const StringMap<TimeRecord> &Records) -    : TimerGroup(Name, Description) { +                       const StringMap<TimeRecord> &Records, bool PrintOnExit) +    : TimerGroup(Name, Description, PrintOnExit) {    TimersToPrint.reserve(Records.size());    for (const auto &P : Records)      TimersToPrint.emplace_back(P.getValue(), std::string(P.getKey()), @@ -301,7 +303,7 @@ TimerGroup::~TimerGroup() {    while (FirstTimer)      removeTimer(*FirstTimer); -  if (!TimersToPrint.empty()) { +  if (!TimersToPrint.empty() && PrintOnExit) {      std::unique_ptr<raw_ostream> OutStream = CreateInfoOutputFile();      PrintQueuedTimers(*OutStream);    } @@ -530,7 +532,7 @@ public:    sys::SmartMutex<true> TimerLock;    TimerGroup DefaultTimerGroup{"misc", "Miscellaneous Ungrouped Timers", -                               TimerLock}; +                               TimerLock, /*PrintOnExit=*/true};    SignpostEmitter Signposts;    // Order of these members and initialization below is important. For example diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 86f9548..a4529a5 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -73,9 +73,16 @@ def SVEUnsupported : AArch64Unsupported {                        SVE2Unsupported.F);  } -let F = [HasSME2p2, HasSVE2p2_or_SME2p2, HasNonStreamingSVE_or_SME2p2, -         HasNonStreamingSVE2p2_or_SME2p2] in -def SME2p2Unsupported : AArch64Unsupported; +def SME2p3Unsupported : AArch64Unsupported { +  let F = [HasSVE2p3_or_SME2p3, HasSVE_B16MM]; +} + +def SME2p2Unsupported : AArch64Unsupported { +  let F = !listconcat([HasSME2p2, HasSVE2p2_or_SME2p2, +           HasNonStreamingSVE_or_SME2p2, +           HasNonStreamingSVE2p2_or_SME2p2], +           SME2p3Unsupported.F); +}  def SME2p1Unsupported : AArch64Unsupported {    let F = !listconcat([HasSME2p1, HasSVE2p1_or_SME2p1, diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index ecaeff7..b3ec65c 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -71,7 +71,6 @@ def AArch64PreLegalizerCombiner: GICombiner<    "AArch64PreLegalizerCombinerImpl", [all_combines,                                        icmp_redundant_trunc,                                        fold_global_offset, -                                      shuffle_to_extract,                                        ext_addv_to_udot_addv,                                        ext_uaddv_to_uaddlv,                                        push_sub_through_zext, diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 46f5f0c..0e94b78 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -585,6 +585,47 @@ def FeatureSME_TMOP: ExtensionWithMArch<"sme-tmop", "SME_TMOP", "FEAT_SME_TMOP",  def FeatureSSVE_FEXPA : ExtensionWithMArch<"ssve-fexpa", "SSVE_FEXPA", "FEAT_SSVE_FEXPA",    "Enable SVE FEXPA instruction in Streaming SVE mode", [FeatureSME2]>; +//===----------------------------------------------------------------------===// +//  Armv9.7 Architecture Extensions +//===----------------------------------------------------------------------===// + +def FeatureCMH : ExtensionWithMArch<"cmh", "CMH", "FEAT_CMH", +  "Enable Armv9.7-A Contention Management Hints">; + +def FeatureLSCP : ExtensionWithMArch<"lscp", "LSCP", "FEAT_LSCP", +  "Enable Armv9.7-A Load-acquire and store-release pair extension">; + +def FeatureTLBID: ExtensionWithMArch<"tlbid", "TLBID", "FEAT_TLBID", +  "Enable Armv9.7-A TLBI Domains extension">; + +def FeatureMPAMv2: ExtensionWithMArch<"mpamv2", "MPAMv2", "FEAT_MPAMv2", +  "Enable Armv9.7-A MPAMv2 Lookaside Buffer Invalidate instructions">; + +def FeatureMTETC: ExtensionWithMArch<"mtetc", "MTETC", "FEAT_MTETC", +  "Enable Virtual Memory Tagging Extension">; + +def FeatureGCIE: ExtensionWithMArch<"gcie", "GCIE", "FEAT_GCIE", +  "Enable GICv5 (Generic Interrupt Controller) CPU Interface Extension">; + +def FeatureSVE2p3 : ExtensionWithMArch<"sve2p3", "SVE2p3", "FEAT_SVE2p3", +  "Enable Armv9.7-A Scalable Vector Extension 2.3 instructions", [FeatureSVE2p2]>; + +def FeatureSME2p3 : ExtensionWithMArch<"sme2p3", "SME2p3", "FEAT_SME2p3", +  "Enable Armv9.7-A Scalable Matrix Extension 2.3 instructions", [FeatureSME2p2]>; + +def FeatureSVE_B16MM : ExtensionWithMArch<"sve-b16mm", "SVE_B16MM", "FEAT_SVE_B16MM", +  "Enable Armv9.7-A SVE non-widening BFloat16 matrix multiply-accumulate", [FeatureSVE]>; + +def FeatureF16MM : ExtensionWithMArch<"f16mm", "F16MM", "FEAT_F16MM", +  "Enable Armv9.7-A non-widening half-precision matrix multiply-accumulate", [FeatureFullFP16]>; + +def FeatureF16F32DOT : ExtensionWithMArch<"f16f32dot", "F16F32DOT", "FEAT_F16F32DOT", +  "Enable Armv9.7-A Advanced SIMD half-precision dot product accumulate to single-precision", [FeatureNEON, FeatureFullFP16]>; + +def FeatureF16F32MM : ExtensionWithMArch<"f16f32mm", "F16F32MM", "FEAT_F16F32MM", +  "Enable Armv9.7-A Advanced SIMD half-precision matrix multiply-accumulate to single-precision", [FeatureNEON, FeatureFullFP16]>; + +//===----------------------------------------------------------------------===//  //  Other Features  //===----------------------------------------------------------------------===// @@ -939,9 +980,12 @@ def HasV9_5aOps : Architecture64<9, 5, "a", "v9.5a",    [HasV9_4aOps, FeatureCPA],    !listconcat(HasV9_4aOps.DefaultExts, [FeatureCPA,  FeatureLUT, FeatureFAMINMAX])>;  def HasV9_6aOps : Architecture64<9, 6, "a", "v9.6a", -  [HasV9_5aOps, FeatureCMPBR, FeatureFPRCVT, FeatureSVE2p2, FeatureLSUI, FeatureOCCMO], -  !listconcat(HasV9_5aOps.DefaultExts, [FeatureCMPBR, FeatureFPRCVT, FeatureSVE2p2, +  [HasV9_5aOps, FeatureCMPBR, FeatureLSUI, FeatureOCCMO], +  !listconcat(HasV9_5aOps.DefaultExts, [FeatureCMPBR,      FeatureLSUI, FeatureOCCMO])>; +def HasV9_7aOps : Architecture64<9, 7, "a", "v9.7a", +  [HasV9_6aOps, FeatureSVE2p3, FeatureFPRCVT], +  !listconcat(HasV9_6aOps.DefaultExts, [FeatureSVE2p3, FeatureFPRCVT])>;  def HasV8_0rOps : Architecture64<8, 0, "r", "v8r",    [ //v8.1      FeatureCRC, FeaturePAN, FeatureLSE, FeatureCONTEXTIDREL2, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a81de5c..d16b116 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9002,12 +9002,12 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI,  }  static SMECallAttrs -getSMECallAttrs(const Function &Caller, const AArch64TargetLowering &TLI, +getSMECallAttrs(const Function &Caller, const RTLIB::RuntimeLibcallsInfo &RTLCI,                  const TargetLowering::CallLoweringInfo &CLI) {    if (CLI.CB) -    return SMECallAttrs(*CLI.CB, &TLI); +    return SMECallAttrs(*CLI.CB, &RTLCI);    if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) -    return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI)); +    return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), RTLCI));    return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(SMEAttrs::Normal));  } @@ -9029,7 +9029,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(    // SME Streaming functions are not eligible for TCO as they may require    // the streaming mode or ZA to be restored after returning from the call. -  SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI); +  SMECallAttrs CallAttrs = +      getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI);    if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||        CallAttrs.requiresPreservingAllZAState() ||        CallAttrs.caller().hasStreamingBody()) @@ -9454,7 +9455,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,    }    // Determine whether we need any streaming mode changes. -  SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI); +  SMECallAttrs CallAttrs = +      getSMECallAttrs(MF.getFunction(), getRuntimeLibcallsInfo(), CLI);    std::optional<unsigned> ZAMarkerNode;    bool UseNewSMEABILowering = getTM().useNewSMEABILowering(); @@ -19476,6 +19478,61 @@ static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {                       Op1 ? Op1 : Mul->getOperand(1));  } +// Multiplying an RDSVL value by a constant can sometimes be done cheaper by +// folding a power-of-two factor of the constant into the RDSVL immediate and +// compensating with an extra shift. +// +// We rewrite: +//   (mul (srl (rdsvl 1), w), x) +// to one of: +//   (shl (rdsvl y),  z)   if z > 0 +//   (srl (rdsvl y), abs(z))   if z < 0 +// where integers y, z satisfy   x = y * 2^(w + z)   and   y ∈ [-32, 31]. +static SDValue performMulRdsvlCombine(SDNode *Mul, SelectionDAG &DAG) { +  SDLoc DL(Mul); +  EVT VT = Mul->getValueType(0); +  SDValue MulOp0 = Mul->getOperand(0); +  int ConstMultiplier = +      cast<ConstantSDNode>(Mul->getOperand(1))->getSExtValue(); +  if ((MulOp0->getOpcode() != ISD::SRL) || +      (MulOp0->getOperand(0).getOpcode() != AArch64ISD::RDSVL)) +    return SDValue(); + +  unsigned AbsConstValue = abs(ConstMultiplier); +  unsigned OperandShift = +      cast<ConstantSDNode>(MulOp0->getOperand(1))->getZExtValue(); + +  // z ≤ ctz(|x|) - w  (largest extra shift we can take while keeping y +  // integral) +  int UpperBound = llvm::countr_zero(AbsConstValue) - OperandShift; + +  // To keep y in range, with B = 31 for x > 0 and B = 32 for x < 0, we need: +  // 2^(w + z) ≥ ceil(x / B)  ⇒  z ≥ ceil_log2(ceil(x / B)) - w  (LowerBound). +  unsigned B = ConstMultiplier < 0 ? 32 : 31; +  unsigned CeilAxOverB = (AbsConstValue + (B - 1)) / B; // ceil(|x|/B) +  int LowerBound = llvm::Log2_32_Ceil(CeilAxOverB) - OperandShift; + +  // No valid solution found. +  if (LowerBound > UpperBound) +    return SDValue(); + +  // Any value of z in [LowerBound, UpperBound] is valid. Prefer no extra +  // shift if possible. +  int Shift = std::min(std::max(/*prefer*/ 0, LowerBound), UpperBound); + +  // y = x / 2^(w + z) +  int32_t RdsvlMul = (AbsConstValue >> (OperandShift + Shift)) * +                     (ConstMultiplier < 0 ? -1 : 1); +  auto Rdsvl = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, +                           DAG.getSignedConstant(RdsvlMul, DL, MVT::i32)); + +  if (Shift == 0) +    return Rdsvl; +  return DAG.getNode(Shift < 0 ? ISD::SRL : ISD::SHL, DL, VT, Rdsvl, +                     DAG.getConstant(abs(Shift), DL, MVT::i32), +                     SDNodeFlags::Exact); +} +  // Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz  // Same for other types with equivalent constants.  static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) { @@ -19604,6 +19661,9 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,    if (!isa<ConstantSDNode>(N1))      return SDValue(); +  if (SDValue Ext = performMulRdsvlCombine(N, DAG)) +    return Ext; +    ConstantSDNode *C = cast<ConstantSDNode>(N1);    const APInt &ConstValue = C->getAPIntValue(); @@ -26665,11 +26725,34 @@ static SDValue performDUPCombine(SDNode *N,    }    if (N->getOpcode() == AArch64ISD::DUP) { +    SDValue Op = N->getOperand(0); + +    // Optimize DUP(extload/zextload i8/i16/i32) to avoid GPR->FPR transfer. +    // For example: +    //   v4i32 = DUP (i32 (zextloadi8 addr)) +    // => +    //   v4i32 = SCALAR_TO_VECTOR (i32 (zextloadi8 addr)) ; Matches to ldr b0 +    //   v4i32 = DUPLANE32 (v4i32), 0 +    if (auto *LD = dyn_cast<LoadSDNode>(Op)) { +      ISD::LoadExtType ExtType = LD->getExtensionType(); +      EVT MemVT = LD->getMemoryVT(); +      EVT ElemVT = VT.getVectorElementType(); +      if ((ExtType == ISD::EXTLOAD || ExtType == ISD::ZEXTLOAD) && +          (MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) && +          ElemVT != MemVT && LD->hasOneUse()) { +        EVT Vec128VT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT, +                                        128 / ElemVT.getSizeInBits()); +        SDValue ScalarToVec = +            DCI.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, Vec128VT, Op); +        return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, ScalarToVec, +                               DCI.DAG.getConstant(0, DL, MVT::i64)); +      } +    } +      // If the instruction is known to produce a scalar in SIMD registers, we can      // duplicate it across the vector lanes using DUPLANE instead of moving it      // to a GPR first. For example, this allows us to handle:      //   v4i32 = DUP (i32 (FCMGT (f32, f32))) -    SDValue Op = N->getOperand(0);      // FIXME: Ideally, we should be able to handle all instructions that      // produce a scalar value in FPRs.      if (Op.getOpcode() == AArch64ISD::FCMEQ || @@ -29430,15 +29513,6 @@ void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {    TargetLowering::insertSSPDeclarations(M);  } -Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const { -  // MSVC CRT has a function to validate security cookie. -  RTLIB::LibcallImpl SecurityCheckCookieLibcall = -      getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE); -  if (SecurityCheckCookieLibcall != RTLIB::Unsupported) -    return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall)); -  return TargetLowering::getSSPStackGuardCheck(M); -} -  Value *  AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {    // Android provides a fixed TLS slot for the SafeStack pointer. See the @@ -29447,11 +29521,6 @@ AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {    if (Subtarget->isTargetAndroid())      return UseTlsOffset(IRB, 0x48); -  // Fuchsia is similar. -  // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value. -  if (Subtarget->isTargetFuchsia()) -    return UseTlsOffset(IRB, -0x8); -    return TargetLowering::getSafeStackPointerLocation(IRB);  } @@ -29769,7 +29838,7 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {    // Checks to allow the use of SME instructions    if (auto *Base = dyn_cast<CallBase>(&Inst)) { -    auto CallAttrs = SMECallAttrs(*Base, this); +    auto CallAttrs = SMECallAttrs(*Base, &getRuntimeLibcallsInfo());      if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||          CallAttrs.requiresPreservingZT0() ||          CallAttrs.requiresPreservingAllZAState()) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 9495c9f..2cb8ed2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -366,7 +366,6 @@ public:    Value *getIRStackGuard(IRBuilderBase &IRB) const override;    void insertSSPDeclarations(Module &M) const override; -  Function *getSSPStackGuardCheck(const Module &M) const override;    /// If the target has a standard location for the unsafe stack pointer,    /// returns the address of that location. Otherwise, returns nullptr. diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 09ce713..58a53af 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -1894,6 +1894,21 @@ def btihint_op : Operand<i32> {    }];  } +def CMHPriorityHintOperand : AsmOperandClass { +  let Name = "CMHPriorityHint"; +  let ParserMethod = "tryParseCMHPriorityHint"; +} + +def CMHPriorityHint_op : Operand<i32> { +  let ParserMatchClass = CMHPriorityHintOperand; +  let PrintMethod = "printCMHPriorityHintOp"; +  let MCOperandPredicate = [{ +    if (!MCOp.isImm()) +      return false; +    return AArch64CMHPriorityHint::lookupCMHPriorityHintByEncoding(MCOp.getImm()) != nullptr; +  }]; +} +  class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),                         "mrs", "\t$Rt, $systemreg"> {    bits<16> systemreg; @@ -4636,6 +4651,48 @@ multiclass StorePairOffset<bits<2> opc, bit V, RegisterOperand regtype,                                                    GPR64sp:$Rn, 0)>;  } +class BaseLoadStoreAcquirePairOffset<bits<4> opc, bit L, dag oops, dag iops, +                              string asm> +    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, #0]", "", []> { +  bits<5> Rt; +  bits<5> Rt2; +  bits<5> Rn; +  let Inst{31-23} = 0b110110010; +  let Inst{22}    = L; +  let Inst{21}    = 0b0; +  let Inst{20-16} = Rt2; +  let Inst{15-12} = opc; +  let Inst{11-10} = 0b10; +  let Inst{9-5}   = Rn; +  let Inst{4-0}   = Rt; +} + +multiclass LoadAcquirePairOffset<bits<4> opc, string asm> { +  let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in +  def i : BaseLoadStoreAcquirePairOffset<opc, 0b1, +                                  (outs GPR64:$Rt, GPR64:$Rt2), +                                  (ins GPR64sp:$Rn), asm>, +          Sched<[WriteAtomic, WriteLDHi]>; + +  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]", +                  (!cast<Instruction>(NAME # "i") GPR64:$Rt, GPR64:$Rt2, +                                                  GPR64sp:$Rn)>; +} + + +multiclass StoreAcquirePairOffset<bits<4> opc, string asm> { +  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in +  def i : BaseLoadStoreAcquirePairOffset<opc, 0b0, (outs), +                                  (ins GPR64:$Rt, GPR64:$Rt2, +                                       GPR64sp:$Rn), +                                  asm>, +          Sched<[WriteSTP]>; + +  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]", +                  (!cast<Instruction>(NAME # "i") GPR64:$Rt, GPR64:$Rt2, +                                                  GPR64sp:$Rn)>; +} +  // (pre-indexed)  class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,                                string asm> @@ -5241,7 +5298,7 @@ multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,  }  multiclass FPToIntegerSIMDScalar<bits<2> rmode, bits<3> opcode, string asm,  -                                 SDPatternOperator OpN = null_frag> { +                                 SDPatternOperator OpN> {    // double-precision to 32-bit SIMD/FPR    def SDr :  BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, FPR32, asm,               [(set FPR32:$Rd, (i32 (OpN (f64 FPR64:$Rn))))]> { @@ -6481,8 +6538,7 @@ multiclass SIMDThreeSameVectorFML<bit U, bit b13, bits<3> size, string asm,  }  multiclass SIMDThreeSameVectorMLA<bit Q, string asm, SDPatternOperator op> { - -  def v8f16 : BaseSIMDThreeSameVectorDot<Q, 0b0, 0b11, 0b1111, asm, ".8h", ".16b", +  def v16i8_v8f16 : BaseSIMDThreeSameVectorDot<Q, 0b0, 0b11, 0b1111, asm, ".8h", ".16b",                                           V128, v8f16, v16i8, op>;  } @@ -6491,6 +6547,23 @@ multiclass SIMDThreeSameVectorMLAL<bit Q, bits<2> sz, string asm, SDPatternOpera                                           V128, v4f32, v16i8, op>;  } +multiclass SIMDThreeSameVectorFMLA<string asm> { +  def v8f16_v8f16 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b11, 0b1101, asm, ".8h", ".8h", +                                          V128, v8f16, v8f16, null_frag>; +} + +multiclass SIMDThreeSameVectorFMLAWiden<string asm> { +  def v8f16_v4f32 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b01, 0b1101, asm, ".4s", ".8h", +                                          V128, v4f32, v8f16, null_frag>; +} + +multiclass SIMDThreeSameVectorFDot<string asm, SDPatternOperator OpNode = null_frag> { +  def v4f16_v2f32 : BaseSIMDThreeSameVectorDot<0, 0, 0b10, 0b1111, asm, ".2s", ".4h", V64, +                                         v2f32, v4f16, OpNode>; +  def v8f16_v4f32 : BaseSIMDThreeSameVectorDot<1, 0, 0b10, 0b1111, asm, ".4s", ".8h", V128, +                                         v4f32, v8f16, OpNode>; +} +  // FP8 assembly/disassembly classes  //---------------------------------------------------------------------------- @@ -9112,6 +9185,13 @@ multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm,                                                V128, V128_lo, v4f32, v8f16, VectorIndexH, OpNode>;  } +multiclass SIMDThreeSameVectorFDOTIndex<string asm> { +  def v4f16_v2f32 : BaseSIMDThreeSameVectorIndexS<0b0, 0b0, 0b01, 0b1001, asm, ".2s", ".4h", ".2h", +                                           V64, v2f32, v4f16, VectorIndexS, null_frag>; +  def v8f16_v4f32 : BaseSIMDThreeSameVectorIndexS<0b1, 0b0, 0b01, 0b1001, asm, ".4s", ".8h",".2h", +                                            V128, v4f32, v8f16, VectorIndexS, null_frag>; +} +  //----------------------------------------------------------------------------  // FP8 Advanced SIMD vector x indexed element  multiclass SIMD_FP8_Dot2_Index<string asm, SDPatternOperator op> { @@ -13227,3 +13307,34 @@ multiclass SIMDThreeSameVectorFP8MatrixMul<string asm>{        let Predicates = [HasNEON, HasF8F32MM];      }  } + +//---------------------------------------------------------------------------- +// Contention Management Hints - FEAT_CMH +//---------------------------------------------------------------------------- + +class SHUHInst<string asm> : I< +    (outs), +    (ins CMHPriorityHint_op:$priority), +    asm, "\t$priority", "", []>, Sched<[]> { +  bits<1> priority; +  let Inst{31-12} = 0b11010101000000110010; +  let Inst{11-8}  = 0b0110; +  let Inst{7-6}   = 0b01; +  let Inst{5}     = priority; +  let Inst{4-0}   = 0b11111; +} + +multiclass SHUH<string asm> { +  def NAME : SHUHInst<asm>; +  def      : InstAlias<asm, (!cast<Instruction>(NAME) 0), 1>; +} + +class STCPHInst<string asm> : I< +    (outs), +    (ins), +    asm, "", "", []>, Sched<[]> { +    let Inst{31-12} = 0b11010101000000110010; +    let Inst{11-8}  = 0b0110; +    let Inst{7-5}   = 0b100; +    let Inst{4-0}   = 0b11111; +} diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 92f260f..b9e299e 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -50,63 +50,44 @@ def HasV9_4a         : Predicate<"Subtarget->hasV9_4aOps()">,                                   AssemblerPredicateWithAll<(all_of HasV9_4aOps), "armv9.4a">;  def HasV8_0r         : Predicate<"Subtarget->hasV8_0rOps()">,                                   AssemblerPredicateWithAll<(all_of HasV8_0rOps), "armv8-r">; -  def HasEL2VMSA       : Predicate<"Subtarget->hasEL2VMSA()">, -                       AssemblerPredicateWithAll<(all_of FeatureEL2VMSA), "el2vmsa">; - +                                 AssemblerPredicateWithAll<(all_of FeatureEL2VMSA), "el2vmsa">;  def HasEL3           : Predicate<"Subtarget->hasEL3()">, -                       AssemblerPredicateWithAll<(all_of FeatureEL3), "el3">; - +                                 AssemblerPredicateWithAll<(all_of FeatureEL3), "el3">;  def HasVH            : Predicate<"Subtarget->hasVH()">, -                       AssemblerPredicateWithAll<(all_of FeatureVH), "vh">; - +                                 AssemblerPredicateWithAll<(all_of FeatureVH), "vh">;  def HasLOR           : Predicate<"Subtarget->hasLOR()">, -                       AssemblerPredicateWithAll<(all_of FeatureLOR), "lor">; - +                                 AssemblerPredicateWithAll<(all_of FeatureLOR), "lor">;  def HasPAuth         : Predicate<"Subtarget->hasPAuth()">, -                       AssemblerPredicateWithAll<(all_of FeaturePAuth), "pauth">; - +                                 AssemblerPredicateWithAll<(all_of FeaturePAuth), "pauth">;  def HasPAuthLR       : Predicate<"Subtarget->hasPAuthLR()">, -                       AssemblerPredicateWithAll<(all_of FeaturePAuthLR), "pauth-lr">; - +                                 AssemblerPredicateWithAll<(all_of FeaturePAuthLR), "pauth-lr">;  def HasJS            : Predicate<"Subtarget->hasJS()">, -                       AssemblerPredicateWithAll<(all_of FeatureJS), "jsconv">; - +                                 AssemblerPredicateWithAll<(all_of FeatureJS), "jsconv">;  def HasCCIDX         : Predicate<"Subtarget->hasCCIDX()">, -                       AssemblerPredicateWithAll<(all_of FeatureCCIDX), "ccidx">; - -def HasComplxNum      : Predicate<"Subtarget->hasComplxNum()">, -                       AssemblerPredicateWithAll<(all_of FeatureComplxNum), "complxnum">; - +                                 AssemblerPredicateWithAll<(all_of FeatureCCIDX), "ccidx">; +def HasComplxNum     : Predicate<"Subtarget->hasComplxNum()">, +                                 AssemblerPredicateWithAll<(all_of FeatureComplxNum), "complxnum">;  def HasNV            : Predicate<"Subtarget->hasNV()">, -                       AssemblerPredicateWithAll<(all_of FeatureNV), "nv">; - +                                 AssemblerPredicateWithAll<(all_of FeatureNV), "nv">;  def HasMPAM          : Predicate<"Subtarget->hasMPAM()">, -                       AssemblerPredicateWithAll<(all_of FeatureMPAM), "mpam">; - +                                 AssemblerPredicateWithAll<(all_of FeatureMPAM), "mpam">;  def HasDIT           : Predicate<"Subtarget->hasDIT()">, -                       AssemblerPredicateWithAll<(all_of FeatureDIT), "dit">; - -def HasTRACEV8_4         : Predicate<"Subtarget->hasTRACEV8_4()">, -                       AssemblerPredicateWithAll<(all_of FeatureTRACEV8_4), "tracev8.4">; - +                                 AssemblerPredicateWithAll<(all_of FeatureDIT), "dit">; +def HasTRACEV8_4     : Predicate<"Subtarget->hasTRACEV8_4()">, +                                 AssemblerPredicateWithAll<(all_of FeatureTRACEV8_4), "tracev8.4">;  def HasAM            : Predicate<"Subtarget->hasAM()">, -                       AssemblerPredicateWithAll<(all_of FeatureAM), "am">; - +                                 AssemblerPredicateWithAll<(all_of FeatureAM), "am">;  def HasSEL2          : Predicate<"Subtarget->hasSEL2()">, -                       AssemblerPredicateWithAll<(all_of FeatureSEL2), "sel2">; - -def HasTLB_RMI          : Predicate<"Subtarget->hasTLB_RMI()">, -                       AssemblerPredicateWithAll<(all_of FeatureTLB_RMI), "tlb-rmi">; - +                                 AssemblerPredicateWithAll<(all_of FeatureSEL2), "sel2">; +def HasTLB_RMI       : Predicate<"Subtarget->hasTLB_RMI()">, +                                 AssemblerPredicateWithAll<(all_of FeatureTLB_RMI), "tlb-rmi">;  def HasFlagM         : Predicate<"Subtarget->hasFlagM()">, -                       AssemblerPredicateWithAll<(all_of FeatureFlagM), "flagm">; - -def HasRCPC_IMMO      : Predicate<"Subtarget->hasRCPC_IMMO()">, -                       AssemblerPredicateWithAll<(all_of FeatureRCPC_IMMO), "rcpc-immo">; - +                                 AssemblerPredicateWithAll<(all_of FeatureFlagM), "flagm">; +def HasRCPC_IMMO     : Predicate<"Subtarget->hasRCPC_IMMO()">, +                                 AssemblerPredicateWithAll<(all_of FeatureRCPC_IMMO), "rcpc-immo">;  def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">, -                               AssemblerPredicateWithAll<(all_of FeatureFPARMv8), "fp-armv8">; +                                 AssemblerPredicateWithAll<(all_of FeatureFPARMv8), "fp-armv8">;  def HasNEON          : Predicate<"Subtarget->isNeonAvailable()">,                                   AssemblerPredicateWithAll<(all_of FeatureNEON), "neon">;  def HasSM4           : Predicate<"Subtarget->hasSM4()">, @@ -149,13 +130,13 @@ def HasSVE2          : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasS                                   AssemblerPredicateWithAll<(all_of FeatureSVE2), "sve2">;  def HasSVE2p1        : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()">,                                   AssemblerPredicateWithAll<(all_of FeatureSVE2p1), "sve2p1">; -def HasSVEAES       : Predicate<"Subtarget->hasSVEAES()">, +def HasSVEAES        : Predicate<"Subtarget->hasSVEAES()">,                                   AssemblerPredicateWithAll<(all_of FeatureSVEAES), "sve-aes">; -def HasSVESM4       : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVESM4()">, +def HasSVESM4        : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVESM4()">,                                   AssemblerPredicateWithAll<(all_of FeatureSVESM4), "sve-sm4">; -def HasSVESHA3      : Predicate<"Subtarget->hasSVESHA3()">, +def HasSVESHA3       : Predicate<"Subtarget->hasSVESHA3()">,                                   AssemblerPredicateWithAll<(all_of FeatureSVESHA3), "sve-sha3">; -def HasSVEBitPerm   : Predicate<"Subtarget->hasSVEBitPerm()">, +def HasSVEBitPerm    : Predicate<"Subtarget->hasSVEBitPerm()">,                                   AssemblerPredicateWithAll<(all_of FeatureSVEBitPerm), "sve-bitperm">;  def HasSMEandIsNonStreamingSafe                       : Predicate<"Subtarget->hasSME()">, @@ -196,7 +177,7 @@ def HasSSVE_FP8DOT2  : Predicate<"Subtarget->hasSSVE_FP8DOT2() || "                                   "(Subtarget->hasSVE2() && Subtarget->hasFP8DOT2())">,                                   AssemblerPredicateWithAll<(any_of FeatureSSVE_FP8DOT2,                                                             (all_of FeatureSVE2, FeatureFP8DOT2)), -                                "ssve-fp8dot2 or (sve2 and fp8dot2)">; +                                 "ssve-fp8dot2 or (sve2 and fp8dot2)">;  def HasFP8DOT4       : Predicate<"Subtarget->hasFP8DOT4()">,                                   AssemblerPredicateWithAll<(all_of FeatureFP8DOT4), "fp8dot4">;  def HasSSVE_FP8DOT4  : Predicate<"Subtarget->hasSSVE_FP8DOT4() || " @@ -204,43 +185,60 @@ def HasSSVE_FP8DOT4  : Predicate<"Subtarget->hasSSVE_FP8DOT4() || "                                   AssemblerPredicateWithAll<(any_of FeatureSSVE_FP8DOT4,                                                             (all_of FeatureSVE2, FeatureFP8DOT4)),                                   "ssve-fp8dot4 or (sve2 and fp8dot4)">; -def HasLUT          : Predicate<"Subtarget->hasLUT()">, +def HasLUT           : Predicate<"Subtarget->hasLUT()">,                                   AssemblerPredicateWithAll<(all_of FeatureLUT), "lut">; -def HasSME_LUTv2    : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME_LUTv2()">, +def HasSME_LUTv2     : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME_LUTv2()">,                                   AssemblerPredicateWithAll<(all_of FeatureSME_LUTv2), "sme-lutv2">; -def HasSMEF8F16     : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F16()">, +def HasSMEF8F16      : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F16()">,                                   AssemblerPredicateWithAll<(all_of FeatureSMEF8F16), "sme-f8f16">; -def HasSMEF8F32     : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F32()">, +def HasSMEF8F32      : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F32()">,                                   AssemblerPredicateWithAll<(all_of FeatureSMEF8F32), "sme-f8f32">; -def HasSME_MOP4     : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_MOP4())">, +def HasSME_MOP4      : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_MOP4())">,                                   AssemblerPredicateWithAll<(all_of FeatureSME_MOP4), "sme-mop4">; -def HasSME_TMOP     : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_TMOP())">, +def HasSME_TMOP      : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_TMOP())">,                                   AssemblerPredicateWithAll<(all_of FeatureSME_TMOP), "sme-tmop">; - -def HasCMPBR        : Predicate<"Subtarget->hasCMPBR()">, +def HasCMPBR         : Predicate<"Subtarget->hasCMPBR()">,                                   AssemblerPredicateWithAll<(all_of FeatureCMPBR), "cmpbr">; -def HasF8F32MM      : Predicate<"Subtarget->hasF8F32MM()">, +def HasF8F32MM       : Predicate<"Subtarget->hasF8F32MM()">,                                   AssemblerPredicateWithAll<(all_of FeatureF8F32MM), "f8f32mm">; -def HasF8F16MM      : Predicate<"Subtarget->hasF8F16MM()">, +def HasF8F16MM       : Predicate<"Subtarget->hasF8F16MM()">,                                   AssemblerPredicateWithAll<(all_of FeatureF8F16MM), "f8f16mm">; -def HasFPRCVT       : Predicate<"Subtarget->hasFPRCVT()">, +def HasFPRCVT        : Predicate<"Subtarget->hasFPRCVT()">,                                   AssemblerPredicateWithAll<(all_of FeatureFPRCVT), "fprcvt">; -def HasLSFE         : Predicate<"Subtarget->hasLSFE()">, +def HasLSFE          : Predicate<"Subtarget->hasLSFE()">,                                   AssemblerPredicateWithAll<(all_of FeatureLSFE), "lsfe">; -def HasSME2p2       : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p2()">, +def HasSME2p2        : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p2()">,                                   AssemblerPredicateWithAll<(all_of FeatureSME2p2), "sme2p2">; -def HasSVEAES2      : Predicate<"Subtarget->hasSVEAES2()">, +def HasSVEAES2       : Predicate<"Subtarget->hasSVEAES2()">,                                   AssemblerPredicateWithAll<(all_of FeatureSVEAES2), "sve-aes2">; -def HasSVEBFSCALE   : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSVEBFSCALE()">, +def HasSVEBFSCALE    : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSVEBFSCALE()">,                                   AssemblerPredicateWithAll<(all_of FeatureSVEBFSCALE), "sve-bfscale">; -def HasSVE_F16F32MM : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE_F16F32MM()">, +def HasSVE_F16F32MM  : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE_F16F32MM()">,                                   AssemblerPredicateWithAll<(all_of FeatureSVE_F16F32MM), "sve-f16f32mm">;  def HasPCDPHINT      : Predicate<"Subtarget->hasPCDPHINT()">, -                       AssemblerPredicateWithAll<(all_of FeaturePCDPHINT), "pcdphint">; +                                 AssemblerPredicateWithAll<(all_of FeaturePCDPHINT), "pcdphint">;  def HasLSUI          : Predicate<"Subtarget->hasLSUI()">, -                       AssemblerPredicateWithAll<(all_of FeatureLSUI), "lsui">; +                                 AssemblerPredicateWithAll<(all_of FeatureLSUI), "lsui">;  def HasOCCMO         : Predicate<"Subtarget->hasOCCMO()">, -                       AssemblerPredicateWithAll<(all_of FeatureOCCMO), "occmo">; +                                 AssemblerPredicateWithAll<(all_of FeatureOCCMO), "occmo">; +def HasCMH           : Predicate<"Subtarget->hasCMH()">, +                                 AssemblerPredicateWithAll<(all_of FeatureCMH), "cmh">; +def HasLSCP          : Predicate<"Subtarget->hasLSCP()">, +                                 AssemblerPredicateWithAll<(all_of FeatureLSCP), "lscp">; +def HasSVE2p2        : Predicate<"Subtarget->hasSVE2p2()">, +                                 AssemblerPredicateWithAll<(all_of FeatureSVE2p2), "sve2p2">; +def HasSVE_B16MM     : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE_B16MM()">, +                                 AssemblerPredicateWithAll<(all_of FeatureSVE_B16MM), "sve-b16mm">; +def HasF16MM         : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasF16MM()">, +                                 AssemblerPredicateWithAll<(all_of FeatureF16MM), "f16mm">; +def HasSVE2p3        : Predicate<"Subtarget->hasSVE2p3()">, +                                 AssemblerPredicateWithAll<(all_of FeatureSVE2p3), "sve2p3">; +def HasSME2p3        : Predicate<"Subtarget->hasSME2p3()">, +                                 AssemblerPredicateWithAll<(all_of FeatureSME2p3), "sme2p3">; +def HasF16F32DOT     : Predicate<"Subtarget->hasF16F32DOT()">, +                                 AssemblerPredicateWithAll<(all_of FeatureF16F32DOT), "f16f32dot">; +def HasF16F32MM      : Predicate<"Subtarget->hasF16F32MM()">, +                                 AssemblerPredicateWithAll<(all_of FeatureF16F32MM), "f16f32mm">;  // A subset of SVE(2) instructions are legal in Streaming SVE execution mode,  // they should be enabled if either has been specified. @@ -310,6 +308,10 @@ def HasSVE2p2_or_SME2p2      : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && (Subtarget->hasSVE2p2() || Subtarget->hasSME2p2())">,                  AssemblerPredicateWithAll<(any_of FeatureSME2p2, FeatureSVE2p2),                  "sme2p2 or sve2p2">; +def HasSVE2p3_or_SME2p3 +    : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && (Subtarget->hasSVE2p3() || Subtarget->hasSME2p3())">, +                AssemblerPredicateWithAll<(any_of FeatureSME2p3, FeatureSVE2p3), +                "sme2p3 or sve2p3">;  def HasNonStreamingSVE2p2_or_SME2p2      : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p2()) ||"                  "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">, @@ -328,100 +330,110 @@ def HasNEONandIsStreamingSafe        AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;  // A subset of NEON instructions are legal in Streaming SVE mode only with +sme2p2.  def HasNEONandIsSME2p2StreamingSafe -    : Predicate<"Subtarget->isNeonAvailable() || (Subtarget->hasNEON() && Subtarget->hasSME2p2())">, -    AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">; +                     : Predicate<"Subtarget->isNeonAvailable() || (Subtarget->hasNEON() && Subtarget->hasSME2p2())">, +                                 AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;  def HasRCPC          : Predicate<"Subtarget->hasRCPC()">,                                   AssemblerPredicateWithAll<(all_of FeatureRCPC), "rcpc">;  def HasAltNZCV       : Predicate<"Subtarget->hasAlternativeNZCV()">, -                       AssemblerPredicateWithAll<(all_of FeatureAltFPCmp), "altnzcv">; +                                 AssemblerPredicateWithAll<(all_of FeatureAltFPCmp), "altnzcv">;  def HasFRInt3264     : Predicate<"Subtarget->hasFRInt3264()">, -                       AssemblerPredicateWithAll<(all_of FeatureFRInt3264), "frint3264">; +                                 AssemblerPredicateWithAll<(all_of FeatureFRInt3264), "frint3264">;  def HasSB            : Predicate<"Subtarget->hasSB()">, -                       AssemblerPredicateWithAll<(all_of FeatureSB), "sb">; -def HasPredRes      : Predicate<"Subtarget->hasPredRes()">, -                       AssemblerPredicateWithAll<(all_of FeaturePredRes), "predres">; +                                 AssemblerPredicateWithAll<(all_of FeatureSB), "sb">; +def HasPredRes       : Predicate<"Subtarget->hasPredRes()">, +                                 AssemblerPredicateWithAll<(all_of FeaturePredRes), "predres">;  def HasCCDP          : Predicate<"Subtarget->hasCCDP()">, -                       AssemblerPredicateWithAll<(all_of FeatureCacheDeepPersist), "ccdp">; +                                 AssemblerPredicateWithAll<(all_of FeatureCacheDeepPersist), "ccdp">;  def HasBTI           : Predicate<"Subtarget->hasBTI()">, -                       AssemblerPredicateWithAll<(all_of FeatureBranchTargetId), "bti">; +                                 AssemblerPredicateWithAll<(all_of FeatureBranchTargetId), "bti">;  def HasMTE           : Predicate<"Subtarget->hasMTE()">, -                       AssemblerPredicateWithAll<(all_of FeatureMTE), "mte">; +                                 AssemblerPredicateWithAll<(all_of FeatureMTE), "mte">;  def HasTME           : Predicate<"Subtarget->hasTME()">, -                       AssemblerPredicateWithAll<(all_of FeatureTME), "tme">; +                                 AssemblerPredicateWithAll<(all_of FeatureTME), "tme">;  def HasETE           : Predicate<"Subtarget->hasETE()">, -                       AssemblerPredicateWithAll<(all_of FeatureETE), "ete">; +                                 AssemblerPredicateWithAll<(all_of FeatureETE), "ete">;  def HasTRBE          : Predicate<"Subtarget->hasTRBE()">, -                       AssemblerPredicateWithAll<(all_of FeatureTRBE), "trbe">; +                                 AssemblerPredicateWithAll<(all_of FeatureTRBE), "trbe">;  def HasBF16          : Predicate<"Subtarget->hasBF16()">, -                       AssemblerPredicateWithAll<(all_of FeatureBF16), "bf16">; +                                 AssemblerPredicateWithAll<(all_of FeatureBF16), "bf16">;  def HasNoBF16        : Predicate<"!Subtarget->hasBF16()">;  def HasMatMulInt8    : Predicate<"Subtarget->hasMatMulInt8()">, -                       AssemblerPredicateWithAll<(all_of FeatureMatMulInt8), "i8mm">; +                                 AssemblerPredicateWithAll<(all_of FeatureMatMulInt8), "i8mm">;  def HasMatMulFP32    : Predicate<"Subtarget->hasMatMulFP32()">, -                       AssemblerPredicateWithAll<(all_of FeatureMatMulFP32), "f32mm">; +                                 AssemblerPredicateWithAll<(all_of FeatureMatMulFP32), "f32mm">;  def HasMatMulFP64    : Predicate<"Subtarget->hasMatMulFP64()">, -                       AssemblerPredicateWithAll<(all_of FeatureMatMulFP64), "f64mm">; +                                 AssemblerPredicateWithAll<(all_of FeatureMatMulFP64), "f64mm">;  def HasXS            : Predicate<"Subtarget->hasXS()">, -                       AssemblerPredicateWithAll<(all_of FeatureXS), "xs">; +                                 AssemblerPredicateWithAll<(all_of FeatureXS), "xs">;  def HasWFxT          : Predicate<"Subtarget->hasWFxT()">, -                       AssemblerPredicateWithAll<(all_of FeatureWFxT), "wfxt">; +                                 AssemblerPredicateWithAll<(all_of FeatureWFxT), "wfxt">;  def HasLS64          : Predicate<"Subtarget->hasLS64()">, -                       AssemblerPredicateWithAll<(all_of FeatureLS64), "ls64">; +                                 AssemblerPredicateWithAll<(all_of FeatureLS64), "ls64">;  def HasBRBE          : Predicate<"Subtarget->hasBRBE()">, -                       AssemblerPredicateWithAll<(all_of FeatureBRBE), "brbe">; +                                 AssemblerPredicateWithAll<(all_of FeatureBRBE), "brbe">;  def HasSPE_EEF       : Predicate<"Subtarget->hasSPE_EEF()">, -                       AssemblerPredicateWithAll<(all_of FeatureSPE_EEF), "spe-eef">; +                                 AssemblerPredicateWithAll<(all_of FeatureSPE_EEF), "spe-eef">;  def HasHBC           : Predicate<"Subtarget->hasHBC()">, -                       AssemblerPredicateWithAll<(all_of FeatureHBC), "hbc">; +                                 AssemblerPredicateWithAll<(all_of FeatureHBC), "hbc">;  def HasMOPS          : Predicate<"Subtarget->hasMOPS()">, -                       AssemblerPredicateWithAll<(all_of FeatureMOPS), "mops">; +                                 AssemblerPredicateWithAll<(all_of FeatureMOPS), "mops">;  def HasCLRBHB        : Predicate<"Subtarget->hasCLRBHB()">, -                       AssemblerPredicateWithAll<(all_of FeatureCLRBHB), "clrbhb">; +                                 AssemblerPredicateWithAll<(all_of FeatureCLRBHB), "clrbhb">;  def HasSPECRES2      : Predicate<"Subtarget->hasSPECRES2()">, -                       AssemblerPredicateWithAll<(all_of FeatureSPECRES2), "specres2">; +                                 AssemblerPredicateWithAll<(all_of FeatureSPECRES2), "specres2">;  def HasITE           : Predicate<"Subtarget->hasITE()">, -                       AssemblerPredicateWithAll<(all_of FeatureITE), "ite">; +                                 AssemblerPredicateWithAll<(all_of FeatureITE), "ite">;  def HasTHE           : Predicate<"Subtarget->hasTHE()">, -                       AssemblerPredicateWithAll<(all_of FeatureTHE), "the">; +                                 AssemblerPredicateWithAll<(all_of FeatureTHE), "the">;  def HasRCPC3         : Predicate<"Subtarget->hasRCPC3()">, -                       AssemblerPredicateWithAll<(all_of FeatureRCPC3), "rcpc3">; +                                 AssemblerPredicateWithAll<(all_of FeatureRCPC3), "rcpc3">;  def HasLSE128        : Predicate<"Subtarget->hasLSE128()">, -                       AssemblerPredicateWithAll<(all_of FeatureLSE128), "lse128">; +                                 AssemblerPredicateWithAll<(all_of FeatureLSE128), "lse128">;  def HasD128          : Predicate<"Subtarget->hasD128()">, -                       AssemblerPredicateWithAll<(all_of FeatureD128), "d128">; +                                 AssemblerPredicateWithAll<(all_of FeatureD128), "d128">;  def HasCHK           : Predicate<"Subtarget->hasCHK()">, -                       AssemblerPredicateWithAll<(all_of FeatureCHK), "chk">; +                                 AssemblerPredicateWithAll<(all_of FeatureCHK), "chk">;  def HasGCS           : Predicate<"Subtarget->hasGCS()">, -                       AssemblerPredicateWithAll<(all_of FeatureGCS), "gcs">; +                                 AssemblerPredicateWithAll<(all_of FeatureGCS), "gcs">;  def HasCPA           : Predicate<"Subtarget->hasCPA()">, -                       AssemblerPredicateWithAll<(all_of FeatureCPA), "cpa">; +                                 AssemblerPredicateWithAll<(all_of FeatureCPA), "cpa">; +def HasTLBID         : Predicate<"Subtarget->hasTLBID()">, +                                 AssemblerPredicateWithAll<(all_of FeatureTLBID), "tlbid">; +def HasMPAMv2        : Predicate<"Subtarget->hasMPAMv2()">, +                                 AssemblerPredicateWithAll<(all_of FeatureMPAMv2), "mpamv2">; +def HasMTETC         : Predicate<"Subtarget->hasMTETC()">, +                                 AssemblerPredicateWithAll<(all_of FeatureMTETC), "mtetc">; +def HasGCIE          : Predicate<"Subtarget->hasGCIE()">, +                                 AssemblerPredicateWithAll<(all_of FeatureGCIE), "gcie">;  def IsLE             : Predicate<"Subtarget->isLittleEndian()">;  def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;  def IsWindows        : Predicate<"Subtarget->isTargetWindows()">;  def UseExperimentalZeroingPseudos -    : Predicate<"Subtarget->useExperimentalZeroingPseudos()">; +                     : Predicate<"Subtarget->useExperimentalZeroingPseudos()">;  def UseAlternateSExtLoadCVTF32 -    : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">; +                     : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;  def UseNegativeImmediates -    : Predicate<"false">, AssemblerPredicate<(all_of (not FeatureNoNegativeImmediates)), -                                             "NegativeImmediates">; +                     : Predicate<"false">, +                                 AssemblerPredicate<(all_of (not FeatureNoNegativeImmediates)), +                                 "NegativeImmediates">; -def UseScalarIncVL : Predicate<"Subtarget->useScalarIncVL()">; +def UseScalarIncVL   : Predicate<"Subtarget->useScalarIncVL()">;  def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">; -def HasFastIncVL : Predicate<"!Subtarget->hasDisableFastIncVL()">; +def HasFastIncVL     : Predicate<"!Subtarget->hasDisableFastIncVL()">; -def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">; +def UseSVEFPLD1R     : Predicate<"!Subtarget->noSVEFPLD1R()">; -def UseLDAPUR : Predicate<"!Subtarget->avoidLDAPUR()">; +def UseLDAPUR        : Predicate<"!Subtarget->avoidLDAPUR()">;  def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",                                    SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,                                                         SDTCisInt<1>]>>; -def AllowMisalignedMemAccesses : Predicate<"!Subtarget->requiresStrictAlign()">; +def AllowMisalignedMemAccesses +                    : Predicate<"!Subtarget->requiresStrictAlign()">;  def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">; @@ -3692,6 +3704,12 @@ def UDF : UDFType<0, "udf">;  // Load instructions.  //===----------------------------------------------------------------------===// +let Predicates = [HasLSCP] in { +defm LDAP  : LoadAcquirePairOffset<0b0101, "ldap">; +defm LDAPP : LoadAcquirePairOffset<0b0111, "ldapp">; +defm STLP  : StoreAcquirePairOffset<0b0101, "stlp">; +} +  // Pair (indexed, offset)  defm LDPW : LoadPairOffset<0b00, 0, GPR32z, simm7s4, "ldp">;  defm LDPX : LoadPairOffset<0b10, 0, GPR64z, simm7s8, "ldp">; @@ -4004,22 +4022,6 @@ defm LDRSW  : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",  def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),        (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>; -// load zero-extended i32, bitcast to f64 -def : Pat<(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), -          (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; -// load zero-extended i16, bitcast to f64 -def : Pat<(f64 (bitconvert (i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), -          (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; -// load zero-extended i8, bitcast to f64 -def : Pat<(f64 (bitconvert (i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), -          (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; -// load zero-extended i16, bitcast to f32 -def : Pat<(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), -          (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; -// load zero-extended i8, bitcast to f32 -def : Pat<(f32 (bitconvert (i32 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), -          (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; -  // Pre-fetch.  def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",                          [(AArch64Prefetch timm:$Rt, @@ -4371,6 +4373,64 @@ def : Pat <(v1i64 (scalar_to_vector (i64                 (load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))),             (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>; +// Patterns for bitconvert or scalar_to_vector of load operations. +// Enables direct SIMD register loads for small integer types (i8/i16) that are +// naturally zero-extended to i32/i64. +multiclass ExtLoad8_16AllModes<ValueType OutTy, ValueType InnerTy, +                                SDPatternOperator OuterOp, +                                PatFrags LoadOp8, PatFrags LoadOp16> { +  // 8-bit loads. +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), +            (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))), +            (SUBREG_TO_REG (i64 0), (LDURBi GPR64sp:$Rn, simm9:$offset), bsub)>; +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$extend))))), +            (SUBREG_TO_REG (i64 0), (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$extend), bsub)>; +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$extend))))), +            (SUBREG_TO_REG (i64 0), (LDRBroX GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$extend), bsub)>; + +  // 16-bit loads. +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), +            (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))), +            (SUBREG_TO_REG (i64 0), (LDURHi GPR64sp:$Rn, simm9:$offset), hsub)>; +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$extend))))), +            (SUBREG_TO_REG (i64 0), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$extend), hsub)>; +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$extend))))), +            (SUBREG_TO_REG (i64 0), (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$extend), hsub)>; +} + +// Extended multiclass that includes 32-bit loads in addition to 8-bit and 16-bit. +multiclass ExtLoad8_16_32AllModes<ValueType OutTy, ValueType InnerTy, +                                   SDPatternOperator OuterOp, +                                   PatFrags LoadOp8, PatFrags LoadOp16, PatFrags LoadOp32> { +  defm : ExtLoad8_16AllModes<OutTy, InnerTy, OuterOp, LoadOp8, LoadOp16>; + +  // 32-bit loads. +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), +            (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))), +            (SUBREG_TO_REG (i64 0), (LDURSi GPR64sp:$Rn, simm9:$offset), ssub)>; +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$extend))))), +            (SUBREG_TO_REG (i64 0), (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$extend), ssub)>; +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$extend))))), +            (SUBREG_TO_REG (i64 0), (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$extend), ssub)>; +} + +// Instantiate bitconvert patterns for floating-point types. +defm : ExtLoad8_16AllModes<f32, i32, bitconvert, zextloadi8, zextloadi16>; +defm : ExtLoad8_16_32AllModes<f64, i64, bitconvert, zextloadi8, zextloadi16, zextloadi32>; + +// Instantiate scalar_to_vector patterns for all vector types. +defm : ExtLoad8_16AllModes<v16i8, i32, scalar_to_vector, zextloadi8, zextloadi16>; +defm : ExtLoad8_16AllModes<v16i8, i32, scalar_to_vector, extloadi8, extloadi16>; +defm : ExtLoad8_16AllModes<v8i16, i32, scalar_to_vector, zextloadi8, zextloadi16>; +defm : ExtLoad8_16AllModes<v8i16, i32, scalar_to_vector, extloadi8, extloadi16>; +defm : ExtLoad8_16AllModes<v4i32, i32, scalar_to_vector, zextloadi8, zextloadi16>; +defm : ExtLoad8_16AllModes<v4i32, i32, scalar_to_vector, extloadi8, extloadi16>; +defm : ExtLoad8_16_32AllModes<v2i64, i64, scalar_to_vector, zextloadi8, zextloadi16, zextloadi32>; +defm : ExtLoad8_16_32AllModes<v2i64, i64, scalar_to_vector, extloadi8, extloadi16, extloadi32>; +  // Pre-fetch.  defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",                    [(AArch64Prefetch timm:$Rt, @@ -5235,113 +5295,10 @@ let Predicates = [HasNEON, HasFPRCVT] in{    defm FCVTNU : FPToIntegerSIMDScalar<0b01, 0b011, "fcvtnu", int_aarch64_neon_fcvtnu>;    defm FCVTPS : FPToIntegerSIMDScalar<0b10, 0b010, "fcvtps", int_aarch64_neon_fcvtps>;    defm FCVTPU : FPToIntegerSIMDScalar<0b10, 0b011, "fcvtpu", int_aarch64_neon_fcvtpu>; -  defm FCVTZS : FPToIntegerSIMDScalar<0b10, 0b110, "fcvtzs">; -  defm FCVTZU : FPToIntegerSIMDScalar<0b10, 0b111, "fcvtzu">; -} - - -// AArch64's FCVT instructions saturate when out of range. -multiclass FPToIntegerSatPats<SDNode to_int_sat, SDNode to_int_sat_gi, string INST> { -  let Predicates = [HasFullFP16] in { -  def : Pat<(i32 (to_int_sat f16:$Rn, i32)), -            (!cast<Instruction>(INST # UWHr) f16:$Rn)>; -  def : Pat<(i64 (to_int_sat f16:$Rn, i64)), -            (!cast<Instruction>(INST # UXHr) f16:$Rn)>; -  } -  def : Pat<(i32 (to_int_sat f32:$Rn, i32)), -            (!cast<Instruction>(INST # UWSr) f32:$Rn)>; -  def : Pat<(i64 (to_int_sat f32:$Rn, i64)), -            (!cast<Instruction>(INST # UXSr) f32:$Rn)>; -  def : Pat<(i32 (to_int_sat f64:$Rn, i32)), -            (!cast<Instruction>(INST # UWDr) f64:$Rn)>; -  def : Pat<(i64 (to_int_sat f64:$Rn, i64)), -            (!cast<Instruction>(INST # UXDr) f64:$Rn)>; - -  let Predicates = [HasFullFP16] in { -  def : Pat<(i32 (to_int_sat_gi f16:$Rn)), -            (!cast<Instruction>(INST # UWHr) f16:$Rn)>; -  def : Pat<(i64 (to_int_sat_gi f16:$Rn)), -            (!cast<Instruction>(INST # UXHr) f16:$Rn)>; -  } -  def : Pat<(i32 (to_int_sat_gi f32:$Rn)), -            (!cast<Instruction>(INST # UWSr) f32:$Rn)>; -  def : Pat<(i64 (to_int_sat_gi f32:$Rn)), -            (!cast<Instruction>(INST # UXSr) f32:$Rn)>; -  def : Pat<(i32 (to_int_sat_gi f64:$Rn)), -            (!cast<Instruction>(INST # UWDr) f64:$Rn)>; -  def : Pat<(i64 (to_int_sat_gi f64:$Rn)), -            (!cast<Instruction>(INST # UXDr) f64:$Rn)>; - -  let Predicates = [HasFullFP16] in { -  def : Pat<(i32 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i32:$scale), i32)), -            (!cast<Instruction>(INST # SWHri) $Rn, $scale)>; -  def : Pat<(i64 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i64:$scale), i64)), -            (!cast<Instruction>(INST # SXHri) $Rn, $scale)>; -  } -  def : Pat<(i32 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i32:$scale), i32)), -            (!cast<Instruction>(INST # SWSri) $Rn, $scale)>; -  def : Pat<(i64 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i64:$scale), i64)), -            (!cast<Instruction>(INST # SXSri) $Rn, $scale)>; -  def : Pat<(i32 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i32:$scale), i32)), -            (!cast<Instruction>(INST # SWDri) $Rn, $scale)>; -  def : Pat<(i64 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i64:$scale), i64)), -            (!cast<Instruction>(INST # SXDri) $Rn, $scale)>; - -  let Predicates = [HasFullFP16] in { -  def : Pat<(i32 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i32:$scale))), -            (!cast<Instruction>(INST # SWHri) $Rn, $scale)>; -  def : Pat<(i64 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i64:$scale))), -            (!cast<Instruction>(INST # SXHri) $Rn, $scale)>; -  } -  def : Pat<(i32 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i32:$scale))), -            (!cast<Instruction>(INST # SWSri) $Rn, $scale)>; -  def : Pat<(i64 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i64:$scale))), -            (!cast<Instruction>(INST # SXSri) $Rn, $scale)>; -  def : Pat<(i32 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i32:$scale))), -            (!cast<Instruction>(INST # SWDri) $Rn, $scale)>; -  def : Pat<(i64 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i64:$scale))), -            (!cast<Instruction>(INST # SXDri) $Rn, $scale)>; -} - -defm : FPToIntegerSatPats<fp_to_sint_sat, fp_to_sint_sat_gi, "FCVTZS">; -defm : FPToIntegerSatPats<fp_to_uint_sat, fp_to_uint_sat_gi, "FCVTZU">; - -multiclass FPToIntegerPats<SDNode to_int, SDNode to_int_sat, SDNode round, string INST> { -  def : Pat<(i32 (to_int (round f32:$Rn))), -            (!cast<Instruction>(INST # UWSr) f32:$Rn)>; -  def : Pat<(i64 (to_int (round f32:$Rn))), -            (!cast<Instruction>(INST # UXSr) f32:$Rn)>; -  def : Pat<(i32 (to_int (round f64:$Rn))), -            (!cast<Instruction>(INST # UWDr) f64:$Rn)>; -  def : Pat<(i64 (to_int (round f64:$Rn))), -            (!cast<Instruction>(INST # UXDr) f64:$Rn)>; - -  // These instructions saturate like fp_to_[su]int_sat. -  let Predicates = [HasFullFP16] in { -  def : Pat<(i32 (to_int_sat (round f16:$Rn), i32)), -            (!cast<Instruction>(INST # UWHr) f16:$Rn)>; -  def : Pat<(i64 (to_int_sat (round f16:$Rn), i64)), -            (!cast<Instruction>(INST # UXHr) f16:$Rn)>; -  } -  def : Pat<(i32 (to_int_sat (round f32:$Rn), i32)), -            (!cast<Instruction>(INST # UWSr) f32:$Rn)>; -  def : Pat<(i64 (to_int_sat (round f32:$Rn), i64)), -            (!cast<Instruction>(INST # UXSr) f32:$Rn)>; -  def : Pat<(i32 (to_int_sat (round f64:$Rn), i32)), -            (!cast<Instruction>(INST # UWDr) f64:$Rn)>; -  def : Pat<(i64 (to_int_sat (round f64:$Rn), i64)), -            (!cast<Instruction>(INST # UXDr) f64:$Rn)>; +  defm FCVTZS : FPToIntegerSIMDScalar<0b10, 0b110, "fcvtzs", any_fp_to_sint>; +  defm FCVTZU : FPToIntegerSIMDScalar<0b10, 0b111, "fcvtzu", any_fp_to_uint>;  } -defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fceil,  "FCVTPS">; -defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fceil,  "FCVTPU">; -defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, ffloor, "FCVTMS">; -defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, ffloor, "FCVTMU">; -defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, ftrunc, "FCVTZS">; -defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, ftrunc, "FCVTZU">; -defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fround, "FCVTAS">; -defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fround, "FCVTAU">; -  let Predicates = [HasFullFP16] in { @@ -6549,8 +6506,8 @@ defm FCVTNU : SIMDFPTwoScalar<   1, 0, 0b11010, "fcvtnu", int_aarch64_neon_fcvtn  defm FCVTPS : SIMDFPTwoScalar<   0, 1, 0b11010, "fcvtps", int_aarch64_neon_fcvtps>;  defm FCVTPU : SIMDFPTwoScalar<   1, 1, 0b11010, "fcvtpu", int_aarch64_neon_fcvtpu>;  def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">; -defm FCVTZS : SIMDFPTwoScalar<   0, 1, 0b11011, "fcvtzs">; -defm FCVTZU : SIMDFPTwoScalar<   1, 1, 0b11011, "fcvtzu">; +defm FCVTZS : SIMDFPTwoScalar<   0, 1, 0b11011, "fcvtzs", any_fp_to_sint>; +defm FCVTZU : SIMDFPTwoScalar<   1, 1, 0b11011, "fcvtzu", any_fp_to_uint>;  defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe">;  defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx">;  defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte">; @@ -6570,6 +6527,7 @@ defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",  // Floating-point conversion patterns.  multiclass FPToIntegerSIMDScalarPatterns<SDPatternOperator OpN, string INST> { +  let Predicates = [HasFPRCVT] in {    def : Pat<(f32 (bitconvert (i32 (OpN (f64 FPR64:$Rn))))),              (!cast<Instruction>(INST # SDr) FPR64:$Rn)>;    def : Pat<(f32 (bitconvert (i32 (OpN (f16 FPR16:$Rn))))), @@ -6578,6 +6536,7 @@ multiclass FPToIntegerSIMDScalarPatterns<SDPatternOperator OpN, string INST> {              (!cast<Instruction>(INST # DHr) FPR16:$Rn)>;    def : Pat<(f64 (bitconvert (i64 (OpN (f32 FPR32:$Rn))))),              (!cast<Instruction>(INST # DSr) FPR32:$Rn)>; +  }    def : Pat<(f32 (bitconvert (i32 (OpN (f32 FPR32:$Rn))))),              (!cast<Instruction>(INST # v1i32) FPR32:$Rn)>;    def : Pat<(f64 (bitconvert (i64 (OpN (f64 FPR64:$Rn))))), @@ -6592,6 +6551,8 @@ defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtns, "FCVTNS">;  defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtnu, "FCVTNU">;  defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtps, "FCVTPS">;  defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtpu, "FCVTPU">; +defm: FPToIntegerSIMDScalarPatterns<any_fp_to_sint, "FCVTZS">; +defm: FPToIntegerSIMDScalarPatterns<any_fp_to_uint, "FCVTZU">;  multiclass FPToIntegerIntPats<Intrinsic round, string INST> {    let Predicates = [HasFullFP16] in { @@ -6648,6 +6609,196 @@ multiclass FPToIntegerIntPats<Intrinsic round, string INST> {  defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">;  defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">; +// AArch64's FCVT instructions saturate when out of range. +multiclass FPToIntegerSatPats<SDNode to_int_sat, SDNode to_int_sat_gi, string INST> { +  let Predicates = [HasFullFP16] in { +  def : Pat<(i32 (to_int_sat f16:$Rn, i32)), +            (!cast<Instruction>(INST # UWHr) f16:$Rn)>; +  def : Pat<(i64 (to_int_sat f16:$Rn, i64)), +            (!cast<Instruction>(INST # UXHr) f16:$Rn)>; +  } +  def : Pat<(i32 (to_int_sat f32:$Rn, i32)), +            (!cast<Instruction>(INST # UWSr) f32:$Rn)>; +  def : Pat<(i64 (to_int_sat f32:$Rn, i64)), +            (!cast<Instruction>(INST # UXSr) f32:$Rn)>; +  def : Pat<(i32 (to_int_sat f64:$Rn, i32)), +            (!cast<Instruction>(INST # UWDr) f64:$Rn)>; +  def : Pat<(i64 (to_int_sat f64:$Rn, i64)), +            (!cast<Instruction>(INST # UXDr) f64:$Rn)>; + +  let Predicates = [HasFullFP16] in { +  def : Pat<(i32 (to_int_sat_gi f16:$Rn)), +            (!cast<Instruction>(INST # UWHr) f16:$Rn)>; +  def : Pat<(i64 (to_int_sat_gi f16:$Rn)), +            (!cast<Instruction>(INST # UXHr) f16:$Rn)>; +  } +  def : Pat<(i32 (to_int_sat_gi f32:$Rn)), +            (!cast<Instruction>(INST # UWSr) f32:$Rn)>; +  def : Pat<(i64 (to_int_sat_gi f32:$Rn)), +            (!cast<Instruction>(INST # UXSr) f32:$Rn)>; +  def : Pat<(i32 (to_int_sat_gi f64:$Rn)), +            (!cast<Instruction>(INST # UWDr) f64:$Rn)>; +  def : Pat<(i64 (to_int_sat_gi f64:$Rn)), +            (!cast<Instruction>(INST # UXDr) f64:$Rn)>; + +  // For global-isel we can use register classes to determine +  // which FCVT instruction to use. +  let Predicates = [HasFPRCVT] in { +  def : Pat<(i32 (to_int_sat_gi f16:$Rn)), +            (!cast<Instruction>(INST # SHr) f16:$Rn)>; +  def : Pat<(i64 (to_int_sat_gi f16:$Rn)), +            (!cast<Instruction>(INST # DHr) f16:$Rn)>; +  def : Pat<(i64 (to_int_sat_gi f32:$Rn)), +            (!cast<Instruction>(INST # DSr) f32:$Rn)>; +  def : Pat<(i32 (to_int_sat_gi f64:$Rn)), +            (!cast<Instruction>(INST # SDr) f64:$Rn)>; +  } +  def : Pat<(i32 (to_int_sat_gi f32:$Rn)), +            (!cast<Instruction>(INST # v1i32) f32:$Rn)>; +  def : Pat<(i64 (to_int_sat_gi f64:$Rn)), +            (!cast<Instruction>(INST # v1i64) f64:$Rn)>; + +  let Predicates = [HasFPRCVT] in { +  def : Pat<(f32 (bitconvert (i32 (to_int_sat f16:$Rn, i32)))), +            (!cast<Instruction>(INST # SHr) f16:$Rn)>; +  def : Pat<(f64 (bitconvert (i64 (to_int_sat f16:$Rn, i64)))), +            (!cast<Instruction>(INST # DHr) f16:$Rn)>; +  def : Pat<(f64 (bitconvert (i64 (to_int_sat f32:$Rn, i64)))), +            (!cast<Instruction>(INST # DSr) f32:$Rn)>; +  def : Pat<(f32 (bitconvert (i32 (to_int_sat f64:$Rn, i32)))), +            (!cast<Instruction>(INST # SDr) f64:$Rn)>; +  } +  def : Pat<(f32 (bitconvert (i32 (to_int_sat f32:$Rn, i32)))), +            (!cast<Instruction>(INST # v1i32) f32:$Rn)>; +  def : Pat<(f64 (bitconvert (i64 (to_int_sat f64:$Rn, i64)))), +            (!cast<Instruction>(INST # v1i64) f64:$Rn)>; + +  let Predicates = [HasFullFP16] in { +  def : Pat<(i32 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i32:$scale), i32)), +            (!cast<Instruction>(INST # SWHri) $Rn, $scale)>; +  def : Pat<(i64 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i64:$scale), i64)), +            (!cast<Instruction>(INST # SXHri) $Rn, $scale)>; +  } +  def : Pat<(i32 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i32:$scale), i32)), +            (!cast<Instruction>(INST # SWSri) $Rn, $scale)>; +  def : Pat<(i64 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i64:$scale), i64)), +            (!cast<Instruction>(INST # SXSri) $Rn, $scale)>; +  def : Pat<(i32 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i32:$scale), i32)), +            (!cast<Instruction>(INST # SWDri) $Rn, $scale)>; +  def : Pat<(i64 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i64:$scale), i64)), +            (!cast<Instruction>(INST # SXDri) $Rn, $scale)>; + +  let Predicates = [HasFullFP16] in { +  def : Pat<(i32 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i32:$scale))), +            (!cast<Instruction>(INST # SWHri) $Rn, $scale)>; +  def : Pat<(i64 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i64:$scale))), +            (!cast<Instruction>(INST # SXHri) $Rn, $scale)>; +  } +  def : Pat<(i32 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i32:$scale))), +            (!cast<Instruction>(INST # SWSri) $Rn, $scale)>; +  def : Pat<(i64 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i64:$scale))), +            (!cast<Instruction>(INST # SXSri) $Rn, $scale)>; +  def : Pat<(i32 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i32:$scale))), +            (!cast<Instruction>(INST # SWDri) $Rn, $scale)>; +  def : Pat<(i64 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i64:$scale))), +            (!cast<Instruction>(INST # SXDri) $Rn, $scale)>; +} + +defm : FPToIntegerSatPats<fp_to_sint_sat, fp_to_sint_sat_gi, "FCVTZS">; +defm : FPToIntegerSatPats<fp_to_uint_sat, fp_to_uint_sat_gi, "FCVTZU">; + +multiclass FPToIntegerPats<SDNode to_int, SDNode to_int_sat, SDNode to_int_sat_gi, SDNode round, string INST> { +  def : Pat<(i32 (to_int (round f32:$Rn))), +            (!cast<Instruction>(INST # UWSr) f32:$Rn)>; +  def : Pat<(i64 (to_int (round f32:$Rn))), +            (!cast<Instruction>(INST # UXSr) f32:$Rn)>; +  def : Pat<(i32 (to_int (round f64:$Rn))), +            (!cast<Instruction>(INST # UWDr) f64:$Rn)>; +  def : Pat<(i64 (to_int (round f64:$Rn))), +            (!cast<Instruction>(INST # UXDr) f64:$Rn)>; + +  // For global-isel we can use register classes to determine +  // which FCVT instruction to use. +  let Predicates = [HasFPRCVT] in { +  def : Pat<(i64 (to_int (round f32:$Rn))), +            (!cast<Instruction>(INST # DSr) f32:$Rn)>; +  def : Pat<(i32 (to_int (round f64:$Rn))), +            (!cast<Instruction>(INST # SDr) f64:$Rn)>; +  } +  def : Pat<(i32 (to_int (round f32:$Rn))), +            (!cast<Instruction>(INST # v1i32) f32:$Rn)>; +  def : Pat<(i64 (to_int (round f64:$Rn))), +            (!cast<Instruction>(INST # v1i64) f64:$Rn)>; + +  let Predicates = [HasFPRCVT] in { +  def : Pat<(f64 (bitconvert (i64 (to_int (round f32:$Rn))))), +            (!cast<Instruction>(INST # DSr) f32:$Rn)>; +  def : Pat<(f32 (bitconvert (i32 (to_int (round f64:$Rn))))), +            (!cast<Instruction>(INST # SDr) f64:$Rn)>; +  } +  def : Pat<(f32 (bitconvert (i32 (to_int (round f32:$Rn))))), +            (!cast<Instruction>(INST # v1i32) f32:$Rn)>; +  def : Pat<(f64 (bitconvert (i64 (to_int (round f64:$Rn))))), +            (!cast<Instruction>(INST # v1i64) f64:$Rn)>; + +  // These instructions saturate like fp_to_[su]int_sat. +  let Predicates = [HasFullFP16] in { +  def : Pat<(i32 (to_int_sat (round f16:$Rn), i32)), +            (!cast<Instruction>(INST # UWHr) f16:$Rn)>; +  def : Pat<(i64 (to_int_sat (round f16:$Rn), i64)), +            (!cast<Instruction>(INST # UXHr) f16:$Rn)>; +  } +  def : Pat<(i32 (to_int_sat (round f32:$Rn), i32)), +            (!cast<Instruction>(INST # UWSr) f32:$Rn)>; +  def : Pat<(i64 (to_int_sat (round f32:$Rn), i64)), +            (!cast<Instruction>(INST # UXSr) f32:$Rn)>; +  def : Pat<(i32 (to_int_sat (round f64:$Rn), i32)), +            (!cast<Instruction>(INST # UWDr) f64:$Rn)>; +  def : Pat<(i64 (to_int_sat (round f64:$Rn), i64)), +            (!cast<Instruction>(INST # UXDr) f64:$Rn)>; + +  // For global-isel we can use register classes to determine +  // which FCVT instruction to use. +  let Predicates = [HasFPRCVT] in { +    def : Pat<(i32 (to_int_sat_gi (round f16:$Rn))), +              (!cast<Instruction>(INST # SHr) f16:$Rn)>; +    def : Pat<(i64 (to_int_sat_gi (round f16:$Rn))), +              (!cast<Instruction>(INST # DHr) f16:$Rn)>; +    def : Pat<(i64 (to_int_sat_gi (round f32:$Rn))), +              (!cast<Instruction>(INST # DSr) f32:$Rn)>; +    def : Pat<(i32 (to_int_sat_gi (round f64:$Rn))), +              (!cast<Instruction>(INST # SDr) f64:$Rn)>; +  } +  def : Pat<(i32 (to_int_sat_gi (round f32:$Rn))), +            (!cast<Instruction>(INST # v1i32) f32:$Rn)>; +  def : Pat<(i64 (to_int_sat_gi (round f64:$Rn))), +            (!cast<Instruction>(INST # v1i64) f64:$Rn)>; +             +  let Predicates = [HasFPRCVT] in { +    def : Pat<(f32 (bitconvert (i32 (to_int_sat (round f16:$Rn), i32)))), +              (!cast<Instruction>(INST # SHr) f16:$Rn)>; +    def : Pat<(f64 (bitconvert (i64 (to_int_sat (round f16:$Rn), i64)))), +              (!cast<Instruction>(INST # DHr) f16:$Rn)>; +    def : Pat<(f64 (bitconvert (i64 (to_int_sat (round f32:$Rn), i64)))), +              (!cast<Instruction>(INST # DSr) f32:$Rn)>; +    def : Pat<(f32 (bitconvert (i32 (to_int_sat (round f64:$Rn), i32)))), +              (!cast<Instruction>(INST # SDr) f64:$Rn)>; +  } +  def : Pat<(f32 (bitconvert (i32 (to_int_sat (round f32:$Rn), i32)))), +            (!cast<Instruction>(INST # v1i32) f32:$Rn)>; +  def : Pat<(f64 (bitconvert (i64 (to_int_sat (round f64:$Rn), i64)))), +            (!cast<Instruction>(INST # v1i64) f64:$Rn)>; +} + +defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, fceil,  "FCVTPS">; +defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, fceil,  "FCVTPU">; +defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, ffloor, "FCVTMS">; +defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, ffloor, "FCVTMU">; +defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, ftrunc, "FCVTZS">; +defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, ftrunc, "FCVTZU">; +defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, fround, "FCVTAS">; +defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, fround, "FCVTAU">; +  // f16 -> s16 conversions  let Predicates = [HasFullFP16] in {    def : Pat<(i16(fp_to_sint_sat_gi f16:$Rn)), (FCVTZSv1f16 f16:$Rn)>; @@ -11244,8 +11395,28 @@ let Predicates = [HasLSFE] in {    def STBFMINNML : BaseAtomicFPStore<FPR16, 0b00, 0b1, 0b111, "stbfminnml">;  } +let Predicates = [HasF16F32DOT] in { +  defm FDOT :SIMDThreeSameVectorFDot<"fdot">; +  defm FDOTlane: SIMDThreeSameVectorFDOTIndex<"fdot">; +} + +let Predicates = [HasF16MM] in +  defm FMMLA : SIMDThreeSameVectorFMLA<"fmmla">; + +let Predicates = [HasF16F32MM] in +  defm FMMLA : SIMDThreeSameVectorFMLAWiden<"fmmla">; +  let Uses = [FPMR, FPCR] in -defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">; +  defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">; + +//===----------------------------------------------------------------------===// +// Contention Management Hints (FEAT_CMH) +//===----------------------------------------------------------------------===// + +let Predicates = [HasCMH] in { +  defm SHUH  : SHUH<"shuh">;       // Shared Update Hint instruction +  def STCPH  : STCPHInst<"stcph">; // Store Concurrent Priority Hint instruction +}  include "AArch64InstrAtomics.td"  include "AArch64SVEInstrInfo.td" diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index 47144c7..cd94a25 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -1341,6 +1341,10 @@ def Z_q  : RegisterOperand<ZPR,  "printTypedVectorList<0,'q'>"> {    let ParserMatchClass = ZPRVectorList<128, 1>;  } +def ZZ_Any  : RegisterOperand<ZPR2, "printTypedVectorList<0,0>"> { +  let ParserMatchClass = ZPRVectorList<0, 2>; +} +  def ZZ_b  : RegisterOperand<ZPR2, "printTypedVectorList<0,'b'>"> {    let ParserMatchClass = ZPRVectorList<8, 2>;  } @@ -1361,6 +1365,10 @@ def ZZ_q  : RegisterOperand<ZPR2, "printTypedVectorList<0,'q'>"> {    let ParserMatchClass = ZPRVectorList<128, 2>;  } +def ZZZ_Any  : RegisterOperand<ZPR3, "printTypedVectorList<0,0>"> { +  let ParserMatchClass = ZPRVectorList<0, 3>; +} +  def ZZZ_b  : RegisterOperand<ZPR3, "printTypedVectorList<0,'b'>"> {    let ParserMatchClass = ZPRVectorList<8, 3>;  } diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index e552afe..752b185 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -1173,3 +1173,14 @@ let Predicates = [HasSME_MOP4, HasSMEF64F64] in {    defm FMOP4A : sme2_fmop4as_fp64_non_widening<0, "fmop4a", "int_aarch64_sme_mop4a">;    defm FMOP4S : sme2_fmop4as_fp64_non_widening<1, "fmop4s", "int_aarch64_sme_mop4s">;  } + +//===----------------------------------------------------------------------===// +// SME2.3 instructions +//===----------------------------------------------------------------------===// +let Predicates = [HasSME2p3] in { +  def LUTI6_ZTZ       : sme2_lut_single<"luti6">; +  def LUTI6_4ZT3Z     : sme2_luti6_zt_consecutive<"luti6">; +  def LUTI6_S_4ZT3Z   : sme2_luti6_zt_strided<"luti6">; +  def LUTI6_4Z2Z2ZI   : sme2_luti6_vector_vg4_consecutive<"luti6">; +  def LUTI6_S_4Z2Z2ZI : sme2_luti6_vector_vg4_strided<"luti6">; +} // [HasSME2p3] diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 98a128e..3b268dc 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2569,7 +2569,7 @@ let Predicates = [HasBF16, HasSVE_or_SME] in {  } // End HasBF16, HasSVE_or_SME  let Predicates = [HasBF16, HasSVE] in { -  defm BFMMLA_ZZZ_HtoS : sve_fp_matrix_mla<0b01, "bfmmla", ZPR32, ZPR16, int_aarch64_sve_bfmmla, nxv4f32, nxv8bf16>; +  defm BFMMLA_ZZZ_HtoS : sve_fp_matrix_mla<0b011, "bfmmla", ZPR32, ZPR16, int_aarch64_sve_bfmmla, nxv4f32, nxv8bf16>;  } // End HasBF16, HasSVE  let Predicates = [HasBF16, HasSVE_or_SME] in { @@ -3680,15 +3680,15 @@ let Predicates = [HasSVE_or_SME, HasMatMulInt8] in {  } // End HasSVE_or_SME, HasMatMulInt8  let Predicates = [HasSVE, HasMatMulFP32] in { -  defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0b10, "fmmla", ZPR32, ZPR32, int_aarch64_sve_fmmla, nxv4f32, nxv4f32>; +  defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0b101, "fmmla", ZPR32, ZPR32, int_aarch64_sve_fmmla, nxv4f32, nxv4f32>;  } // End HasSVE, HasMatMulFP32  let Predicates = [HasSVE_F16F32MM] in { -  def FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b00, "fmmla", ZPR32, ZPR16>; +  def FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16>;  } // End HasSVE_F16F32MM  let Predicates = [HasSVE, HasMatMulFP64] in { -  defm FMMLA_ZZZ_D : sve_fp_matrix_mla<0b11, "fmmla", ZPR64, ZPR64, int_aarch64_sve_fmmla, nxv2f64, nxv2f64>; +  defm FMMLA_ZZZ_D : sve_fp_matrix_mla<0b111, "fmmla", ZPR64, ZPR64, int_aarch64_sve_fmmla, nxv2f64, nxv2f64>;    defm LD1RO_B_IMM : sve_mem_ldor_si<0b00, "ld1rob", Z_b, ZPR8,  nxv16i8, nxv16i1, AArch64ld1ro_z>;    defm LD1RO_H_IMM : sve_mem_ldor_si<0b01, "ld1roh", Z_h, ZPR16, nxv8i16, nxv8i1,  AArch64ld1ro_z>;    defm LD1RO_W_IMM : sve_mem_ldor_si<0b10, "ld1row", Z_s, ZPR32, nxv4i32, nxv4i1,  AArch64ld1ro_z>; @@ -4272,9 +4272,9 @@ def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$MulLHS, nxv8i16:$  defm SQCVTN_Z2Z_StoH  : sve2p1_multi_vec_extract_narrow<"sqcvtn", 0b00, int_aarch64_sve_sqcvtn_x2>;  defm UQCVTN_Z2Z_StoH  : sve2p1_multi_vec_extract_narrow<"uqcvtn", 0b01, int_aarch64_sve_uqcvtn_x2>;  defm SQCVTUN_Z2Z_StoH : sve2p1_multi_vec_extract_narrow<"sqcvtun", 0b10, int_aarch64_sve_sqcvtun_x2>; -defm SQRSHRN_Z2ZI_StoH  : sve2p1_multi_vec_shift_narrow<"sqrshrn", 0b101, int_aarch64_sve_sqrshrn_x2>; -defm UQRSHRN_Z2ZI_StoH  : sve2p1_multi_vec_shift_narrow<"uqrshrn", 0b111, int_aarch64_sve_uqrshrn_x2>; -defm SQRSHRUN_Z2ZI_StoH : sve2p1_multi_vec_shift_narrow<"sqrshrun", 0b001, int_aarch64_sve_sqrshrun_x2>; +defm SQRSHRN_Z2ZI_StoH  : sve_multi_vec_shift_narrow<"sqrshrn", 0b101, int_aarch64_sve_sqrshrn_x2>; +defm UQRSHRN_Z2ZI_StoH  : sve_multi_vec_shift_narrow<"uqrshrn", 0b111, int_aarch64_sve_uqrshrn_x2>; +defm SQRSHRUN_Z2ZI_StoH : sve_multi_vec_shift_narrow<"sqrshrun", 0b001, int_aarch64_sve_sqrshrun_x2>;  defm WHILEGE_2PXX : sve2p1_int_while_rr_pair<"whilege", 0b000>;  defm WHILEGT_2PXX : sve2p1_int_while_rr_pair<"whilegt", 0b001>; @@ -4615,6 +4615,75 @@ let Predicates = [HasSVE2p2_or_SME2p2] in {    defm REVD_ZPzZ : sve_int_perm_rev_revd_z<"revd", AArch64revd_mt>;  } // End HasSME2p2orSVE2p2 + +//===----------------------------------------------------------------------===// +// SME2.3 or SVE2.3 instructions +//===----------------------------------------------------------------------===// +let Predicates = [HasSVE2p3_or_SME2p3] in { +  // SVE2 Add pairwise within quadword vector segments (unpredicated) +  defm ADDQP_ZZZ     : sve2_int_mul<0b110, "addqp",   null_frag>; + +  // SVE2 Add subtract/subtract pairwise +  defm ADDSUBP_ZZZ   : sve2_int_mul<0b111, "addsubp", null_frag>; +  defm SUBP_ZPmZZ    : sve2_int_arith_pred<0b100001, "subp", null_frag>; + +  // SVE2 integer absolute difference and accumulate long +  defm SABAL_ZZZ : sve2_int_two_way_absdiff_accum_long<0b0, "sabal">; +  defm UABAL_ZZZ : sve2_int_two_way_absdiff_accum_long<0b1, "uabal">; + +  // SVE2 integer dot product +  def SDOT_ZZZ_BtoH : sve_intx_dot<0b01, 0b00000, 0b0, "sdot", ZPR16, ZPR8>; +  def UDOT_ZZZ_BtoH : sve_intx_dot<0b01, 0b00000, 0b1, "udot", ZPR16, ZPR8>; + +  // SVE2 integer indexed dot product +  def SDOT_ZZZI_BtoH : sve_intx_dot_by_indexed_elem_x<0b0, "sdot">; +  def UDOT_ZZZI_BtoH : sve_intx_dot_by_indexed_elem_x<0b1, "udot">; + +  // SVE2 fp convert, narrow and interleave to integer, rounding toward zero +  defm FCVTZSN_Z2Z : sve2_fp_to_int_downcvt<"fcvtzsn", 0b0>; +  defm FCVTZUN_Z2Z : sve2_fp_to_int_downcvt<"fcvtzun", 0b1>; + +  // SVE2 signed/unsigned integer convert to floating-point +  defm SCVTF_ZZ   : sve2_int_to_fp_upcvt<"scvtf",   0b00>; +  defm SCVTFLT_ZZ : sve2_int_to_fp_upcvt<"scvtflt", 0b10>; +  defm UCVTF_ZZ   : sve2_int_to_fp_upcvt<"ucvtf",   0b01>; +  defm UCVTFLT_ZZ : sve2_int_to_fp_upcvt<"ucvtflt", 0b11>; + +  // SVE2 saturating shift right narrow by immediate and interleave +  defm SQRSHRN_Z2ZI_HtoB  : sve_multi_vec_round_shift_narrow<"sqrshrn",  0b101>; +  defm SQRSHRUN_Z2ZI_HtoB : sve_multi_vec_round_shift_narrow<"sqrshrun", 0b001>; +  defm SQSHRN_Z2ZI_HtoB   : sve_multi_vec_round_shift_narrow<"sqshrn",   0b000>; +  defm SQSHRUN_Z2ZI_HtoB  : sve_multi_vec_round_shift_narrow<"sqshrun",  0b100>; +  defm UQRSHRN_Z2ZI_HtoB  : sve_multi_vec_round_shift_narrow<"uqrshrn",  0b111>; +  defm UQSHRN_Z2ZI_HtoB   : sve_multi_vec_round_shift_narrow<"uqshrn",   0b010>; +  defm SQSHRUN_Z2ZI_StoH  : sve_multi_vec_shift_narrow<"sqshrun",  0b100, null_frag>; +  defm SQSHRN_Z2ZI_StoH   : sve_multi_vec_shift_narrow<"sqshrn",   0b000, null_frag>; +  defm UQSHRN_Z2ZI_StoH   : sve_multi_vec_shift_narrow<"uqshrn",   0b010, null_frag>; + +  defm LUTI6_Z2ZZI : sve2_luti6_vector_index<"luti6">; +} // End HasSME2p3orSVE2p3 + +//===----------------------------------------------------------------------===// +// SVE2.3 instructions +//===----------------------------------------------------------------------===// +let Predicates = [HasSVE2p3] in { +  def LUTI6_Z2ZZ : sve2_luti6_vector<"luti6">; +} + +//===----------------------------------------------------------------------===// +// SVE_B16MM Instructions +//===----------------------------------------------------------------------===// +let Predicates = [HasSVE_B16MM] in { +  def BFMMLA_ZZZ_H : sve_fp_matrix_mla<0b110, "bfmmla", ZPR16, ZPR16>; +} + +//===----------------------------------------------------------------------===// +// F16MM Instructions +//===----------------------------------------------------------------------===// +let Predicates = [HasSVE2p2, HasF16MM] in { +  def FMMLA_ZZZ_H : sve_fp_matrix_mla<0b100, "fmmla", ZPR16, ZPR16>; +} +  //===----------------------------------------------------------------------===//  // SME2.2 or SVE2.2 instructions - Legal in streaming mode iff target has SME2p2  //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td index bdde8e3..2387f17 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td @@ -2762,11 +2762,11 @@ def : InstRW<[V2Write_11c_18L01_18V01], (instregex "^ST4[BHWD]_IMM$")>;  def : InstRW<[V2Write_11c_18L01_18S_18V01], (instregex "^ST4[BHWD]$")>;  // Non temporal store, scalar + imm -def : InstRW<[V2Write_2c_1L01_1V], (instregex "^STNT1[BHWD]_ZRI$")>; +def : InstRW<[V2Write_2c_1L01_1V01], (instregex "^STNT1[BHWD]_ZRI$")>;  // Non temporal store, scalar + scalar -def : InstRW<[V2Write_2c_1L01_1S_1V], (instrs STNT1H_ZRR)>; -def : InstRW<[V2Write_2c_1L01_1V], (instregex "^STNT1[BWD]_ZRR$")>; +def : InstRW<[V2Write_2c_1L01_1S_1V01], (instrs STNT1H_ZRR)>; +def : InstRW<[V2Write_2c_1L01_1V01], (instregex "^STNT1[BWD]_ZRR$")>;  // Scatter non temporal store, vector + scalar 32-bit element size  def : InstRW<[V2Write_4c_4L01_4V01], (instregex "^STNT1[BHW]_ZZR_S")>; diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td index 9438917..ae46d71 100644 --- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td +++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td @@ -205,6 +205,7 @@ def lookupDCByName : SearchIndex {    let Key = ["Name"];  } +//                Op1    CRn     CRm     Op2  def : DC<"ZVA",   0b011, 0b0111, 0b0100, 0b001>;  def : DC<"IVAC",  0b000, 0b0111, 0b0110, 0b001>;  def : DC<"ISW",   0b000, 0b0111, 0b0110, 0b010>; @@ -241,6 +242,11 @@ def : DC<"CIGDVAC", 0b011, 0b0111, 0b1110, 0b101>;  def : DC<"GZVA",    0b011, 0b0111, 0b0100, 0b100>;  } +let Requires = [{ {AArch64::FeatureMTETC} }] in { +def : DC<"ZGBVA",   0b011, 0b0111, 0b0100, 0b101>; +def : DC<"GBVA",    0b011, 0b0111, 0b0100, 0b111>; +} +  let Requires = [{ {AArch64::FeatureMEC} }] in {  def : DC<"CIPAE",   0b100, 0b0111, 0b1110, 0b000>;  def : DC<"CIGDPAE", 0b100, 0b0111, 0b1110, 0b111>; @@ -813,11 +819,26 @@ def : BTI<"j",  0b100>;  def : BTI<"jc", 0b110>;  //===----------------------------------------------------------------------===// +// CMHPriority instruction options. +//===----------------------------------------------------------------------===// + +class CMHPriorityHint<string name, bits<1> encoding> : SearchableTable { +  let SearchableFields = ["Name", "Encoding"]; +  let EnumValueField = "Encoding"; + +  string Name = name; +  bits<1> Encoding; +  let Encoding = encoding; +} + +def : CMHPriorityHint<"ph", 0b1>; + +//===----------------------------------------------------------------------===//  // TLBI (translation lookaside buffer invalidate) instruction options.  //===----------------------------------------------------------------------===//  class TLBICommon<string name, bits<3> op1, bits<4> crn, bits<4> crm, -                 bits<3> op2, bit needsreg> { +                 bits<3> op2, bit needsreg, bit optionalreg> {    string Name = name;    bits<14> Encoding;    let Encoding{13-11} = op1; @@ -825,24 +846,25 @@ class TLBICommon<string name, bits<3> op1, bits<4> crn, bits<4> crm,    let Encoding{6-3} = crm;    let Encoding{2-0} = op2;    bit NeedsReg = needsreg; +  bit OptionalReg = optionalreg;    list<string> Requires = [];    list<string> ExtraRequires = [];    code RequiresStr = [{ { }] # !interleave(Requires # ExtraRequires, [{, }]) # [{ } }];  }  class TLBIEntry<string name, bits<3> op1, bits<4> crn, bits<4> crm, -                bits<3> op2, bit needsreg> -  : TLBICommon<name, op1, crn, crm, op2, needsreg>; +                bits<3> op2, bit needsreg, bit optionalreg> +  : TLBICommon<name, op1, crn, crm, op2, needsreg, optionalreg>;  class TLBIPEntry<string name, bits<3> op1, bits<4> crn, bits<4> crm, -                 bits<3> op2, bit needsreg> -  : TLBICommon<name, op1, crn, crm, op2, needsreg>; +                 bits<3> op2, bit needsreg, bit optionalreg> +  : TLBICommon<name, op1, crn, crm, op2, needsreg, optionalreg>;  multiclass TLBITableBase {    def NAME # Table : GenericTable {      let FilterClass = NAME # "Entry";      let CppTypeName = NAME; -    let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"]; +    let Fields = ["Name", "Encoding", "NeedsReg", "OptionalReg", "RequiresStr"];      let PrimaryKey = ["Encoding"];      let PrimaryKeyName = "lookup" # NAME # "ByEncoding";    } @@ -856,60 +878,60 @@ defm TLBI  : TLBITableBase;  defm TLBIP : TLBITableBase;  multiclass TLBI<string name, bit hasTLBIP, bits<3> op1, bits<4> crn, bits<4> crm, -             bits<3> op2, bit needsreg = 1> { -  def : TLBIEntry<name, op1, crn, crm, op2, needsreg>; -  def : TLBIEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg> { +             bits<3> op2, bit needsreg = 1, bit optionalreg = 0> { +  def : TLBIEntry<name, op1, crn, crm, op2, needsreg, optionalreg>; +  def : TLBIEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg, optionalreg> {      let Encoding{7} = 1;      let ExtraRequires = ["AArch64::FeatureXS"];    }    if !eq(hasTLBIP, true) then { -    def : TLBIPEntry<name, op1, crn, crm, op2, needsreg>; -    def : TLBIPEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg> { +    def : TLBIPEntry<name, op1, crn, crm, op2, needsreg, optionalreg>; +    def : TLBIPEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg, optionalreg> {        let Encoding{7} = 1;        let ExtraRequires = ["AArch64::FeatureXS"];      }    }  } -//                   hasTLBIP  op1    CRn     CRm     op2    needsreg +//                   hasTLBIP  op1    CRn     CRm     op2    needsreg, optreg  defm : TLBI<"IPAS2E1IS",    1, 0b100, 0b1000, 0b0000, 0b001>;  defm : TLBI<"IPAS2LE1IS",   1, 0b100, 0b1000, 0b0000, 0b101>; -defm : TLBI<"VMALLE1IS",    0, 0b000, 0b1000, 0b0011, 0b000, 0>; -defm : TLBI<"ALLE2IS",      0, 0b100, 0b1000, 0b0011, 0b000, 0>; -defm : TLBI<"ALLE3IS",      0, 0b110, 0b1000, 0b0011, 0b000, 0>; +defm : TLBI<"VMALLE1IS",    0, 0b000, 0b1000, 0b0011, 0b000, 0, 1>; +defm : TLBI<"ALLE2IS",      0, 0b100, 0b1000, 0b0011, 0b000, 0, 1>; +defm : TLBI<"ALLE3IS",      0, 0b110, 0b1000, 0b0011, 0b000, 0, 1>;  defm : TLBI<"VAE1IS",       1, 0b000, 0b1000, 0b0011, 0b001>;  defm : TLBI<"VAE2IS",       1, 0b100, 0b1000, 0b0011, 0b001>;  defm : TLBI<"VAE3IS",       1, 0b110, 0b1000, 0b0011, 0b001>;  defm : TLBI<"ASIDE1IS",     0, 0b000, 0b1000, 0b0011, 0b010>;  defm : TLBI<"VAAE1IS",      1, 0b000, 0b1000, 0b0011, 0b011>; -defm : TLBI<"ALLE1IS",      0, 0b100, 0b1000, 0b0011, 0b100, 0>; +defm : TLBI<"ALLE1IS",      0, 0b100, 0b1000, 0b0011, 0b100, 0, 1>;  defm : TLBI<"VALE1IS",      1, 0b000, 0b1000, 0b0011, 0b101>;  defm : TLBI<"VALE2IS",      1, 0b100, 0b1000, 0b0011, 0b101>;  defm : TLBI<"VALE3IS",      1, 0b110, 0b1000, 0b0011, 0b101>; -defm : TLBI<"VMALLS12E1IS", 0, 0b100, 0b1000, 0b0011, 0b110, 0>; +defm : TLBI<"VMALLS12E1IS", 0, 0b100, 0b1000, 0b0011, 0b110, 0, 1>;  defm : TLBI<"VAALE1IS",     1, 0b000, 0b1000, 0b0011, 0b111>;  defm : TLBI<"IPAS2E1",      1, 0b100, 0b1000, 0b0100, 0b001>;  defm : TLBI<"IPAS2LE1",     1, 0b100, 0b1000, 0b0100, 0b101>; -defm : TLBI<"VMALLE1",      0, 0b000, 0b1000, 0b0111, 0b000, 0>; -defm : TLBI<"ALLE2",        0, 0b100, 0b1000, 0b0111, 0b000, 0>; -defm : TLBI<"ALLE3",        0, 0b110, 0b1000, 0b0111, 0b000, 0>; +defm : TLBI<"VMALLE1",      0, 0b000, 0b1000, 0b0111, 0b000, 0, 0>; +defm : TLBI<"ALLE2",        0, 0b100, 0b1000, 0b0111, 0b000, 0, 0>; +defm : TLBI<"ALLE3",        0, 0b110, 0b1000, 0b0111, 0b000, 0, 0>;  defm : TLBI<"VAE1",         1, 0b000, 0b1000, 0b0111, 0b001>;  defm : TLBI<"VAE2",         1, 0b100, 0b1000, 0b0111, 0b001>;  defm : TLBI<"VAE3",         1, 0b110, 0b1000, 0b0111, 0b001>;  defm : TLBI<"ASIDE1",       0, 0b000, 0b1000, 0b0111, 0b010>;  defm : TLBI<"VAAE1",        1, 0b000, 0b1000, 0b0111, 0b011>; -defm : TLBI<"ALLE1",        0, 0b100, 0b1000, 0b0111, 0b100, 0>; +defm : TLBI<"ALLE1",        0, 0b100, 0b1000, 0b0111, 0b100, 0, 0>;  defm : TLBI<"VALE1",        1, 0b000, 0b1000, 0b0111, 0b101>;  defm : TLBI<"VALE2",        1, 0b100, 0b1000, 0b0111, 0b101>;  defm : TLBI<"VALE3",        1, 0b110, 0b1000, 0b0111, 0b101>; -defm : TLBI<"VMALLS12E1",   0, 0b100, 0b1000, 0b0111, 0b110, 0>; +defm : TLBI<"VMALLS12E1",   0, 0b100, 0b1000, 0b0111, 0b110, 0, 0>;  defm : TLBI<"VAALE1",       1, 0b000, 0b1000, 0b0111, 0b111>;  // Armv8.4-A Translation Lookaside Buffer Instructions (TLBI)  let Requires = ["AArch64::FeatureTLB_RMI"] in {  // Armv8.4-A Outer Sharable TLB Maintenance instructions: -//                   hasTLBIP  op1    CRn     CRm     op2    needsreg -defm : TLBI<"VMALLE1OS",    0, 0b000, 0b1000, 0b0001, 0b000, 0>; +//                   hasTLBIP  op1    CRn     CRm     op2    needsreg, optreg +defm : TLBI<"VMALLE1OS",    0, 0b000, 0b1000, 0b0001, 0b000, 0, 1>;  defm : TLBI<"VAE1OS",       1, 0b000, 0b1000, 0b0001, 0b001>;  defm : TLBI<"ASIDE1OS",     0, 0b000, 0b1000, 0b0001, 0b010>;  defm : TLBI<"VAAE1OS",      1, 0b000, 0b1000, 0b0001, 0b011>; @@ -919,15 +941,15 @@ defm : TLBI<"IPAS2E1OS",    1, 0b100, 0b1000, 0b0100, 0b000>;  defm : TLBI<"IPAS2LE1OS",   1, 0b100, 0b1000, 0b0100, 0b100>;  defm : TLBI<"VAE2OS",       1, 0b100, 0b1000, 0b0001, 0b001>;  defm : TLBI<"VALE2OS",      1, 0b100, 0b1000, 0b0001, 0b101>; -defm : TLBI<"VMALLS12E1OS", 0, 0b100, 0b1000, 0b0001, 0b110, 0>; +defm : TLBI<"VMALLS12E1OS", 0, 0b100, 0b1000, 0b0001, 0b110, 0, 1>;  defm : TLBI<"VAE3OS",       1, 0b110, 0b1000, 0b0001, 0b001>;  defm : TLBI<"VALE3OS",      1, 0b110, 0b1000, 0b0001, 0b101>; -defm : TLBI<"ALLE2OS",      0, 0b100, 0b1000, 0b0001, 0b000, 0>; -defm : TLBI<"ALLE1OS",      0, 0b100, 0b1000, 0b0001, 0b100, 0>; -defm : TLBI<"ALLE3OS",      0, 0b110, 0b1000, 0b0001, 0b000, 0>; +defm : TLBI<"ALLE2OS",      0, 0b100, 0b1000, 0b0001, 0b000, 0, 1>; +defm : TLBI<"ALLE1OS",      0, 0b100, 0b1000, 0b0001, 0b100, 0, 1>; +defm : TLBI<"ALLE3OS",      0, 0b110, 0b1000, 0b0001, 0b000, 0, 1>;  // Armv8.4-A TLB Range Maintenance instructions: -//                   hasTLBIP  op1    CRn     CRm     op2    needsreg +//                   hasTLBIP  op1    CRn     CRm     op2  defm : TLBI<"RVAE1",        1, 0b000, 0b1000, 0b0110, 0b001>;  defm : TLBI<"RVAAE1",       1, 0b000, 0b1000, 0b0110, 0b011>;  defm : TLBI<"RVALE1",       1, 0b000, 0b1000, 0b0110, 0b101>; @@ -962,18 +984,19 @@ defm : TLBI<"RVALE3OS",     1, 0b110, 0b1000, 0b0101, 0b101>;  // Armv9-A Realm Management Extension TLBI Instructions  let Requires = ["AArch64::FeatureRME"] in { +//                   hasTLBIP  op1    CRn     CRm     op2    needsreg  defm : TLBI<"RPAOS",        0, 0b110, 0b1000, 0b0100, 0b011>;  defm : TLBI<"RPALOS",       0, 0b110, 0b1000, 0b0100, 0b111>; -defm : TLBI<"PAALLOS",      0, 0b110, 0b1000, 0b0001, 0b100, 0>; -defm : TLBI<"PAALL",        0, 0b110, 0b1000, 0b0111, 0b100, 0>; +defm : TLBI<"PAALLOS",      0, 0b110, 0b1000, 0b0001, 0b100, 0, 0>; +defm : TLBI<"PAALL",        0, 0b110, 0b1000, 0b0111, 0b100, 0, 0>;  }  // Armv9.5-A TLBI VMALL for Dirty State  let Requires = ["AArch64::FeatureTLBIW"] in { -//                                           op1,   CRn,    CRm,    op2,   needsreg -defm : TLBI<"VMALLWS2E1",    0, 0b100, 0b1000, 0b0110, 0b010, 0>; -defm : TLBI<"VMALLWS2E1IS",  0, 0b100, 0b1000, 0b0010, 0b010, 0>; -defm : TLBI<"VMALLWS2E1OS",  0, 0b100, 0b1000, 0b0101, 0b010, 0>; +//                   hasTLBIP  op1    CRn     CRm     op2    needsreg, optreg +defm : TLBI<"VMALLWS2E1",   0, 0b100, 0b1000, 0b0110, 0b010, 0, 0>; +defm : TLBI<"VMALLWS2E1IS", 0, 0b100, 0b1000, 0b0010, 0b010, 0, 1>; +defm : TLBI<"VMALLWS2E1OS", 0, 0b100, 0b1000, 0b0101, 0b010, 0, 1>;  }  //===----------------------------------------------------------------------===// @@ -1862,13 +1885,6 @@ def : ROSysReg<"ERXPFGF_EL1",   0b11, 0b000, 0b0101, 0b0100, 0b100>;  // v8.4a MPAM registers  //                             Op0   Op1    CRn     CRm     Op2 -let Requires = [{ {AArch64::FeatureMPAM} }] in { -def : RWSysReg<"MPAM0_EL1",    0b11, 0b000, 0b1010, 0b0101, 0b001>; -def : RWSysReg<"MPAM1_EL1",    0b11, 0b000, 0b1010, 0b0101, 0b000>; -def : RWSysReg<"MPAM2_EL2",    0b11, 0b100, 0b1010, 0b0101, 0b000>; -def : RWSysReg<"MPAM3_EL3",    0b11, 0b110, 0b1010, 0b0101, 0b000>; -def : RWSysReg<"MPAM1_EL12",   0b11, 0b101, 0b1010, 0b0101, 0b000>; -def : RWSysReg<"MPAMHCR_EL2",  0b11, 0b100, 0b1010, 0b0100, 0b000>;  def : RWSysReg<"MPAMVPMV_EL2", 0b11, 0b100, 0b1010, 0b0100, 0b001>;  def : RWSysReg<"MPAMVPM0_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b000>;  def : RWSysReg<"MPAMVPM1_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b001>; @@ -1878,8 +1894,6 @@ def : RWSysReg<"MPAMVPM4_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b100>;  def : RWSysReg<"MPAMVPM5_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b101>;  def : RWSysReg<"MPAMVPM6_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b110>;  def : RWSysReg<"MPAMVPM7_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b111>; -def : ROSysReg<"MPAMIDR_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b100>; -} //FeatureMPAM  // v8.4a Activity Monitor registers  //                                 Op0   Op1    CRn     CRm     Op2 @@ -2319,6 +2333,26 @@ def : RWSysReg<"MPAMBW0_EL1",             0b11, 0b000, 0b1010, 0b0101, 0b101>;  def : RWSysReg<"MPAMBWCAP_EL2",           0b11, 0b100, 0b1010, 0b0101, 0b110>;  def : RWSysReg<"MPAMBWSM_EL1",            0b11, 0b000, 0b1010, 0b0101, 0b111>; +// v9.7a Memory partitioning and monitoring version 2 +// (FEAT_MPAMv2) registers +//                               Op0   Op1    CRn     CRm     Op2 +// MPAM system registers that are also available for MPAMv2 +def : RWSysReg<"MPAM0_EL1",    0b11, 0b000, 0b1010, 0b0101, 0b001>; +def : RWSysReg<"MPAM1_EL1",    0b11, 0b000, 0b1010, 0b0101, 0b000>; +def : RWSysReg<"MPAM1_EL12",   0b11, 0b101, 0b1010, 0b0101, 0b000>; +def : RWSysReg<"MPAM2_EL2",    0b11, 0b100, 0b1010, 0b0101, 0b000>; +def : RWSysReg<"MPAM3_EL3",    0b11, 0b110, 0b1010, 0b0101, 0b000>; +def : RWSysReg<"MPAMHCR_EL2",  0b11, 0b100, 0b1010, 0b0100, 0b000>; +def : ROSysReg<"MPAMIDR_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b100>; +// Only MPAMv2 registers +def : RWSysReg<"MPAMCTL_EL1",   0b11, 0b000, 0b1010, 0b0101, 0b010>; +def : RWSysReg<"MPAMCTL_EL12",  0b11, 0b101, 0b1010, 0b0101, 0b010>; +def : RWSysReg<"MPAMCTL_EL2",   0b11, 0b100, 0b1010, 0b0101, 0b010>; +def : RWSysReg<"MPAMCTL_EL3",   0b11, 0b110, 0b1010, 0b0101, 0b010>; +def : RWSysReg<"MPAMVIDCR_EL2", 0b11, 0b100, 0b1010, 0b0111, 0b000>; +def : RWSysReg<"MPAMVIDSR_EL2", 0b11, 0b100, 0b1010, 0b0111, 0b001>; +def : RWSysReg<"MPAMVIDSR_EL3", 0b11, 0b110, 0b1010, 0b0111, 0b001>; +  //===----------------------------------------------------------------------===//  // FEAT_SRMASK v9.6a registers  //===----------------------------------------------------------------------===// @@ -2412,3 +2446,251 @@ def : DC<"CIVAPS",    0b000, 0b0111, 0b1111, 0b001>;  let Requires = [{ {AArch64::FeaturePoPS, AArch64::FeatureMTE} }] in {  def : DC<"CIGDVAPS",  0b000, 0b0111, 0b1111, 0b101>;  } + +// v9.7a TLBI domains system registers (MemSys) +foreach n = 0-3 in { +  defvar nb = !cast<bits<3>>(n); +  def : RWSysReg<"VTLBID"#n#"_EL2", 0b11,  0b100, 0b0010, 0b1000, nb>; +} + +foreach n = 0-3 in { +  defvar nb = !cast<bits<3>>(n); +  def : RWSysReg<"VTLBIDOS"#n#"_EL2", 0b11,  0b100, 0b0010, 0b1001, nb>; +} + +def : ROSysReg<"TLBIDIDR_EL1",      0b11,  0b000, 0b1010, 0b0100, 0b110>; + +// MPAM Lookaside Buffer Invalidate (MLBI) instructions +class MLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2, bit needsreg> { +  string Name = name; +  bits<14> Encoding; +  let Encoding{13-11} = op1; +  let Encoding{10-7} = crn; +  let Encoding{6-3} = crm; +  let Encoding{2-0} = op2; +  bit NeedsReg = needsreg; +  string RequiresStr = [{ {AArch64::FeatureMPAMv2} }]; +} + +def MLBITable : GenericTable { +  let FilterClass = "MLBI"; +  let CppTypeName = "MLBI"; +  let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"]; + +  let PrimaryKey = ["Encoding"]; +  let PrimaryKeyName = "lookupMLBIByEncoding"; +} + +def lookupMLBIByName : SearchIndex { +  let Table = MLBITable; +  let Key = ["Name"]; +} + +//                     Op1    CRn     CRm     Op2    needsReg +def : MLBI<"ALLE1",    0b100, 0b0111, 0b0000, 0b100, 0>; +def : MLBI<"VMALLE1",  0b100, 0b0111, 0b0000, 0b101, 0>; +def : MLBI<"VPIDE1",   0b100, 0b0111, 0b0000, 0b110, 1>; +def : MLBI<"VPMGE1",   0b100, 0b0111, 0b0000, 0b111, 1>; + + +// v9.7-A GICv5 (FEAT_GCIE) +// CPU Interface Registers +//                                        Op0   Op1    CRn     CRm     Op2 +def : RWSysReg<"ICC_APR_EL1",             0b11, 0b001, 0b1100, 0b0000, 0b000>; +def : RWSysReg<"ICC_APR_EL3",             0b11, 0b110, 0b1100, 0b1000, 0b000>; +def : RWSysReg<"ICC_CR0_EL1",             0b11, 0b001, 0b1100, 0b0000, 0b001>; +def : RWSysReg<"ICC_CR0_EL3",             0b11, 0b110, 0b1100, 0b1001, 0b000>; +def : ROSysReg<"ICC_DOMHPPIR_EL3",        0b11, 0b110, 0b1100, 0b1000, 0b010>; +def : ROSysReg<"ICC_HAPR_EL1",            0b11, 0b001, 0b1100, 0b0000, 0b011>; +def : ROSysReg<"ICC_HPPIR_EL1",           0b11, 0b000, 0b1100, 0b1010, 0b011>; +def : ROSysReg<"ICC_HPPIR_EL3",           0b11, 0b110, 0b1100, 0b1001, 0b001>; +def : ROSysReg<"ICC_IAFFIDR_EL1",         0b11, 0b000, 0b1100, 0b1010, 0b101>; +def : RWSysReg<"ICC_ICSR_EL1",            0b11, 0b000, 0b1100, 0b1010, 0b100>; +def : ROSysReg<"ICC_IDR0_EL1",            0b11, 0b000, 0b1100, 0b1010, 0b010>; +def : RWSysReg<"ICC_PCR_EL1",             0b11, 0b001, 0b1100, 0b0000, 0b010>; +def : RWSysReg<"ICC_PCR_EL3",             0b11, 0b110, 0b1100, 0b1000, 0b001>; + +// Virtual CPU Interface Registers +//                                        Op0   Op1    CRn     CRm     Op2 +def : RWSysReg<"ICV_APR_EL1",             0b11, 0b001, 0b1100, 0b0000, 0b000>; +def : RWSysReg<"ICV_CR0_EL1",             0b11, 0b001, 0b1100, 0b0000, 0b001>; +def : RWSysReg<"ICV_HAPR_EL1",            0b11, 0b001, 0b1100, 0b0000, 0b011>; +def : RWSysReg<"ICV_HPPIR_EL1",           0b11, 0b000, 0b1100, 0b1010, 0b011>; +def : RWSysReg<"ICV_PCR_EL1",             0b11, 0b001, 0b1100, 0b0000, 0b010>; + +foreach n=0-3 in { +  defvar nb = !cast<bits<2>>(n); +//                                             Op0   Op1    CRn     CRm     Op2 +  def : RWSysReg<"ICC_PPI_DOMAINR"#n#"_EL3",   0b11, 0b110, 0b1100, 0b1000, {0b1,nb{1-0}}>; + +} + +foreach n=0-15 in{ +  defvar nb = !cast<bits<4>>(n); +//                                               Op0   Op1    CRn     CRm            Op2 +  def : RWSysReg<"ICC_PPI_PRIORITYR"#n#"_EL1",   0b11, 0b000, 0b1100, {0b111,nb{3}}, nb{2-0}>; +} + +// PPI and Virtual PPI Registers +multiclass PPIRegisters<string prefix> { +  foreach n=0-1 in { +    defvar nb = !cast<bit>(n); +//                                                  Op0   Op1    CRn     CRm     Op2 +    def : RWSysReg<prefix#"_PPI_CACTIVER"#n#"_EL1", 0b11, 0b000, 0b1100, 0b1101, {0b00,nb}>; +    def : RWSysReg<prefix#"_PPI_CPENDR"#n#"_EL1",   0b11, 0b000, 0b1100, 0b1101, {0b10,nb}>; +    def : RWSysReg<prefix#"_PPI_ENABLER"#n#"_EL1",  0b11, 0b000, 0b1100, 0b1010, {0b11,nb}>; +    def : RWSysReg<prefix#"_PPI_SACTIVER"#n#"_EL1", 0b11, 0b000, 0b1100, 0b1101, {0b01,nb}>; +    def : RWSysReg<prefix#"_PPI_SPENDR"#n#"_EL1",   0b11, 0b000, 0b1100, 0b1101, {0b11,nb}>; +    def : RWSysReg<prefix#"_PPI_HMR"#n#"_EL1",      0b11, 0b000, 0b1100, 0b1010, {0b00,nb}>; +  } +} + +defm : PPIRegisters<"ICC">;  // PPI Registers +defm : PPIRegisters<"ICV">;  // Virtual PPI Registers + +foreach n=0-15 in { +  defvar nb = !cast<bits<4>>(n); +//                                               Op0   Op1    CRn     CRm            Op2 +  def : RWSysReg<"ICV_PPI_PRIORITYR"#n#"_EL1",   0b11, 0b000, 0b1100, {0b111,nb{3}}, nb{2-0}>; +} + +// Hypervisor Control Registers +//                                    Op0   Op1    CRn     CRm     Op2 +def : RWSysReg<"ICH_APR_EL2",         0b11, 0b100, 0b1100, 0b1000, 0b100>; +def : RWSysReg<"ICH_CONTEXTR_EL2",    0b11, 0b100, 0b1100, 0b1011, 0b110>; +def : RWSysReg<"ICH_HFGITR_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b111>; +def : RWSysReg<"ICH_HFGRTR_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b100>; +def : RWSysReg<"ICH_HFGWTR_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b110>; +def : ROSysReg<"ICH_HPPIR_EL2",       0b11, 0b100, 0b1100, 0b1000, 0b101>; +def : RWSysReg<"ICH_VCTLR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b100>; + +foreach n=0-1 in { +  defvar nb = !cast<bit>(n); +//                                           Op0   Op1    CRn     CRm     Op2 +def : RWSysReg<"ICH_PPI_ACTIVER"#n#"_EL2",   0b11, 0b100, 0b1100, 0b1010, {0b11,nb}>; +def : RWSysReg<"ICH_PPI_DVIR"#n#"_EL2",      0b11, 0b100, 0b1100, 0b1010, {0b00,nb}>; +def : RWSysReg<"ICH_PPI_ENABLER"#n#"_EL2",   0b11, 0b100, 0b1100, 0b1010, {0b01,nb}>; +def : RWSysReg<"ICH_PPI_PENDR"#n#"_EL2",     0b11, 0b100, 0b1100, 0b1010, {0b10,nb}>; +} + +foreach n=0-15 in { +  defvar nb = !cast<bits<4>>(n); +//                                               Op0   Op1    CRn     CRm            Op2 +  def : RWSysReg<"ICH_PPI_PRIORITYR"#n#"_EL2",   0b11, 0b100, 0b1100, {0b111,nb{3}}, nb{2-0}>; +} + +//===----------------------------------------------------------------------===// +// GICv5 instruction options. +//===----------------------------------------------------------------------===// + +// GIC +class GIC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2> { +  string Name = name; +  bits<14> Encoding; +  let Encoding{13-11} = op1; +  let Encoding{10-7} = crn; +  let Encoding{6-3} = crm; +  let Encoding{2-0} = op2; +  bit NeedsReg = 1; +  string RequiresStr = [{ {AArch64::FeatureGCIE} }]; +} + +// GSB +class GSB<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2> { +  string Name = name; +  bits<14> Encoding; +  let Encoding{13-11} = op1; +  let Encoding{10-7} = crn; +  let Encoding{6-3} = crm; +  let Encoding{2-0} = op2; +  string RequiresStr = [{ {AArch64::FeatureGCIE} }]; +} + +// GICR +class GICR<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2> { +  string Name = name; +  bits<14> Encoding; +  let Encoding{13-11} = op1; +  let Encoding{10-7} = crn; +  let Encoding{6-3} = crm; +  let Encoding{2-0} = op2; +  bit NeedsReg = 1; +  string RequiresStr = [{ {AArch64::FeatureGCIE} }]; +} + +def GICTable : GenericTable { +  let FilterClass = "GIC"; +  let CppTypeName = "GIC"; +  let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"]; + +  let PrimaryKey = ["Encoding"]; +  let PrimaryKeyName = "lookupGICByEncoding"; +} + +def GSBTable : GenericTable { +  let FilterClass = "GSB"; +  let CppTypeName = "GSB"; +  let Fields = ["Name", "Encoding", "RequiresStr"]; + +  let PrimaryKey = ["Encoding"]; +  let PrimaryKeyName = "lookupGSBByEncoding"; +} + +def GICRTable : GenericTable { +  let FilterClass = "GICR"; +  let CppTypeName = "GICR"; +  let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"]; + +  let PrimaryKey = ["Encoding"]; +  let PrimaryKeyName = "lookupGICRByEncoding"; +} + +def lookupGICByName : SearchIndex { +  let Table = GICTable; +  let Key = ["Name"]; +} + +def lookupGSBByName : SearchIndex { +  let Table = GSBTable; +  let Key = ["Name"]; +} + +def lookupGICRByName : SearchIndex { +  let Table = GICRTable; +  let Key = ["Name"]; +} + +//                    Op1    CRn     CRm     Op2 +def : GSB<"sys",      0b000, 0b1100, 0b0000, 0b000>; +def : GSB<"ack",      0b000, 0b1100, 0b0000, 0b001>; + +//                    Op1    CRn     CRm     Op2 +def : GICR<"cdia",    0b000, 0b1100, 0b0011, 0b000>; +def : GICR<"cdnmia",  0b000, 0b1100, 0b0011, 0b001>; + +//                    Op1    CRn     CRm     Op2 +def : GIC<"cdaff",    0b000, 0b1100, 0b0001, 0b011>; +def : GIC<"cddi",     0b000, 0b1100, 0b0010, 0b000>; +def : GIC<"cddis",    0b000, 0b1100, 0b0001, 0b000>; +def : GIC<"cden",     0b000, 0b1100, 0b0001, 0b001>; +def : GIC<"cdeoi",    0b000, 0b1100, 0b0001, 0b111>; +def : GIC<"cdhm",     0b000, 0b1100, 0b0010, 0b001>; +def : GIC<"cdpend",   0b000, 0b1100, 0b0001, 0b100>; +def : GIC<"cdpri",    0b000, 0b1100, 0b0001, 0b010>; +def : GIC<"cdrcfg",   0b000, 0b1100, 0b0001, 0b101>; +def : GIC<"vdaff",    0b100, 0b1100, 0b0001, 0b011>; +def : GIC<"vddi",     0b100, 0b1100, 0b0010, 0b000>; +def : GIC<"vddis",    0b100, 0b1100, 0b0001, 0b000>; +def : GIC<"vden",     0b100, 0b1100, 0b0001, 0b001>; +def : GIC<"vdhm",     0b100, 0b1100, 0b0010, 0b001>; +def : GIC<"vdpend",   0b100, 0b1100, 0b0001, 0b100>; +def : GIC<"vdpri",    0b100, 0b1100, 0b0001, 0b010>; +def : GIC<"vdrcfg",   0b100, 0b1100, 0b0001, 0b101>; +def : GIC<"ldaff",    0b110, 0b1100, 0b0001, 0b011>; +def : GIC<"lddi",     0b110, 0b1100, 0b0010, 0b000>; +def : GIC<"lddis",    0b110, 0b1100, 0b0001, 0b000>; +def : GIC<"lden",     0b110, 0b1100, 0b0001, 0b001>; +def : GIC<"ldhm",     0b110, 0b1100, 0b0010, 0b001>; +def : GIC<"ldpend",   0b110, 0b1100, 0b0001, 0b100>; +def : GIC<"ldpri",    0b110, 0b1100, 0b0001, 0b010>; +def : GIC<"ldrcfg",   0b110, 0b1100, 0b0001, 0b101>; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 2053fc4..fede586 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -224,7 +224,8 @@ static cl::opt<bool> EnableScalableAutovecInStreamingMode(  static bool isSMEABIRoutineCall(const CallInst &CI,                                  const AArch64TargetLowering &TLI) {    const auto *F = CI.getCalledFunction(); -  return F && SMEAttrs(F->getName(), TLI).isSMEABIRoutine(); +  return F && +         SMEAttrs(F->getName(), TLI.getRuntimeLibcallsInfo()).isSMEABIRoutine();  }  /// Returns true if the function has explicit operations that can only be @@ -355,7 +356,7 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,    // change only once and avoid inlining of G into F.    SMEAttrs FAttrs(*F); -  SMECallAttrs CallAttrs(Call, getTLI()); +  SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());    if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {      if (F == Call.getCaller()) // (1) @@ -957,23 +958,50 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,      return TyL.first + ExtraCost;    }    case Intrinsic::get_active_lane_mask: { -    auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType()); -    if (RetTy) { -      EVT RetVT = getTLI()->getValueType(DL, RetTy); -      EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); -      if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) && -          !getTLI()->isTypeLegal(RetVT)) { -        // We don't have enough context at this point to determine if the mask -        // is going to be kept live after the block, which will force the vXi1 -        // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32. -        // For now, we just assume the vectorizer created this intrinsic and -        // the result will be the input for a PHI. In this case the cost will -        // be extremely high for fixed-width vectors. -        // NOTE: getScalarizationOverhead returns a cost that's far too -        // pessimistic for the actual generated codegen. In reality there are -        // two instructions generated per lane. -        return RetTy->getNumElements() * 2; +    auto RetTy = cast<VectorType>(ICA.getReturnType()); +    EVT RetVT = getTLI()->getValueType(DL, RetTy); +    EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); +    if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT)) +      break; + +    if (RetTy->isScalableTy()) { +      if (TLI->getTypeAction(RetTy->getContext(), RetVT) != +          TargetLowering::TypeSplitVector) +        break; + +      auto LT = getTypeLegalizationCost(RetTy); +      InstructionCost Cost = LT.first; +      // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost +      // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g. +      //   nxv32i1 = get_active_lane_mask(base, idx) -> +      //    {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx) +      if (ST->hasSVE2p1() || ST->hasSME2()) { +        Cost /= 2; +        if (Cost == 1) +          return Cost;        } + +      // If more than one whilelo intrinsic is required, include the extra cost +      // required by the saturating add & select required to increment the +      // start value after the first intrinsic call. +      Type *OpTy = ICA.getArgTypes()[0]; +      IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy}); +      InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind); +      Type *CondTy = OpTy->getWithNewBitWidth(1); +      SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy, +                                      CmpInst::ICMP_UGT, CostKind); +      return Cost + (SplitCost * (Cost - 1)); +    } else if (!getTLI()->isTypeLegal(RetVT)) { +      // We don't have enough context at this point to determine if the mask +      // is going to be kept live after the block, which will force the vXi1 +      // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32. +      // For now, we just assume the vectorizer created this intrinsic and +      // the result will be the input for a PHI. In this case the cost will +      // be extremely high for fixed-width vectors. +      // NOTE: getScalarizationOverhead returns a cost that's far too +      // pessimistic for the actual generated codegen. In reality there are +      // two instructions generated per lane. +      return cast<FixedVectorType>(RetTy)->getNumElements() * 2;      }      break;    } diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 636d4f8a..6273cfc 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -159,6 +159,7 @@ private:    SMLoc getLoc() const { return getParser().getTok().getLoc(); }    bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands); +  bool parseSyslAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);    bool parseSyspAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);    void createSysAlias(uint16_t Encoding, OperandVector &Operands, SMLoc S);    AArch64CC::CondCode parseCondCodeString(StringRef Cond, @@ -266,6 +267,7 @@ private:    ParseStatus tryParseRPRFMOperand(OperandVector &Operands);    ParseStatus tryParsePSBHint(OperandVector &Operands);    ParseStatus tryParseBTIHint(OperandVector &Operands); +  ParseStatus tryParseCMHPriorityHint(OperandVector &Operands);    ParseStatus tryParseAdrpLabel(OperandVector &Operands);    ParseStatus tryParseAdrLabel(OperandVector &Operands);    template <bool AddFPZeroAsLiteral> @@ -370,6 +372,7 @@ private:      k_PSBHint,      k_PHint,      k_BTIHint, +    k_CMHPriorityHint,    } Kind;    SMLoc StartLoc, EndLoc; @@ -499,6 +502,11 @@ private:      unsigned Length;      unsigned Val;    }; +  struct CMHPriorityHintOp { +    const char *Data; +    unsigned Length; +    unsigned Val; +  };    struct SVCROp {      const char *Data; @@ -525,6 +533,7 @@ private:      struct PSBHintOp PSBHint;      struct PHintOp PHint;      struct BTIHintOp BTIHint; +    struct CMHPriorityHintOp CMHPriorityHint;      struct ShiftExtendOp ShiftExtend;      struct SVCROp SVCR;    }; @@ -595,6 +604,9 @@ public:      case k_BTIHint:        BTIHint = o.BTIHint;        break; +    case k_CMHPriorityHint: +      CMHPriorityHint = o.CMHPriorityHint; +      break;      case k_ShiftExtend:        ShiftExtend = o.ShiftExtend;        break; @@ -769,6 +781,16 @@ public:      return StringRef(BTIHint.Data, BTIHint.Length);    } +  unsigned getCMHPriorityHint() const { +    assert(Kind == k_CMHPriorityHint && "Invalid access!"); +    return CMHPriorityHint.Val; +  } + +  StringRef getCMHPriorityHintName() const { +    assert(Kind == k_CMHPriorityHint && "Invalid access!"); +    return StringRef(CMHPriorityHint.Data, CMHPriorityHint.Length); +  } +    StringRef getSVCR() const {      assert(Kind == k_SVCR && "Invalid access!");      return StringRef(SVCR.Data, SVCR.Length); @@ -1511,6 +1533,7 @@ public:    bool isPSBHint() const { return Kind == k_PSBHint; }    bool isPHint() const { return Kind == k_PHint; }    bool isBTIHint() const { return Kind == k_BTIHint; } +  bool isCMHPriorityHint() const { return Kind == k_CMHPriorityHint; }    bool isShiftExtend() const { return Kind == k_ShiftExtend; }    bool isShifter() const {      if (!isShiftExtend()) @@ -2196,6 +2219,11 @@ public:      Inst.addOperand(MCOperand::createImm(getBTIHint()));    } +  void addCMHPriorityHintOperands(MCInst &Inst, unsigned N) const { +    assert(N == 1 && "Invalid number of operands!"); +    Inst.addOperand(MCOperand::createImm(getCMHPriorityHint())); +  } +    void addShifterOperands(MCInst &Inst, unsigned N) const {      assert(N == 1 && "Invalid number of operands!");      unsigned Imm = @@ -2547,6 +2575,17 @@ public:    }    static std::unique_ptr<AArch64Operand> +  CreateCMHPriorityHint(unsigned Val, StringRef Str, SMLoc S, MCContext &Ctx) { +    auto Op = std::make_unique<AArch64Operand>(k_CMHPriorityHint, Ctx); +    Op->CMHPriorityHint.Val = Val; +    Op->CMHPriorityHint.Data = Str.data(); +    Op->CMHPriorityHint.Length = Str.size(); +    Op->StartLoc = S; +    Op->EndLoc = S; +    return Op; +  } + +  static std::unique_ptr<AArch64Operand>    CreateMatrixRegister(unsigned RegNum, unsigned ElementWidth, MatrixKind Kind,                         SMLoc S, SMLoc E, MCContext &Ctx) {      auto Op = std::make_unique<AArch64Operand>(k_MatrixRegister, Ctx); @@ -2656,6 +2695,9 @@ void AArch64Operand::print(raw_ostream &OS, const MCAsmInfo &MAI) const {    case k_BTIHint:      OS << getBTIHintName();      break; +  case k_CMHPriorityHint: +    OS << getCMHPriorityHintName(); +    break;    case k_MatrixRegister:      OS << "<matrix " << getMatrixReg() << ">";      break; @@ -3279,6 +3321,24 @@ ParseStatus AArch64AsmParser::tryParseBTIHint(OperandVector &Operands) {    return ParseStatus::Success;  } +/// tryParseCMHPriorityHint - Try to parse a CMHPriority operand +ParseStatus AArch64AsmParser::tryParseCMHPriorityHint(OperandVector &Operands) { +  SMLoc S = getLoc(); +  const AsmToken &Tok = getTok(); +  if (Tok.isNot(AsmToken::Identifier)) +    return TokError("invalid operand for instruction"); + +  auto CMHPriority = +      AArch64CMHPriorityHint::lookupCMHPriorityHintByName(Tok.getString()); +  if (!CMHPriority) +    return TokError("invalid operand for instruction"); + +  Operands.push_back(AArch64Operand::CreateCMHPriorityHint( +      CMHPriority->Encoding, Tok.getString(), S, getContext())); +  Lex(); // Eat identifier token. +  return ParseStatus::Success; +} +  /// tryParseAdrpLabel - Parse and validate a source label for the ADRP  /// instruction.  ParseStatus AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) { @@ -3824,6 +3884,18 @@ static const struct Extension {      {"ssve-bitperm", {AArch64::FeatureSSVE_BitPerm}},      {"sme-mop4", {AArch64::FeatureSME_MOP4}},      {"sme-tmop", {AArch64::FeatureSME_TMOP}}, +    {"cmh", {AArch64::FeatureCMH}}, +    {"lscp", {AArch64::FeatureLSCP}}, +    {"tlbid", {AArch64::FeatureTLBID}}, +    {"mpamv2", {AArch64::FeatureMPAMv2}}, +    {"mtetc", {AArch64::FeatureMTETC}}, +    {"gcie", {AArch64::FeatureGCIE}}, +    {"sme2p3", {AArch64::FeatureSME2p3}}, +    {"sve2p3", {AArch64::FeatureSVE2p3}}, +    {"sve-b16mm", {AArch64::FeatureSVE_B16MM}}, +    {"f16mm", {AArch64::FeatureF16MM}}, +    {"f16f32dot", {AArch64::FeatureF16F32DOT}}, +    {"f16f32mm", {AArch64::FeatureF16F32MM}},  };  static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) { @@ -3861,6 +3933,8 @@ static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {      Str += "ARMv9.5a";    else if (FBS[AArch64::HasV9_6aOps])      Str += "ARMv9.6a"; +  else if (FBS[AArch64::HasV9_7aOps]) +    Str += "ARMv9.7a";    else if (FBS[AArch64::HasV8_0rOps])      Str += "ARMv8r";    else { @@ -3894,8 +3968,9 @@ void AArch64AsmParser::createSysAlias(uint16_t Encoding, OperandVector &Operands        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));  } -/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for -/// the SYS instruction. Parse them specially so that we create a SYS MCInst. +/// parseSysAlias - The IC, DC, AT, TLBI, MLBI and GIC{R} and GSB instructions +/// are simple aliases for the SYS instruction. Parse them specially so that +/// we create a SYS MCInst.  bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,                                     OperandVector &Operands) {    if (Name.contains('.')) @@ -3908,6 +3983,8 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,    StringRef Op = Tok.getString();    SMLoc S = Tok.getLoc();    bool ExpectRegister = true; +  bool OptionalRegister = false; +  bool hasAll = getSTI().hasFeature(AArch64::FeatureAll);    if (Mnemonic == "ic") {      const AArch64IC::IC *IC = AArch64IC::lookupICByName(Op); @@ -3950,13 +4027,50 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,        return TokError(Str);      }      ExpectRegister = TLBI->NeedsReg; +    bool hasTLBID = getSTI().hasFeature(AArch64::FeatureTLBID); +    if (hasAll || hasTLBID) { +      OptionalRegister = TLBI->OptionalReg; +    }      createSysAlias(TLBI->Encoding, Operands, S); -  } else if (Mnemonic == "cfp" || Mnemonic == "dvp" || Mnemonic == "cpp" || Mnemonic == "cosp") { +  } else if (Mnemonic == "mlbi") { +    const AArch64MLBI::MLBI *MLBI = AArch64MLBI::lookupMLBIByName(Op); +    if (!MLBI) +      return TokError("invalid operand for MLBI instruction"); +    else if (!MLBI->haveFeatures(getSTI().getFeatureBits())) { +      std::string Str("MLBI " + std::string(MLBI->Name) + " requires: "); +      setRequiredFeatureString(MLBI->getRequiredFeatures(), Str); +      return TokError(Str); +    } +    ExpectRegister = MLBI->NeedsReg; +    createSysAlias(MLBI->Encoding, Operands, S); +  } else if (Mnemonic == "gic") { +    const AArch64GIC::GIC *GIC = AArch64GIC::lookupGICByName(Op); +    if (!GIC) +      return TokError("invalid operand for GIC instruction"); +    else if (!GIC->haveFeatures(getSTI().getFeatureBits())) { +      std::string Str("GIC " + std::string(GIC->Name) + " requires: "); +      setRequiredFeatureString(GIC->getRequiredFeatures(), Str); +      return TokError(Str); +    } +    ExpectRegister = true; +    createSysAlias(GIC->Encoding, Operands, S); +  } else if (Mnemonic == "gsb") { +    const AArch64GSB::GSB *GSB = AArch64GSB::lookupGSBByName(Op); +    if (!GSB) +      return TokError("invalid operand for GSB instruction"); +    else if (!GSB->haveFeatures(getSTI().getFeatureBits())) { +      std::string Str("GSB " + std::string(GSB->Name) + " requires: "); +      setRequiredFeatureString(GSB->getRequiredFeatures(), Str); +      return TokError(Str); +    } +    ExpectRegister = false; +    createSysAlias(GSB->Encoding, Operands, S); +  } else if (Mnemonic == "cfp" || Mnemonic == "dvp" || Mnemonic == "cpp" || +             Mnemonic == "cosp") {      if (Op.lower() != "rctx")        return TokError("invalid operand for prediction restriction instruction"); -    bool hasAll = getSTI().hasFeature(AArch64::FeatureAll);      bool hasPredres = hasAll || getSTI().hasFeature(AArch64::FeaturePredRes);      bool hasSpecres2 = hasAll || getSTI().hasFeature(AArch64::FeatureSPECRES2); @@ -3989,10 +4103,61 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,      HasRegister = true;    } -  if (ExpectRegister && !HasRegister) -    return TokError("specified " + Mnemonic + " op requires a register"); -  else if (!ExpectRegister && HasRegister) -    return TokError("specified " + Mnemonic + " op does not use a register"); +  if (!OptionalRegister) { +    if (ExpectRegister && !HasRegister) +      return TokError("specified " + Mnemonic + " op requires a register"); +    else if (!ExpectRegister && HasRegister) +      return TokError("specified " + Mnemonic + " op does not use a register"); +  } + +  if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list")) +    return true; + +  return false; +} + +/// parseSyslAlias - The GICR instructions are simple aliases for +/// the SYSL instruction. Parse them specially so that we create a +/// SYS MCInst. +bool AArch64AsmParser::parseSyslAlias(StringRef Name, SMLoc NameLoc, +                                      OperandVector &Operands) { + +  Mnemonic = Name; +  Operands.push_back( +      AArch64Operand::CreateToken("sysl", NameLoc, getContext())); + +  // Now expect two operands (identifier + register) +  SMLoc startLoc = getLoc(); +  const AsmToken ®Tok = getTok(); +  StringRef reg = regTok.getString(); +  unsigned RegNum = matchRegisterNameAlias(reg.lower(), RegKind::Scalar); +  if (!RegNum) +    return TokError("expected register operand"); + +  Operands.push_back(AArch64Operand::CreateReg( +      RegNum, RegKind::Scalar, startLoc, getLoc(), getContext(), EqualsReg)); + +  Lex(); // Eat token +  if (parseToken(AsmToken::Comma)) +    return true; + +  // Check for identifier +  const AsmToken &operandTok = getTok(); +  StringRef Op = operandTok.getString(); +  SMLoc S2 = operandTok.getLoc(); +  Lex(); // Eat token + +  if (Mnemonic == "gicr") { +    const AArch64GICR::GICR *GICR = AArch64GICR::lookupGICRByName(Op); +    if (!GICR) +      return Error(S2, "invalid operand for GICR instruction"); +    else if (!GICR->haveFeatures(getSTI().getFeatureBits())) { +      std::string Str("GICR " + std::string(GICR->Name) + " requires: "); +      setRequiredFeatureString(GICR->getRequiredFeatures(), Str); +      return Error(S2, Str); +    } +    createSysAlias(GICR->Encoding, Operands, S2); +  }    if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))      return true; @@ -4025,7 +4190,7 @@ bool AArch64AsmParser::parseSyspAlias(StringRef Name, SMLoc NameLoc,        return TokError("invalid operand for TLBIP instruction");      const AArch64TLBIP::TLBIP TLBIP(          TLBIPorig->Name, TLBIPorig->Encoding | (HasnXSQualifier ? (1 << 7) : 0), -        TLBIPorig->NeedsReg, +        TLBIPorig->NeedsReg, TLBIPorig->OptionalReg,          HasnXSQualifier              ? TLBIPorig->FeaturesRequired | FeatureBitset({AArch64::FeatureXS})              : TLBIPorig->FeaturesRequired); @@ -4719,6 +4884,13 @@ ParseStatus AArch64AsmParser::tryParseVectorList(OperandVector &Operands,        FirstReg, Count, Stride, NumElements, ElementWidth, VectorKind, S,        getLoc(), getContext())); +  if (getTok().is(AsmToken::LBrac)) { +    ParseStatus Res = tryParseVectorIndex(Operands); +    if (Res.isFailure()) +      return ParseStatus::Failure; +    return ParseStatus::Success; +  } +    return ParseStatus::Success;  } @@ -5267,12 +5439,17 @@ bool AArch64AsmParser::parseInstruction(ParseInstructionInfo &Info,    size_t Start = 0, Next = Name.find('.');    StringRef Head = Name.slice(Start, Next); -  // IC, DC, AT, TLBI and Prediction invalidation instructions are aliases for -  // the SYS instruction. +  // IC, DC, AT, TLBI, MLBI, GIC{R}, GSB and Prediction invalidation +  // instructions are aliases for the SYS instruction.    if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi" || -      Head == "cfp" || Head == "dvp" || Head == "cpp" || Head == "cosp") +      Head == "cfp" || Head == "dvp" || Head == "cpp" || Head == "cosp" || +      Head == "mlbi" || Head == "gic" || Head == "gsb")      return parseSysAlias(Head, NameLoc, Operands); +  // GICR instructions are aliases for the SYSL instruction. +  if (Head == "gicr") +    return parseSyslAlias(Head, NameLoc, Operands); +    // TLBIP instructions are aliases for the SYSP instruction.    if (Head == "tlbip")      return parseSyspAlias(Head, NameLoc, Operands); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 3e55b76..14b0f9a 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -5126,23 +5126,13 @@ bool AArch64InstructionSelector::selectShuffleVector(      MachineInstr &I, MachineRegisterInfo &MRI) {    const LLT DstTy = MRI.getType(I.getOperand(0).getReg());    Register Src1Reg = I.getOperand(1).getReg(); -  const LLT Src1Ty = MRI.getType(Src1Reg);    Register Src2Reg = I.getOperand(2).getReg(); -  const LLT Src2Ty = MRI.getType(Src2Reg);    ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();    MachineBasicBlock &MBB = *I.getParent();    MachineFunction &MF = *MBB.getParent();    LLVMContext &Ctx = MF.getFunction().getContext(); -  // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if -  // it's originated from a <1 x T> type. Those should have been lowered into -  // G_BUILD_VECTOR earlier. -  if (!Src1Ty.isVector() || !Src2Ty.isVector()) { -    LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n"); -    return false; -  } -    unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;    SmallVector<Constant *, 64> CstIdxs; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 05a4313..5f93847 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1201,25 +1201,17 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)          return llvm::is_contained(              {v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64}, DstTy);        }) -      // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors) or scalar -      // destinations, we just want those lowered into G_BUILD_VECTOR or -      // G_EXTRACT_ELEMENT. -      .lowerIf([=](const LegalityQuery &Query) { -        return !Query.Types[0].isVector() || !Query.Types[1].isVector(); -      })        .moreElementsIf(            [](const LegalityQuery &Query) { -            return Query.Types[0].isVector() && Query.Types[1].isVector() && -                   Query.Types[0].getNumElements() > -                       Query.Types[1].getNumElements(); +            return Query.Types[0].getNumElements() > +                   Query.Types[1].getNumElements();            },            changeTo(1, 0))        .moreElementsToNextPow2(0)        .moreElementsIf(            [](const LegalityQuery &Query) { -            return Query.Types[0].isVector() && Query.Types[1].isVector() && -                   Query.Types[0].getNumElements() < -                       Query.Types[1].getNumElements(); +            return Query.Types[0].getNumElements() < +                   Query.Types[1].getNumElements();            },            changeTo(0, 1))        .widenScalarOrEltToNextPow2OrMinSize(0, 8) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index 830a35bb..6d2d705 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -856,7 +856,9 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {      break;    }    case TargetOpcode::G_FPTOSI_SAT: -  case TargetOpcode::G_FPTOUI_SAT: { +  case TargetOpcode::G_FPTOUI_SAT: +  case TargetOpcode::G_FPTOSI: +  case TargetOpcode::G_FPTOUI: {      LLT DstType = MRI.getType(MI.getOperand(0).getReg());      if (DstType.isVector())        break; @@ -864,11 +866,19 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {        OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};        break;      } -    OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR}; +    TypeSize DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); +    TypeSize SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, TRI); +    if (((DstSize == SrcSize) || STI.hasFeature(AArch64::FeatureFPRCVT)) && +        all_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()), +               [&](const MachineInstr &UseMI) { +                 return onlyUsesFP(UseMI, MRI, TRI) || +                        prefersFPUse(UseMI, MRI, TRI); +               })) +      OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; +    else +      OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR};      break;    } -  case TargetOpcode::G_FPTOSI: -  case TargetOpcode::G_FPTOUI:    case TargetOpcode::G_INTRINSIC_LRINT:    case TargetOpcode::G_INTRINSIC_LLRINT:      if (MRI.getType(MI.getOperand(0).getReg()).isVector()) diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index 35bd244..5c3e26e 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -84,6 +84,12 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address,        return;      } +  if (Opcode == AArch64::SYSLxt) +    if (printSyslAlias(MI, STI, O)) { +      printAnnotation(O, Annot); +      return; +    } +    if (Opcode == AArch64::SYSPxt || Opcode == AArch64::SYSPxt_XZR)      if (printSyspAlias(MI, STI, O)) {        printAnnotation(O, Annot); @@ -909,13 +915,25 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,    Encoding |= CnVal << 7;    Encoding |= Op1Val << 11; -  bool NeedsReg; +  bool NeedsReg = false; +  bool OptionalReg = false;    std::string Ins;    std::string Name;    if (CnVal == 7) {      switch (CmVal) {      default: return false; +    // MLBI aliases +    case 0: { +      const AArch64MLBI::MLBI *MLBI = +          AArch64MLBI::lookupMLBIByEncoding(Encoding); +      if (!MLBI || !MLBI->haveFeatures(STI.getFeatureBits())) +        return false; + +      NeedsReg = MLBI->NeedsReg; +      Ins = "mlbi\t"; +      Name = std::string(MLBI->Name); +    } break;      // Maybe IC, maybe Prediction Restriction      case 1:        switch (Op1Val) { @@ -1004,19 +1022,41 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,        return false;      NeedsReg = TLBI->NeedsReg; +    if (STI.hasFeature(AArch64::FeatureAll) || +        STI.hasFeature(AArch64::FeatureTLBID)) +      OptionalReg = TLBI->OptionalReg;      Ins = "tlbi\t";      Name = std::string(TLBI->Name); -  } -  else +  } else if (CnVal == 12) { +    if (CmVal != 0) { +      // GIC aliases +      const AArch64GIC::GIC *GIC = AArch64GIC::lookupGICByEncoding(Encoding); +      if (!GIC || !GIC->haveFeatures(STI.getFeatureBits())) +        return false; + +      NeedsReg = true; +      Ins = "gic\t"; +      Name = std::string(GIC->Name); +    } else { +      // GSB aliases +      const AArch64GSB::GSB *GSB = AArch64GSB::lookupGSBByEncoding(Encoding); +      if (!GSB || !GSB->haveFeatures(STI.getFeatureBits())) +        return false; + +      NeedsReg = false; +      Ins = "gsb\t"; +      Name = std::string(GSB->Name); +    } +  } else      return false;    StringRef Reg = getRegisterName(MI->getOperand(4).getReg());    bool NotXZR = Reg != "xzr"; -  // If a mandatory is not specified in the TableGen +  // If a mandatory or optional register is not specified in the TableGen    // (i.e. no register operand should be present), and the register value    // is not xzr/x31, then disassemble to a SYS alias instead. -  if (NotXZR && !NeedsReg) +  if (NotXZR && !NeedsReg && !OptionalReg)      return false;    std::string Str = Ins + Name; @@ -1024,12 +1064,64 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,    O << '\t' << Str; -  if (NeedsReg) +  // For optional registers, don't print the value if it's xzr/x31 +  // since this defaults to xzr/x31 if register is not specified. +  if (NeedsReg || (OptionalReg && NotXZR))      O << ", " << Reg;    return true;  } +bool AArch64InstPrinter::printSyslAlias(const MCInst *MI, +                                        const MCSubtargetInfo &STI, +                                        raw_ostream &O) { +#ifndef NDEBUG +  unsigned Opcode = MI->getOpcode(); +  assert(Opcode == AArch64::SYSLxt && "Invalid opcode for SYSL alias!"); +#endif + +  StringRef Reg = getRegisterName(MI->getOperand(0).getReg()); +  const MCOperand &Op1 = MI->getOperand(1); +  const MCOperand &Cn = MI->getOperand(2); +  const MCOperand &Cm = MI->getOperand(3); +  const MCOperand &Op2 = MI->getOperand(4); + +  unsigned Op1Val = Op1.getImm(); +  unsigned CnVal = Cn.getImm(); +  unsigned CmVal = Cm.getImm(); +  unsigned Op2Val = Op2.getImm(); + +  uint16_t Encoding = Op2Val; +  Encoding |= CmVal << 3; +  Encoding |= CnVal << 7; +  Encoding |= Op1Val << 11; + +  std::string Ins; +  std::string Name; + +  if (CnVal == 12) { +    if (CmVal == 3) { +      // GICR aliases +      const AArch64GICR::GICR *GICR = +          AArch64GICR::lookupGICRByEncoding(Encoding); +      if (!GICR || !GICR->haveFeatures(STI.getFeatureBits())) +        return false; + +      Ins = "gicr"; +      Name = std::string(GICR->Name); +    } else +      return false; +  } else +    return false; + +  std::string Str; +  llvm::transform(Name, Name.begin(), ::tolower); + +  O << '\t' << Ins << '\t' << Reg.str() << ", " << Name; + +  return true; +} +  bool AArch64InstPrinter::printSyspAlias(const MCInst *MI,                                          const MCSubtargetInfo &STI,                                          raw_ostream &O) { @@ -1508,6 +1600,17 @@ void AArch64InstPrinter::printBTIHintOp(const MCInst *MI, unsigned OpNum,      markup(O, Markup::Immediate) << '#' << formatImm(btihintop);  } +void AArch64InstPrinter::printCMHPriorityHintOp(const MCInst *MI, +                                                unsigned OpNum, +                                                const MCSubtargetInfo &STI, +                                                raw_ostream &O) { +  unsigned priorityhint_op = MI->getOperand(OpNum).getImm(); +  auto PHint = +      AArch64CMHPriorityHint::lookupCMHPriorityHintByEncoding(priorityhint_op); +  if (PHint) +    O << PHint->Name; +} +  void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,                                             const MCSubtargetInfo &STI,                                             raw_ostream &O) { diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h index 15ef2dd..307402d 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h @@ -52,6 +52,8 @@ public:  protected:    bool printSysAlias(const MCInst *MI, const MCSubtargetInfo &STI,                       raw_ostream &O); +  bool printSyslAlias(const MCInst *MI, const MCSubtargetInfo &STI, +                      raw_ostream &O);    bool printSyspAlias(const MCInst *MI, const MCSubtargetInfo &STI,                        raw_ostream &O);    bool printRangePrefetchAlias(const MCInst *MI, const MCSubtargetInfo &STI, @@ -151,6 +153,9 @@ protected:    void printBTIHintOp(const MCInst *MI, unsigned OpNum,                        const MCSubtargetInfo &STI, raw_ostream &O); +  void printCMHPriorityHintOp(const MCInst *MI, unsigned OpNum, +                              const MCSubtargetInfo &STI, raw_ostream &O); +    void printFPImmOperand(const MCInst *MI, unsigned OpNum,                           const MCSubtargetInfo &STI, raw_ostream &O); diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 33f35ad..99836ae 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -3920,6 +3920,78 @@ multiclass sme2_luti4_vector_vg4_index<string mnemonic> {    def _S : sme2_luti4_vector_vg4_index<0b10, ZZZZ_s_mul_r, mnemonic>;  } +// 8-bit Look up table +class sme2_lut_single<string asm> +  : I<(outs ZPR8:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn), +    asm, "\t$Zd, $ZTt, $Zn", "", []>, Sched<[]> { +  bits<0> ZTt; +  bits<5> Zd; +  bits<5> Zn; +  let Inst{31-10} = 0b1100000011001000010000; +  let Inst{9-5}   = Zn; +  let Inst{4-0}   = Zd; +} + +//===----------------------------------------------------------------------===// +// Lookup table read with 6-bit indices (8-bit) +class sme2_luti6_zt_base<RegisterOperand zd_ty, string asm> +  : I<(outs zd_ty:$Zd), (ins ZTR:$ZTt, ZZZ_Any:$Zn), +    asm, "\t$Zd, $ZTt, $Zn", "", []>, Sched<[]> { +  bits<0> ZTt; +  bits<3> Zd; +  bits<3> Zn; +  let Inst{31-21} = 0b11000000100; +  let Inst{19-10} = 0b1010000000; +  let Inst{9-7}   = Zn; +  let Inst{6-5}   = 0b00; +} + +class sme2_luti6_zt_consecutive<string asm> +  : sme2_luti6_zt_base<ZZZZ_b_mul_r, asm> { +  let Inst{20}    = 0; +  let Inst{4-2}   = Zd; +  let Inst{1-0}   = 0b00; +} + +class sme2_luti6_zt_strided<string asm> +  : sme2_luti6_zt_base<ZZZZ_b_strided, asm> { +  let Inst{20}    = 1; +  let Inst{4}     = Zd{2}; +  let Inst{3-2}   = 0b00; +  let Inst{1-0}   = Zd{1-0}; +} + +//===----------------------------------------------------------------------===// +// Lookup table read with 6-bit indices (8-bit) +class sme2_luti6_vector_vg4_base<RegisterOperand zd_ty, string asm> +  : I<(outs zd_ty:$Zd), (ins ZZ_h:$Zn, ZZ_Any:$Zm, VectorIndexD:$i1), +    asm, "\t$Zd, $Zn, $Zm$i1", "", []>, Sched<[]> { +  bits<3> Zd; +  bits<5> Zn; +  bits<5> Zm; +  bits<1> i1; +  let Inst{31-23} = 0b110000010; +  let Inst{22}    = i1; +  let Inst{21}    = 0b1; +  let Inst{20-16} = Zm; +  let Inst{9-5}   = Zn; +} + +class sme2_luti6_vector_vg4_consecutive<string asm> +  : sme2_luti6_vector_vg4_base<ZZZZ_h_mul_r, asm> { +  let Inst{15-10} = 0b111101; +  let Inst{4-2}   = Zd; +  let Inst{1-0}   = 0b00; +} + +class sme2_luti6_vector_vg4_strided<string asm> +  : sme2_luti6_vector_vg4_base<ZZZZ_h_strided, asm> { +  let Inst{15-10} = 0b111111; +  let Inst{4}     = Zd{2}; +  let Inst{3-2}   = 0b00; +  let Inst{1-0}   = Zd{1-0}; +} +  //===----------------------------------------------------------------------===//  // SME2 MOV  class sme2_mova_vec_to_tile_vg2_multi_base<bits<2> sz, bit v, diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 3cdd505..1664f4a 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -3787,7 +3787,7 @@ multiclass sve2p1_two_way_dot_vv<string mnemonic, bit u, SDPatternOperator intri  // SVE Integer Dot Product Group - Indexed Group  //===----------------------------------------------------------------------===// -class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm, +class sve_intx_dot_by_indexed_elem<bit U, string asm,                                     ZPRRegOp zprty1, ZPRRegOp zprty2,                                     ZPRRegOp zprty3, Operand itype>  : I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop), @@ -3795,8 +3795,7 @@ class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,    "", []>, Sched<[]> {    bits<5> Zda;    bits<5> Zn; -  let Inst{31-23} = 0b010001001; -  let Inst{22}    = sz; +  let Inst{31-24} = 0b01000100;    let Inst{21}    = 0b1;    let Inst{15-11} = 0;    let Inst{10}    = U; @@ -3810,16 +3809,18 @@ class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,  multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm,                                          SDPatternOperator op> { -  def _BtoS : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b_timm> { +  def _BtoS : sve_intx_dot_by_indexed_elem<opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b_timm> {      bits<2> iop;      bits<3> Zm; +    let Inst{23-22} = 0b10;      let Inst{20-19} = iop;      let Inst{18-16} = Zm;    } -  def _HtoD : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b_timm> { +  def _HtoD : sve_intx_dot_by_indexed_elem<opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b_timm> {      bits<1> iop;      bits<4> Zm; -    let Inst{20} = iop; +    let Inst{23-22} = 0b11; +    let Inst{20}    = iop;      let Inst{19-16} = Zm;    } @@ -3827,6 +3828,16 @@ multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm,    def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv8i16, nxv8i16, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _HtoD)>;  } +class sve_intx_dot_by_indexed_elem_x<bit opc, string asm> +: sve_intx_dot_by_indexed_elem<opc, asm, ZPR16, ZPR8, ZPR3b8, VectorIndexH32b_timm> { + bits<3> iop; + bits<3> Zm; + let Inst{23}    = 0b0; + let Inst{22}    = iop{2}; + let Inst{20-19} = iop{1-0}; + let Inst{18-16} = Zm; +} +  //===----------------------------------------------------------------------===//  // SVE2 Complex Integer Dot Product Group  //===----------------------------------------------------------------------===// @@ -4085,7 +4096,7 @@ class sve2_int_arith_pred<bits<2> sz, bits<6> opc, string asm,    bits<5> Zdn;    let Inst{31-24} = 0b01000100;    let Inst{23-22} = sz; -  let Inst{21-20} = 0b01; +  let Inst{21}    = 0b0;    let Inst{20-16} = opc{5-1};    let Inst{15-14} = 0b10;    let Inst{13}    = opc{0}; @@ -4590,15 +4601,15 @@ multiclass sve2_int_cadd<bit opc, string asm, SDPatternOperator op> {    def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, complexrotateopodd, !cast<Instruction>(NAME # _D)>;  } -class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm, +class sve2_int_absdiff_accum<bits<3> sz, bits<4> opc, string asm,                               ZPRRegOp zprty1, ZPRRegOp zprty2>  : I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm),    asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {    bits<5> Zda;    bits<5> Zn;    bits<5> Zm; -  let Inst{31-24} = 0b01000101; -  let Inst{23-22} = sz; +  let Inst{31-25} = 0b0100010; +  let Inst{24-22} = sz;    let Inst{21}    = 0b0;    let Inst{20-16} = Zm;    let Inst{15-14} = 0b11; @@ -4613,10 +4624,10 @@ class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm,  }  multiclass sve2_int_absdiff_accum<bit opc, string asm, SDPatternOperator op> { -  def _B : sve2_int_absdiff_accum<0b00, { 0b111, opc }, asm, ZPR8, ZPR8>; -  def _H : sve2_int_absdiff_accum<0b01, { 0b111, opc }, asm, ZPR16, ZPR16>; -  def _S : sve2_int_absdiff_accum<0b10, { 0b111, opc }, asm, ZPR32, ZPR32>; -  def _D : sve2_int_absdiff_accum<0b11, { 0b111, opc }, asm, ZPR64, ZPR64>; +  def _B : sve2_int_absdiff_accum<0b100, { 0b111, opc }, asm, ZPR8, ZPR8>; +  def _H : sve2_int_absdiff_accum<0b101, { 0b111, opc }, asm, ZPR16, ZPR16>; +  def _S : sve2_int_absdiff_accum<0b110, { 0b111, opc }, asm, ZPR32, ZPR32>; +  def _D : sve2_int_absdiff_accum<0b111, { 0b111, opc }, asm, ZPR64, ZPR64>;    def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;    def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; @@ -4626,20 +4637,26 @@ multiclass sve2_int_absdiff_accum<bit opc, string asm, SDPatternOperator op> {  multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm,                                         SDPatternOperator op> { -  def _H : sve2_int_absdiff_accum<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>; -  def _S : sve2_int_absdiff_accum<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>; -  def _D : sve2_int_absdiff_accum<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>; +  def _H : sve2_int_absdiff_accum<0b101, { 0b00, opc }, asm, ZPR16, ZPR8>; +  def _S : sve2_int_absdiff_accum<0b110, { 0b00, opc }, asm, ZPR32, ZPR16>; +  def _D : sve2_int_absdiff_accum<0b111, { 0b00, opc }, asm, ZPR64, ZPR32>;    def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;    def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;    def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;  } +multiclass sve2_int_two_way_absdiff_accum_long<bit U, string asm> { +  def _BtoH : sve2_int_absdiff_accum<0b001, { 0b01, U, 0b1 }, asm, ZPR16, ZPR8>; +  def _HtoS : sve2_int_absdiff_accum<0b010, { 0b01, U, 0b1 }, asm, ZPR32, ZPR16>; +  def _StoD : sve2_int_absdiff_accum<0b011, { 0b01, U, 0b1 }, asm, ZPR64, ZPR32>; +} +  multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm,                                        SDPatternOperator op> { -  def _S : sve2_int_absdiff_accum<{ opc{1}, 0b0 }, { 0b010, opc{0} }, asm, +  def _S : sve2_int_absdiff_accum<{ 0b1, opc{1}, 0b0 }, { 0b010, opc{0} }, asm,                                    ZPR32, ZPR32>; -  def _D : sve2_int_absdiff_accum<{ opc{1}, 0b1 }, { 0b010, opc{0} }, asm, +  def _D : sve2_int_absdiff_accum<{ 0b1, opc{1}, 0b1 }, { 0b010, opc{0} }, asm,                                    ZPR64, ZPR64>;    def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; @@ -9610,17 +9627,18 @@ multiclass sve_int_dot_mixed_indexed<bit U, string asm, SDPatternOperator op> {  // SVE Floating Point Matrix Multiply Accumulate Group  //===----------------------------------------------------------------------===// -class sve_fp_matrix_mla<bits<2> opc, string asm, ZPRRegOp zda_ty, ZPRRegOp reg_ty> +class sve_fp_matrix_mla<bits<3> opc, string asm, ZPRRegOp zda_ty, ZPRRegOp reg_ty>  : I<(outs zda_ty:$Zda), (ins zda_ty:$_Zda, reg_ty:$Zn, reg_ty:$Zm),      asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {    bits<5> Zda;    bits<5> Zn;    bits<5> Zm;    let Inst{31-24} = 0b01100100; -  let Inst{23-22} = opc; +  let Inst{23-22} = opc{2-1};    let Inst{21}    = 1;    let Inst{20-16} = Zm; -  let Inst{15-10} = 0b111001; +  let Inst{15-11} = 0b11100; +  let Inst{10}    = opc{0};    let Inst{9-5}   = Zn;    let Inst{4-0}   = Zda; @@ -9630,10 +9648,12 @@ class sve_fp_matrix_mla<bits<2> opc, string asm, ZPRRegOp zda_ty, ZPRRegOp reg_t    let mayRaiseFPException = 1;  } -multiclass sve_fp_matrix_mla<bits<2> opc, string asm, ZPRRegOp zda_ty, ZPRRegOp reg_ty, SDPatternOperator op, ValueType zda_vt, ValueType reg_vt> { +multiclass sve_fp_matrix_mla<bits<3> opc, string asm, ZPRRegOp zda_ty, +                             ZPRRegOp reg_ty, SDPatternOperator op, +                             ValueType zda_vt, ValueType reg_vt> {    def NAME : sve_fp_matrix_mla<opc, asm, zda_ty, reg_ty>; -  def : SVE_3_Op_Pat<zda_vt, op , zda_vt, reg_vt, reg_vt, !cast<Instruction>(NAME)>; +  def : SVE_3_Op_Pat<zda_vt, op, zda_vt, reg_vt, reg_vt, !cast<Instruction>(NAME)>;  }  //===----------------------------------------------------------------------===// @@ -10030,18 +10050,19 @@ multiclass sve2p1_multi_vec_extract_narrow<string mnemonic, bits<2> opc, SDPatte  }  // SVE2 multi-vec shift narrow -class sve2p1_multi_vec_shift_narrow<string mnemonic, bits<3> opc, bits<2> tsz> -    : I<(outs ZPR16:$Zd), (ins ZZ_s_mul_r:$Zn, vecshiftR16:$imm4), -        mnemonic, "\t$Zd, $Zn, $imm4", +class sve2p1_multi_vec_shift_narrow<string mnemonic, ZPRRegOp ZdRC, RegisterOperand ZSrcOp, +                                    Operand immtype, bits<3> opc, bits<2> tsz> +    : I<(outs ZdRC:$Zd), (ins ZSrcOp:$Zn, immtype:$imm), +        mnemonic, "\t$Zd, $Zn, $imm",          "", []>, Sched<[]> {    bits<5> Zd;    bits<4> Zn; -  bits<4> imm4; +  bits<4> imm;    let Inst{31-23} = 0b010001011;    let Inst{22}    = tsz{1};    let Inst{21}    = 0b1;    let Inst{20}    = tsz{0}; -  let Inst{19-16} = imm4; +  let Inst{18-16} = imm{2-0};  // imm3    let Inst{15-14} = 0b00;    let Inst{13-11} = opc;    let Inst{10}    = 0b0; @@ -10052,12 +10073,19 @@ class sve2p1_multi_vec_shift_narrow<string mnemonic, bits<3> opc, bits<2> tsz>    let hasSideEffects = 0;  } -multiclass sve2p1_multi_vec_shift_narrow<string mnemonic, bits<3> opc, SDPatternOperator intrinsic> { -  def NAME : sve2p1_multi_vec_shift_narrow<mnemonic, opc, 0b01>; +multiclass sve_multi_vec_shift_narrow<string mnemonic, bits<3> opc, SDPatternOperator intrinsic> { +  def NAME : sve2p1_multi_vec_shift_narrow<mnemonic, ZPR16, ZZ_s_mul_r, vecshiftR16, opc, 0b01> { +    let Inst{19} = imm{3}; // imm4 +  }    def : SVE2p1_Sat_Shift_VG2_Pat<NAME, intrinsic, nxv8i16, nxv4i32, vecshiftR16>;  } +multiclass sve_multi_vec_round_shift_narrow<string mnemonic, bits<3> opc> { +  def NAME : sve2p1_multi_vec_shift_narrow<mnemonic, ZPR8, ZZ_h_mul_r, vecshiftR8, opc, 0b00> { +    let Inst{19} = 0b1;    // always 1 for imm3 version +  } +}  // SME2 multi-vec contiguous load (scalar plus scalar, two registers)  class sve2p1_mem_cld_ss_2z<string mnemonic, bits<2> msz, bit n, @@ -11164,7 +11192,7 @@ multiclass sve2_fp8_dot_indexed_s<string asm, SDPatternOperator op> {    def : SVE_4_Op_Pat<nxv4f32, op, nxv4f32, nxv16i8, nxv16i8, i32, !cast<Instruction>(NAME)>;  } -// FP8 Look up table +// Look up table  class sve2_lut_vector_index<ZPRRegOp zd_ty, RegisterOperand zn_ty,                              Operand idx_ty, bits<4>opc, string mnemonic>      : I<(outs zd_ty:$Zd), (ins zn_ty:$Zn, ZPRAny:$Zm, idx_ty:$idx), @@ -11183,7 +11211,7 @@ class sve2_lut_vector_index<ZPRRegOp zd_ty, RegisterOperand zn_ty,    let Inst{4-0}   = Zd;  } -// FP8 Look up table read with 2-bit indices +// Look up table read with 2-bit indices  multiclass sve2_luti2_vector_index<string mnemonic> {    def _B : sve2_lut_vector_index<ZPR8, Z_b, VectorIndexS32b, {?, 0b100}, mnemonic> {      bits<2> idx; @@ -11205,7 +11233,7 @@ multiclass sve2_luti2_vector_index<string mnemonic> {                           i32, timm32_0_7, !cast<Instruction>(NAME # _H)>;  } -// FP8 Look up table read with 4-bit indices +// Look up table read with 4-bit indices  multiclass sve2_luti4_vector_index<string mnemonic> {    def _B : sve2_lut_vector_index<ZPR8, Z_b, VectorIndexD32b, 0b1001, mnemonic> {      bit idx; @@ -11226,7 +11254,7 @@ multiclass sve2_luti4_vector_index<string mnemonic> {                           i32, timm32_0_3, !cast<Instruction>(NAME # _H)>;  } -// FP8 Look up table read with 4-bit indices (two contiguous registers) +// Look up table read with 4-bit indices (two contiguous registers)  multiclass sve2_luti4_vector_vg2_index<string mnemonic> {    def NAME : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexS32b, {?, 0b101}, mnemonic> {      bits<2> idx; @@ -11250,6 +11278,29 @@ multiclass sve2_luti4_vector_vg2_index<string mnemonic> {                                                  nxv16i8:$Op3, timm32_0_3:$Op4))>;  } +// Look up table read with 6-bit indices +multiclass sve2_luti6_vector_index<string mnemonic> { +  def _H : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexD32b, 0b1011, mnemonic> { +    bit idx; +    let Inst{23} = idx; +  } +} + +// Look up table +class sve2_luti6_vector<string mnemonic> +    : I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, ZPRAny:$Zm), +      mnemonic, "\t$Zd, $Zn, $Zm", +      "", []>, Sched<[]> { +  bits<5> Zd; +  bits<5> Zn; +  bits<5> Zm; +  let Inst{31-21} = 0b01000101001; +  let Inst{20-16} = Zm; +  let Inst{15-10} = 0b101011; +  let Inst{9-5}   = Zn; +  let Inst{4-0}   = Zd; +} +  //===----------------------------------------------------------------------===//  // Checked Pointer Arithmetic (FEAT_CPA)  //===----------------------------------------------------------------------===// @@ -11280,3 +11331,49 @@ class sve_int_mla_cpa<string asm>    let ElementSize = ZPR64.ElementSize;  } + +//===----------------------------------------------------------------------===// +// FP to Int down-converts +//===----------------------------------------------------------------------===// +class sve2_fp_to_int_downcvt<string asm, ZPRRegOp ZdRC, RegisterOperand ZSrcOp, bits<2> size, bit U> +  : I<(outs ZdRC:$Zd), (ins ZSrcOp:$Zn), +      asm, "\t$Zd, $Zn", "", []>, Sched<[]> { +  bits<5> Zd; +  bits<4> Zn; +  let Inst{31-24} = 0b01100101; +  let Inst{23-22} = size; +  let Inst{21-11} = 0b00110100110; +  let Inst{10}    = U; +  let Inst{9-6}   = Zn; +  let Inst{5}     = 0b0; +  let Inst{4-0}   = Zd; +} + +multiclass sve2_fp_to_int_downcvt<string asm, bit U> { +  def _HtoB : sve2_fp_to_int_downcvt<asm, ZPR8,  ZZ_h_mul_r, 0b01, U>; +  def _StoH : sve2_fp_to_int_downcvt<asm, ZPR16, ZZ_s_mul_r, 0b10, U>; +  def _DtoS : sve2_fp_to_int_downcvt<asm, ZPR32, ZZ_d_mul_r, 0b11, U>; +} + +//===----------------------------------------------------------------------===// +// Int to FP up-converts +//===----------------------------------------------------------------------===// +class sve2_int_to_fp_upcvt<string asm, ZPRRegOp ZdRC, ZPRRegOp ZnRC, +                        bits<2> size, bits<2> U> +  : I<(outs ZdRC:$Zd), (ins  ZnRC:$Zn), +      asm, "\t$Zd, $Zn", "", []>, Sched<[]> { +  bits<5> Zd; +  bits<5> Zn; +  let Inst{31-24} = 0b01100101; +  let Inst{23-22} = size; +  let Inst{21-12} = 0b0011000011; +  let Inst{11-10} = U; +  let Inst{9-5}   = Zn; +  let Inst{4-0}   = Zd; +} + +multiclass sve2_int_to_fp_upcvt<string asm, bits<2> U> { +  def _BtoH : sve2_int_to_fp_upcvt<asm, ZPR16, ZPR8,  0b01, U>; +  def _HtoS : sve2_int_to_fp_upcvt<asm, ZPR32, ZPR16, 0b10, U>; +  def _StoD : sve2_int_to_fp_upcvt<asm, ZPR64, ZPR32, 0b11, U>; +} diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index d6cb0e8..268a229 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -139,6 +139,13 @@ namespace llvm {  }  namespace llvm { +namespace AArch64CMHPriorityHint { +#define GET_CMHPRIORITYHINT_IMPL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64CMHPriorityHint +} // namespace llvm + +namespace llvm {    namespace AArch64SysReg {  #define GET_SysRegsList_IMPL  #include "AArch64GenSystemOperands.inc" @@ -190,6 +197,32 @@ namespace AArch64TLBIP {  #define GET_TLBIPTable_IMPL  #include "AArch64GenSystemOperands.inc"  } // namespace AArch64TLBIP + +namespace AArch64MLBI { +#define GET_MLBITable_IMPL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64MLBI +} // namespace llvm + +namespace llvm { +namespace AArch64GIC { +#define GET_GICTable_IMPL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64GIC +} // namespace llvm + +namespace llvm { +namespace AArch64GICR { +#define GET_GICRTable_IMPL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64GICR +} // namespace llvm + +namespace llvm { +namespace AArch64GSB { +#define GET_GSBTable_IMPL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64GSB  } // namespace llvm  namespace llvm { diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index fea33ef..27812e9 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -409,6 +409,16 @@ struct SysAliasReg : SysAlias {        : SysAlias(N, E, F), NeedsReg(R) {}  }; +struct SysAliasOptionalReg : SysAlias { +  bool NeedsReg; +  bool OptionalReg; +  constexpr SysAliasOptionalReg(const char *N, uint16_t E, bool R, bool O) +      : SysAlias(N, E), NeedsReg(R), OptionalReg(O) {} +  constexpr SysAliasOptionalReg(const char *N, uint16_t E, bool R, bool O, +                                FeatureBitset F) +      : SysAlias(N, E, F), NeedsReg(R), OptionalReg(O) {} +}; +  struct SysAliasImm : SysAlias {    uint16_t ImmValue;    constexpr SysAliasImm(const char *N, uint16_t E, uint16_t I) @@ -677,6 +687,14 @@ namespace AArch64BTIHint {  #include "AArch64GenSystemOperands.inc"  } +namespace AArch64CMHPriorityHint { +struct CMHPriorityHint : SysAlias { +  using SysAlias::SysAlias; +}; +#define GET_CMHPRIORITYHINT_DECL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64CMHPriorityHint +  namespace AArch64SME {  enum ToggleCondition : unsigned {    Always, @@ -788,21 +806,53 @@ namespace AArch64SysReg {  }  namespace AArch64TLBI { -  struct TLBI : SysAliasReg { -    using SysAliasReg::SysAliasReg; -  }; -  #define GET_TLBITable_DECL -  #include "AArch64GenSystemOperands.inc" +struct TLBI : SysAliasOptionalReg { +  using SysAliasOptionalReg::SysAliasOptionalReg; +}; +#define GET_TLBITable_DECL +#include "AArch64GenSystemOperands.inc"  }  namespace AArch64TLBIP { -struct TLBIP : SysAliasReg { -  using SysAliasReg::SysAliasReg; +struct TLBIP : SysAliasOptionalReg { +  using SysAliasOptionalReg::SysAliasOptionalReg;  };  #define GET_TLBIPTable_DECL  #include "AArch64GenSystemOperands.inc"  } // namespace AArch64TLBIP +namespace AArch64MLBI { +struct MLBI : SysAliasReg { +  using SysAliasReg::SysAliasReg; +}; +#define GET_MLBITable_DECL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64MLBI + +namespace AArch64GIC { +struct GIC : SysAliasReg { +  using SysAliasReg::SysAliasReg; +}; +#define GET_GICTable_DECL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64GIC + +namespace AArch64GICR { +struct GICR : SysAliasReg { +  using SysAliasReg::SysAliasReg; +}; +#define GET_GICRTable_DECL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64GICR + +namespace AArch64GSB { +struct GSB : SysAlias { +  using SysAlias::SysAlias; +}; +#define GET_GSBTable_DECL +#include "AArch64GenSystemOperands.inc" +} // namespace AArch64GSB +  namespace AArch64II {  /// Target Operand Flag enum.  enum TOF { diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp index d71f728..085c8588 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp @@ -75,8 +75,8 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) {  }  void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName, -                                     const AArch64TargetLowering &TLI) { -  RTLIB::LibcallImpl Impl = TLI.getSupportedLibcallImpl(FuncName); +                                     const RTLIB::RuntimeLibcallsInfo &RTLCI) { +  RTLIB::LibcallImpl Impl = RTLCI.getSupportedLibcallImpl(FuncName);    if (Impl == RTLIB::Unsupported)      return;    unsigned KnownAttrs = SMEAttrs::Normal; @@ -124,21 +124,22 @@ bool SMECallAttrs::requiresSMChange() const {    return true;  } -SMECallAttrs::SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI) +SMECallAttrs::SMECallAttrs(const CallBase &CB, +                           const RTLIB::RuntimeLibcallsInfo *RTLCI)      : CallerFn(*CB.getFunction()), CalledFn(SMEAttrs::Normal),        Callsite(CB.getAttributes()), IsIndirect(CB.isIndirectCall()) {    if (auto *CalledFunction = CB.getCalledFunction()) -    CalledFn = SMEAttrs(*CalledFunction, TLI); - -  // An `invoke` of an agnostic ZA function may not return normally (it may -  // resume in an exception block). In this case, it acts like a private ZA -  // callee and may require a ZA save to be set up before it is called. -  if (isa<InvokeInst>(CB)) -    CalledFn.set(SMEAttrs::ZA_State_Agnostic, /*Enable=*/false); +    CalledFn = SMEAttrs(*CalledFunction, RTLCI);    // FIXME: We probably should not allow SME attributes on direct calls but    // clang duplicates streaming mode attributes at each callsite.    assert((IsIndirect ||            ((Callsite.withoutPerCallsiteFlags() | CalledFn) == CalledFn)) &&           "SME attributes at callsite do not match declaration"); + +  // An `invoke` of an agnostic ZA function may not return normally (it may +  // resume in an exception block). In this case, it acts like a private ZA +  // callee and may require a ZA save to be set up before it is called. +  if (isa<InvokeInst>(CB)) +    CalledFn.set(SMEAttrs::ZA_State_Agnostic, /*Enable=*/false);  } diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h index d26e3cd..28c397e 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h @@ -12,8 +12,9 @@  #include "llvm/IR/Function.h"  namespace llvm { - -class AArch64TargetLowering; +namespace RTLIB { +struct RuntimeLibcallsInfo; +}  class Function;  class CallBase; @@ -52,14 +53,14 @@ public:    SMEAttrs() = default;    SMEAttrs(unsigned Mask) { set(Mask); } -  SMEAttrs(const Function &F, const AArch64TargetLowering *TLI = nullptr) +  SMEAttrs(const Function &F, const RTLIB::RuntimeLibcallsInfo *RTLCI = nullptr)        : SMEAttrs(F.getAttributes()) { -    if (TLI) -      addKnownFunctionAttrs(F.getName(), *TLI); +    if (RTLCI) +      addKnownFunctionAttrs(F.getName(), *RTLCI);    }    SMEAttrs(const AttributeList &L); -  SMEAttrs(StringRef FuncName, const AArch64TargetLowering &TLI) { -    addKnownFunctionAttrs(FuncName, TLI); +  SMEAttrs(StringRef FuncName, const RTLIB::RuntimeLibcallsInfo &RTLCI) { +    addKnownFunctionAttrs(FuncName, RTLCI);    };    void set(unsigned M, bool Enable = true) { @@ -157,7 +158,7 @@ public:  private:    void addKnownFunctionAttrs(StringRef FuncName, -                             const AArch64TargetLowering &TLI); +                             const RTLIB::RuntimeLibcallsInfo &RTLCI);    void validate() const;  }; @@ -175,7 +176,7 @@ public:                 SMEAttrs Callsite = SMEAttrs::Normal)        : CallerFn(Caller), CalledFn(Callee), Callsite(Callsite) {} -  SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI); +  SMECallAttrs(const CallBase &CB, const RTLIB::RuntimeLibcallsInfo *RTLCI);    SMEAttrs &caller() { return CallerFn; }    SMEAttrs &callee() { return IsIndirect ? Callsite : CalledFn; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index e8b211f..7f00ead 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -176,6 +176,19 @@ def binop_s64_with_s32_mask_combines : GICombineGroup<[    combine_or_s64_with_s32_mask, combine_and_s64_with_s32_mask  ]>; +// (or i64:x, (zext i32:y)) -> i64:(merge (or lo_32(x), i32:y), hi_32(x)) +// (or (zext i32:y), i64:x) -> i64:(merge (or lo_32(x), i32:y), hi_32(x)) +def or_s64_zext_s32_frag : GICombinePatFrag<(outs root:$dst), (ins $src_s64, $src_s32), +  [(pattern (G_OR $dst, i64:$src_s64, i64:$zext_val), (G_ZEXT i64:$zext_val, i32:$src_s32)), +   (pattern (G_OR $dst, i64:$zext_val, i64:$src_s64), (G_ZEXT i64:$zext_val, i32:$src_s32))]>; + +def combine_or_s64_s32 : GICombineRule< +  (defs root:$dst), +  (match (or_s64_zext_s32_frag $dst, i64:$x, i32:$y):$dst), +  (apply (G_UNMERGE_VALUES $x_lo, $x_hi, $x), +         (G_OR $or, $x_lo, $y), +         (G_MERGE_VALUES $dst, $or, $x_hi))>; +  let Predicates = [Has16BitInsts, NotHasMed3_16] in {  // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This  // saves one instruction compared to the promotion. @@ -206,7 +219,7 @@ def AMDGPUPreLegalizerCombiner: GICombiner<    "AMDGPUPreLegalizerCombinerImpl",    [all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16,     foldable_fneg, combine_shuffle_vector_to_build_vector, -   binop_s64_with_s32_mask_combines]> { +   binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {    let CombineAllMethodName = "tryCombineAllImpl";  } @@ -215,7 +228,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<    [all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp,     uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,     rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64, -   binop_s64_with_s32_mask_combines]> { +   binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {    let CombineAllMethodName = "tryCombineAllImpl";  } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 8ed4062..1b559a6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -514,8 +514,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,        MVT::i64, Custom);    setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); -  setOperationAction({ISD::ABS, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, -                     MVT::i32, Legal); +  setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32, +                     Legal);    setOperationAction(        {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 596a895..1a13b22 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -976,9 +976,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,      FPOpActions.clampMaxNumElementsStrict(0, S32, 2);    } +  auto &MinNumMaxNumIeee = +      getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); + +  if (ST.hasVOP3PInsts()) { +    MinNumMaxNumIeee.legalFor(FPTypesPK16) +        .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) +        .clampMaxNumElements(0, S16, 2) +        .clampScalar(0, S16, S64) +        .scalarize(0); +  } else if (ST.has16BitInsts()) { +    MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0); +  } else { +    MinNumMaxNumIeee.legalFor(FPTypesBase) +        .clampScalar(0, S32, S64) +        .scalarize(0); +  } +    auto &MinNumMaxNum = getActionDefinitionsBuilder( -      {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE, -       G_FMAXNUM_IEEE}); +      {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});    if (ST.hasVOP3PInsts()) {      MinNumMaxNum.customFor(FPTypesPK16) @@ -2136,9 +2152,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,          .legalFor(FPTypesPK16)          .clampMaxNumElements(0, S16, 2)          .scalarize(0); +  } else if (ST.hasVOP3PInsts()) { +    getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) +        .lowerFor({V2S16}) +        .clampMaxNumElementsStrict(0, S16, 2) +        .scalarize(0) +        .lower();    } else { -    // TODO: Implement -    getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); +    getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) +        .scalarize(0) +        .clampScalar(0, S32, S64) +        .lower();    }    getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) @@ -2195,8 +2219,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(    case TargetOpcode::G_FMAXNUM:    case TargetOpcode::G_FMINIMUMNUM:    case TargetOpcode::G_FMAXIMUMNUM: -  case TargetOpcode::G_FMINNUM_IEEE: -  case TargetOpcode::G_FMAXNUM_IEEE:      return legalizeMinNumMaxNum(Helper, MI);    case TargetOpcode::G_EXTRACT_VECTOR_ELT:      return legalizeExtractVectorElt(MI, MRI, B); @@ -2817,23 +2839,8 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,    MachineFunction &MF = Helper.MIRBuilder.getMF();    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); -  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || -                        MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; - -  // With ieee_mode disabled, the instructions have the correct behavior -  // already for G_FMINIMUMNUM/G_FMAXIMUMNUM. -  // -  // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode -  // enabled. -  if (!MFI->getMode().IEEE) { -    if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM || -        MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM) -      return true; - -    return !IsIEEEOp; -  } - -  if (IsIEEEOp) +  // With ieee_mode disabled, the instructions have the correct behavior. +  if (!MFI->getMode().IEEE)      return true;    return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 99ba043..5580e4c 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1860,7 +1860,6 @@ private:    bool validateTHAndScopeBits(const MCInst &Inst, const OperandVector &Operands,                                const unsigned CPol);    bool validateTFE(const MCInst &Inst, const OperandVector &Operands); -  bool validateSetVgprMSB(const MCInst &Inst, const OperandVector &Operands);    bool validateLdsDirect(const MCInst &Inst, const OperandVector &Operands);    bool validateWMMA(const MCInst &Inst, const OperandVector &Operands);    unsigned getConstantBusLimit(unsigned Opcode) const; @@ -5506,22 +5505,6 @@ bool AMDGPUAsmParser::validateTFE(const MCInst &Inst,    return true;  } -bool AMDGPUAsmParser::validateSetVgprMSB(const MCInst &Inst, -                                         const OperandVector &Operands) { -  if (Inst.getOpcode() != AMDGPU::S_SET_VGPR_MSB_gfx12) -    return true; - -  int Simm16Pos = -      AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::simm16); -  if ((unsigned)Inst.getOperand(Simm16Pos).getImm() > 255) { -    SMLoc Loc = Operands[1]->getStartLoc(); -    Error(Loc, "s_set_vgpr_msb accepts values in range [0..255]"); -    return false; -  } - -  return true; -} -  bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst,                                     const OperandVector &Operands) {    unsigned Opc = Inst.getOpcode(); @@ -5681,9 +5664,6 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, SMLoc IDLoc,    if (!validateTFE(Inst, Operands)) {      return false;    } -  if (!validateSetVgprMSB(Inst, Operands)) { -    return false; -  }    if (!validateWMMA(Inst, Operands)) {      return false;    } diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 09ef6ac..2aa54c9 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -45,9 +45,6 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,    // Legalize loads and stores to the private address space.    setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom); -  // 32-bit ABS is legal for AMDGPU except for R600 -  setOperationAction(ISD::ABS, MVT::i32, Expand); -    // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address    // spaces, so it is custom lowered to handle those where it isn't.    for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a757421..be42291 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -298,7 +298,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,    setOperationAction(ISD::BR_CC,                       {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand); -  setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal); +  setOperationAction({ISD::ABS, ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);    setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal); diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index ee10190..05ba76a 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -976,10 +976,10 @@ def : GCNPat <  } // End SubtargetPredicate = HasLshlAddU64Inst  let SubtargetPredicate = HasAddMinMaxInsts in { -def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>; -def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>; -def : ThreeOp_i32_Pats<add, smin, V_ADD_MIN_I32_e64>; -def : ThreeOp_i32_Pats<add, umin, V_ADD_MIN_U32_e64>; +def : ThreeOp_i32_Pats<saddsat, smax, V_ADD_MAX_I32_e64>; +def : ThreeOp_i32_Pats<uaddsat, umax, V_ADD_MAX_U32_e64>; +def : ThreeOp_i32_Pats<saddsat, smin, V_ADD_MIN_I32_e64>; +def : ThreeOp_i32_Pats<uaddsat, umin, V_ADD_MIN_U32_e64>;  }  def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index c4692b7..4ae2c1e 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -464,10 +464,10 @@ class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2,  >;  let SubtargetPredicate = HasPkAddMinMaxInsts in { -def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>; -def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>; -def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>; -def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>; +def : ThreeOp_OpSelClampPats<saddsat, smax, V_PK_ADD_MAX_I16>; +def : ThreeOp_OpSelClampPats<uaddsat, umax, V_PK_ADD_MAX_U16>; +def : ThreeOp_OpSelClampPats<saddsat, smin, V_PK_ADD_MIN_I16>; +def : ThreeOp_OpSelClampPats<uaddsat, umin, V_PK_ADD_MIN_U16>;  }  let SubtargetPredicate = HasPkMinMax3Insts in { diff --git a/llvm/lib/Target/ARM/ARMArchitectures.td b/llvm/lib/Target/ARM/ARMArchitectures.td index 301ed5b..bfcecfe 100644 --- a/llvm/lib/Target/ARM/ARMArchitectures.td +++ b/llvm/lib/Target/ARM/ARMArchitectures.td @@ -297,6 +297,18 @@ def ARMv96a   : Architecture<"armv9.6-a", "ARMv96a",  [HasV9_6aOps,                                                         FeatureCRC,                                                         FeatureRAS,                                                         FeatureDotProd]>; +def ARMv97a   : Architecture<"armv9.7-a", "ARMv97a",  [HasV9_7aOps, +                                                       FeatureAClass, +                                                       FeatureDB, +                                                       FeatureFPARMv8, +                                                       FeatureNEON, +                                                       FeatureDSP, +                                                       FeatureTrustZone, +                                                       FeatureMP, +                                                       FeatureVirtualization, +                                                       FeatureCRC, +                                                       FeatureRAS, +                                                       FeatureDotProd]>;  def ARMv8r    : Architecture<"armv8-r",   "ARMv8r",   [HasV8Ops,                                                         FeatureRClass,                                                         FeatureDB, diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td index 9b1fa5d..e562b21 100644 --- a/llvm/lib/Target/ARM/ARMFeatures.td +++ b/llvm/lib/Target/ARM/ARMFeatures.td @@ -712,6 +712,11 @@ def HasV9_6aOps   : SubtargetFeature<"v9.6a", "HasV9_6aOps", "true",                                     "Support ARM v9.6a instructions",                                     [HasV9_5aOps]>; +// Armv9.7-A is a v9-only architecture. +def HasV9_7aOps   : SubtargetFeature<"v9.7a", "HasV9_7aOps", "true", +                                   "Support ARM v9.7a instructions", +                                   [HasV9_6aOps]>; +  def HasV8_1MMainlineOps : SubtargetFeature<                 "v8.1m.main", "HasV8_1MMainlineOps", "true",                 "Support ARM v8-1M Mainline instructions", diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 8122db2..313ae3d 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21381,15 +21381,6 @@ void ARMTargetLowering::insertSSPDeclarations(Module &M) const {    TargetLowering::insertSSPDeclarations(M);  } -Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const { -  // MSVC CRT has a function to validate security cookie. -  RTLIB::LibcallImpl SecurityCheckCookie = -      getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE); -  if (SecurityCheckCookie != RTLIB::Unsupported) -    return M.getFunction(getLibcallImplName(SecurityCheckCookie)); -  return TargetLowering::getSSPStackGuardCheck(M); -} -  bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,                                                    unsigned &Cost) const {    // If we do not have NEON, vector types are not natively supported. diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 8c5e0cf..357d2c5 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -708,7 +708,6 @@ class VectorType;      bool useLoadStackGuardNode(const Module &M) const override;      void insertSSPDeclarations(Module &M) const override; -    Function *getSSPStackGuardCheck(const Module &M) const override;      bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,                                     unsigned &Cost) const override; diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 53be167..10d4cd5 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -6546,23 +6546,25 @@ def KCFI_CHECK_ARM      : PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>,        Sched<[]>,        Requires<[IsARM]> { -  let Size = 28; // 7 instructions (bic, ldr, 4x eor, beq, udf) +  let Size = 40; // worst-case 10 instructions @ 4 bytes each +                 // (push, bic, ldr, 4x eor, pop, beq, udf)  }  def KCFI_CHECK_Thumb2      : PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>,        Sched<[]>,        Requires<[IsThumb2]> { -  let Size = -      32; // worst-case 9 instructions (push, bic, ldr, 4x eor, pop, beq.w, udf) +  let Size = 34; // worst-case (push.w[2], bic[4], ldr[4], 4x eor[16], pop.w[2], +                 // beq.w[4], udf[2])  }  def KCFI_CHECK_Thumb1      : PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>,        Sched<[]>,        Requires<[IsThumb1Only]> { -  let Size = 50; // worst-case 25 instructions (pushes, bic helper, type -                 // building, cmp, pops) +  let Size = 38; // worst-case 19 instructions @ 2 bytes each +                 // (2x push, 3x bic-helper, subs+ldr, 13x type-building, cmp, +                 // 2x pop, beq, bkpt)  }  //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 0796746..94b511a 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -895,6 +895,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {    case ARM::ArchKind::ARMV9_4A:    case ARM::ArchKind::ARMV9_5A:    case ARM::ArchKind::ARMV9_6A: +  case ARM::ArchKind::ARMV9_7A:      S.setAttributeItem(CPU_arch_profile, ApplicationProfile, false);      S.setAttributeItem(ARM_ISA_use, Allowed, false);      S.setAttributeItem(THUMB_ISA_use, AllowThumb32, false); diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td index 02fb905..4a2f714 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.td +++ b/llvm/lib/Target/AVR/AVRInstrInfo.td @@ -1504,14 +1504,26 @@ let Defs = [SREG], hasSideEffects = 0 in  def FRMIDX : Pseudo<(outs DLDREGS:$dst), (ins DLDREGS:$src, i16imm:$src2),                      "frmidx\t$dst, $src, $src2", []>; +// The instructions STDSPQRr and STDWSPQRr are used to store to the stack +// frame. The most accurate implementation would be to load the SP into +// a temporary pointer variable and then STDPtrQRr. However for efficiency, +// we assume that R29R28 contains the current call frame pointer. +// However in the PEI pass we sometimes rewrite a ADJCALLSTACKDOWN pseudo, +// plus one or more STDSPQRr/STDWSPQRr pseudo instructions to use Z for a +// stack adjustment then as a base pointer. To avoid corruption, we thus +// specify special classes of registers, like GPR8 and DREGS, but with +// the Z register removed, as the source/input to these instructions.  // This pseudo is either converted to a regular store or a push which clobbers  // SP. -def STDSPQRr : StorePseudo<(outs), (ins memspi:$dst, GPR8:$src), +let Defs = [SP], Uses = [SP], hasSideEffects = 0 in +def STDSPQRr : StorePseudo<(outs), (ins memspi:$dst, GPR8NOZ:$src),                             "stdstk\t$dst, $src", [(store i8:$src, addr:$dst)]>; +// See the comment on STDSPQRr.  // This pseudo is either converted to a regular store or a push which clobbers  // SP. -def STDWSPQRr : StorePseudo<(outs), (ins memspi:$dt, DREGS:$src), +let Defs = [SP], Uses = [SP], hasSideEffects = 0 in +def STDWSPQRr : StorePseudo<(outs), (ins memspi:$dt, DREGSNOZ:$src),                              "stdwstk\t$dt, $src", [(store i16:$src, addr:$dt)]>;  // SP read/write pseudos. diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.td b/llvm/lib/Target/AVR/AVRRegisterInfo.td index 182f92c..9b935b1 100644 --- a/llvm/lib/Target/AVR/AVRRegisterInfo.td +++ b/llvm/lib/Target/AVR/AVRRegisterInfo.td @@ -211,6 +211,31 @@ def PTRDISPREGS : RegisterClass<"AVR", [i16], 8, (add R31R30, R29R28), ptr>;  // model this using a register class containing only the Z register.  def ZREG : RegisterClass<"AVR", [i16], 8, (add R31R30)>; +// general registers excluding Z register lo/hi, these are the only +// registers that are always safe for STDSPQr instructions +def GPR8NOZ : RegisterClass<"AVR", [i8], 8, +                         (// Return value and argument registers. +                          add R24, R25, R18, R19, R20, R21, R22, R23, +                          // Scratch registers. +                          R26, R27, +                          // Callee saved registers. +                          R28, R29, R17, R16, R15, R14, R13, R12, R11, R10, +                          R9, R8, R7, R6, R5, R4, R3, R2, R0, R1)>; + +// 16-bit pair register class excluding Z register lo/hi, these are the only +// registers that are always safe for STDWSPQr instructions +def DREGSNOZ : RegisterClass<"AVR", [i16], 8, +                          (// Return value and arguments. +                           add R25R24, R19R18, R21R20, R23R22, +                           // Scratch registers. +                           R27R26, +                           // Callee saved registers. +                           R29R28, R17R16, R15R14, R13R12, R11R10, R9R8, +                           R7R6, R5R4, R3R2, R1R0, +                           // Pseudo regs for unaligned 16-bits +                           R26R25, R24R23, R22R21, R20R19, R18R17, R16R15, +                           R14R13, R12R11, R10R9)>; +  // Register class used for the stack read pseudo instruction.  def GPRSP : RegisterClass<"AVR", [i16], 8, (add SP)>; diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp index 42e90f0..d6fa65f 100644 --- a/llvm/lib/Target/DirectX/DXILPrepare.cpp +++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp @@ -6,7 +6,7 @@  //  //===----------------------------------------------------------------------===//  /// -/// \file This file contains pases and utilities to convert a modern LLVM +/// \file This file contains passes and utilities to convert a modern LLVM  /// module into a module compatible with the LLVM 3.7-based DirectX Intermediate  /// Language (DXIL).  //===----------------------------------------------------------------------===// @@ -16,7 +16,6 @@  #include "DirectX.h"  #include "DirectXIRPasses/PointerTypeAnalysis.h"  #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/StringSet.h"  #include "llvm/Analysis/DXILMetadataAnalysis.h"  #include "llvm/Analysis/DXILResource.h" @@ -27,7 +26,6 @@  #include "llvm/IR/Module.h"  #include "llvm/InitializePasses.h"  #include "llvm/Pass.h" -#include "llvm/Support/Compiler.h"  #include "llvm/Support/VersionTuple.h"  #define DEBUG_TYPE "dxil-prepare" @@ -116,31 +114,6 @@ static void removeStringFunctionAttributes(Function &F,    F.removeRetAttrs(DeadAttrs);  } -static void cleanModuleFlags(Module &M) { -  NamedMDNode *MDFlags = M.getModuleFlagsMetadata(); -  if (!MDFlags) -    return; - -  SmallVector<llvm::Module::ModuleFlagEntry> FlagEntries; -  M.getModuleFlagsMetadata(FlagEntries); -  bool Updated = false; -  for (auto &Flag : FlagEntries) { -    // llvm 3.7 only supports behavior up to AppendUnique. -    if (Flag.Behavior <= Module::ModFlagBehavior::AppendUnique) -      continue; -    Flag.Behavior = Module::ModFlagBehavior::Warning; -    Updated = true; -  } - -  if (!Updated) -    return; - -  MDFlags->eraseFromParent(); - -  for (auto &Flag : FlagEntries) -    M.addModuleFlag(Flag.Behavior, Flag.Key->getString(), Flag.Val); -} -  class DXILPrepareModule : public ModulePass {    static Value *maybeGenerateBitcast(IRBuilder<> &Builder, @@ -202,15 +175,6 @@ class DXILPrepareModule : public ModulePass {                           Builder.getPtrTy(PtrTy->getAddressSpace())));    } -  static std::array<unsigned, 6> getCompatibleInstructionMDs(llvm::Module &M) { -    return {M.getMDKindID("dx.nonuniform"), -            M.getMDKindID("dx.controlflow.hints"), -            M.getMDKindID("dx.precise"), -            llvm::LLVMContext::MD_range, -            llvm::LLVMContext::MD_alias_scope, -            llvm::LLVMContext::MD_noalias}; -  } -  public:    bool runOnModule(Module &M) override {      PointerTypeMap PointerTypes = PointerTypeAnalysis::run(M); @@ -224,10 +188,7 @@ public:      const dxil::ModuleMetadataInfo MetadataInfo =          getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();      VersionTuple ValVer = MetadataInfo.ValidatorVersion; -    bool SkipValidation = ValVer.getMajor() == 0 && ValVer.getMinor() == 0; - -    // construct allowlist of valid metadata node kinds -    std::array<unsigned, 6> DXILCompatibleMDs = getCompatibleInstructionMDs(M); +    bool AllowExperimental = ValVer.getMajor() == 0 && ValVer.getMinor() == 0;      for (auto &F : M.functions()) {        F.removeFnAttrs(AttrMask); @@ -235,7 +196,7 @@ public:        // Only remove string attributes if we are not skipping validation.        // This will reserve the experimental attributes when validation version        // is 0.0 for experiment mode. -      removeStringFunctionAttributes(F, SkipValidation); +      removeStringFunctionAttributes(F, AllowExperimental);        for (size_t Idx = 0, End = F.arg_size(); Idx < End; ++Idx)          F.removeParamAttrs(Idx, AttrMask); @@ -243,11 +204,17 @@ public:          IRBuilder<> Builder(&BB);          for (auto &I : make_early_inc_range(BB)) { -          I.dropUnknownNonDebugMetadata(DXILCompatibleMDs); +          if (auto *CB = dyn_cast<CallBase>(&I)) { +            CB->removeFnAttrs(AttrMask); +            CB->removeRetAttrs(AttrMask); +            for (size_t Idx = 0, End = CB->arg_size(); Idx < End; ++Idx) +              CB->removeParamAttrs(Idx, AttrMask); +            continue; +          }            // Emtting NoOp bitcast instructions allows the ValueEnumerator to be            // unmodified as it reserves instruction IDs during contruction. -          if (auto LI = dyn_cast<LoadInst>(&I)) { +          if (auto *LI = dyn_cast<LoadInst>(&I)) {              if (Value *NoOpBitcast = maybeGenerateBitcast(                      Builder, PointerTypes, I, LI->getPointerOperand(),                      LI->getType())) { @@ -257,7 +224,7 @@ public:              }              continue;            } -          if (auto SI = dyn_cast<StoreInst>(&I)) { +          if (auto *SI = dyn_cast<StoreInst>(&I)) {              if (Value *NoOpBitcast = maybeGenerateBitcast(                      Builder, PointerTypes, I, SI->getPointerOperand(),                      SI->getValueOperand()->getType())) { @@ -268,39 +235,16 @@ public:              }              continue;            } -          if (auto GEP = dyn_cast<GetElementPtrInst>(&I)) { +          if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {              if (Value *NoOpBitcast = maybeGenerateBitcast(                      Builder, PointerTypes, I, GEP->getPointerOperand(),                      GEP->getSourceElementType()))                GEP->setOperand(0, NoOpBitcast);              continue;            } -          if (auto *CB = dyn_cast<CallBase>(&I)) { -            CB->removeFnAttrs(AttrMask); -            CB->removeRetAttrs(AttrMask); -            for (size_t Idx = 0, End = CB->arg_size(); Idx < End; ++Idx) -              CB->removeParamAttrs(Idx, AttrMask); -            continue; -          }          }        }      } -    // Remove flags not for DXIL. -    cleanModuleFlags(M); - -    // dx.rootsignatures will have been parsed from its metadata form as its -    // binary form as part of the RootSignatureAnalysisWrapper, so safely -    // remove it as it is not recognized in DXIL -    if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures")) -      RootSignature->eraseFromParent(); - -    // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and -    // causes all tests using the DXIL Validator to fail. -    // -    // This is a temporary fix and should be replaced with a whitelist once -    // we have determined all metadata that the DXIL Validator allows -    if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa")) -      ErrNo->eraseFromParent();      return true;    } @@ -308,11 +252,11 @@ public:    DXILPrepareModule() : ModulePass(ID) {}    void getAnalysisUsage(AnalysisUsage &AU) const override {      AU.addRequired<DXILMetadataAnalysisWrapperPass>(); -    AU.addRequired<RootSignatureAnalysisWrapper>(); -    AU.addPreserved<RootSignatureAnalysisWrapper>(); -    AU.addPreserved<ShaderFlagsAnalysisWrapper>(); +      AU.addPreserved<DXILMetadataAnalysisWrapperPass>();      AU.addPreserved<DXILResourceWrapperPass>(); +    AU.addPreserved<RootSignatureAnalysisWrapper>(); +    AU.addPreserved<ShaderFlagsAnalysisWrapper>();    }    static char ID; // Pass identification.  }; @@ -323,7 +267,6 @@ char DXILPrepareModule::ID = 0;  INITIALIZE_PASS_BEGIN(DXILPrepareModule, DEBUG_TYPE, "DXIL Prepare Module",                        false, false)  INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass) -INITIALIZE_PASS_DEPENDENCY(RootSignatureAnalysisWrapper)  INITIALIZE_PASS_END(DXILPrepareModule, DEBUG_TYPE, "DXIL Prepare Module", false,                      false) diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp index 9eebcc9..1e4797b 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp @@ -7,8 +7,10 @@  //===----------------------------------------------------------------------===//  #include "DXILTranslateMetadata.h" +#include "DXILRootSignature.h"  #include "DXILShaderFlags.h"  #include "DirectX.h" +#include "llvm/ADT/STLExtras.h"  #include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Twine.h"  #include "llvm/Analysis/DXILMetadataAnalysis.h" @@ -204,9 +206,9 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,    return MDNode::get(Ctx, MDVals);  } -MDTuple *constructEntryMetadata(const Function *EntryFn, MDTuple *Signatures, -                                MDNode *Resources, MDTuple *Properties, -                                LLVMContext &Ctx) { +static MDTuple *constructEntryMetadata(const Function *EntryFn, +                                       MDTuple *Signatures, MDNode *Resources, +                                       MDTuple *Properties, LLVMContext &Ctx) {    // Each entry point metadata record specifies:    //  * reference to the entry point function global symbol    //  * unmangled name @@ -290,42 +292,82 @@ static MDTuple *emitTopLevelLibraryNode(Module &M, MDNode *RMD,    return constructEntryMetadata(nullptr, nullptr, RMD, Properties, Ctx);  } -// TODO: We might need to refactor this to be more generic, -// in case we need more metadata to be replaced. -static void translateBranchMetadata(Module &M) { -  for (Function &F : M) { -    for (BasicBlock &BB : F) { -      Instruction *BBTerminatorInst = BB.getTerminator(); +static void translateBranchMetadata(Module &M, Instruction *BBTerminatorInst) { +  MDNode *HlslControlFlowMD = +      BBTerminatorInst->getMetadata("hlsl.controlflow.hint"); + +  if (!HlslControlFlowMD) +    return; -      MDNode *HlslControlFlowMD = -          BBTerminatorInst->getMetadata("hlsl.controlflow.hint"); +  assert(HlslControlFlowMD->getNumOperands() == 2 && +         "invalid operands for hlsl.controlflow.hint"); -      if (!HlslControlFlowMD) -        continue; +  MDBuilder MDHelper(M.getContext()); -      assert(HlslControlFlowMD->getNumOperands() == 2 && -             "invalid operands for hlsl.controlflow.hint"); +  llvm::Metadata *HintsStr = MDHelper.createString("dx.controlflow.hints"); +  llvm::Metadata *HintsValue = MDHelper.createConstant( +      mdconst::extract<ConstantInt>(HlslControlFlowMD->getOperand(1))); -      MDBuilder MDHelper(M.getContext()); -      ConstantInt *Op1 = -          mdconst::extract<ConstantInt>(HlslControlFlowMD->getOperand(1)); +  MDNode *MDNode = llvm::MDNode::get(M.getContext(), {HintsStr, HintsValue}); -      SmallVector<llvm::Metadata *, 2> Vals( -          ArrayRef<Metadata *>{MDHelper.createString("dx.controlflow.hints"), -                               MDHelper.createConstant(Op1)}); +  BBTerminatorInst->setMetadata("dx.controlflow.hints", MDNode); +  BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr); +} + +static std::array<unsigned, 6> getCompatibleInstructionMDs(llvm::Module &M) { +  return { +      M.getMDKindID("dx.nonuniform"),    M.getMDKindID("dx.controlflow.hints"), +      M.getMDKindID("dx.precise"),       llvm::LLVMContext::MD_range, +      llvm::LLVMContext::MD_alias_scope, llvm::LLVMContext::MD_noalias}; +} -      MDNode *MDNode = llvm::MDNode::get(M.getContext(), Vals); +static void translateInstructionMetadata(Module &M) { +  // construct allowlist of valid metadata node kinds +  std::array<unsigned, 6> DXILCompatibleMDs = getCompatibleInstructionMDs(M); -      BBTerminatorInst->setMetadata("dx.controlflow.hints", MDNode); -      BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr); +  for (Function &F : M) { +    for (BasicBlock &BB : F) { +      // This needs to be done first so that "hlsl.controlflow.hints" isn't +      // removed in the whitelist below +      if (auto *I = BB.getTerminator()) +        translateBranchMetadata(M, I); + +      for (auto &I : make_early_inc_range(BB)) { +        I.dropUnknownNonDebugMetadata(DXILCompatibleMDs); +      }      }    }  } -static void translateMetadata(Module &M, DXILResourceMap &DRM, -                              DXILResourceTypeMap &DRTM, -                              const ModuleShaderFlags &ShaderFlags, -                              const ModuleMetadataInfo &MMDI) { +static void cleanModuleFlags(Module &M) { +  NamedMDNode *MDFlags = M.getModuleFlagsMetadata(); +  if (!MDFlags) +    return; + +  SmallVector<llvm::Module::ModuleFlagEntry> FlagEntries; +  M.getModuleFlagsMetadata(FlagEntries); +  bool Updated = false; +  for (auto &Flag : FlagEntries) { +    // llvm 3.7 only supports behavior up to AppendUnique. +    if (Flag.Behavior <= Module::ModFlagBehavior::AppendUnique) +      continue; +    Flag.Behavior = Module::ModFlagBehavior::Warning; +    Updated = true; +  } + +  if (!Updated) +    return; + +  MDFlags->eraseFromParent(); + +  for (auto &Flag : FlagEntries) +    M.addModuleFlag(Flag.Behavior, Flag.Key->getString(), Flag.Val); +} + +static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM, +                                    DXILResourceTypeMap &DRTM, +                                    const ModuleShaderFlags &ShaderFlags, +                                    const ModuleMetadataInfo &MMDI) {    LLVMContext &Ctx = M.getContext();    IRBuilder<> IRB(Ctx);    SmallVector<MDNode *> EntryFnMDNodes; @@ -381,6 +423,22 @@ static void translateMetadata(Module &M, DXILResourceMap &DRM,        M.getOrInsertNamedMetadata("dx.entryPoints");    for (auto *Entry : EntryFnMDNodes)      EntryPointsNamedMD->addOperand(Entry); + +  cleanModuleFlags(M); + +  // dx.rootsignatures will have been parsed from its metadata form as its +  // binary form as part of the RootSignatureAnalysisWrapper, so safely +  // remove it as it is not recognized in DXIL +  if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures")) +    RootSignature->eraseFromParent(); + +  // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and +  // causes all tests using the DXIL Validator to fail. +  // +  // This is a temporary fix and should be replaced with a allowlist once +  // we have determined all metadata that the DXIL Validator allows +  if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa")) +    ErrNo->eraseFromParent();  }  PreservedAnalyses DXILTranslateMetadata::run(Module &M, @@ -390,8 +448,8 @@ PreservedAnalyses DXILTranslateMetadata::run(Module &M,    const ModuleShaderFlags &ShaderFlags = MAM.getResult<ShaderFlagsAnalysis>(M);    const dxil::ModuleMetadataInfo MMDI = MAM.getResult<DXILMetadataAnalysis>(M); -  translateMetadata(M, DRM, DRTM, ShaderFlags, MMDI); -  translateBranchMetadata(M); +  translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI); +  translateInstructionMetadata(M);    return PreservedAnalyses::all();  } @@ -409,10 +467,13 @@ public:      AU.addRequired<DXILResourceWrapperPass>();      AU.addRequired<ShaderFlagsAnalysisWrapper>();      AU.addRequired<DXILMetadataAnalysisWrapperPass>(); -    AU.addPreserved<DXILResourceWrapperPass>(); +    AU.addRequired<RootSignatureAnalysisWrapper>(); +      AU.addPreserved<DXILMetadataAnalysisWrapperPass>(); -    AU.addPreserved<ShaderFlagsAnalysisWrapper>();      AU.addPreserved<DXILResourceBindingWrapperPass>(); +    AU.addPreserved<DXILResourceWrapperPass>(); +    AU.addPreserved<RootSignatureAnalysisWrapper>(); +    AU.addPreserved<ShaderFlagsAnalysisWrapper>();    }    bool runOnModule(Module &M) override { @@ -425,8 +486,8 @@ public:      dxil::ModuleMetadataInfo MMDI =          getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata(); -    translateMetadata(M, DRM, DRTM, ShaderFlags, MMDI); -    translateBranchMetadata(M); +    translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI); +    translateInstructionMetadata(M);      return true;    }  }; @@ -443,6 +504,7 @@ INITIALIZE_PASS_BEGIN(DXILTranslateMetadataLegacy, "dxil-translate-metadata",                        "DXIL Translate Metadata", false, false)  INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass)  INITIALIZE_PASS_DEPENDENCY(ShaderFlagsAnalysisWrapper) +INITIALIZE_PASS_DEPENDENCY(RootSignatureAnalysisWrapper)  INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass)  INITIALIZE_PASS_END(DXILTranslateMetadataLegacy, "dxil-translate-metadata",                      "DXIL Translate Metadata", false, false) diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h index f3f5eb1..4c1ffac 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h @@ -13,7 +13,8 @@  namespace llvm { -/// A pass that transforms DXIL Intrinsics that don't have DXIL opCodes +/// A pass that transforms LLVM Metadata in the module to it's DXIL equivalent, +/// then emits all recognized DXIL Metadata  class DXILTranslateMetadata : public PassInfoMixin<DXILTranslateMetadata> {  public:    PreservedAnalyses run(Module &M, ModuleAnalysisManager &); diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td index fb0928b8..ede8463 100644 --- a/llvm/lib/Target/Hexagon/Hexagon.td +++ b/llvm/lib/Target/Hexagon/Hexagon.td @@ -79,6 +79,12 @@ def ExtensionHVXV79: SubtargetFeature<"hvxv79", "HexagonHVXVersion",         ExtensionHVXV67, ExtensionHVXV68, ExtensionHVXV69, ExtensionHVXV71,         ExtensionHVXV73, ExtensionHVXV75]>; +def ExtensionHVXV81: SubtargetFeature<"hvxv81", "HexagonHVXVersion", +      "Hexagon::ArchEnum::V81", "Hexagon HVX instructions", +      [ExtensionHVXV65, ExtensionHVXV66, ExtensionHVXV67, +       ExtensionHVXV68, ExtensionHVXV69, ExtensionHVXV71, +       ExtensionHVXV73, ExtensionHVXV75, ExtensionHVXV79]>; +  def ExtensionHVX64B: SubtargetFeature<"hvx-length64b", "UseHVX64BOps",        "true", "Hexagon HVX 64B instructions", [ExtensionHVX]>;  def ExtensionHVX128B: SubtargetFeature<"hvx-length128b", "UseHVX128BOps", @@ -151,6 +157,8 @@ def UseHVXV75          : Predicate<"HST->useHVXV75Ops()">,                           AssemblerPredicate<(all_of ExtensionHVXV75)>;  def UseHVXV79          : Predicate<"HST->useHVXV79Ops()">,                           AssemblerPredicate<(all_of ExtensionHVXV79)>; +def UseHVXV81          : Predicate<"HST->useHVXV81Ops()">, +                         AssemblerPredicate<(all_of ExtensionHVXV81)>;  def UseAudio           : Predicate<"HST->useAudioOps()">,                           AssemblerPredicate<(all_of ExtensionAudio)>;  def UseZReg            : Predicate<"HST->useZRegOps()">, @@ -488,6 +496,11 @@ def : Proc<"hexagonv79", HexagonModelV79,             ArchV68, ArchV69, ArchV71, ArchV73, ArchV75, ArchV79,             FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops,             FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>; +def : Proc<"hexagonv81", HexagonModelV81, +           [ArchV65, ArchV66, ArchV67, ArchV68, ArchV69, ArchV71, ArchV73, +            ArchV75, ArchV79, ArchV81, +            FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops, +            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;  // Need to update the correct features for tiny core.  // Disable NewValueJumps since the packetizer is unable to handle a packet with diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.h b/llvm/lib/Target/Hexagon/HexagonDepArch.h index 8984534..9bf4034 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepArch.h +++ b/llvm/lib/Target/Hexagon/HexagonDepArch.h @@ -29,7 +29,8 @@ enum class ArchEnum {    V71,    V73,    V75, -  V79 +  V79, +  V81  };  inline std::optional<Hexagon::ArchEnum> getCpu(StringRef CPU) { @@ -50,6 +51,7 @@ inline std::optional<Hexagon::ArchEnum> getCpu(StringRef CPU) {        .Case("hexagonv73", Hexagon::ArchEnum::V73)        .Case("hexagonv75", Hexagon::ArchEnum::V75)        .Case("hexagonv79", Hexagon::ArchEnum::V79) +      .Case("hexagonv81", Hexagon::ArchEnum::V81)        .Default(std::nullopt);  }  } // namespace Hexagon diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.td b/llvm/lib/Target/Hexagon/HexagonDepArch.td index 8ec1d93..f623fd0 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepArch.td +++ b/llvm/lib/Target/Hexagon/HexagonDepArch.td @@ -34,3 +34,5 @@ def ArchV75: SubtargetFeature<"v75", "HexagonArchVersion", "Hexagon::ArchEnum::V  def HasV75 : Predicate<"HST->hasV75Ops()">, AssemblerPredicate<(all_of ArchV75)>;  def ArchV79: SubtargetFeature<"v79", "HexagonArchVersion", "Hexagon::ArchEnum::V79", "Enable Hexagon V79 architecture">;  def HasV79 : Predicate<"HST->hasV79Ops()">, AssemblerPredicate<(all_of ArchV79)>; +def ArchV81: SubtargetFeature<"v81", "HexagonArchVersion", "Hexagon::ArchEnum::V81", "Enable Hexagon V81 architecture">; +def HasV81 : Predicate<"HST->hasV81Ops()">, AssemblerPredicate<(all_of ArchV81)>; diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td index 93696e0..f4e36fa7 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td @@ -7222,3 +7222,595 @@ class DepHVXItinV79 {        [Hex_FWD, Hex_FWD, HVX_FWD]>    ];  } + +class DepHVXItinV81 { +  list<InstrItinData> DepHVXItinV81_list = [ +    InstrItinData <tc_0390c1ca, /*SLOT01,LOAD,VA,VX_DV*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_04da405a, /*SLOT0123,VP_VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLSHF]>], [9, 5], +      [HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_08a4f1b6, /*SLOT23,VX_DV*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 5], +      [HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 5, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_0ec46cf9, /*SLOT0123,VA*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7], +      [HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_131f1c81, /*SLOT0,NOSLOT1,STORE,VP*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [SLOT1], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5], +      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_1381a97c, /*SLOT0123,4SLOT*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_ALL]>], [], +      []>, + +    InstrItinData <tc_15fdf750, /*SLOT23,VS_VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, +       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_16ff9ef8, /*SLOT0123,VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_191381c1, /*SLOT0,STORE,VA*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7], +      [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_1ad8a370, /*SLOT23,VX_DV*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_1ba8a0cd, /*SLOT01,LOAD,VA*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_20a4bbec, /*SLOT0,STORE*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST]>], [3, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_227864f7, /*SLOT0,STORE,VA,VX_DV*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>, +       InstrStage<1, [CVI_MPY01]>], [3, 1, 2, 5], +      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_26a377fe, /*SLOT23,4SLOT_MPY*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], +      [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_2d4051cd, /*SLOT23,4SLOT_MPY*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2], +      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_2e8f5f6e, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_309dbb4f, /*SLOT0123,VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_37820f4c, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_3904b926, /*SLOT01,LOAD*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_3aacf4a8, /*SLOT0123,VA*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7], +      [HVX_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_3ad719fb, /*SLOT01,ZW*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_3c56e5ce, /*SLOT0,NOSLOT1,LOAD,VP*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5], +      [HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_3ce09744, /*SLOT0,STORE*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST]>], [1, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_3e2aaafc, /*SLOT0,STORE,VA*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7], +      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_447d9895, /*SLOT0,STORE,VA*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], +      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_453fe68d, /*SLOT01,LOAD,VA*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_46d6c3e0, /*SLOT0123,VP*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLANE]>], [9, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_4942646a, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_SHIFT]>], [9, 5], +      [HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_52447ecc, /*SLOT01,LOAD*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD]>], [9, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_531b383c, /*SLOT0123*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_540c3da3, /*SLOT0,VA*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1], +      [Hex_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_54a0dc47, /*SLOT0,STORE,VA*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_561aaa58, /*SLOT0123,VP_VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_56c4f9fe, /*SLOT0123,VA*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_56e64202, /*SLOT0123,VP*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_58d21193, /*SLOT0,STORE,VA_DV*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], +      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLANE]>], [9, 2], +      [HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7], +      [HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_649072c2, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_660769f1, /*SLOT23,VX_DV*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_663c80a7, /*SLOT01,LOAD*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD]>], [9, 3, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_6942b6e0, /*SLOT0,STORE*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST]>], [3, 1, 2, 5], +      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_6e7fa133, /*SLOT0123,VP*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLANE]>], [9, 5, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_7095ecba, /*SLOT01,LOAD,VA_DV*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7], +      [Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_7177e272, /*SLOT0,STORE*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST]>], [2, 1, 2, 5], +      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_718b5c53, /*SLOT0123,VA_DV*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9], +      [HVX_FWD]>, + +    InstrItinData <tc_7273323b, /*SLOT0,STORE,VA_DV*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], +      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_72e2b393, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_73efe966, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_7417e785, /*SLOT0123,VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_767c4e9d, /*SLOT0123,4SLOT*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_ALL]>], [3, 2], +      [HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_7d68d5c2, /*SLOT01,LOAD,VA*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], +      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_8772086c, /*SLOT0123,VA*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_87adc037, /*SLOT0123,VP_VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_8e420e4d, /*SLOT0,STORE,VA*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7], +      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_90bcc1db, /*SLOT2,VX_DV*/ +      [InstrStage<1, [SLOT2], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_933f2b39, /*SLOT23,4SLOT_MPY*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_946013d8, /*SLOT0123,VP*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLANE]>], [9, 5], +      [HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_9a1cab75, /*SLOT01,LOAD,VA,VX_DV*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 3, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9aff7a2a, /*SLOT0,STORE,VA,VX_DV*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>, +       InstrStage<1, [CVI_MPY01]>], [1, 2, 5], +      [Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_9d1dc972, /*SLOT0123,VP_VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_9f363d21, /*SLOT0,STORE,VA*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7], +      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_a02a10a8, /*SLOT0,STORE,VA*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7], +      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_a0dbea28, /*SLOT01,ZW*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_ZW]>], [3, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a19b9305, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_a28f32b5, /*SLOT01,LOAD,VA*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], +      [Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_a69eeee1, /*SLOT01,LOAD,VA_DV*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7], +      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_XLANE]>], [9, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_ab23f776, /*SLOT0,STORE*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST]>], [1, 2, 5], +      [Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_abe8c3b2, /*SLOT01,LOAD,VA*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_ac4046bc, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_af25efd9, /*SLOT0123,VA_DV*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7], +      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_b091f1c6, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_b28e51aa, /*SLOT0123,4SLOT*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_ALL]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_b4416217, /*SLOT0123,VA_DV*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7], +      [HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_b9db8205, /*SLOT01,LOAD*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_bb599486, /*SLOT23,VX_DV*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_c127de3a, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_c4edf264, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2], +      [HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_c5dba46e, /*SLOT0,STORE,VA*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], +      [Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_c7039829, /*SLOT0,NOSLOT1,STORE,VP*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [SLOT1], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_cd94bfe0, /*SLOT23,VS_VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, +       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_cda936da, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_db5555f3, /*SLOT0123,VA_DV*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_dcca380f, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_ZW]>], [2, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_df80eeb0, /*SLOT0123,VP_VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_e2d2e9e5, /*SLOT0,NOSLOT1,STORE,VP*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [SLOT1], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5], +      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_e2fdd6e6, /*SLOT0123*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 5], +      [HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_e35c1e93, /*SLOT0123,VA*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_e3f68a46, /*SLOT0123,4SLOT*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_ALL]>], [3], +      [HVX_FWD]>, + +    InstrItinData <tc_e675c45a, /*SLOT23,VX_DV*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_e699ae41, /*SLOT01,ZW*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_ZW]>], [1, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_f175e046, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/ +      [InstrStage<1, [SLOT2], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 5, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_f21e8abb, /*SLOT0,NOSLOT1,STORE,VP*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [SLOT1], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_XLANE]>], [1, 2, 5], +      [Hex_FWD, Hex_FWD, HVX_FWD]> +  ]; +}
\ No newline at end of file diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td b/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td index 7a1ad3e..48b665c 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td +++ b/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td @@ -13740,3 +13740,891 @@ class DepScalarItinV79 {        [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>    ];  } + +class DepScalarItinV81 { +  list<InstrItinData> DepScalarItinV81_list = [ +    InstrItinData <tc_011e0e9d, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_01d44cb2, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_01e1be3b, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_02fe1c65, /*tc_4x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_0655b949, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [2, 3], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_075c8dd8, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_0a195f2c, /*tc_4x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_0a43be35, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_0a6c20ae, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_0ba0d5da, /*tc_3stall*/ +      [InstrStage<1, [SLOT2]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_0dfac0a7, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_0fac1eb8, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [3, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_112d30d6, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_1242dc2a, /*tc_ld*/ +      [InstrStage<1, [SLOT0]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_1248597c, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_139ef484, /*tc_3stall*/ +      [InstrStage<1, [SLOT2]>], [1, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_14ab4f41, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [3, 3, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_151bf368, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_158aa3f7, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_197dce51, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [4, 2, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_1981450d, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [3], +      [Hex_FWD]>, + +    InstrItinData <tc_1c2c7a4a, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_1c7522a8, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_1d41f8b7, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_1fcb8495, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_1fe4ab69, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_20131976, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_2237d952, /*tc_ld*/ +      [InstrStage<1, [SLOT0]>], [1, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_23708a21, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [], +      []>, + +    InstrItinData <tc_2471c1c8, /*tc_ld*/ +      [InstrStage<1, [SLOT0]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_24e109c7, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [3, 3, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_24f426ab, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_27106296, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [4, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_280f7fe1, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_28e55c6f, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [1, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_2c13e7f5, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_2c3e17fc, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_2f573607, /*tc_1*/ +      [InstrStage<1, [SLOT2]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_33e7e673, /*tc_2early*/ +      [InstrStage<1, [SLOT2]>], [], +      []>, + +    InstrItinData <tc_362b0be2, /*tc_3*/ +      [InstrStage<1, [SLOT2]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_38382228, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_388f9897, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_38e0bae9, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_3d14a17b, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_3edca78f, /*tc_2*/ +      [InstrStage<1, [SLOT3]>], [4, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_3fbf1042, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3], +      [Hex_FWD]>, + +    InstrItinData <tc_407e96f9, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_40d64c94, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [3, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_4222e6bf, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_42ff66ba, /*tc_1*/ +      [InstrStage<1, [SLOT2]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_442395f3, /*tc_2latepred*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_449acf79, /*tc_latepredstaia*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_44d5a428, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_44fffc58, /*tc_3*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_45791fb8, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_45f9d1be, /*tc_2early*/ +      [InstrStage<1, [SLOT2]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_46c18ecf, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_49fdfd4b, /*tc_3stall*/ +      [InstrStage<1, [SLOT3]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_4a55d03c, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_4abdbdc6, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_4ac61d92, /*tc_2latepred*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_4bf903b0, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [3], +      [Hex_FWD]>, + +    InstrItinData <tc_503ce0f3, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_512b1653, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_53c851ab, /*tc_3stall*/ +      [InstrStage<1, [SLOT2]>], [4, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_54f0cee2, /*tc_3stall*/ +      [InstrStage<1, [SLOT3]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_5502c366, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_55255f2b, /*tc_3stall*/ +      [InstrStage<1, [SLOT3]>], [], +      []>, + +    InstrItinData <tc_556f6577, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_55a9a350, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_55b33fda, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_56a124a7, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_57a55b54, /*tc_1*/ +      [InstrStage<1, [SLOT3]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_5944960d, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_59a7822c, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_5a222e89, /*tc_2early*/ +      [InstrStage<1, [SLOT2]>], [1, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_5a4b5e58, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [4, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_5b347363, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_5ceb2f9e, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_5da50c4b, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_5deb5e47, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_5e4cf0e8, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_5f2afaf7, /*tc_latepredldaia*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_60e324ff, /*tc_1*/ +      [InstrStage<1, [SLOT2]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_63567288, /*tc_2latepred*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4], +      [Hex_FWD]>, + +    InstrItinData <tc_64b00d8a, /*tc_ld*/ +      [InstrStage<1, [SLOT0]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_651cbe02, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_65279839, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_65cbd974, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_69bfb303, /*tc_3*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_6aa823ab, /*tc_3stall*/ +      [InstrStage<1, [SLOT3]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_6ae3426b, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_6d861a95, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [2, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_6e20402a, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [2, 3], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_6f42bc60, /*tc_3stall*/ +      [InstrStage<1, [SLOT0]>], [4, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_6fb52018, /*tc_3stall*/ +      [InstrStage<1, [SLOT0]>], [1, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_6fc5dbea, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_711c805f, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_713b66bf, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_7401744f, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_7476d766, /*tc_3stall*/ +      [InstrStage<1, [SLOT3]>], [4, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_74a42bda, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_759e57be, /*tc_3stall*/ +      [InstrStage<1, [SLOT2]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_76bb5435, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_7d6a2568, /*tc_3stall*/ +      [InstrStage<1, [SLOT2]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_77f94a5e, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [], +      []>, + +    InstrItinData <tc_788b1d09, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_78f87ed3, /*tc_3stall*/ +      [InstrStage<1, [SLOT0]>], [], +      []>, + +    InstrItinData <tc_7af3a37e, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1, 3], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_7b9187d3, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_7c28bd7e, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [3], +      [Hex_FWD]>, + +    InstrItinData <tc_7c31e19a, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_7c6d32e4, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_7dc63b5c, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_7f58404a, /*tc_3stall*/ +      [InstrStage<1, [SLOT3]>], [], +      []>, + +    InstrItinData <tc_7f7f45f5, /*tc_4x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_7f8ae742, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_8035e91f, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_822c3c68, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_829d8a86, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_838c4d7a, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_84a7500d, /*tc_2*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_86173609, /*tc_2latepred*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_887d1bb7, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_8a6d0d94, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_8a825db2, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_8b5bd4f5, /*tc_2*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_8e82e8ca, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_8f36a2fd, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9124c04f, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_92240447, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_934753bb, /*tc_ld*/ +      [InstrStage<1, [SLOT0]>], [3, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_937dd41c, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [], +      []>, + +    InstrItinData <tc_9406230a, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [2, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_95a33176, /*tc_2*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_95f43c5e, /*tc_3*/ +      [InstrStage<1, [SLOT2]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_96ef76ef, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_975a4e54, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [3, 3, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9783714b, /*tc_4x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9b20a062, /*tc_3stall*/ +      [InstrStage<1, [SLOT2]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9b34f5e0, /*tc_3stall*/ +      [InstrStage<1, [SLOT2]>], [], +      []>, + +    InstrItinData <tc_9b3c0462, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9bcfb2ee, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9c52f549, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9e27f2f9, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9e72dc89, /*tc_4x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9edb7c77, /*tc_4x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9edefe01, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9f6cd987, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a08b630b, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a1297125, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a154b476, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a2b365d2, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a3070909, /*tc_3stall*/ +      [InstrStage<1, [SLOT0]>], [1, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a32e03e7, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a38c45dc, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a4e22bbd, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a4ee89db, /*tc_2early*/ +      [InstrStage<1, [SLOT0]>], [], +      []>, + +    InstrItinData <tc_a724463d, /*tc_3stall*/ +      [InstrStage<1, [SLOT0]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a7a13fac, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a7bdb22c, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a9edeffa, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_abfd9a6d, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_ac65613f, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_addc37a8, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_ae5babd7, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_aee6250c, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_af6af259, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_b1ae5f67, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_b2196a3f, /*tc_3stall*/ +      [InstrStage<1, [SLOT3]>], [1, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_b3d46584, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [], +      []>, + +    InstrItinData <tc_b4dc7630, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_b7c4062a, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_b837298f, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [], +      []>, + +    InstrItinData <tc_b9bec29e, /*tc_3stall*/ +      [InstrStage<1, [SLOT2]>], [], +      []>, + +    InstrItinData <tc_ba9255a6, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_bb07f2c5, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_bb78483e, /*tc_3stall*/ +      [InstrStage<1, [SLOT3]>], [4, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_bb831a7c, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_bf2ffc0f, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_c20701f0, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_c21d7447, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_c57d9f39, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_c818ff7f, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [], +      []>, + +    InstrItinData <tc_ce59038e, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_cfa0e29b, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [2, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_d03278fd, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_d234b61a, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_d33e5eee, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_d3632d88, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_d45ba9cd, /*tc_ld*/ +      [InstrStage<1, [SLOT0]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_d57d649c, /*tc_3stall*/ +      [InstrStage<1, [SLOT2]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_d61dfdc3, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_d68dca5c, /*tc_3stall*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_d71ea8fa, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [2, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_d7718fbe, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_db596beb, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_db96aa6b, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_dc51281d, /*tc_3*/ +      [InstrStage<1, [SLOT2]>], [2, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_decdde8a, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_df5d53f9, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [3, 2, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_e3d699e3, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_e60def48, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_e9170fb7, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_ed03645c, /*tc_1*/ +      [InstrStage<1, [SLOT2]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_ed3f8d2a, /*tc_ld*/ +      [InstrStage<1, [SLOT0]>], [4, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_eed07714, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_eeda4109, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_ef921005, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_f098b237, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_f0cdeccf, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_f0e8e832, /*tc_4x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_f34c1c21, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_f38f92e1, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_f529831b, /*tc_latepredstaia*/ +      [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_f6e2aff9, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_f7569068, /*tc_4x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_f97707c1, /*tc_1*/ +      [InstrStage<1, [SLOT2]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_f999c66e, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_fae9dfa5, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [4, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_fedb7e19, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]> +  ]; +}
\ No newline at end of file diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td index ae96753..f8f1c2a 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td +++ b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td @@ -39178,6 +39178,19 @@ let opNewValue = 0;  let isCVI = 1;  let DecoderNamespace = "EXT_mmvec";  } +def V6_vsub_hf_mix : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf16 = vsub($Vu32.hf,$Vv32.qf16)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b100; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011010000; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +}  def V6_vsub_qf16 : HInst<  (outs HvxVR:$Vd32),  (ins HvxVR:$Vu32, HvxVR:$Vv32), @@ -39269,6 +39282,19 @@ let opNewValue = 0;  let isCVI = 1;  let DecoderNamespace = "EXT_mmvec";  } +def V6_vsub_sf_mix : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf32 = vsub($Vu32.sf,$Vv32.qf32)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011010000; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +}  def V6_vsub_sf_sf : HInst<  (outs HvxVR:$Vd32),  (ins HvxVR:$Vu32, HvxVR:$Vv32), @@ -41116,6 +41142,17 @@ let hasNewValue = 1;  let opNewValue = 0;  let isSolo = 1;  } +def Y2_tlbpp : HInst< +(outs IntRegs:$Rd32), +(ins DoubleRegs:$Rss32), +"$Rd32 = tlbp($Rss32)", +tc_6aa823ab, TypeCR>, Enc_90cd8b, Requires<[HasV81]> { +let Inst{13-5} = 0b000000000; +let Inst{31-21} = 0b01101100011; +let hasNewValue = 1; +let opNewValue = 0; +let isSolo = 1; +}  def Y2_tlbr : HInst<  (outs DoubleRegs:$Rdd32),  (ins IntRegs:$Rs32), diff --git a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td index 17cb96c..23f4b3a 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td +++ b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td @@ -3827,3 +3827,14 @@ def: Pat<(int_hexagon_V6_vsub_hf_f8 HvxVR:$src1, HvxVR:$src2),           (V6_vsub_hf_f8 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV79, UseHVX64B]>;  def: Pat<(int_hexagon_V6_vsub_hf_f8_128B HvxVR:$src1, HvxVR:$src2),           (V6_vsub_hf_f8 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV79, UseHVX128B]>; + +// V81 HVX Instructions. + +def: Pat<(int_hexagon_V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2), +         (V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_hf_mix_128B HvxVR:$src1, HvxVR:$src2), +         (V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_sf_mix HvxVR:$src1, HvxVR:$src2), +         (V6_vsub_sf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_sf_mix_128B HvxVR:$src1, HvxVR:$src2), +         (V6_vsub_sf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index e285e04..7ee280d 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -654,7 +654,9 @@ void HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) {        IntNo == Intrinsic::hexagon_V6_vgathermh ||        IntNo == Intrinsic::hexagon_V6_vgathermh_128B ||        IntNo == Intrinsic::hexagon_V6_vgathermhw || -      IntNo == Intrinsic::hexagon_V6_vgathermhw_128B) { +      IntNo == Intrinsic::hexagon_V6_vgathermhw_128B || +      IntNo == Intrinsic::hexagon_V6_vgather_vscattermh || +      IntNo == Intrinsic::hexagon_V6_vgather_vscattermh_128B) {      SelectV65Gather(N);      return;    } diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index c7a4f68..3cc146b 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -2953,6 +2953,10 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {    case Intrinsic::hexagon_V6_vgathermhw_128B:      Opcode = Hexagon::V6_vgathermhw_pseudo;      break; +  case Intrinsic::hexagon_V6_vgather_vscattermh: +  case Intrinsic::hexagon_V6_vgather_vscattermh_128B: +    Opcode = Hexagon::V6_vgather_vscatter_mh_pseudo; +    break;    }    SDVTList VTs = CurDAG->getVTList(MVT::Other); diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 9f7f434..526b4de 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -2145,7 +2145,9 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,    case Intrinsic::hexagon_V6_vgathermhq:    case Intrinsic::hexagon_V6_vgathermhq_128B:    case Intrinsic::hexagon_V6_vgathermhwq: -  case Intrinsic::hexagon_V6_vgathermhwq_128B: { +  case Intrinsic::hexagon_V6_vgathermhwq_128B: +  case Intrinsic::hexagon_V6_vgather_vscattermh: +  case Intrinsic::hexagon_V6_vgather_vscattermh_128B: {      const Module &M = *I.getParent()->getParent()->getParent();      Info.opc = ISD::INTRINSIC_W_CHAIN;      Type *VecTy = I.getArgOperand(1)->getType(); diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 939841a..47726d6 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1554,80 +1554,93 @@ HexagonInstrInfo::expandVGatherPseudo(MachineInstr &MI) const {    MachineBasicBlock::iterator First;    switch (Opc) { -    case Hexagon::V6_vgathermh_pseudo: -      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh)) -                  .add(MI.getOperand(2)) -                  .add(MI.getOperand(3)) -                  .add(MI.getOperand(4)); -      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) -          .add(MI.getOperand(0)) -          .addImm(MI.getOperand(1).getImm()) -          .addReg(Hexagon::VTMP); -      MBB.erase(MI); -      return First.getInstrIterator(); - -    case Hexagon::V6_vgathermw_pseudo: -      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermw)) -                  .add(MI.getOperand(2)) -                  .add(MI.getOperand(3)) -                  .add(MI.getOperand(4)); -      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) -          .add(MI.getOperand(0)) -          .addImm(MI.getOperand(1).getImm()) -          .addReg(Hexagon::VTMP); -      MBB.erase(MI); -      return First.getInstrIterator(); - -    case Hexagon::V6_vgathermhw_pseudo: -      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhw)) -                  .add(MI.getOperand(2)) -                  .add(MI.getOperand(3)) -                  .add(MI.getOperand(4)); -      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) -          .add(MI.getOperand(0)) -          .addImm(MI.getOperand(1).getImm()) -          .addReg(Hexagon::VTMP); -      MBB.erase(MI); -      return First.getInstrIterator(); - -    case Hexagon::V6_vgathermhq_pseudo: -      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhq)) -                  .add(MI.getOperand(2)) -                  .add(MI.getOperand(3)) -                  .add(MI.getOperand(4)) -                  .add(MI.getOperand(5)); -      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) -          .add(MI.getOperand(0)) -          .addImm(MI.getOperand(1).getImm()) -          .addReg(Hexagon::VTMP); -      MBB.erase(MI); -      return First.getInstrIterator(); - -    case Hexagon::V6_vgathermwq_pseudo: -      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermwq)) -                  .add(MI.getOperand(2)) -                  .add(MI.getOperand(3)) -                  .add(MI.getOperand(4)) -                  .add(MI.getOperand(5)); -      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) -          .add(MI.getOperand(0)) -          .addImm(MI.getOperand(1).getImm()) -          .addReg(Hexagon::VTMP); -      MBB.erase(MI); -      return First.getInstrIterator(); - -    case Hexagon::V6_vgathermhwq_pseudo: -      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhwq)) -                  .add(MI.getOperand(2)) -                  .add(MI.getOperand(3)) -                  .add(MI.getOperand(4)) -                  .add(MI.getOperand(5)); -      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) -          .add(MI.getOperand(0)) -          .addImm(MI.getOperand(1).getImm()) -          .addReg(Hexagon::VTMP); -      MBB.erase(MI); -      return First.getInstrIterator(); +  case Hexagon::V6_vgather_vscatter_mh_pseudo: +    // This is mainly a place holder. It will be extended. +    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh)) +                .add(MI.getOperand(2)) +                .add(MI.getOperand(3)) +                .add(MI.getOperand(4)); +    BuildMI(MBB, MI, DL, get(Hexagon::V6_vscattermh)) +        .add(MI.getOperand(2)) +        .add(MI.getOperand(3)) +        .add(MI.getOperand(4)) +        .addReg(Hexagon::VTMP); +    MBB.erase(MI); +    return First.getInstrIterator(); +  case Hexagon::V6_vgathermh_pseudo: +    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh)) +                .add(MI.getOperand(2)) +                .add(MI.getOperand(3)) +                .add(MI.getOperand(4)); +    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) +        .add(MI.getOperand(0)) +        .addImm(MI.getOperand(1).getImm()) +        .addReg(Hexagon::VTMP); +    MBB.erase(MI); +    return First.getInstrIterator(); + +  case Hexagon::V6_vgathermw_pseudo: +    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermw)) +                .add(MI.getOperand(2)) +                .add(MI.getOperand(3)) +                .add(MI.getOperand(4)); +    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) +        .add(MI.getOperand(0)) +        .addImm(MI.getOperand(1).getImm()) +        .addReg(Hexagon::VTMP); +    MBB.erase(MI); +    return First.getInstrIterator(); + +  case Hexagon::V6_vgathermhw_pseudo: +    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhw)) +                .add(MI.getOperand(2)) +                .add(MI.getOperand(3)) +                .add(MI.getOperand(4)); +    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) +        .add(MI.getOperand(0)) +        .addImm(MI.getOperand(1).getImm()) +        .addReg(Hexagon::VTMP); +    MBB.erase(MI); +    return First.getInstrIterator(); + +  case Hexagon::V6_vgathermhq_pseudo: +    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhq)) +                .add(MI.getOperand(2)) +                .add(MI.getOperand(3)) +                .add(MI.getOperand(4)) +                .add(MI.getOperand(5)); +    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) +        .add(MI.getOperand(0)) +        .addImm(MI.getOperand(1).getImm()) +        .addReg(Hexagon::VTMP); +    MBB.erase(MI); +    return First.getInstrIterator(); + +  case Hexagon::V6_vgathermwq_pseudo: +    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermwq)) +                .add(MI.getOperand(2)) +                .add(MI.getOperand(3)) +                .add(MI.getOperand(4)) +                .add(MI.getOperand(5)); +    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) +        .add(MI.getOperand(0)) +        .addImm(MI.getOperand(1).getImm()) +        .addReg(Hexagon::VTMP); +    MBB.erase(MI); +    return First.getInstrIterator(); + +  case Hexagon::V6_vgathermhwq_pseudo: +    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhwq)) +                .add(MI.getOperand(2)) +                .add(MI.getOperand(3)) +                .add(MI.getOperand(4)) +                .add(MI.getOperand(5)); +    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) +        .add(MI.getOperand(0)) +        .addImm(MI.getOperand(1).getImm()) +        .addReg(Hexagon::VTMP); +    MBB.erase(MI); +    return First.getInstrIterator();    }    return MI.getIterator(); @@ -2806,6 +2819,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,    case Hexagon::V6_vL32b_nt_tmp_npred_ai:    case Hexagon::V6_vS32Ub_npred_ai:    case Hexagon::V6_vgathermh_pseudo: +  case Hexagon::V6_vgather_vscatter_mh_pseudo:    case Hexagon::V6_vgathermw_pseudo:    case Hexagon::V6_vgathermhw_pseudo:    case Hexagon::V6_vgathermhq_pseudo: diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td index f927f9b..42393d0 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td @@ -40,6 +40,19 @@ defm V6_vgathermh_pseudo  : vgathermh<HvxVR>;  defm V6_vgathermw_pseudo  : vgathermw<HvxVR>;  defm V6_vgathermhw_pseudo  : vgathermhw<HvxWR>; + +multiclass vgather_scatter_mh<RegisterClass RC> { +  let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, +  mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in +  def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ), +                           (ins IntRegs:$_dst_, s4_0Imm:$Ii, +                                IntRegs:$Rt, ModRegs:$Mu, RC:$Vv), +                           ".error \"should not emit\" ", +                           []>; +} + +defm V6_vgather_vscatter_mh_pseudo  : vgather_scatter_mh<HvxVR>; +  multiclass vgathermhq<RegisterClass RC1, RegisterClass RC2> {    let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1,    mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in diff --git a/llvm/lib/Target/Hexagon/HexagonSchedule.td b/llvm/lib/Target/Hexagon/HexagonSchedule.td index b8a9cf3..9bcd4bf 100644 --- a/llvm/lib/Target/Hexagon/HexagonSchedule.td +++ b/llvm/lib/Target/Hexagon/HexagonSchedule.td @@ -75,3 +75,4 @@ include "HexagonScheduleV71T.td"  include "HexagonScheduleV73.td"  include "HexagonScheduleV75.td"  include "HexagonScheduleV79.td" +include "HexagonScheduleV81.td"
\ No newline at end of file diff --git a/llvm/lib/Target/Hexagon/HexagonScheduleV81.td b/llvm/lib/Target/Hexagon/HexagonScheduleV81.td new file mode 100644 index 0000000..dd5f5a0 --- /dev/null +++ b/llvm/lib/Target/Hexagon/HexagonScheduleV81.td @@ -0,0 +1,31 @@ +//=-HexagonScheduleV81.td - HexagonV81 Scheduling Definitions *- tablegen -*-=// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +def HexagonV81ItinList : DepScalarItinV81, ScalarItin, +                         DepHVXItinV81, HVXItin, PseudoItin { +  list<InstrItinData> ItinList = +    !listconcat(DepScalarItinV81_list, ScalarItin_list, +                DepHVXItinV81_list, HVXItin_list, PseudoItin_list); +} + +def HexagonItinerariesV81 : +      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP, +                            CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1, +                            CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL, +                            CVI_ALL_NOMEM, CVI_ZW], +                            [Hex_FWD, HVX_FWD], +                            HexagonV81ItinList.ItinList>; + +def HexagonModelV81 : SchedMachineModel { +  // Max issue per cycle == bundle width. +  let IssueWidth = 4; +  let Itineraries = HexagonItinerariesV81; +  let LoadLatency = 1; +  let CompleteModel = 0; +} diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h index 7430567..995f66d 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h @@ -224,6 +224,15 @@ public:    bool useHVXV79Ops() const {      return HexagonHVXVersion >= Hexagon::ArchEnum::V79;    } +  bool hasV81Ops() const { +    return getHexagonArchVersion() >= Hexagon::ArchEnum::V81; +  } +  bool hasV81OpsOnly() const { +    return getHexagonArchVersion() == Hexagon::ArchEnum::V81; +  } +  bool useHVXV81Ops() const { +    return HexagonHVXVersion >= Hexagon::ArchEnum::V81; +  }    bool useAudioOps() const { return UseAudioOps; }    bool useCompound() const { return UseCompound; } diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index 171e294..e925e04 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -31,6 +31,10 @@ using namespace llvm;  static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(false),      cl::Hidden, cl::desc("Enable loop vectorizer for HVX")); +cl::opt<bool> HexagonAllowScatterGatherHVX( +    "hexagon-allow-scatter-gather-hvx", cl::init(false), cl::Hidden, +    cl::desc("Allow auto-generation of HVX scatter-gather")); +  static cl::opt<bool> EnableV68FloatAutoHVX(      "force-hvx-float", cl::Hidden,      cl::desc("Enable auto-vectorization of floatint point types on v68.")); @@ -354,6 +358,61 @@ bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/,    return HexagonMaskedVMem && ST.isTypeForHVX(DataType);  } +bool HexagonTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const { +  // For now assume we can not deal with all HVX datatypes. +  if (!Ty->isVectorTy() || !ST.isTypeForHVX(Ty) || +      !HexagonAllowScatterGatherHVX) +    return false; +  // This must be in sync with HexagonVectorCombine pass. +  switch (Ty->getScalarSizeInBits()) { +  case 8: +    return (getTypeNumElements(Ty) == 128); +  case 16: +    if (getTypeNumElements(Ty) == 64 || getTypeNumElements(Ty) == 32) +      return (Alignment >= 2); +    break; +  case 32: +    if (getTypeNumElements(Ty) == 32) +      return (Alignment >= 4); +    break; +  default: +    break; +  } +  return false; +} + +bool HexagonTTIImpl::isLegalMaskedScatter(Type *Ty, Align Alignment) const { +  if (!Ty->isVectorTy() || !ST.isTypeForHVX(Ty) || +      !HexagonAllowScatterGatherHVX) +    return false; +  // This must be in sync with HexagonVectorCombine pass. +  switch (Ty->getScalarSizeInBits()) { +  case 8: +    return (getTypeNumElements(Ty) == 128); +  case 16: +    if (getTypeNumElements(Ty) == 64) +      return (Alignment >= 2); +    break; +  case 32: +    if (getTypeNumElements(Ty) == 32) +      return (Alignment >= 4); +    break; +  default: +    break; +  } +  return false; +} + +bool HexagonTTIImpl::forceScalarizeMaskedGather(VectorType *VTy, +                                                Align Alignment) const { +  return !isLegalMaskedGather(VTy, Alignment); +} + +bool HexagonTTIImpl::forceScalarizeMaskedScatter(VectorType *VTy, +                                                 Align Alignment) const { +  return !isLegalMaskedScatter(VTy, Alignment); +} +  /// --- Vector TTI end ---  unsigned HexagonTTIImpl::getPrefetchDistance() const { diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index dbf16c9..cec2bf9 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -169,6 +169,12 @@ public:                            unsigned AddressSpace) const override;    bool isLegalMaskedLoad(Type *DataType, Align Alignment,                           unsigned AddressSpace) const override; +  bool isLegalMaskedGather(Type *Ty, Align Alignment) const override; +  bool isLegalMaskedScatter(Type *Ty, Align Alignment) const override; +  bool forceScalarizeMaskedGather(VectorType *VTy, +                                  Align Alignment) const override; +  bool forceScalarizeMaskedScatter(VectorType *VTy, +                                   Align Alignment) const override;    /// @} diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp index 9ab5202..5c50ec2 100644 --- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp @@ -57,6 +57,11 @@  #define DEBUG_TYPE "hexagon-vc" +// This is a const that represents default HVX VTCM page size. +// It is boot time configurable, so we probably want an API to +// read it, but for now assume 128KB +#define DEFAULT_HVX_VTCM_PAGE_SIZE 131072 +  using namespace llvm;  namespace { @@ -418,6 +423,18 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {  class HvxIdioms {  public: +  enum DstQualifier { +    Undefined = 0, +    Arithmetic, +    LdSt, +    LLVM_Gather, +    LLVM_Scatter, +    HEX_Gather_Scatter, +    HEX_Gather, +    HEX_Scatter, +    Call +  }; +    HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) {      auto *Int32Ty = HVC.getIntTy(32);      HvxI32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/false); @@ -473,6 +490,11 @@ private:    auto createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,                       Signedness SgnX, ArrayRef<Value *> WordY,                       Signedness SgnY) const -> SmallVector<Value *>; +  // Vector manipulations for Ripple +  bool matchScatter(Instruction &In) const; +  bool matchGather(Instruction &In) const; +  Value *processVScatter(Instruction &In) const; +  Value *processVGather(Instruction &In) const;    VectorType *HvxI32Ty;    VectorType *HvxP32Ty; @@ -1545,7 +1567,7 @@ auto AlignVectors::isSectorTy(Type *Ty) const -> bool {  }  auto AlignVectors::run() -> bool { -  LLVM_DEBUG(dbgs() << "Running HVC::AlignVectors on " << HVC.F.getName() +  LLVM_DEBUG(dbgs() << "\nRunning HVC::AlignVectors on " << HVC.F.getName()                      << '\n');    if (!createAddressGroups())      return false; @@ -1797,6 +1819,846 @@ auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const    return Ext;  } +inline bool HvxIdioms::matchScatter(Instruction &In) const { +  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In); +  if (!II) +    return false; +  return (II->getIntrinsicID() == Intrinsic::masked_scatter); +} + +inline bool HvxIdioms::matchGather(Instruction &In) const { +  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In); +  if (!II) +    return false; +  return (II->getIntrinsicID() == Intrinsic::masked_gather); +} + +Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual); + +// Binary instructions we want to handle as users of gather/scatter. +inline bool isArithmetic(unsigned Opc) { +  switch (Opc) { +  case Instruction::Add: +  case Instruction::Sub: +  case Instruction::Mul: +  case Instruction::And: +  case Instruction::Or: +  case Instruction::Xor: +  case Instruction::AShr: +  case Instruction::LShr: +  case Instruction::Shl: +  case Instruction::UDiv: +    return true; +  } +  return false; +} + +// TODO: Maybe use MemoryLocation for this. See getLocOrNone above. +inline Value *getPointer(Value *Ptr) { +  assert(Ptr && "Unable to extract pointer"); +  if (isa<AllocaInst>(Ptr) || isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) +    return Ptr; +  if (isa<LoadInst>(Ptr) || isa<StoreInst>(Ptr)) +    return getLoadStorePointerOperand(Ptr); +  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Ptr)) { +    if (II->getIntrinsicID() == Intrinsic::masked_store) +      return II->getOperand(1); +  } +  return nullptr; +} + +static Instruction *selectDestination(Instruction *In, +                                      HvxIdioms::DstQualifier &Qual) { +  Instruction *Destination = nullptr; +  if (!In) +    return Destination; +  if (isa<StoreInst>(In)) { +    Destination = In; +    Qual = HvxIdioms::LdSt; +  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) { +    if (II->getIntrinsicID() == Intrinsic::masked_gather) { +      Destination = In; +      Qual = HvxIdioms::LLVM_Gather; +    } else if (II->getIntrinsicID() == Intrinsic::masked_scatter) { +      Destination = In; +      Qual = HvxIdioms::LLVM_Scatter; +    } else if (II->getIntrinsicID() == Intrinsic::masked_store) { +      Destination = In; +      Qual = HvxIdioms::LdSt; +    } else if (II->getIntrinsicID() == +               Intrinsic::hexagon_V6_vgather_vscattermh) { +      Destination = In; +      Qual = HvxIdioms::HEX_Gather_Scatter; +    } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vscattermh_128B) { +      Destination = In; +      Qual = HvxIdioms::HEX_Scatter; +    } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vgathermh_128B) { +      Destination = In; +      Qual = HvxIdioms::HEX_Gather; +    } +  } else if (isa<ZExtInst>(In)) { +    return locateDestination(In, Qual); +  } else if (isa<CastInst>(In)) { +    return locateDestination(In, Qual); +  } else if (isa<CallInst>(In)) { +    Destination = In; +    Qual = HvxIdioms::Call; +  } else if (isa<GetElementPtrInst>(In)) { +    return locateDestination(In, Qual); +  } else if (isArithmetic(In->getOpcode())) { +    Destination = In; +    Qual = HvxIdioms::Arithmetic; +  } else { +    LLVM_DEBUG(dbgs() << "Unhandled destination : " << *In << "\n"); +  } +  return Destination; +} + +// This method attempts to find destination (user) for a given intrinsic. +// Given that these are produced only by Ripple, the number of options is +// limited. Simplest case is explicit store which in fact is redundant (since +// HVX gater creates its own store during packetization). Nevertheless we need +// to figure address where we storing. Other cases are more complicated, but +// still few. +Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual) { +  Instruction *Destination = nullptr; +  if (!In) +    return Destination; +  // Get all possible destinations +  SmallVector<Instruction *> Users; +  // Iterate over the uses of the instruction +  for (auto &U : In->uses()) { +    if (auto *UI = dyn_cast<Instruction>(U.getUser())) { +      Destination = selectDestination(UI, Qual); +      if (Destination) +        Users.push_back(Destination); +    } +  } +  // Now see which of the users (if any) is a memory destination. +  for (auto *I : Users) +    if (getPointer(I)) +      return I; +  return Destination; +} + +// The two intrinsics we handle here have GEP in a different position. +inline GetElementPtrInst *locateGepFromIntrinsic(Instruction *In) { +  assert(In && "Bad instruction"); +  IntrinsicInst *IIn = dyn_cast<IntrinsicInst>(In); +  assert((IIn && (IIn->getIntrinsicID() == Intrinsic::masked_gather || +                  IIn->getIntrinsicID() == Intrinsic::masked_scatter)) && +         "Not a gather Intrinsic"); +  GetElementPtrInst *GEPIndex = nullptr; +  if (IIn->getIntrinsicID() == Intrinsic::masked_gather) +    GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(0)); +  else +    GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(1)); +  return GEPIndex; +} + +// Given the intrinsic find its GEP argument and extract base address it uses. +// The method relies on the way how Ripple typically forms the GEP for +// scatter/gather. +static Value *locateAddressFromIntrinsic(Instruction *In) { +  GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In); +  if (!GEPIndex) { +    LLVM_DEBUG(dbgs() << "  No GEP in intrinsic\n"); +    return nullptr; +  } +  Value *BaseAddress = GEPIndex->getPointerOperand(); +  auto *IndexLoad = dyn_cast<LoadInst>(BaseAddress); +  if (IndexLoad) +    return IndexLoad; + +  auto *IndexZEx = dyn_cast<ZExtInst>(BaseAddress); +  if (IndexZEx) { +    IndexLoad = dyn_cast<LoadInst>(IndexZEx->getOperand(0)); +    if (IndexLoad) +      return IndexLoad; +    IntrinsicInst *II = dyn_cast<IntrinsicInst>(IndexZEx->getOperand(0)); +    if (II && II->getIntrinsicID() == Intrinsic::masked_gather) +      return locateAddressFromIntrinsic(II); +  } +  auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(BaseAddress); +  if (BaseShuffle) { +    IndexLoad = dyn_cast<LoadInst>(BaseShuffle->getOperand(0)); +    if (IndexLoad) +      return IndexLoad; +    auto *IE = dyn_cast<InsertElementInst>(BaseShuffle->getOperand(0)); +    if (IE) { +      auto *Src = IE->getOperand(1); +      IndexLoad = dyn_cast<LoadInst>(Src); +      if (IndexLoad) +        return IndexLoad; +      auto *Alloca = dyn_cast<AllocaInst>(Src); +      if (Alloca) +        return Alloca; +      if (isa<Argument>(Src)) { +        return Src; +      } +      if (isa<GlobalValue>(Src)) { +        return Src; +      } +    } +  } +  LLVM_DEBUG(dbgs() << "  Unable to locate Address from intrinsic\n"); +  return nullptr; +} + +static Type *getIndexType(Value *In) { +  if (!In) +    return nullptr; + +  if (isa<LoadInst>(In) || isa<StoreInst>(In)) +    return getLoadStoreType(In); + +  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) { +    if (II->getIntrinsicID() == Intrinsic::masked_load) +      return II->getType(); +    if (II->getIntrinsicID() == Intrinsic::masked_store) +      return II->getOperand(0)->getType(); +  } +  return In->getType(); +} + +static Value *locateIndexesFromGEP(Value *In) { +  if (!In) +    return nullptr; +  if (isa<LoadInst>(In)) +    return In; +  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) { +    if (II->getIntrinsicID() == Intrinsic::masked_load) +      return In; +    if (II->getIntrinsicID() == Intrinsic::masked_gather) +      return In; +  } +  if (auto *IndexZEx = dyn_cast<ZExtInst>(In)) +    return locateIndexesFromGEP(IndexZEx->getOperand(0)); +  if (auto *IndexSEx = dyn_cast<SExtInst>(In)) +    return locateIndexesFromGEP(IndexSEx->getOperand(0)); +  if (auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(In)) +    return locateIndexesFromGEP(BaseShuffle->getOperand(0)); +  if (auto *IE = dyn_cast<InsertElementInst>(In)) +    return locateIndexesFromGEP(IE->getOperand(1)); +  if (auto *cstDataVector = dyn_cast<ConstantDataVector>(In)) +    return cstDataVector; +  if (auto *GEPIndex = dyn_cast<GetElementPtrInst>(In)) +    return GEPIndex->getOperand(0); +  return nullptr; +} + +// Given the intrinsic find its GEP argument and extract offsetts from the base +// address it uses. +static Value *locateIndexesFromIntrinsic(Instruction *In) { +  GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In); +  if (!GEPIndex) { +    LLVM_DEBUG(dbgs() << "  No GEP in intrinsic\n"); +    return nullptr; +  } +  Value *Indexes = GEPIndex->getOperand(1); +  if (auto *IndexLoad = locateIndexesFromGEP(Indexes)) +    return IndexLoad; + +  LLVM_DEBUG(dbgs() << "  Unable to locate Index from intrinsic\n"); +  return nullptr; +} + +// Because of aukward definition of many Hex intrinsics we often have to +// reinterprete HVX native <64 x i16> as <32 x i32> which in practice is a NOP +// for all use cases, so this only exist to make IR builder happy. +inline Value *getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC, +                                               IRBuilderBase &Builder, +                                               LLVMContext &Ctx, Value *I) { +  assert(I && "Unable to reinterprete cast"); +  Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false); +  std::vector<unsigned> shuffleMask; +  for (unsigned i = 0; i < 64; ++i) +    shuffleMask.push_back(i); +  Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask); +  Value *CastShuffle = +      Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle"); +  return Builder.CreateBitCast(CastShuffle, NT, "cst64_i16_to_32_i32"); +} + +// Recast <128 x i8> as <32 x i32> +inline Value *getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC, +                                              IRBuilderBase &Builder, +                                              LLVMContext &Ctx, Value *I) { +  assert(I && "Unable to reinterprete cast"); +  Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false); +  std::vector<unsigned> shuffleMask; +  for (unsigned i = 0; i < 128; ++i) +    shuffleMask.push_back(i); +  Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask); +  Value *CastShuffle = +      Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle"); +  return Builder.CreateBitCast(CastShuffle, NT, "cst128_i8_to_32_i32"); +} + +// Create <32 x i32> mask reinterpreted as <128 x i1> with a given pattern +inline Value *get_i32_Mask(const HexagonVectorCombine &HVC, +                           IRBuilderBase &Builder, LLVMContext &Ctx, +                           unsigned int pattern) { +  std::vector<unsigned int> byteMask; +  for (unsigned i = 0; i < 32; ++i) +    byteMask.push_back(pattern); + +  return Builder.CreateIntrinsic( +      HVC.getBoolTy(128), HVC.HST.getIntrinsicId(Hexagon::V6_vandvrt), +      {llvm::ConstantDataVector::get(Ctx, byteMask), HVC.getConstInt(~0)}, +      nullptr); +} + +Value *HvxIdioms::processVScatter(Instruction &In) const { +  auto *InpTy = dyn_cast<VectorType>(In.getOperand(0)->getType()); +  assert(InpTy && "Cannot handle no vector type for llvm.scatter/gather"); +  unsigned InpSize = HVC.getSizeOf(InpTy); +  auto *F = In.getFunction(); +  LLVMContext &Ctx = F->getContext(); +  auto *ElemTy = dyn_cast<IntegerType>(InpTy->getElementType()); +  assert(ElemTy && "llvm.scatter needs integer type argument"); +  unsigned ElemWidth = HVC.DL.getTypeAllocSize(ElemTy); +  LLVM_DEBUG({ +    unsigned Elements = HVC.length(InpTy); +    dbgs() << "\n[Process scatter](" << In << ")\n" << *In.getParent() << "\n"; +    dbgs() << "  Input type(" << *InpTy << ") elements(" << Elements +           << ") VecLen(" << InpSize << ") type(" << *ElemTy << ") ElemWidth(" +           << ElemWidth << ")\n"; +  }); + +  IRBuilder Builder(In.getParent(), In.getIterator(), +                    InstSimplifyFolder(HVC.DL)); + +  auto *ValueToScatter = In.getOperand(0); +  LLVM_DEBUG(dbgs() << "  ValueToScatter   : " << *ValueToScatter << "\n"); + +  if (HVC.HST.getVectorLength() != InpSize) { +    LLVM_DEBUG(dbgs() << "Unhandled vector size(" << InpSize +                      << ") for vscatter\n"); +    return nullptr; +  } + +  // Base address of indexes. +  auto *IndexLoad = locateAddressFromIntrinsic(&In); +  if (!IndexLoad) +    return nullptr; +  LLVM_DEBUG(dbgs() << "  IndexLoad        : " << *IndexLoad << "\n"); + +  // Address of destination. Must be in VTCM. +  auto *Ptr = getPointer(IndexLoad); +  if (!Ptr) +    return nullptr; +  LLVM_DEBUG(dbgs() << "  Ptr              : " << *Ptr << "\n"); +  // Indexes/offsets +  auto *Indexes = locateIndexesFromIntrinsic(&In); +  if (!Indexes) +    return nullptr; +  LLVM_DEBUG(dbgs() << "  Indexes          : " << *Indexes << "\n"); +  Value *CastedDst = Builder.CreateBitOrPointerCast(Ptr, Type::getInt32Ty(Ctx), +                                                    "cst_ptr_to_i32"); +  LLVM_DEBUG(dbgs() << "  CastedDst        : " << *CastedDst << "\n"); +  // Adjust Indexes +  auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes); +  Value *CastIndex = nullptr; +  if (cstDataVector) { +    // Our indexes are represented as a constant. We need it in a reg. +    AllocaInst *IndexesAlloca = +        Builder.CreateAlloca(HVC.getHvxTy(HVC.getIntTy(32), false)); +    [[maybe_unused]] auto *StoreIndexes = +        Builder.CreateStore(cstDataVector, IndexesAlloca); +    LLVM_DEBUG(dbgs() << "  StoreIndexes     : " << *StoreIndexes << "\n"); +    CastIndex = Builder.CreateLoad(IndexesAlloca->getAllocatedType(), +                                   IndexesAlloca, "reload_index"); +  } else { +    if (ElemWidth == 2) +      CastIndex = getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes); +    else +      CastIndex = Indexes; +  } +  LLVM_DEBUG(dbgs() << "  Cast index       : " << *CastIndex << ")\n"); + +  if (ElemWidth == 1) { +    // v128i8 There is no native instruction for this. +    // Do this as two Hi/Lo gathers with masking. +    Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false); +    // Extend indexes. We assume that indexes are in 128i8 format - need to +    // expand them to Hi/Lo 64i16 +    Value *CastIndexes = Builder.CreateBitCast(CastIndex, NT, "cast_to_32i32"); +    auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub); +    auto *UnpackedIndexes = Builder.CreateIntrinsic( +        HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastIndexes, nullptr); +    LLVM_DEBUG(dbgs() << "  UnpackedIndexes  : " << *UnpackedIndexes << ")\n"); + +    auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi); +    auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo); +    [[maybe_unused]] Value *IndexHi = +        HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes); +    [[maybe_unused]] Value *IndexLo = +        HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes); +    LLVM_DEBUG(dbgs() << "  UnpackedIndHi    : " << *IndexHi << ")\n"); +    LLVM_DEBUG(dbgs() << "  UnpackedIndLo    : " << *IndexLo << ")\n"); +    // Now unpack values to scatter +    Value *CastSrc = +        getReinterpretiveCast_i8_to_i32(HVC, Builder, Ctx, ValueToScatter); +    LLVM_DEBUG(dbgs() << "  CastSrc          : " << *CastSrc << ")\n"); +    auto *UnpackedValueToScatter = Builder.CreateIntrinsic( +        HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastSrc, nullptr); +    LLVM_DEBUG(dbgs() << "  UnpackedValToScat: " << *UnpackedValueToScatter +                      << ")\n"); + +    [[maybe_unused]] Value *UVSHi = +        HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedValueToScatter); +    [[maybe_unused]] Value *UVSLo = +        HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedValueToScatter); +    LLVM_DEBUG(dbgs() << "  UVSHi            : " << *UVSHi << ")\n"); +    LLVM_DEBUG(dbgs() << "  UVSLo            : " << *UVSLo << ")\n"); + +    // Create the mask for individual bytes +    auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff); +    LLVM_DEBUG(dbgs() << "  QByteMask        : " << *QByteMask << "\n"); +    [[maybe_unused]] auto *ResHi = Builder.CreateIntrinsic( +        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B, +        {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), +         IndexHi, UVSHi}, +        nullptr); +    LLVM_DEBUG(dbgs() << "  ResHi            : " << *ResHi << ")\n"); +    return Builder.CreateIntrinsic( +        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B, +        {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), +         IndexLo, UVSLo}, +        nullptr); +  } else if (ElemWidth == 2) { +    Value *CastSrc = +        getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, ValueToScatter); +    LLVM_DEBUG(dbgs() << "  CastSrc        : " << *CastSrc << ")\n"); +    return Builder.CreateIntrinsic( +        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermh_128B, +        {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex, +         CastSrc}, +        nullptr); +  } else if (ElemWidth == 4) { +    return Builder.CreateIntrinsic( +        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermw_128B, +        {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex, +         ValueToScatter}, +        nullptr); +  } else { +    LLVM_DEBUG(dbgs() << "Unhandled element type for vscatter\n"); +    return nullptr; +  } +} + +Value *HvxIdioms::processVGather(Instruction &In) const { +  [[maybe_unused]] auto *InpTy = +      dyn_cast<VectorType>(In.getOperand(0)->getType()); +  assert(InpTy && "Cannot handle no vector type for llvm.gather"); +  [[maybe_unused]] auto *ElemTy = +      dyn_cast<PointerType>(InpTy->getElementType()); +  assert(ElemTy && "llvm.gather needs vector of ptr argument"); +  auto *F = In.getFunction(); +  LLVMContext &Ctx = F->getContext(); +  LLVM_DEBUG(dbgs() << "\n[Process gather](" << In << ")\n" +                    << *In.getParent() << "\n"); +  LLVM_DEBUG(dbgs() << "  Input type(" << *InpTy << ") elements(" +                    << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy) +                    << ") type(" << *ElemTy << ") Access alignment(" +                    << *In.getOperand(1) << ") AddressSpace(" +                    << ElemTy->getAddressSpace() << ")\n"); + +  // TODO: Handle masking of elements. +  assert(dyn_cast<VectorType>(In.getOperand(2)->getType()) && +         "llvm.gather needs vector for mask"); +  IRBuilder Builder(In.getParent(), In.getIterator(), +                    InstSimplifyFolder(HVC.DL)); + +  // See who is using the result. The difference between LLVM and HVX vgather +  // Intrinsic makes it impossible to handle all cases with temp storage. Alloca +  // in VTCM is not yet supported, so for now we just bail out for those cases. +  HvxIdioms::DstQualifier Qual = HvxIdioms::Undefined; +  Instruction *Dst = locateDestination(&In, Qual); +  if (!Dst) { +    LLVM_DEBUG(dbgs() << "  Unable to locate vgather destination\n"); +    return nullptr; +  } +  LLVM_DEBUG(dbgs() << "  Destination    : " << *Dst << " Qual(" << Qual +                    << ")\n"); + +  // Address of destination. Must be in VTCM. +  auto *Ptr = getPointer(Dst); +  if (!Ptr) { +    LLVM_DEBUG(dbgs() << "Could not locate vgather destination ptr\n"); +    return nullptr; +  } + +  // Result type. Assume it is a vector type. +  auto *DstType = cast<VectorType>(getIndexType(Dst)); +  assert(DstType && "Cannot handle non vector dst type for llvm.gather"); + +  // Base address for sources to be loaded +  auto *IndexLoad = locateAddressFromIntrinsic(&In); +  if (!IndexLoad) +    return nullptr; +  LLVM_DEBUG(dbgs() << "  IndexLoad      : " << *IndexLoad << "\n"); + +  // Gather indexes/offsets +  auto *Indexes = locateIndexesFromIntrinsic(&In); +  if (!Indexes) +    return nullptr; +  LLVM_DEBUG(dbgs() << "  Indexes        : " << *Indexes << "\n"); + +  Instruction *Gather = nullptr; +  Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false); +  if (Qual == HvxIdioms::LdSt || Qual == HvxIdioms::Arithmetic) { +    // We fully assume the address space is in VTCM. We also assume that all +    // pointers in Operand(0) have the same base(!). +    // This is the most basic case of all the above. +    unsigned OutputSize = HVC.getSizeOf(DstType); +    auto *DstElemTy = cast<IntegerType>(DstType->getElementType()); +    unsigned ElemWidth = HVC.DL.getTypeAllocSize(DstElemTy); +    LLVM_DEBUG(dbgs() << "  Buffer type    : " << *Ptr->getType() +                      << "  Address space (" +                      << Ptr->getType()->getPointerAddressSpace() << ")\n" +                      << "  Result type    : " << *DstType +                      << "\n  Size in bytes  : " << OutputSize +                      << " element type(" << *DstElemTy +                      << ")\n  ElemWidth      : " << ElemWidth << " bytes\n"); + +    auto *IndexType = cast<VectorType>(getIndexType(Indexes)); +    assert(IndexType && "Cannot handle non vector index type for llvm.gather"); +    unsigned IndexWidth = HVC.DL.getTypeAllocSize(IndexType->getElementType()); +    LLVM_DEBUG(dbgs() << "  IndexWidth(" << IndexWidth << ")\n"); + +    // Intrinsic takes i32 instead of pointer so cast. +    Value *CastedPtr = Builder.CreateBitOrPointerCast( +        IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); +    // [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ...] +    // int_hexagon_V6_vgathermh       [... , llvm_v16i32_ty] +    // int_hexagon_V6_vgathermh_128B  [... , llvm_v32i32_ty] +    // int_hexagon_V6_vgathermhw      [... , llvm_v32i32_ty] +    // int_hexagon_V6_vgathermhw_128B [... , llvm_v64i32_ty] +    // int_hexagon_V6_vgathermw       [... , llvm_v16i32_ty] +    // int_hexagon_V6_vgathermw_128B  [... , llvm_v32i32_ty] +    if (HVC.HST.getVectorLength() == OutputSize) { +      if (ElemWidth == 1) { +        // v128i8 There is no native instruction for this. +        // Do this as two Hi/Lo gathers with masking. +        // Unpack indexes. We assume that indexes are in 128i8 format - need to +        // expand them to Hi/Lo 64i16 +        Value *CastIndexes = +            Builder.CreateBitCast(Indexes, NT, "cast_to_32i32"); +        auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub); +        auto *UnpackedIndexes = +            Builder.CreateIntrinsic(HVC.getHvxTy(HVC.getIntTy(32), true), +                                    V6_vunpack, CastIndexes, nullptr); +        LLVM_DEBUG(dbgs() << "  UnpackedIndexes : " << *UnpackedIndexes +                          << ")\n"); + +        auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi); +        auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo); +        [[maybe_unused]] Value *IndexHi = +            HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes); +        [[maybe_unused]] Value *IndexLo = +            HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes); +        LLVM_DEBUG(dbgs() << "  UnpackedIndHi   : " << *IndexHi << ")\n"); +        LLVM_DEBUG(dbgs() << "  UnpackedIndLo   : " << *IndexLo << ")\n"); +        // Create the mask for individual bytes +        auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff); +        LLVM_DEBUG(dbgs() << "  QByteMask       : " << *QByteMask << "\n"); +        // We use our destination allocation as a temp storage +        // This is unlikely to work properly for masked gather. +        auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermhq); +        [[maybe_unused]] auto GatherHi = Builder.CreateIntrinsic( +            Type::getVoidTy(Ctx), V6_vgather, +            {Ptr, QByteMask, CastedPtr, +             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexHi}, +            nullptr); +        LLVM_DEBUG(dbgs() << "  GatherHi        : " << *GatherHi << ")\n"); +        // Rematerialize the result +        [[maybe_unused]] Value *LoadedResultHi = Builder.CreateLoad( +            HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_hi"); +        LLVM_DEBUG(dbgs() << "  LoadedResultHi : " << *LoadedResultHi << "\n"); +        // Same for the low part. Here we use Gather to return non-NULL result +        // from this function and continue to iterate. We also are deleting Dst +        // store below. +        Gather = Builder.CreateIntrinsic( +            Type::getVoidTy(Ctx), V6_vgather, +            {Ptr, QByteMask, CastedPtr, +             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexLo}, +            nullptr); +        LLVM_DEBUG(dbgs() << "  GatherLo        : " << *Gather << ")\n"); +        Value *LoadedResultLo = Builder.CreateLoad( +            HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_lo"); +        LLVM_DEBUG(dbgs() << "  LoadedResultLo : " << *LoadedResultLo << "\n"); +        // Now we have properly sized bytes in every other position +        // B b A a c a A b B c f F g G h H is presented as +        // B . b . A . a . c . a . A . b . B . c . f . F . g . G . h . H +        // Use vpack to gather them +        auto V6_vpackeb = HVC.HST.getIntrinsicId(Hexagon::V6_vpackeb); +        [[maybe_unused]] auto Res = Builder.CreateIntrinsic( +            NT, V6_vpackeb, {LoadedResultHi, LoadedResultLo}, nullptr); +        LLVM_DEBUG(dbgs() << "  ScaledRes      : " << *Res << "\n"); +        [[maybe_unused]] auto *StoreRes = Builder.CreateStore(Res, Ptr); +        LLVM_DEBUG(dbgs() << "  StoreRes       : " << *StoreRes << "\n"); +      } else if (ElemWidth == 2) { +        // v32i16 +        if (IndexWidth == 2) { +          // Reinterprete 64i16 as 32i32. Only needed for syntactic IR match. +          Value *CastIndex = +              getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes); +          LLVM_DEBUG(dbgs() << "  Cast index: " << *CastIndex << ")\n"); +          // shift all i16 left by 1 to match short addressing mode instead of +          // byte. +          auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh); +          Value *AdjustedIndex = HVC.createHvxIntrinsic( +              Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)}); +          LLVM_DEBUG(dbgs() +                     << "  Shifted half index: " << *AdjustedIndex << ")\n"); + +          auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermh); +          // The 3rd argument is the size of the region to gather from. Probably +          // want to set it to max VTCM size. +          Gather = Builder.CreateIntrinsic( +              Type::getVoidTy(Ctx), V6_vgather, +              {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), +               AdjustedIndex}, +              nullptr); +          for (auto &U : Dst->uses()) { +            if (auto *UI = dyn_cast<Instruction>(U.getUser())) +              dbgs() << "    dst used by: " << *UI << "\n"; +          } +          for (auto &U : In.uses()) { +            if (auto *UI = dyn_cast<Instruction>(U.getUser())) +              dbgs() << "    In used by : " << *UI << "\n"; +          } +          // Create temp load from result in case the result is used by any +          // other instruction. +          Value *LoadedResult = Builder.CreateLoad( +              HVC.getHvxTy(HVC.getIntTy(16), false), Ptr, "temp_result"); +          LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n"); +          In.replaceAllUsesWith(LoadedResult); +        } else { +          dbgs() << "    Unhandled index type for vgather\n"; +          return nullptr; +        } +      } else if (ElemWidth == 4) { +        if (IndexWidth == 4) { +          // v32i32 +          auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh); +          Value *AdjustedIndex = HVC.createHvxIntrinsic( +              Builder, V6_vaslh, NT, {Indexes, HVC.getConstInt(2)}); +          LLVM_DEBUG(dbgs() +                     << "  Shifted word index: " << *AdjustedIndex << ")\n"); +          Gather = Builder.CreateIntrinsic( +              Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermw_128B, +              {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), +               AdjustedIndex}, +              nullptr); +        } else { +          LLVM_DEBUG(dbgs() << "    Unhandled index type for vgather\n"); +          return nullptr; +        } +      } else { +        LLVM_DEBUG(dbgs() << "    Unhandled element type for vgather\n"); +        return nullptr; +      } +    } else if (HVC.HST.getVectorLength() == OutputSize * 2) { +      // This is half of the reg width, duplicate low in high +      LLVM_DEBUG(dbgs() << "    Unhandled half of register size\n"); +      return nullptr; +    } else if (HVC.HST.getVectorLength() * 2 == OutputSize) { +      LLVM_DEBUG(dbgs() << "    Unhandle twice the register size\n"); +      return nullptr; +    } +    // Erase the original intrinsic and store that consumes it. +    // HVX will create a pseudo for gather that is expanded to gather + store +    // during packetization. +    Dst->eraseFromParent(); +  } else if (Qual == HvxIdioms::LLVM_Scatter) { +    // Gather feeds directly into scatter. +    LLVM_DEBUG({ +      auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType()); +      assert(DstInpTy && "Cannot handle no vector type for llvm.scatter"); +      unsigned DstInpSize = HVC.getSizeOf(DstInpTy); +      unsigned DstElements = HVC.length(DstInpTy); +      auto *DstElemTy = cast<PointerType>(DstInpTy->getElementType()); +      assert(DstElemTy && "llvm.scatter needs vector of ptr argument"); +      dbgs() << "  Gather feeds into scatter\n  Values to scatter : " +             << *Dst->getOperand(0) << "\n"; +      dbgs() << "  Dst type(" << *DstInpTy << ") elements(" << DstElements +             << ") VecLen(" << DstInpSize << ") type(" << *DstElemTy +             << ") Access alignment(" << *Dst->getOperand(2) << ")\n"; +    }); +    // Address of source +    auto *Src = getPointer(IndexLoad); +    if (!Src) +      return nullptr; +    LLVM_DEBUG(dbgs() << "  Src            : " << *Src << "\n"); + +    if (!isa<PointerType>(Src->getType())) { +      LLVM_DEBUG(dbgs() << "    Source is not a pointer type...\n"); +      return nullptr; +    } + +    Value *CastedSrc = Builder.CreateBitOrPointerCast( +        Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); +    LLVM_DEBUG(dbgs() << "  CastedSrc: " << *CastedSrc << "\n"); + +    auto *DstLoad = locateAddressFromIntrinsic(Dst); +    if (!DstLoad) { +      LLVM_DEBUG(dbgs() << "  Unable to locate DstLoad\n"); +      return nullptr; +    } +    LLVM_DEBUG(dbgs() << "  DstLoad  : " << *DstLoad << "\n"); + +    Value *Ptr = getPointer(DstLoad); +    if (!Ptr) +      return nullptr; +    LLVM_DEBUG(dbgs() << "  Ptr      : " << *Ptr << "\n"); +    Value *CastIndex = +        getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, IndexLoad); +    LLVM_DEBUG(dbgs() << "  Cast index: " << *CastIndex << ")\n"); +    // Shift all i16 left by 1 to match short addressing mode instead of +    // byte. +    auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh); +    Value *AdjustedIndex = HVC.createHvxIntrinsic( +        Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)}); +    LLVM_DEBUG(dbgs() << "  Shifted half index: " << *AdjustedIndex << ")\n"); + +    return Builder.CreateIntrinsic( +        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B, +        {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), +         AdjustedIndex}, +        nullptr); +  } else if (Qual == HvxIdioms::HEX_Gather_Scatter) { +    // Gather feeds into previously inserted pseudo intrinsic. +    // These could not be in the same packet, so we need to generate another +    // pseudo that is expanded to .tmp + store V6_vgathermh_pseudo +    // V6_vgathermh_pseudo (ins IntRegs:$_dst_, s4_0Imm:$Ii, IntRegs:$Rt, +    // ModRegs:$Mu, HvxVR:$Vv) +    if (isa<AllocaInst>(IndexLoad)) { +      auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes); +      if (cstDataVector) { +        // Our indexes are represented as a constant. We need THEM in a reg. +        // This most likely will not work properly since alloca gives us DDR +        // stack location. This will be fixed once we teach compiler about VTCM. +        AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT); +        [[maybe_unused]] auto *StoreIndexes = +            Builder.CreateStore(cstDataVector, IndexesAlloca); +        LLVM_DEBUG(dbgs() << "  StoreIndexes   : " << *StoreIndexes << "\n"); +        Value *LoadedIndex = Builder.CreateLoad( +            IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index"); +        AllocaInst *ResultAlloca = Builder.CreateAlloca(NT); +        LLVM_DEBUG(dbgs() << "  ResultAlloca   : " << *ResultAlloca << "\n"); + +        Value *CastedSrc = Builder.CreateBitOrPointerCast( +            IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); +        LLVM_DEBUG(dbgs() << "  CastedSrc      : " << *CastedSrc << "\n"); + +        Gather = Builder.CreateIntrinsic( +            Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B, +            {ResultAlloca, CastedSrc, +             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex}, +            nullptr); +        Value *LoadedResult = Builder.CreateLoad( +            HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result"); +        LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n"); +        LLVM_DEBUG(dbgs() << "  Gather         : " << *Gather << "\n"); +        In.replaceAllUsesWith(LoadedResult); +      } +    } else { +      // Address of source +      auto *Src = getPointer(IndexLoad); +      if (!Src) +        return nullptr; +      LLVM_DEBUG(dbgs() << "  Src      : " << *Src << "\n"); + +      Value *CastedSrc = Builder.CreateBitOrPointerCast( +          Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); +      LLVM_DEBUG(dbgs() << "  CastedSrc: " << *CastedSrc << "\n"); + +      auto *DstLoad = locateAddressFromIntrinsic(Dst); +      if (!DstLoad) +        return nullptr; +      LLVM_DEBUG(dbgs() << "  DstLoad  : " << *DstLoad << "\n"); +      auto *Ptr = getPointer(DstLoad); +      if (!Ptr) +        return nullptr; +      LLVM_DEBUG(dbgs() << "  Ptr      : " << *Ptr << "\n"); + +      Gather = Builder.CreateIntrinsic( +          Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgather_vscattermh, +          {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), +           Indexes}, +          nullptr); +    } +    return Gather; +  } else if (Qual == HvxIdioms::HEX_Scatter) { +    // This is the case when result of a gather is used as an argument to +    // Intrinsic::hexagon_V6_vscattermh_128B. Most likely we just inserted it +    // ourselves. We have to create alloca, store to it, and replace all uses +    // with that. +    AllocaInst *ResultAlloca = Builder.CreateAlloca(NT); +    Value *CastedSrc = Builder.CreateBitOrPointerCast( +        IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); +    LLVM_DEBUG(dbgs() << "  CastedSrc      : " << *CastedSrc << "\n"); +    Value *CastIndex = +        getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes); +    LLVM_DEBUG(dbgs() << "  Cast index     : " << *CastIndex << ")\n"); + +    Gather = Builder.CreateIntrinsic( +        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B, +        {ResultAlloca, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), +         CastIndex}, +        nullptr); +    Value *LoadedResult = Builder.CreateLoad( +        HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result"); +    LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n"); +    In.replaceAllUsesWith(LoadedResult); +  } else if (Qual == HvxIdioms::HEX_Gather) { +    // Gather feeds to another gather but already replaced with +    // hexagon_V6_vgathermh_128B +    if (isa<AllocaInst>(IndexLoad)) { +      auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes); +      if (cstDataVector) { +        // Our indexes are represented as a constant. We need it in a reg. +        AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT); + +        [[maybe_unused]] auto *StoreIndexes = +            Builder.CreateStore(cstDataVector, IndexesAlloca); +        LLVM_DEBUG(dbgs() << "  StoreIndexes   : " << *StoreIndexes << "\n"); +        Value *LoadedIndex = Builder.CreateLoad( +            IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index"); +        AllocaInst *ResultAlloca = Builder.CreateAlloca(NT); +        LLVM_DEBUG(dbgs() << "  ResultAlloca   : " << *ResultAlloca +                          << "\n  AddressSpace: " +                          << ResultAlloca->getAddressSpace() << "\n";); + +        Value *CastedSrc = Builder.CreateBitOrPointerCast( +            IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); +        LLVM_DEBUG(dbgs() << "  CastedSrc      : " << *CastedSrc << "\n"); + +        Gather = Builder.CreateIntrinsic( +            Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B, +            {ResultAlloca, CastedSrc, +             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex}, +            nullptr); +        Value *LoadedResult = Builder.CreateLoad( +            HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result"); +        LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n"); +        LLVM_DEBUG(dbgs() << "  Gather         : " << *Gather << "\n"); +        In.replaceAllUsesWith(LoadedResult); +      } +    } +  } else if (Qual == HvxIdioms::LLVM_Gather) { +    // Gather feeds into another gather +    errs() << " Underimplemented vgather to vgather sequence\n"; +    return nullptr; +  } else +    llvm_unreachable("Unhandled Qual enum"); + +  return Gather; +} +  auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,                                       const FxpOp &Op) const -> Value * {    assert(Op.X.Val->getType() == Op.Y.Val->getType()); @@ -2138,6 +3000,26 @@ auto HvxIdioms::run() -> bool {          It = StartOver ? B.rbegin()                         : cast<Instruction>(New)->getReverseIterator();          Changed = true; +      } else if (matchGather(*It)) { +        Value *New = processVGather(*It); +        if (!New) +          continue; +        LLVM_DEBUG(dbgs() << "  Gather : " << *New << "\n"); +        // We replace original intrinsic with a new pseudo call. +        It->eraseFromParent(); +        It = cast<Instruction>(New)->getReverseIterator(); +        RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI); +        Changed = true; +      } else if (matchScatter(*It)) { +        Value *New = processVScatter(*It); +        if (!New) +          continue; +        LLVM_DEBUG(dbgs() << "  Scatter : " << *New << "\n"); +        // We replace original intrinsic with a new pseudo call. +        It->eraseFromParent(); +        It = cast<Instruction>(New)->getReverseIterator(); +        RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI); +        Changed = true;        }      }    } diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp index 6455757..2f59b7c 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp @@ -186,6 +186,9 @@ static unsigned featureToArchVersion(unsigned Feature) {    case Hexagon::ArchV79:    case Hexagon::ExtensionHVXV79:      return 79; +  case Hexagon::ArchV81: +  case Hexagon::ExtensionHVXV81: +    return 81;    }    llvm_unreachable("Expected valid arch feature");    return 0; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index 6b48a21..b8075bd 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -96,6 +96,8 @@ cl::opt<bool> MV75("mv75", cl::Hidden, cl::desc("Build for Hexagon V75"),                     cl::init(false));  cl::opt<bool> MV79("mv79", cl::Hidden, cl::desc("Build for Hexagon V79"),                     cl::init(false)); +cl::opt<bool> MV81("mv81", cl::Hidden, cl::desc("Build for Hexagon V81"), +                   cl::init(false));  } // namespace  static cl::opt<Hexagon::ArchEnum> EnableHVX( @@ -111,6 +113,7 @@ static cl::opt<Hexagon::ArchEnum> EnableHVX(                 clEnumValN(Hexagon::ArchEnum::V73, "v73", "Build for HVX v73"),                 clEnumValN(Hexagon::ArchEnum::V75, "v75", "Build for HVX v75"),                 clEnumValN(Hexagon::ArchEnum::V79, "v79", "Build for HVX v79"), +               clEnumValN(Hexagon::ArchEnum::V81, "v81", "Build for HVX v81"),                 // Sentinel for no value specified.                 clEnumValN(Hexagon::ArchEnum::Generic, "", "")),      // Sentinel for flag not present. @@ -159,6 +162,8 @@ static StringRef HexagonGetArchVariant() {      return "hexagonv75";    if (MV79)      return "hexagonv79"; +  if (MV81) +    return "hexagonv81";    return "";  } @@ -474,6 +479,9 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {    case Hexagon::ArchEnum::V79:      Result.push_back("+hvxv79");      break; +  case Hexagon::ArchEnum::V81: +    Result.push_back("+hvxv81"); +    break;    case Hexagon::ArchEnum::Generic: {      Result.push_back(StringSwitch<StringRef>(CPU) @@ -489,7 +497,8 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {                           .Case("hexagonv71t", "+hvxv71")                           .Case("hexagonv73", "+hvxv73")                           .Case("hexagonv75", "+hvxv75") -                         .Case("hexagonv79", "+hvxv79")); +                         .Case("hexagonv79", "+hvxv79") +                         .Case("hexagonv81", "+hvxv81"));      break;    }    case Hexagon::ArchEnum::NoArch: @@ -538,8 +547,8 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {    FeatureBitset FB = S;    unsigned CpuArch = ArchV5;    for (unsigned F : -       {ArchV79, ArchV75, ArchV73, ArchV71, ArchV69, ArchV68, ArchV67, ArchV66, -        ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) { +       {ArchV81, ArchV79, ArchV75, ArchV73, ArchV71, ArchV69, ArchV68, ArchV67, +        ArchV66, ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) {      if (!FB.test(F))        continue;      CpuArch = F; @@ -556,7 +565,7 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {    for (unsigned F :         {ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65, ExtensionHVXV66,          ExtensionHVXV67, ExtensionHVXV68, ExtensionHVXV69, ExtensionHVXV71, -        ExtensionHVXV73, ExtensionHVXV75, ExtensionHVXV79}) { +        ExtensionHVXV73, ExtensionHVXV75, ExtensionHVXV79, ExtensionHVXV81}) {      if (!FB.test(F))        continue;      HasHvxVer = true; @@ -569,6 +578,9 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {    // HasHvxVer is false, and UseHvx is true.    switch (CpuArch) { +  case ArchV81: +    FB.set(ExtensionHVXV81); +    [[fallthrough]];    case ArchV79:      FB.set(ExtensionHVXV79);      [[fallthrough]]; @@ -668,12 +680,12 @@ void Hexagon_MC::addArchSubtarget(MCSubtargetInfo const *STI, StringRef FS) {  std::optional<unsigned>  Hexagon_MC::getHVXVersion(const FeatureBitset &Features) { -  for (auto Arch : {Hexagon::ExtensionHVXV79, Hexagon::ExtensionHVXV75, -                    Hexagon::ExtensionHVXV73, Hexagon::ExtensionHVXV71, -                    Hexagon::ExtensionHVXV69, Hexagon::ExtensionHVXV68, -                    Hexagon::ExtensionHVXV67, Hexagon::ExtensionHVXV66, -                    Hexagon::ExtensionHVXV65, Hexagon::ExtensionHVXV62, -                    Hexagon::ExtensionHVXV60}) +  for (auto Arch : {Hexagon::ExtensionHVXV81, Hexagon::ExtensionHVXV79, +                    Hexagon::ExtensionHVXV75, Hexagon::ExtensionHVXV73, +                    Hexagon::ExtensionHVXV71, Hexagon::ExtensionHVXV69, +                    Hexagon::ExtensionHVXV68, Hexagon::ExtensionHVXV67, +                    Hexagon::ExtensionHVXV66, Hexagon::ExtensionHVXV65, +                    Hexagon::ExtensionHVXV62, Hexagon::ExtensionHVXV60})      if (Features.test(Arch))        return Arch;    return {}; @@ -681,13 +693,13 @@ Hexagon_MC::getHVXVersion(const FeatureBitset &Features) {  unsigned Hexagon_MC::getArchVersion(const FeatureBitset &Features) {    for (auto Arch : -       {Hexagon::ArchV79, Hexagon::ArchV75, Hexagon::ArchV73, Hexagon::ArchV71, -        Hexagon::ArchV69, Hexagon::ArchV68, Hexagon::ArchV67, Hexagon::ArchV66, -        Hexagon::ArchV65, Hexagon::ArchV62, Hexagon::ArchV60, Hexagon::ArchV55, -        Hexagon::ArchV5}) +       {Hexagon::ArchV81, Hexagon::ArchV79, Hexagon::ArchV75, Hexagon::ArchV73, +        Hexagon::ArchV71, Hexagon::ArchV69, Hexagon::ArchV68, Hexagon::ArchV67, +        Hexagon::ArchV66, Hexagon::ArchV65, Hexagon::ArchV62, Hexagon::ArchV60, +        Hexagon::ArchV55, Hexagon::ArchV5})      if (Features.test(Arch))        return Arch; -  llvm_unreachable("Expected arch v5-v79"); +  llvm_unreachable("Expected arch v5-v81");    return 0;  } @@ -708,7 +720,8 @@ unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {        .Case("hexagonv71t", llvm::ELF::EF_HEXAGON_MACH_V71T)        .Case("hexagonv73", llvm::ELF::EF_HEXAGON_MACH_V73)        .Case("hexagonv75", llvm::ELF::EF_HEXAGON_MACH_V75) -      .Case("hexagonv79", llvm::ELF::EF_HEXAGON_MACH_V79); +      .Case("hexagonv79", llvm::ELF::EF_HEXAGON_MACH_V79) +      .Case("hexagonv81", llvm::ELF::EF_HEXAGON_MACH_V81);  }  llvm::ArrayRef<MCPhysReg> Hexagon_MC::GetVectRegRev() { diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index aca7abd..44d1a44 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -4578,6 +4578,8 @@ def : InstAlias<"mfamr $Rx", (MFSPR gprc:$Rx, 29)>;  def : InstAlias<"mtpid $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsBookE]>;  def : InstAlias<"mfpid $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsBookE]>; +def : InstAlias<"mtpidr $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsISA3_0]>; +def : InstAlias<"mfpidr $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsISA3_0]>;  foreach SPRG = 4-7 in {    def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 256))>, diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 9e6b7f0..2754d78 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1124,7 +1124,8 @@ def HasStdExtZbkbOrP                           "'Base P' (Packed-SIMD)">;  def HasStdExtZbbOrZbkbOrP -    : Predicate<"Subtarget->HasStdExtZbbOrZbkb()|| Subtarget->hasStdExtP()">, +    : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbkb() || " +                "Subtarget->hasStdExtP()">,        AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtZbkb, FeatureStdExtP),                           "'Zbb' (Basic Bit-Manipulation) or "                           "'Zbkb' (Bitmanip instructions for Cryptography) or " diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 26fe9ed..1c930ac 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -318,8 +318,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,    setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom); -  if (!Subtarget.hasStdExtZbb() && !Subtarget.hasVendorXTHeadBb() && -      !Subtarget.hasVendorXqcibm() && !Subtarget.hasVendorXAndesPerf() && +  if (!Subtarget.hasStdExtZbb() && !Subtarget.hasStdExtP() && +      !Subtarget.hasVendorXTHeadBb() && !Subtarget.hasVendorXqcibm() && +      !Subtarget.hasVendorXAndesPerf() &&        !(Subtarget.hasVendorXCValu() && !Subtarget.is64Bit()))      setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::i8, MVT::i16}, Expand); @@ -392,7 +393,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,        setOperationAction(ISD::BITREVERSE, MVT::i8, Custom);    } -  if (Subtarget.hasStdExtZbb() || +  if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtP() ||        (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {      setOperationAction({ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, XLenVT,                         Legal); @@ -403,6 +404,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,        setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);    } else {      setOperationAction(ISD::CTTZ, XLenVT, Expand); +    // If have a CLZW, but not CTZW, custom promote i32. +    if (Subtarget.hasStdExtP() && Subtarget.is64Bit()) +      setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);    }    if (!Subtarget.hasCPOPLike()) { @@ -419,13 +423,15 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,      // We need the custom lowering to make sure that the resulting sequence      // for the 32bit case is efficient on 64bit targets.      // Use default promotion for i32 without Zbb. -    if (Subtarget.is64Bit() && Subtarget.hasStdExtZbb()) +    if (Subtarget.is64Bit() && +        (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtP()))        setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);    } else {      setOperationAction(ISD::CTLZ, XLenVT, Expand);    } -  if (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit()) { +  if (Subtarget.hasStdExtP() || +      (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {      setOperationAction(ISD::ABS, XLenVT, Legal);    } else if (Subtarget.hasShortForwardBranchOpt()) {      // We can use PseudoCCSUB to implement ABS. @@ -14669,6 +14675,25 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));      bool IsCTZ =          N->getOpcode() == ISD::CTTZ || N->getOpcode() == ISD::CTTZ_ZERO_UNDEF; + +    // Without Zbb, lower as 32 - clzw(~X & (X-1)) +    if (IsCTZ && !Subtarget.hasStdExtZbb()) { +      assert(Subtarget.hasStdExtP()); + +      NewOp0 = DAG.getFreeze(NewOp0); +      SDValue Not = DAG.getNOT(DL, NewOp0, MVT::i64); +      SDValue Minus1 = DAG.getNode(ISD::SUB, DL, MVT::i64, NewOp0, +                                   DAG.getConstant(1, DL, MVT::i64)); +      SDValue And = DAG.getNode(ISD::AND, DL, MVT::i64, Not, Minus1); +      SDValue CLZW = DAG.getNode(RISCVISD::CLZW, DL, MVT::i64, And); +      SDValue Sub = DAG.getNode(ISD::SUB, DL, MVT::i64, +                                DAG.getConstant(32, DL, MVT::i64), CLZW); +      SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, Sub, +                                DAG.getValueType(MVT::i32)); +      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); +      return; +    } +      unsigned Opc = IsCTZ ? RISCVISD::CTZW : RISCVISD::CLZW;      SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp0);      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); @@ -14797,7 +14822,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,        // to NEGW+MAX here requires a Freeze which breaks ComputeNumSignBits.        SDValue Src = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64,                                  N->getOperand(0)); -      SDValue Abs = DAG.getNode(RISCVISD::ABSW, DL, MVT::i64, Src); +      SDValue Abs = DAG.getNode(RISCVISD::NEGW_MAX, DL, MVT::i64, Src);        Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Abs));        return;      } @@ -21813,7 +21838,7 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(      // Output is either all zero or operand 0. We can propagate sign bit count      // from operand 0.      return DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1); -  case RISCVISD::ABSW: { +  case RISCVISD::NEGW_MAX: {      // We expand this at isel to negw+max. The result will have 33 sign bits      // if the input has at least 33 sign bits.      unsigned Tmp = diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index 7d8a919..cc085bb 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -1455,3 +1455,11 @@ let Predicates = [HasStdExtP, IsRV32] in {    def PMAXU_DW     : RVPPairBinaryExchanged_rr<0b1111, 0b01, "pmaxu.dw">;    def PMAXU_DB     : RVPPairBinaryExchanged_rr<0b1111, 0b10, "pmaxu.db">;  } // Predicates = [HasStdExtP, IsRV32] + + +//===----------------------------------------------------------------------===// +// Codegen patterns +//===----------------------------------------------------------------------===// + +let Predicates = [HasStdExtP] in +def : PatGpr<abs, ABS>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td index 4104abd..f7b4914 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td @@ -218,11 +218,13 @@ let Predicates = [HasVendorXSfvcp], mayLoad = 0, mayStore = 0,  }  let Predicates = [HasVendorXSfvfexpAny], DecoderNamespace = "XSfvector" in { -  def SF_VFEXP_V : VALUVs2<0b010011, 0b00111, OPFVV, "sf.vfexp.v">; +  def SF_VFEXP_V : VALUVs2<0b010011, 0b00111, OPFVV, "sf.vfexp.v">, +                   SchedUnaryMC<"WriteSF_VFExp", "ReadSF_VFExp">;  }  let Predicates = [HasVendorXSfvfexpa], DecoderNamespace = "XSfvector" in { -  def SF_VFEXPA_V : VALUVs2<0b010011, 0b00110, OPFVV, "sf.vfexpa.v">; +  def SF_VFEXPA_V : VALUVs2<0b010011, 0b00110, OPFVV, "sf.vfexpa.v">, +                    SchedUnaryMC<"WriteSF_VFExpa", "ReadSF_VFExpa">;  }  let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvector", @@ -482,11 +484,53 @@ let Predicates = [HasVendorXSfvfwmaccqqq] in {    defm SF_VFWMACC_4x4x4 : VPseudoSiFiveVFWMACC;  } -let Predicates = [HasVendorXSfvfnrclipxfqf] in { +let Predicates = [HasVendorXSfvfnrclipxfqf], AltFmtType = IS_NOT_ALTFMT in {    defm SF_VFNRCLIP_XU_F_QF : VPseudoSiFiveVFNRCLIP;    defm SF_VFNRCLIP_X_F_QF : VPseudoSiFiveVFNRCLIP;  } +class VFExpSchedSEWSet<string mx, bit IsBF16, bit IsApprox> { +  defvar BaseSet = SchedSEWSet<mx, isF=1>.val; +  list<int> val = !if(IsBF16, !listremove(BaseSet, [32, 64]), +                      !if(IsApprox, BaseSet, !listremove(BaseSet, [64]))); +} +multiclass VPseudoVFExp_V<bit IsBF16 = false, bit IsApprox = false> { +  defvar SchedSuffix = !if(IsApprox, "VFExpa", "VFExp"); + +  foreach m = MxListF in { +    defvar mx = m.MX; +    foreach e = VFExpSchedSEWSet<mx, IsBF16, IsApprox>.val in { +      let VLMul = m.value in { +        def "_V_" # mx # "_E" # e +            : VPseudoUnaryNoMask<m.vrclass, m.vrclass>, +              SchedUnary<"WriteSF_" # SchedSuffix, "ReadSF_" # SchedSuffix, +                         mx, e, forcePassthruRead=true>; +        def "_V_" # mx # "_E" # e # "_MASK" +            : VPseudoUnaryMask<m.vrclass, m.vrclass>, +              RISCVMaskedPseudo<MaskIdx = 2>, +              SchedUnary<"WriteSF_" # SchedSuffix, "ReadSF_" # SchedSuffix, +                         mx, e, forcePassthruRead=true>; +      } +    } +  } +} + +let Predicates = [HasVendorXSfvfbfexp16e], hasSideEffects = 0 in { +  let AltFmtType = IS_ALTFMT in { +    defm PseudoSF_VFEXP_ALT : VPseudoVFExp_V<IsBF16=true>; +  } +} + +let Predicates = [HasVendorXSfvfexpAnyFloat], hasSideEffects = 0 in { +  let AltFmtType = IS_NOT_ALTFMT in { +    defm PseudoSF_VFEXP : VPseudoVFExp_V; +  } +} + +let Predicates = [HasVendorXSfvfexpa], AltFmtType = IS_NOT_ALTFMT in { +  defm PseudoSF_VFEXPA : VPseudoVFExp_V<IsApprox=true>; +} +  // SDNode  def SDT_SF_VC_V_X : SDTypeProfile<1, 4, [SDTCisVec<0>,                                           SDTCisVT<1, XLenVT>, @@ -893,3 +937,36 @@ let Predicates = [HasVendorXSfcease] in {      let rs2 = 0b00101;  }  } + +let Predicates = [HasVendorXSfvfbfexp16e] in { +  defm : VPatUnaryV_V<"int_riscv_sf_vfexp", "PseudoSF_VFEXP_ALT", +                      AllBF16Vectors, +                      isSEWAware=1>; +} + +let Predicates = [HasVendorXSfvfexp16e] in { +  defm : VPatUnaryV_V<"int_riscv_sf_vfexp", "PseudoSF_VFEXP", +                      [VF16MF4, VF16MF2, VF16M1, VF16M2, VF16M4, VF16M8], +                      isSEWAware=1>; +} + +let Predicates = [HasVendorXSfvfexp32e] in { +  defm : VPatUnaryV_V<"int_riscv_sf_vfexp", "PseudoSF_VFEXP", +                      [VF32MF2, VF32M1, VF32M2, VF32M4, VF32M8], isSEWAware=1>; +} + +let Predicates = [HasVendorXSfvfexpa] in { +  defm : VPatUnaryV_V<"int_riscv_sf_vfexpa", "PseudoSF_VFEXPA", +                      [VF32MF2, VF32M1, VF32M2, VF32M4, VF32M8], isSEWAware=1>; +} + +let Predicates = [HasVendorXSfvfexpa, HasVInstructionsF16] in { +  defm : VPatUnaryV_V<"int_riscv_sf_vfexpa", "PseudoSF_VFEXPA", +                      [VF16MF4, VF16MF2, VF16M1, VF16M2, VF16M4, VF16M8], +                      isSEWAware=1>; +} + +let Predicates = [HasVendorXSfvfexpa64e] in { +  defm : VPatUnaryV_V<"int_riscv_sf_vfexpa", "PseudoSF_VFEXPA", +                      [VF64M1, VF64M2, VF64M4, VF64M8], isSEWAware=1>; +} diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index 62b7bcd..5429c2a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -51,7 +51,7 @@ def riscv_zip     : RVSDNode<"ZIP",     SDTIntUnaryOp>;  def riscv_unzip   : RVSDNode<"UNZIP",   SDTIntUnaryOp>;  // RV64IZbb absolute value for i32. Expanded to (max (negw X), X) during isel. -def riscv_absw    : RVSDNode<"ABSW",    SDTIntUnaryOp>; +def riscv_negw_max : RVSDNode<"NEGW_MAX",    SDTIntUnaryOp>;  // Scalar cryptography  def riscv_clmul   : RVSDNode<"CLMUL",   SDTIntBinOp>; @@ -599,37 +599,43 @@ def : PatGpr<riscv_zip, ZIP_RV32, i32>;  def : PatGpr<riscv_unzip, UNZIP_RV32, i32>;  } // Predicates = [HasStdExtZbkb, IsRV32] -let Predicates = [HasStdExtZbb] in { +let Predicates = [HasStdExtZbbOrP] in {  def : PatGpr<ctlz, CLZ>; +} + +let Predicates = [HasStdExtZbb] in {  def : PatGpr<cttz, CTZ>;  def : PatGpr<ctpop, CPOP>;  } // Predicates = [HasStdExtZbb] -let Predicates = [HasStdExtZbb, IsRV64] in { +let Predicates = [HasStdExtZbbOrP, IsRV64] in {  def : PatGpr<riscv_clzw, CLZW>; +} + +let Predicates = [HasStdExtZbb, IsRV64] in {  def : PatGpr<riscv_ctzw, CTZW>;  def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>; -def : Pat<(i64 (riscv_absw GPR:$rs1)), +def : Pat<(i64 (riscv_negw_max GPR:$rs1)),            (MAX GPR:$rs1, (XLenVT (SUBW (XLenVT X0), GPR:$rs1)))>;  } // Predicates = [HasStdExtZbb, IsRV64] -let Predicates = [HasStdExtZbb] in { +let Predicates = [HasStdExtZbbOrP] in {  def : Pat<(XLenVT (sext_inreg GPR:$rs1, i8)), (SEXT_B GPR:$rs1)>;  def : Pat<(XLenVT (sext_inreg GPR:$rs1, i16)), (SEXT_H GPR:$rs1)>;  } // Predicates = [HasStdExtZbb] -let Predicates = [HasStdExtZbb] in { +let Predicates = [HasStdExtZbbOrP] in {  def : PatGprGpr<smin, MIN>;  def : PatGprGpr<smax, MAX>;  def : PatGprGpr<umin, MINU>;  def : PatGprGpr<umax, MAXU>;  } // Predicates = [HasStdExtZbb] -let Predicates = [HasStdExtZbbOrZbkb, IsRV32] in +let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV32] in  def : PatGpr<bswap, REV8_RV32, i32>; -let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in +let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV64] in  def : PatGpr<bswap, REV8_RV64, i64>;  let Predicates = [HasStdExtZbkb] in { diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index 637d61fe..36a2f46 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -1588,6 +1588,10 @@ multiclass SiFive7SchedResources<int vlen, bit dualVALU,    //===----------------------------------------------------------------------===//    // Unsupported extensions    defm : UnsupportedSchedQ; +  // TODO: scheduling info of XSfvfexp* and XSfvfexpa* +  // for SiFive7 will be added in follow-up patches. +  defm : UnsupportedSchedXSfvfexp; +  defm : UnsupportedSchedXSfvfexpa;    defm : UnsupportedSchedZabha;    defm : UnsupportedSchedZbc;    defm : UnsupportedSchedZbkb; diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td index 9ab9636..64ccfd8 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedule.td +++ b/llvm/lib/Target/RISCV/RISCVSchedule.td @@ -523,6 +523,8 @@ include "RISCVScheduleZvk.td"  // Vendor Extensions  multiclass UnsupportedSchedXsf {    defm : UnsupportedSchedXsfvcp; +  defm : UnsupportedSchedXSfvfexp; +  defm : UnsupportedSchedXSfvfexpa;    defm : UnsupportedSchedXSfvfnrclipxfqf;    defm : UnsupportedSchedXSfvfwmaccqqq;    defm : UnsupportedSchedXSfvqmaccdod; diff --git a/llvm/lib/Target/RISCV/RISCVScheduleXSf.td b/llvm/lib/Target/RISCV/RISCVScheduleXSf.td index 99632e4..1ee6dc1 100644 --- a/llvm/lib/Target/RISCV/RISCVScheduleXSf.td +++ b/llvm/lib/Target/RISCV/RISCVScheduleXSf.td @@ -99,3 +99,23 @@ defm : LMULWriteRes<"WriteSF_VFWMACC_QQQ", []>;  defm : LMULReadAdvance<"ReadSF_VFWMACC_QQQ", 0>;  } // Unsupported = true  } + +defm "" : LMULSEWSchedWritesF<"WriteSF_VFExp">; +defm "" : LMULSEWSchedReadsF<"ReadSF_VFExp">; + +multiclass UnsupportedSchedXSfvfexp { +let Unsupported = true in { +defm : LMULSEWWriteResF<"WriteSF_VFExp", []>; +defm : LMULSEWReadAdvanceF<"ReadSF_VFExp", 0>; +} // Unsupported = true +} + +defm "" : LMULSEWSchedWritesF<"WriteSF_VFExpa">; +defm "" : LMULSEWSchedReadsF<"ReadSF_VFExpa">; + +multiclass UnsupportedSchedXSfvfexpa { +let Unsupported = true in { +defm : LMULSEWWriteResF<"WriteSF_VFExpa", []>; +defm : LMULSEWReadAdvanceF<"ReadSF_VFExpa", 0>; +} // Unsupported = true +} diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 334db4b..4b4fc8f 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -187,7 +187,7 @@ public:    }    bool hasCLZLike() const { -    return HasStdExtZbb || HasVendorXTHeadBb || +    return HasStdExtZbb || HasStdExtP || HasVendorXTHeadBb ||             (HasVendorXCVbitmanip && !IsRV64);    }    bool hasCTZLike() const { @@ -197,7 +197,7 @@ public:      return HasStdExtZbb || (HasVendorXCVbitmanip && !IsRV64);    }    bool hasREV8Like() const { -    return HasStdExtZbb || HasStdExtZbkb || HasVendorXTHeadBb; +    return HasStdExtZbb || HasStdExtZbkb || HasStdExtP || HasVendorXTHeadBb;    }    bool hasBEXTILike() const { return HasStdExtZbs || HasVendorXTHeadBs; } diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 62073ec..4393f6e 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4721,9 +4721,6 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {    if (!(Subtarget->hasVLX() || NVT.is512BitVector()))      return false; -  SDValue N0 = N->getOperand(0); -  SDValue N1 = N->getOperand(1); -    auto getFoldableLogicOp = [](SDValue Op) {      // Peek through single use bitcast.      if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse()) @@ -4740,13 +4737,47 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {      return SDValue();    }; -  SDValue A, FoldableOp; -  if ((FoldableOp = getFoldableLogicOp(N1))) { -    A = N0; -  } else if ((FoldableOp = getFoldableLogicOp(N0))) { -    A = N1; -  } else -    return false; +  SDValue N0, N1, A, FoldableOp; + +  // Identify and (optionally) peel an outer NOT that wraps a pure logic tree +  auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) { +    if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() && +        ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) { +      SDValue InnerOp = Op->getOperand(0); + +      if (!getFoldableLogicOp(InnerOp)) +        return SDValue(); + +      N0 = InnerOp.getOperand(0); +      N1 = InnerOp.getOperand(1); +      if ((FoldableOp = getFoldableLogicOp(N1))) { +        A = N0; +        return InnerOp; +      } +      if ((FoldableOp = getFoldableLogicOp(N0))) { +        A = N1; +        return InnerOp; +      } +    } +    return SDValue(); +  }; + +  bool PeeledOuterNot = false; +  SDNode *OriN = N; +  if (SDValue InnerOp = tryPeelOuterNotWrappingLogic(N)) { +    PeeledOuterNot = true; +    N = InnerOp.getNode(); +  } else { +    N0 = N->getOperand(0); +    N1 = N->getOperand(1); + +    if ((FoldableOp = getFoldableLogicOp(N1))) +      A = N0; +    else if ((FoldableOp = getFoldableLogicOp(N0))) +      A = N1; +    else +      return false; +  }    SDValue B = FoldableOp.getOperand(0);    SDValue C = FoldableOp.getOperand(1); @@ -4798,7 +4829,10 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {    case ISD::XOR: Imm ^= TernlogMagicA; break;    } -  return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm); +  if (PeeledOuterNot) +    Imm = ~Imm; + +  return matchVPTERNLOG(OriN, ParentA, ParentB, ParentC, A, B, C, Imm);  }  /// If the high bits of an 'and' operand are known zero, try setting the diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4dfc400..410f20e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -57617,10 +57617,10 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,    }    // Fold any similar generic ADD/SUB opcodes to reuse this node. -  auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) { +  auto MatchGeneric = [&](unsigned Opc, SDValue N0, SDValue N1, bool Negate) {      SDValue Ops[] = {N0, N1};      SDVTList VTs = DAG.getVTList(N->getValueType(0)); -    if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) { +    if (SDNode *GenericAddSub = DAG.getNodeIfExists(Opc, VTs, Ops)) {        SDValue Op(N, 0);        if (Negate) {          // Bail if this is only used by a user of the x86 add/sub. @@ -57632,8 +57632,25 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,        DCI.CombineTo(GenericAddSub, Op);      }    }; -  MatchGeneric(LHS, RHS, false); -  MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode()); +  MatchGeneric(GenericOpc, LHS, RHS, false); +  MatchGeneric(GenericOpc, RHS, LHS, X86ISD::SUB == N->getOpcode()); + +  if (auto *Const = dyn_cast<ConstantSDNode>(RHS)) { +    SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT); +    if (X86ISD::SUB == N->getOpcode()) { +      // Fold generic add(LHS, -C) to X86ISD::SUB(LHS, C). +      MatchGeneric(ISD::ADD, LHS, NegC, false); +    } else { +      // Negate X86ISD::ADD(LHS, C) and replace generic sub(-C, LHS). +      MatchGeneric(ISD::SUB, NegC, LHS, true); +    } +  } else if (auto *Const = dyn_cast<ConstantSDNode>(LHS)) { +    if (X86ISD::SUB == N->getOpcode()) { +      SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT); +      // Negate X86ISD::SUB(C, RHS) and replace generic add(RHS, -C). +      MatchGeneric(ISD::ADD, RHS, NegC, true); +    } +  }    // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the    // EFLAGS result doesn't change. diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index e28b9c1..b7151f6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1592,7 +1592,6 @@ namespace llvm {      bool useLoadStackGuardNode(const Module &M) const override;      bool useStackGuardXorFP() const override;      void insertSSPDeclarations(Module &M) const override; -    Function *getSSPStackGuardCheck(const Module &M) const override;      SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,                                  const SDLoc &DL) const override; diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 37d7772..a61bbe5 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -640,15 +640,6 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {    TargetLowering::insertSSPDeclarations(M);  } -Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { -  // MSVC CRT has a function to validate security cookie. -  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || -      Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { -    return M.getFunction("__security_check_cookie"); -  } -  return TargetLowering::getSSPStackGuardCheck(M); -} -  Value *  X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {    // Android provides a fixed TLS slot for the SafeStack pointer. See the diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td index edcf247..632c6a2 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td @@ -1407,7 +1407,7 @@ let isBarrier = 1, isTerminator = 1 in {      let r = 0x04;    } -  def BREAK_N : RRRN_Inst<0x0C, (outs), (ins uimm4:$imm), +  def BREAK_N : RRRN_Inst<0x0D, (outs), (ins uimm4:$imm),                           "break.n\t$imm", []>, Requires<[HasDensity, HasDebug]> {      bits<4> imm; diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp index 0fce5b9..709e5f0 100644 --- a/llvm/lib/TargetParser/ARMTargetParser.cpp +++ b/llvm/lib/TargetParser/ARMTargetParser.cpp @@ -88,6 +88,7 @@ unsigned ARM::parseArchVersion(StringRef Arch) {    case ArchKind::ARMV9_4A:    case ArchKind::ARMV9_5A:    case ArchKind::ARMV9_6A: +  case ArchKind::ARMV9_7A:      return 9;    case ArchKind::INVALID:      return 0; @@ -127,6 +128,7 @@ static ARM::ProfileKind getProfileKind(ARM::ArchKind AK) {    case ARM::ArchKind::ARMV9_4A:    case ARM::ArchKind::ARMV9_5A:    case ARM::ArchKind::ARMV9_6A: +  case ARM::ArchKind::ARMV9_7A:      return ARM::ProfileKind::A;    case ARM::ArchKind::ARMV4:    case ARM::ArchKind::ARMV4T: diff --git a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp index f6cea85..15ba1eb 100644 --- a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp +++ b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp @@ -46,6 +46,7 @@ StringRef ARM::getArchSynonym(StringRef Arch) {        .Case("v9.4a", "v9.4-a")        .Case("v9.5a", "v9.5-a")        .Case("v9.6a", "v9.6-a") +      .Case("v9.7a", "v9.7-a")        .Case("v8m.base", "v8-m.base")        .Case("v8m.main", "v8-m.main")        .Case("v8.1m.main", "v8.1-m.main") diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index 1068ce4..11ba9ee 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -937,6 +937,8 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {      return Triple::ARMSubArch_v9_5a;    case ARM::ArchKind::ARMV9_6A:      return Triple::ARMSubArch_v9_6a; +  case ARM::ArchKind::ARMV9_7A: +    return Triple::ARMSubArch_v9_7a;    case ARM::ArchKind::ARMV8R:      return Triple::ARMSubArch_v8r;    case ARM::ArchKind::ARMV8MBaseline: diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 669d4f0..8d9933b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -582,6 +582,18 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {            IC.Builder.CreateBinaryIntrinsic(Intrinsic::ctlz, C, Op1);        return BinaryOperator::CreateSub(ConstCtlz, X);      } + +    // ctlz(~x & (x - 1)) -> bitwidth - cttz(x, false) +    if (Op0->hasOneUse() && +        match(Op0, +              m_c_And(m_Not(m_Value(X)), m_Add(m_Deferred(X), m_AllOnes())))) { +      Type *Ty = II.getType(); +      unsigned BitWidth = Ty->getScalarSizeInBits(); +      auto *Cttz = IC.Builder.CreateIntrinsic(Intrinsic::cttz, Ty, +                                              {X, IC.Builder.getFalse()}); +      auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth)); +      return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz)); +    }    }    // cttz(Pow2) -> Log2(Pow2) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 5aa8de3..f5130da 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -4697,5 +4697,31 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {                  cast<IntrinsicInst>(TrueVal)->getParamAlign(0).valueOrOne(),                  CondVal, FalseVal)); +  // Canonicalize sign function ashr pattern: select (icmp slt X, 1), ashr X, +  // bitwidth-1, 1 -> scmp(X, 0) +  // Also handles: select (icmp sgt X, 0), 1, ashr X, bitwidth-1 -> scmp(X, 0) +  unsigned BitWidth = SI.getType()->getScalarSizeInBits(); +  CmpPredicate Pred; +  Value *CmpLHS, *CmpRHS; + +  // Canonicalize sign function ashr patterns: +  // select (icmp slt X, 1), ashr X, bitwidth-1, 1 -> scmp(X, 0) +  // select (icmp sgt X, 0), 1, ashr X, bitwidth-1 -> scmp(X, 0) +  if (match(&SI, m_Select(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)), +                          m_Value(TrueVal), m_Value(FalseVal))) && +      ((Pred == ICmpInst::ICMP_SLT && match(CmpRHS, m_One()) && +        match(TrueVal, +              m_AShr(m_Specific(CmpLHS), m_SpecificInt(BitWidth - 1))) && +        match(FalseVal, m_One())) || +       (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_Zero()) && +        match(TrueVal, m_One()) && +        match(FalseVal, +              m_AShr(m_Specific(CmpLHS), m_SpecificInt(BitWidth - 1)))))) { + +    Function *Scmp = Intrinsic::getOrInsertDeclaration( +        SI.getModule(), Intrinsic::scmp, {SI.getType(), SI.getType()}); +    return CallInst::Create(Scmp, {CmpLHS, ConstantInt::get(SI.getType(), 0)}); +  } +    return nullptr;  } diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 67e2aae..9c8de45 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2327,6 +2327,18 @@ Constant *InstCombinerImpl::unshuffleConstant(ArrayRef<int> ShMask, Constant *C,    return ConstantVector::get(NewVecC);  } +// Get the result of `Vector Op Splat` (or Splat Op Vector if \p SplatLHS). +static Constant *constantFoldBinOpWithSplat(unsigned Opcode, Constant *Vector, +                                            Constant *Splat, bool SplatLHS, +                                            const DataLayout &DL) { +  ElementCount EC = cast<VectorType>(Vector->getType())->getElementCount(); +  Constant *LHS = ConstantVector::getSplat(EC, Splat); +  Constant *RHS = Vector; +  if (!SplatLHS) +    std::swap(LHS, RHS); +  return ConstantFoldBinaryOpOperands(Opcode, LHS, RHS, DL); +} +  Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {    if (!isa<VectorType>(Inst.getType()))      return nullptr; @@ -2338,6 +2350,37 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {    assert(cast<VectorType>(RHS->getType())->getElementCount() ==           cast<VectorType>(Inst.getType())->getElementCount()); +  auto foldConstantsThroughSubVectorInsertSplat = +      [&](Value *MaybeSubVector, Value *MaybeSplat, +          bool SplatLHS) -> Instruction * { +    Value *Idx; +    Constant *Splat, *SubVector, *Dest; +    if (!match(MaybeSplat, m_ConstantSplat(m_Constant(Splat))) || +        !match(MaybeSubVector, +               m_VectorInsert(m_Constant(Dest), m_Constant(SubVector), +                              m_Value(Idx)))) +      return nullptr; +    SubVector = +        constantFoldBinOpWithSplat(Opcode, SubVector, Splat, SplatLHS, DL); +    Dest = constantFoldBinOpWithSplat(Opcode, Dest, Splat, SplatLHS, DL); +    if (!SubVector || !Dest) +      return nullptr; +    auto *InsertVector = +        Builder.CreateInsertVector(Dest->getType(), Dest, SubVector, Idx); +    return replaceInstUsesWith(Inst, InsertVector); +  }; + +  // If one operand is a constant splat and the other operand is a +  // `vector.insert` where both the destination and subvector are constant, +  // apply the operation to both the destination and subvector, returning a new +  // constant `vector.insert`. This helps constant folding for scalable vectors. +  if (Instruction *Folded = foldConstantsThroughSubVectorInsertSplat( +          /*MaybeSubVector=*/LHS, /*MaybeSplat=*/RHS, /*SplatLHS=*/false)) +    return Folded; +  if (Instruction *Folded = foldConstantsThroughSubVectorInsertSplat( +          /*MaybeSubVector=*/RHS, /*MaybeSplat=*/LHS, /*SplatLHS=*/true)) +    return Folded; +    // If both operands of the binop are vector concatenations, then perform the    // narrow binop on each pair of the source operands followed by concatenation    // of the results. diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index b6cbecb..10b03bb 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -226,6 +226,7 @@ static const Align kMinOriginAlignment = Align(4);  static const Align kShadowTLSAlignment = Align(8);  // These constants must be kept in sync with the ones in msan.h. +// TODO: increase size to match SVE/SVE2/SME/SME2 limits  static const unsigned kParamTLSSize = 800;  static const unsigned kRetvalTLSSize = 800; @@ -1544,6 +1545,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {      }    } +  static bool isAArch64SVCount(Type *Ty) { +    if (TargetExtType *TTy = dyn_cast<TargetExtType>(Ty)) +      return TTy->getName() == "aarch64.svcount"; +    return false; +  } + +  // This is intended to match the "AArch64 Predicate-as-Counter Type" (aka +  // 'target("aarch64.svcount")', but not e.g., <vscale x 4 x i32>. +  static bool isScalableNonVectorType(Type *Ty) { +    if (!isAArch64SVCount(Ty)) +      LLVM_DEBUG(dbgs() << "isScalableNonVectorType: Unexpected type " << *Ty +                        << "\n"); + +    return Ty->isScalableTy() && !isa<VectorType>(Ty); +  } +    void materializeChecks() {  #ifndef NDEBUG      // For assert below. @@ -1672,6 +1689,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {        LLVM_DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");        return Res;      } +    if (isScalableNonVectorType(OrigTy)) { +      LLVM_DEBUG(dbgs() << "getShadowTy: Scalable non-vector type: " << *OrigTy +                        << "\n"); +      return OrigTy; +    } +      uint32_t TypeSize = DL.getTypeSizeInBits(OrigTy);      return IntegerType::get(*MS.C, TypeSize);    } @@ -2185,8 +2208,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {                          << *OrigIns << "\n");        return;      } -#ifndef NDEBUG +      Type *ShadowTy = Shadow->getType(); +    if (isScalableNonVectorType(ShadowTy)) { +      LLVM_DEBUG(dbgs() << "Skipping check of scalable non-vector " << *Shadow +                        << " before " << *OrigIns << "\n"); +      return; +    } +#ifndef NDEBUG      assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy) ||              isa<StructType>(ShadowTy) || isa<ArrayType>(ShadowTy)) &&             "Can only insert checks for integer, vector, and aggregate shadow " @@ -6972,6 +7001,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {        // an extra "select". This results in much more compact IR.        // Sa = select Sb, poisoned, (select b, Sc, Sd)        Sa1 = getPoisonedShadow(getShadowTy(I.getType())); +    } else if (isScalableNonVectorType(I.getType())) { +      // This is intended to handle target("aarch64.svcount"), which can't be +      // handled in the else branch because of incompatibility with CreateXor +      // ("The supported LLVM operations on this type are limited to load, +      // store, phi, select and alloca instructions"). + +      // TODO: this currently underapproximates. Use Arm SVE EOR in the else +      //       branch as needed instead. +      Sa1 = getCleanShadow(getShadowTy(I.getType()));      } else {        // Sa = select Sb, [ (c^d) | Sc | Sd ], [ b ? Sc : Sd ]        // If Sb (condition is poisoned), look for bits in c and d that are equal diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index a1ad2db..2591df8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -4172,11 +4172,6 @@ class VPlan {    /// definitions are VPValues that hold a pointer to their underlying IR.    SmallVector<VPValue *, 16> VPLiveIns; -  /// Mapping from SCEVs to the VPValues representing their expansions. -  /// NOTE: This mapping is temporary and will be removed once all users have -  /// been modeled in VPlan directly. -  DenseMap<const SCEV *, VPValue *> SCEVToExpansion; -    /// Blocks allocated and owned by the VPlan. They will be deleted once the    /// VPlan is destroyed.    SmallVector<VPBlockBase *> CreatedBlocks; @@ -4424,15 +4419,6 @@ public:    LLVM_DUMP_METHOD void dump() const;  #endif -  VPValue *getSCEVExpansion(const SCEV *S) const { -    return SCEVToExpansion.lookup(S); -  } - -  void addSCEVExpansion(const SCEV *S, VPValue *V) { -    assert(!SCEVToExpansion.contains(S) && "SCEV already expanded"); -    SCEVToExpansion[S] = V; -  } -    /// Clone the current VPlan, update all VPValues of the new VPlan and cloned    /// recipes to refer to the clones, and return it.    VPlan *duplicate(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 3e85e6f..84817d7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -943,12 +943,40 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {    }  } +/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R. +/// Returns an optional pair, where the first element indicates whether it is +/// an intrinsic ID. +static std::optional<std::pair<bool, unsigned>> +getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) { +  return TypeSwitch<const VPSingleDefRecipe *, +                    std::optional<std::pair<bool, unsigned>>>(R) +      .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, +            VPWidenSelectRecipe, VPWidenGEPRecipe, VPReplicateRecipe>( +          [](auto *I) { return std::make_pair(false, I->getOpcode()); }) +      .Case<VPWidenIntrinsicRecipe>([](auto *I) { +        return std::make_pair(true, I->getVectorIntrinsicID()); +      }) +      .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) { +        // For recipes that do not directly map to LLVM IR instructions, +        // assign opcodes after the last VPInstruction opcode (which is also +        // after the last IR Instruction opcode), based on the VPDefID. +        return std::make_pair(false, +                              VPInstruction::OpsEnd + 1 + I->getVPDefID()); +      }) +      .Default([](auto *) { return std::nullopt; }); +} +  /// Try to fold \p R using InstSimplifyFolder. Will succeed and return a -/// non-nullptr Value for a handled \p Opcode if corresponding \p Operands are -/// foldable live-ins. -static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode, -                               ArrayRef<VPValue *> Operands, -                               const DataLayout &DL, VPTypeAnalysis &TypeInfo) { +/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p +/// Operands are foldable live-ins. +static VPValue *tryToFoldLiveIns(VPSingleDefRecipe &R, +                                 ArrayRef<VPValue *> Operands, +                                 const DataLayout &DL, +                                 VPTypeAnalysis &TypeInfo) { +  auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R); +  if (!OpcodeOrIID) +    return nullptr; +    SmallVector<Value *, 4> Ops;    for (VPValue *Op : Operands) {      if (!Op->isLiveIn() || !Op->getLiveInIRValue()) @@ -956,43 +984,57 @@ static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,      Ops.push_back(Op->getLiveInIRValue());    } -  InstSimplifyFolder Folder(DL); -  if (Instruction::isBinaryOp(Opcode)) -    return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode), Ops[0], +  auto FoldToIRValue = [&]() -> Value * { +    InstSimplifyFolder Folder(DL); +    if (OpcodeOrIID->first) { +      if (R.getNumOperands() != 2) +        return nullptr; +      unsigned ID = OpcodeOrIID->second; +      return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1], +                                        TypeInfo.inferScalarType(&R)); +    } +    unsigned Opcode = OpcodeOrIID->second; +    if (Instruction::isBinaryOp(Opcode)) +      return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode), +                              Ops[0], Ops[1]); +    if (Instruction::isCast(Opcode)) +      return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0], +                             TypeInfo.inferScalarType(R.getVPSingleValue())); +    switch (Opcode) { +    case VPInstruction::LogicalAnd: +      return Folder.FoldSelect(Ops[0], Ops[1], +                               ConstantInt::getNullValue(Ops[1]->getType())); +    case VPInstruction::Not: +      return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0], +                              Constant::getAllOnesValue(Ops[0]->getType())); +    case Instruction::Select: +      return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]); +    case Instruction::ICmp: +    case Instruction::FCmp: +      return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],                              Ops[1]); -  if (Instruction::isCast(Opcode)) -    return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0], -                           TypeInfo.inferScalarType(R.getVPSingleValue())); -  switch (Opcode) { -  case VPInstruction::LogicalAnd: -    return Folder.FoldSelect(Ops[0], Ops[1], -                             ConstantInt::getNullValue(Ops[1]->getType())); -  case VPInstruction::Not: -    return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0], -                            Constant::getAllOnesValue(Ops[0]->getType())); -  case Instruction::Select: -    return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]); -  case Instruction::ICmp: -  case Instruction::FCmp: -    return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0], -                          Ops[1]); -  case Instruction::GetElementPtr: { -    auto &RFlags = cast<VPRecipeWithIRFlags>(R); -    auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr()); -    return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0], drop_begin(Ops), -                          RFlags.getGEPNoWrapFlags()); -  } -  case VPInstruction::PtrAdd: -  case VPInstruction::WidePtrAdd: -    return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), Ops[0], -                          Ops[1], -                          cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags()); -  // An extract of a live-in is an extract of a broadcast, so return the -  // broadcasted element. -  case Instruction::ExtractElement: -    assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar"); -    return Ops[0]; -  } +    case Instruction::GetElementPtr: { +      auto &RFlags = cast<VPRecipeWithIRFlags>(R); +      auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr()); +      return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0], +                            drop_begin(Ops), RFlags.getGEPNoWrapFlags()); +    } +    case VPInstruction::PtrAdd: +    case VPInstruction::WidePtrAdd: +      return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), +                            Ops[0], Ops[1], +                            cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags()); +    // An extract of a live-in is an extract of a broadcast, so return the +    // broadcasted element. +    case Instruction::ExtractElement: +      assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar"); +      return Ops[0]; +    } +    return nullptr; +  }; + +  if (Value *V = FoldToIRValue()) +    return R.getParent()->getPlan()->getOrAddLiveIn(V);    return nullptr;  } @@ -1006,19 +1048,10 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {    // Simplification of live-in IR values for SingleDef recipes using    // InstSimplifyFolder. -  if (TypeSwitch<VPRecipeBase *, bool>(&R) -          .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, -                VPReplicateRecipe, VPWidenSelectRecipe>([&](auto *I) { -            const DataLayout &DL = -                Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout(); -            Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL, -                                        TypeInfo); -            if (V) -              I->replaceAllUsesWith(Plan->getOrAddLiveIn(V)); -            return V; -          }) -          .Default([](auto *) { return false; })) -    return; +  const DataLayout &DL = +      Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout(); +  if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo)) +    return Def->replaceAllUsesWith(V);    // Fold PredPHI LiveIn -> LiveIn.    if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) { @@ -1996,29 +2029,6 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {      return Def == getEmptyKey() || Def == getTombstoneKey();    } -  /// Get any instruction opcode or intrinsic ID data embedded in recipe \p R. -  /// Returns an optional pair, where the first element indicates whether it is -  /// an intrinsic ID. -  static std::optional<std::pair<bool, unsigned>> -  getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) { -    return TypeSwitch<const VPSingleDefRecipe *, -                      std::optional<std::pair<bool, unsigned>>>(R) -        .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, -              VPWidenSelectRecipe, VPWidenGEPRecipe, VPReplicateRecipe>( -            [](auto *I) { return std::make_pair(false, I->getOpcode()); }) -        .Case<VPWidenIntrinsicRecipe>([](auto *I) { -          return std::make_pair(true, I->getVectorIntrinsicID()); -        }) -        .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) { -          // For recipes that do not directly map to LLVM IR instructions, -          // assign opcodes after the last VPInstruction opcode (which is also -          // after the last IR Instruction opcode), based on the VPDefID. -          return std::make_pair(false, -                                VPInstruction::OpsEnd + 1 + I->getVPDefID()); -        }) -        .Default([](auto *) { return std::nullopt; }); -  } -    /// If recipe \p R will lower to a GEP with a non-i8 source element type,    /// return that source element type.    static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) { @@ -4119,7 +4129,7 @@ static bool isAlreadyNarrow(VPValue *VPV) {  void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,                                               unsigned VectorRegWidth) {    VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); -  if (!VectorLoop) +  if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)      return;    VPTypeAnalysis TypeInfo(Plan); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 06c3d75..fe66f13 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -32,8 +32,6 @@ bool vputils::onlyScalarValuesUsed(const VPValue *Def) {  }  VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { -  if (auto *Expanded = Plan.getSCEVExpansion(Expr)) -    return Expanded;    VPValue *Expanded = nullptr;    if (auto *E = dyn_cast<SCEVConstant>(Expr))      Expanded = Plan.getOrAddLiveIn(E->getValue()); @@ -50,7 +48,6 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) {        Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe());      }    } -  Plan.addSCEVExpansion(Expr, Expanded);    return Expanded;  } | 
