diff options
Diffstat (limited to 'llvm/lib')
77 files changed, 4091 insertions, 695 deletions
| diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 853bd66..a572eef 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -1582,6 +1582,23 @@ static const SCEV *minusSCEVNoSignedOverflow(const SCEV *A, const SCEV *B,    return nullptr;  } +/// Returns the absolute value of \p A. In the context of dependence analysis, +/// we need an absolute value in a mathematical sense. If \p A is the signed +/// minimum value, we cannot represent it unless extending the original type. +/// Thus if we cannot prove that \p A is not the signed minimum value, returns +/// nullptr. +static const SCEV *absSCEVNoSignedOverflow(const SCEV *A, ScalarEvolution &SE) { +  IntegerType *Ty = cast<IntegerType>(A->getType()); +  if (!Ty) +    return nullptr; + +  const SCEV *SMin = +      SE.getConstant(APInt::getSignedMinValue(Ty->getBitWidth())); +  if (!SE.isKnownPredicate(CmpInst::ICMP_NE, A, SMin)) +    return nullptr; +  return SE.getAbsExpr(A, /*IsNSW=*/true); +} +  /// Returns true iff \p Test is enabled.  static bool isDependenceTestEnabled(DependenceTestType Test) {    if (EnableDependenceTest == DependenceTestType::All) @@ -1669,21 +1686,25 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,    LLVM_DEBUG(dbgs() << ", " << *Delta->getType() << "\n");    // check that |Delta| < iteration count -  if (const SCEV *UpperBound = -          collectUpperBound(CurSrcLoop, Delta->getType())) { +  bool IsDeltaLarge = [&] { +    const SCEV *UpperBound = collectUpperBound(CurSrcLoop, Delta->getType()); +    if (!UpperBound) +      return false; +      LLVM_DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound);      LLVM_DEBUG(dbgs() << ", " << *UpperBound->getType() << "\n"); -    const SCEV *AbsDelta = -        SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta); -    const SCEV *AbsCoeff = -        SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff); +    const SCEV *AbsDelta = absSCEVNoSignedOverflow(Delta, *SE); +    const SCEV *AbsCoeff = absSCEVNoSignedOverflow(Coeff, *SE); +    if (!AbsDelta || !AbsCoeff) +      return false;      const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff); -    if (isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product)) { -      // Distance greater than trip count - no dependence -      ++StrongSIVindependence; -      ++StrongSIVsuccesses; -      return true; -    } +    return isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product); +  }(); +  if (IsDeltaLarge) { +    // Distance greater than trip count - no dependence +    ++StrongSIVindependence; +    ++StrongSIVsuccesses; +    return true;    }    // Can we compute distance? @@ -2259,6 +2280,9 @@ bool DependenceInfo::weakZeroSrcSIVtest(    const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(DstCoeff);    if (!ConstCoeff)      return false; + +  // Since ConstCoeff is constant, !isKnownNegative means it's non-negative. +  // TODO: Bail out if it's a signed minimum value.    const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff)                               ? SE->getNegativeSCEV(ConstCoeff)                               : ConstCoeff; @@ -2369,6 +2393,9 @@ bool DependenceInfo::weakZeroDstSIVtest(    const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(SrcCoeff);    if (!ConstCoeff)      return false; + +  // Since ConstCoeff is constant, !isKnownNegative means it's non-negative. +  // TODO: Bail out if it's a signed minimum value.    const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff)                               ? SE->getNegativeSCEV(ConstCoeff)                               : ConstCoeff; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index b425b95..1f10478 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -391,19 +391,6 @@ void CombinerHelper::applyCombineConcatVectors(    MI.eraseFromParent();  } -bool CombinerHelper::matchCombineShuffleToBuildVector(MachineInstr &MI) const { -  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR && -         "Invalid instruction"); -  auto &Shuffle = cast<GShuffleVector>(MI); - -  Register SrcVec1 = Shuffle.getSrc1Reg(); -  Register SrcVec2 = Shuffle.getSrc2Reg(); - -  LLT SrcVec1Type = MRI.getType(SrcVec1); -  LLT SrcVec2Type = MRI.getType(SrcVec2); -  return SrcVec1Type.isVector() && SrcVec2Type.isVector(); -} -  void CombinerHelper::applyCombineShuffleToBuildVector(MachineInstr &MI) const {    auto &Shuffle = cast<GShuffleVector>(MI); @@ -535,11 +522,9 @@ bool CombinerHelper::matchCombineShuffleVector(    LLT DstType = MRI.getType(MI.getOperand(0).getReg());    Register Src1 = MI.getOperand(1).getReg();    LLT SrcType = MRI.getType(Src1); -  // As bizarre as it may look, shuffle vector can actually produce -  // scalar! This is because at the IR level a <1 x ty> shuffle -  // vector is perfectly valid. -  unsigned DstNumElts = DstType.isVector() ? DstType.getNumElements() : 1; -  unsigned SrcNumElts = SrcType.isVector() ? SrcType.getNumElements() : 1; + +  unsigned DstNumElts = DstType.getNumElements(); +  unsigned SrcNumElts = SrcType.getNumElements();    // If the resulting vector is smaller than the size of the source    // vectors being concatenated, we won't be able to replace the @@ -556,7 +541,7 @@ bool CombinerHelper::matchCombineShuffleVector(    //    // TODO: If the size between the source and destination don't match    //       we could still emit an extract vector element in that case. -  if (DstNumElts < 2 * SrcNumElts && DstNumElts != 1) +  if (DstNumElts < 2 * SrcNumElts)      return false;    // Check that the shuffle mask can be broken evenly between the @@ -619,39 +604,6 @@ void CombinerHelper::applyCombineShuffleVector(    MI.eraseFromParent();  } -bool CombinerHelper::matchShuffleToExtract(MachineInstr &MI) const { -  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR && -         "Invalid instruction kind"); - -  ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); -  return Mask.size() == 1; -} - -void CombinerHelper::applyShuffleToExtract(MachineInstr &MI) const { -  Register DstReg = MI.getOperand(0).getReg(); -  Builder.setInsertPt(*MI.getParent(), MI); - -  int I = MI.getOperand(3).getShuffleMask()[0]; -  Register Src1 = MI.getOperand(1).getReg(); -  LLT Src1Ty = MRI.getType(Src1); -  int Src1NumElts = Src1Ty.isVector() ? Src1Ty.getNumElements() : 1; -  Register SrcReg; -  if (I >= Src1NumElts) { -    SrcReg = MI.getOperand(2).getReg(); -    I -= Src1NumElts; -  } else if (I >= 0) -    SrcReg = Src1; - -  if (I < 0) -    Builder.buildUndef(DstReg); -  else if (!MRI.getType(SrcReg).isVector()) -    Builder.buildCopy(DstReg, SrcReg); -  else -    Builder.buildExtractVectorElementConstant(DstReg, SrcReg, I); - -  MI.eraseFromParent(); -} -  namespace {  /// Select a preference between two uses. CurrentUse is the current preference @@ -8369,7 +8321,7 @@ bool CombinerHelper::matchShuffleDisjointMask(MachineInstr &MI,      return false;    ArrayRef<int> Mask = Shuffle.getMask(); -  const unsigned NumSrcElems = Src1Ty.isVector() ? Src1Ty.getNumElements() : 1; +  const unsigned NumSrcElems = Src1Ty.getNumElements();    bool TouchesSrc1 = false;    bool TouchesSrc2 = false; diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp index 04d9309..d6f23b6 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp @@ -602,6 +602,8 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known,                           Depth + 1);      computeKnownBitsImpl(MI.getOperand(3).getReg(), WidthKnown, DemandedElts,                           Depth + 1); +    OffsetKnown = OffsetKnown.sext(BitWidth); +    WidthKnown = WidthKnown.sext(BitWidth);      Known = extractBits(BitWidth, SrcOpKnown, OffsetKnown, WidthKnown);      // Sign extend the extracted value using shift left and arithmetic shift      // right. diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index b49040b..1fc90d0 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -3359,6 +3359,54 @@ bool IRTranslator::translateShuffleVector(const User &U,      Mask = SVI->getShuffleMask();    else      Mask = cast<ConstantExpr>(U).getShuffleMask(); + +  // As GISel does not represent <1 x > vectors as a separate type from scalars, +  // we transform shuffle_vector with a scalar output to an +  // ExtractVectorElement. If the input type is also scalar it becomes a Copy. +  unsigned DstElts = cast<FixedVectorType>(U.getType())->getNumElements(); +  unsigned SrcElts = +      cast<FixedVectorType>(U.getOperand(0)->getType())->getNumElements(); +  if (DstElts == 1) { +    unsigned M = Mask[0]; +    if (SrcElts == 1) { +      if (M == 0 || M == 1) +        return translateCopy(U, *U.getOperand(M), MIRBuilder); +      MIRBuilder.buildUndef(getOrCreateVReg(U)); +    } else { +      Register Dst = getOrCreateVReg(U); +      if (M < SrcElts) { +        MIRBuilder.buildExtractVectorElementConstant( +            Dst, getOrCreateVReg(*U.getOperand(0)), M); +      } else if (M < SrcElts * 2) { +        MIRBuilder.buildExtractVectorElementConstant( +            Dst, getOrCreateVReg(*U.getOperand(1)), M - SrcElts); +      } else { +        MIRBuilder.buildUndef(Dst); +      } +    } +    return true; +  } + +  // A single element src is transformed to a build_vector. +  if (SrcElts == 1) { +    SmallVector<Register> Ops; +    Register Undef; +    for (int M : Mask) { +      LLT SrcTy = getLLTForType(*U.getOperand(0)->getType(), *DL); +      if (M == 0 || M == 1) { +        Ops.push_back(getOrCreateVReg(*U.getOperand(M))); +      } else { +        if (!Undef.isValid()) { +          Undef = MRI->createGenericVirtualRegister(SrcTy); +          MIRBuilder.buildUndef(Undef); +        } +        Ops.push_back(Undef); +      } +    } +    MIRBuilder.buildBuildVector(getOrCreateVReg(U), Ops); +    return true; +  } +    ArrayRef<int> MaskAlloc = MF->allocateShuffleMask(Mask);    MIRBuilder        .buildInstr(TargetOpcode::G_SHUFFLE_VECTOR, {getOrCreateVReg(U)}, diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 38ec83f..178529f 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -4748,6 +4748,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {    case G_FMINIMUMNUM:    case G_FMAXIMUMNUM:      return lowerFMinNumMaxNum(MI); +  case G_FMINIMUM: +  case G_FMAXIMUM: +    return lowerFMinimumMaximum(MI);    case G_MERGE_VALUES:      return lowerMergeValues(MI);    case G_UNMERGE_VALUES: @@ -5819,6 +5822,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(      } else if (InputUsed[0] == -1U) {        // No input vectors were used! The result is undefined.        Output = MIRBuilder.buildUndef(NarrowTy).getReg(0); +    } else if (NewElts == 1) { +      Output = MIRBuilder.buildCopy(NarrowTy, Inputs[InputUsed[0]]).getReg(0);      } else {        Register Op0 = Inputs[InputUsed[0]];        // If only one input was used, use an undefined vector for the other. @@ -8775,6 +8780,77 @@ LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {    return Legalized;  } +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerFMinimumMaximum(MachineInstr &MI) { +  unsigned Opc = MI.getOpcode(); +  auto [Dst, Src0, Src1] = MI.getFirst3Regs(); +  LLT Ty = MRI.getType(Dst); +  LLT CmpTy = Ty.changeElementSize(1); + +  bool IsMax = (Opc == TargetOpcode::G_FMAXIMUM); +  unsigned OpcIeee = +      IsMax ? TargetOpcode::G_FMAXNUM_IEEE : TargetOpcode::G_FMINNUM_IEEE; +  unsigned OpcNonIeee = +      IsMax ? TargetOpcode::G_FMAXNUM : TargetOpcode::G_FMINNUM; +  bool MinMaxMustRespectOrderedZero = false; +  Register Res; + +  // IEEE variants don't need canonicalization +  if (LI.isLegalOrCustom({OpcIeee, Ty})) { +    Res = MIRBuilder.buildInstr(OpcIeee, {Ty}, {Src0, Src1}).getReg(0); +    MinMaxMustRespectOrderedZero = true; +  } else if (LI.isLegalOrCustom({OpcNonIeee, Ty})) { +    Res = MIRBuilder.buildInstr(OpcNonIeee, {Ty}, {Src0, Src1}).getReg(0); +  } else { +    auto Compare = MIRBuilder.buildFCmp( +        IsMax ? CmpInst::FCMP_OGT : CmpInst::FCMP_OLT, CmpTy, Src0, Src1); +    Res = MIRBuilder.buildSelect(Ty, Compare, Src0, Src1).getReg(0); +  } + +  // Propagate any NaN of both operands +  if (!MI.getFlag(MachineInstr::FmNoNans) && +      (!isKnownNeverNaN(Src0, MRI) || isKnownNeverNaN(Src1, MRI))) { +    auto IsOrdered = MIRBuilder.buildFCmp(CmpInst::FCMP_ORD, CmpTy, Src0, Src1); + +    LLT ElementTy = Ty.isScalar() ? Ty : Ty.getElementType(); +    APFloat NaNValue = APFloat::getNaN(getFltSemanticForLLT(ElementTy)); +    Register NaN = MIRBuilder.buildFConstant(ElementTy, NaNValue).getReg(0); +    if (Ty.isVector()) +      NaN = MIRBuilder.buildSplatBuildVector(Ty, NaN).getReg(0); + +    Res = MIRBuilder.buildSelect(Ty, IsOrdered, Res, NaN).getReg(0); +  } + +  // fminimum/fmaximum requires -0.0 less than +0.0 +  if (!MinMaxMustRespectOrderedZero && !MI.getFlag(MachineInstr::FmNsz)) { +    GISelValueTracking VT(MIRBuilder.getMF()); +    KnownFPClass Src0Info = VT.computeKnownFPClass(Src0, fcZero); +    KnownFPClass Src1Info = VT.computeKnownFPClass(Src1, fcZero); + +    if (!Src0Info.isKnownNeverZero() && !Src1Info.isKnownNeverZero()) { +      const unsigned Flags = MI.getFlags(); +      Register Zero = MIRBuilder.buildFConstant(Ty, 0.0).getReg(0); +      auto IsZero = MIRBuilder.buildFCmp(CmpInst::FCMP_OEQ, CmpTy, Res, Zero); + +      unsigned TestClass = IsMax ? fcPosZero : fcNegZero; + +      auto LHSTestZero = MIRBuilder.buildIsFPClass(CmpTy, Src0, TestClass); +      auto LHSSelect = +          MIRBuilder.buildSelect(Ty, LHSTestZero, Src0, Res, Flags); + +      auto RHSTestZero = MIRBuilder.buildIsFPClass(CmpTy, Src1, TestClass); +      auto RHSSelect = +          MIRBuilder.buildSelect(Ty, RHSTestZero, Src1, LHSSelect, Flags); + +      Res = MIRBuilder.buildSelect(Ty, IsZero, RHSSelect, Res, Flags).getReg(0); +    } +  } + +  MIRBuilder.buildCopy(Dst, Res); +  MI.eraseFromParent(); +  return Legalized; +} +  LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {    // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c    Register DstReg = MI.getOperand(0).getReg(); @@ -9016,22 +9092,18 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {        continue;      } -    if (Src0Ty.isScalar()) { -      BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg); -    } else { -      int NumElts = Src0Ty.getNumElements(); -      Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg; -      int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts; -      auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx); -      auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK); -      BuildVec.push_back(Extract.getReg(0)); -    } +    assert(!Src0Ty.isScalar() && "Unexpected scalar G_SHUFFLE_VECTOR"); + +    int NumElts = Src0Ty.getNumElements(); +    Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg; +    int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts; +    auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx); +    auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK); +    BuildVec.push_back(Extract.getReg(0));    } -  if (DstTy.isVector()) -    MIRBuilder.buildBuildVector(DstReg, BuildVec); -  else -    MIRBuilder.buildCopy(DstReg, BuildVec[0]); +  assert(DstTy.isVector() && "Unexpected scalar G_SHUFFLE_VECTOR"); +  MIRBuilder.buildBuildVector(DstReg, BuildVec);    MI.eraseFromParent();    return Legalized;  } diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 27df7e3..4b4df98 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -800,10 +800,11 @@ MachineInstrBuilder MachineIRBuilder::buildShuffleVector(const DstOp &Res,    LLT DstTy = Res.getLLTTy(*getMRI());    LLT Src1Ty = Src1.getLLTTy(*getMRI());    LLT Src2Ty = Src2.getLLTTy(*getMRI()); -  const LLT DstElemTy = DstTy.isVector() ? DstTy.getElementType() : DstTy; -  const LLT ElemTy1 = Src1Ty.isVector() ? Src1Ty.getElementType() : Src1Ty; -  const LLT ElemTy2 = Src2Ty.isVector() ? Src2Ty.getElementType() : Src2Ty; +  const LLT DstElemTy = DstTy.getScalarType(); +  const LLT ElemTy1 = Src1Ty.getScalarType(); +  const LLT ElemTy2 = Src2Ty.getScalarType();    assert(DstElemTy == ElemTy1 && DstElemTy == ElemTy2); +  assert(Mask.size() > 1 && "Scalar G_SHUFFLE_VECTOR are not supported");    (void)DstElemTy;    (void)ElemTy1;    (void)ElemTy2; diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp index 6a464d9..4795d81 100644 --- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -2788,6 +2788,9 @@ bool MIParser::parseShuffleMaskOperand(MachineOperand &Dest) {    if (expectAndConsume(MIToken::rparen))      return error("shufflemask should be terminated by ')'."); +  if (ShufMask.size() < 2) +    return error("shufflemask should have > 1 element"); +    ArrayRef<int> MaskAlloc = MF.allocateShuffleMask(ShufMask);    Dest = MachineOperand::CreateShuffleMask(MaskAlloc);    return false; diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index 1154855..c0710c4 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -1924,13 +1924,23 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {      if (Src0Ty != Src1Ty)        report("Source operands must be the same type", MI); -    if (Src0Ty.getScalarType() != DstTy.getScalarType()) +    if (Src0Ty.getScalarType() != DstTy.getScalarType()) {        report("G_SHUFFLE_VECTOR cannot change element type", MI); +      break; +    } +    if (!Src0Ty.isVector()) { +      report("G_SHUFFLE_VECTOR must have vector src", MI); +      break; +    } +    if (!DstTy.isVector()) { +      report("G_SHUFFLE_VECTOR must have vector dst", MI); +      break; +    }      // Don't check that all operands are vector because scalars are used in      // place of 1 element vectors. -    int SrcNumElts = Src0Ty.isVector() ? Src0Ty.getNumElements() : 1; -    int DstNumElts = DstTy.isVector() ? DstTy.getNumElements() : 1; +    int SrcNumElts = Src0Ty.getNumElements(); +    int DstNumElts = DstTy.getNumElements();      ArrayRef<int> MaskIdxes = MaskOp.getShuffleMask(); diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp index 72b364c..697b779 100644 --- a/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/llvm/lib/CodeGen/RegAllocFast.cpp @@ -211,7 +211,7 @@ private:      unsigned getSparseSetIndex() const { return VirtReg.virtRegIndex(); }    }; -  using LiveRegMap = SparseSet<LiveReg, unsigned, identity_cxx20, uint16_t>; +  using LiveRegMap = SparseSet<LiveReg, unsigned, identity, uint16_t>;    /// This map contains entries for each virtual register that is currently    /// available in a physical register.    LiveRegMap LiveVirtRegs; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d2ea652..8676060 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19993,8 +19993,12 @@ static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad,    //    nor a successor of N. Otherwise, if Op is folded that would    //    create a cycle.    unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps(); -  for (SDNode *Op : Ptr->users()) { +  for (SDUse &U : Ptr->uses()) { +    if (U.getResNo() != Ptr.getResNo()) +      continue; +      // Check for #1. +    SDNode *Op = U.getUser();      if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI))        continue; diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 060b1dd..59798b3 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -2097,6 +2097,11 @@ Value *TargetLoweringBase::getSDagStackGuard(const Module &M) const {  }  Function *TargetLoweringBase::getSSPStackGuardCheck(const Module &M) const { +  // MSVC CRT has a function to validate security cookie. +  RTLIB::LibcallImpl SecurityCheckCookieLibcall = +      getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE); +  if (SecurityCheckCookieLibcall != RTLIB::Unsupported) +    return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall));    return nullptr;  } diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 488b078..1096e57 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -4082,10 +4082,10 @@ void AssemblyWriter::printTypeIdentities() {  /// printFunction - Print all aspects of a function.  void AssemblyWriter::printFunction(const Function *F) { -  if (AnnotationWriter) AnnotationWriter->emitFunctionAnnot(F, Out); -    if (F->isMaterializable())      Out << "; Materializable\n"; +  else if (AnnotationWriter) +    AnnotationWriter->emitFunctionAnnot(F, Out);    const AttributeList &Attrs = F->getAttributes();    if (Attrs.hasFnAttrs()) { diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt index 1e1d0a6..70c4577 100644 --- a/llvm/lib/MC/CMakeLists.txt +++ b/llvm/lib/MC/CMakeLists.txt @@ -73,9 +73,10 @@ add_llvm_component_library(LLVMMC    ${LLVM_MAIN_INCLUDE_DIR}/llvm/MC    LINK_COMPONENTS +  BinaryFormat +  DebugInfoDWARFLowLevel    Support    TargetParser -  BinaryFormat    DEPENDS    intrinsics_gen diff --git a/llvm/lib/MC/MCSFrame.cpp b/llvm/lib/MC/MCSFrame.cpp index d6fa54c..e0a90df 100644 --- a/llvm/lib/MC/MCSFrame.cpp +++ b/llvm/lib/MC/MCSFrame.cpp @@ -8,6 +8,8 @@  #include "llvm/MC/MCSFrame.h"  #include "llvm/BinaryFormat/SFrame.h" +#include "llvm/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.h" +#include "llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h"  #include "llvm/MC/MCAsmInfo.h"  #include "llvm/MC/MCContext.h"  #include "llvm/MC/MCObjectFileInfo.h" @@ -211,8 +213,152 @@ class SFrameEmitterImpl {      return true;    } +  // Technically, the escape data could be anything, but it is commonly a dwarf +  // CFI program. Even then, it could contain an arbitrarily complicated Dwarf +  // expression. Following gnu-gas, look for certain common cases that could +  // invalidate an FDE, emit a warning for those sequences, and don't generate +  // an FDE in those cases. Allow any that are known safe. It is likely that +  // more thorough test cases could refine this code, but it handles the most +  // important ones compatibly with gas. +  // Returns true if the CFI escape sequence is safe for sframes. +  bool isCFIEscapeSafe(SFrameFDE &FDE, const SFrameFRE &FRE, +                       const MCCFIInstruction &CFI) { +    const MCAsmInfo *AI = Streamer.getContext().getAsmInfo(); +    DWARFDataExtractorSimple data(CFI.getValues(), AI->isLittleEndian(), +                                  AI->getCodePointerSize()); + +    // Normally, both alignment factors are extracted from the enclosing Dwarf +    // FDE or CIE. We don't have one here. Alignments are used for scaling +    // factors for ops like CFA_def_cfa_offset_sf. But this particular function +    // is only interested in registers. +    dwarf::CFIProgram P(/*CodeAlignmentFactor=*/1, +                        /*DataAlignmentFactor=*/1, +                        Streamer.getContext().getTargetTriple().getArch()); +    uint64_t Offset = 0; +    if (P.parse(data, &Offset, CFI.getValues().size())) { +      // Not a parsable dwarf expression. Assume the worst. +      Streamer.getContext().reportWarning( +          CFI.getLoc(), +          "skipping SFrame FDE; .cfi_escape with unknown effects"); +      return false; +    } + +    // This loop deals with dwarf::CFIProgram::Instructions. Everywhere else +    // this file deals with MCCFIInstructions. +    for (const dwarf::CFIProgram::Instruction &I : P) { +      switch (I.Opcode) { +      case dwarf::DW_CFA_nop: +        break; +      case dwarf::DW_CFA_val_offset: { +        // First argument is a register. Anything that touches CFA, FP, or RA is +        // a problem, but allow others through. As an even more special case, +        // allow SP + 0. +        auto Reg = I.getOperandAsUnsigned(P, 0); +        // The parser should have failed in this case. +        assert(Reg && "DW_CFA_val_offset with no register."); +        bool SPOk = true; +        if (*Reg == SPReg) { +          auto Opnd = I.getOperandAsSigned(P, 1); +          if (!Opnd || *Opnd != 0) +            SPOk = false; +        } +        if (!SPOk || *Reg == RAReg || *Reg == FPReg) { +          StringRef RN = *Reg == SPReg +                             ? "SP reg " +                             : (*Reg == FPReg ? "FP reg " : "RA reg "); +          Streamer.getContext().reportWarning( +              CFI.getLoc(), +              Twine( +                  "skipping SFrame FDE; .cfi_escape DW_CFA_val_offset with ") + +                  RN + Twine(*Reg)); +          return false; +        } +      } break; +      case dwarf::DW_CFA_expression: { +        // First argument is a register. Anything that touches CFA, FP, or RA is +        // a problem, but allow others through. +        auto Reg = I.getOperandAsUnsigned(P, 0); +        if (!Reg) { +          Streamer.getContext().reportWarning( +              CFI.getLoc(), +              "skipping SFrame FDE; .cfi_escape with unknown effects"); +          return false; +        } +        if (*Reg == SPReg || *Reg == RAReg || *Reg == FPReg) { +          StringRef RN = *Reg == SPReg +                             ? "SP reg " +                             : (*Reg == FPReg ? "FP reg " : "RA reg "); +          Streamer.getContext().reportWarning( +              CFI.getLoc(), +              Twine( +                  "skipping SFrame FDE; .cfi_escape DW_CFA_expression with ") + +                  RN + Twine(*Reg)); +          return false; +        } +      } break; +      case dwarf::DW_CFA_GNU_args_size: { +        auto Size = I.getOperandAsSigned(P, 0); +        // Zero size doesn't affect the cfa. +        if (Size && *Size == 0) +          break; +        if (FRE.Info.getBaseRegister() != BaseReg::FP) { +          Streamer.getContext().reportWarning( +              CFI.getLoc(), +              Twine("skipping SFrame FDE; .cfi_escape DW_CFA_GNU_args_size " +                    "with non frame-pointer CFA")); +          return false; +        } +      } break; +      // Cases that gas doesn't specially handle. TODO: Some of these could be +      // analyzed and handled instead of just punting. But these are uncommon, +      // or should be written as normal cfi directives. Some will need fixes to +      // the scaling factor. +      case dwarf::DW_CFA_advance_loc: +      case dwarf::DW_CFA_offset: +      case dwarf::DW_CFA_restore: +      case dwarf::DW_CFA_set_loc: +      case dwarf::DW_CFA_advance_loc1: +      case dwarf::DW_CFA_advance_loc2: +      case dwarf::DW_CFA_advance_loc4: +      case dwarf::DW_CFA_offset_extended: +      case dwarf::DW_CFA_restore_extended: +      case dwarf::DW_CFA_undefined: +      case dwarf::DW_CFA_same_value: +      case dwarf::DW_CFA_register: +      case dwarf::DW_CFA_remember_state: +      case dwarf::DW_CFA_restore_state: +      case dwarf::DW_CFA_def_cfa: +      case dwarf::DW_CFA_def_cfa_register: +      case dwarf::DW_CFA_def_cfa_offset: +      case dwarf::DW_CFA_def_cfa_expression: +      case dwarf::DW_CFA_offset_extended_sf: +      case dwarf::DW_CFA_def_cfa_sf: +      case dwarf::DW_CFA_def_cfa_offset_sf: +      case dwarf::DW_CFA_val_offset_sf: +      case dwarf::DW_CFA_val_expression: +      case dwarf::DW_CFA_MIPS_advance_loc8: +      case dwarf::DW_CFA_AARCH64_negate_ra_state_with_pc: +      case dwarf::DW_CFA_AARCH64_negate_ra_state: +      case dwarf::DW_CFA_LLVM_def_aspace_cfa: +      case dwarf::DW_CFA_LLVM_def_aspace_cfa_sf: +        Streamer.getContext().reportWarning( +            CFI.getLoc(), "skipping SFrame FDE; .cfi_escape " +                          "CFA expression with unknown side effects"); +        return false; +      default: +        // Dwarf expression was only partially valid, and user could have +        // written anything. +        Streamer.getContext().reportWarning( +            CFI.getLoc(), +            "skipping SFrame FDE; .cfi_escape with unknown effects"); +        return false; +      } +    } +    return true; +  } +    // Add the effects of CFI to the current FDE, creating a new FRE when -  // necessary. +  // necessary. Return true if the CFI is representable in the sframe format.    bool handleCFI(SFrameFDE &FDE, SFrameFRE &FRE, const MCCFIInstruction &CFI) {      switch (CFI.getOperation()) {      case MCCFIInstruction::OpDefCfaRegister: @@ -265,10 +411,11 @@ class SFrameEmitterImpl {        FRE = FDE.SaveState.pop_back_val();        return true;      case MCCFIInstruction::OpEscape: -      // TODO: Implement. Will use FDE. -      return true; +      // This is a string of bytes that contains an arbitrary dwarf-expression +      // that may or may not affect unwind info. +      return isCFIEscapeSafe(FDE, FRE, CFI);      default: -      // Instructions that don't affect the CFA, RA, and SP can be safely +      // Instructions that don't affect the CFA, RA, and FP can be safely        // ignored.        return true;      } diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp index 67483ba..9d45096 100644 --- a/llvm/lib/Support/Timer.cpp +++ b/llvm/lib/Support/Timer.cpp @@ -240,7 +240,8 @@ private:    getGroupEntry(StringRef GroupName, StringRef GroupDescription) {      std::pair<TimerGroup *, Name2TimerMap> &GroupEntry = Map[GroupName];      if (!GroupEntry.first) -      GroupEntry.first = new TimerGroup(GroupName, GroupDescription); +      GroupEntry.first = +          new TimerGroup(GroupName, GroupDescription, /*PrintOnExit=*/true);      return GroupEntry;    } @@ -270,9 +271,10 @@ TimerGroup &NamedRegionTimer::getNamedTimerGroup(StringRef GroupName,  static TimerGroup *TimerGroupList = nullptr;  TimerGroup::TimerGroup(StringRef Name, StringRef Description, -                       sys::SmartMutex<true> &lock) +                       sys::SmartMutex<true> &lock, bool PrintOnExit)      : Name(Name.begin(), Name.end()), -      Description(Description.begin(), Description.end()) { +      Description(Description.begin(), Description.end()), +      PrintOnExit(PrintOnExit) {    // Add the group to TimerGroupList.    sys::SmartScopedLock<true> L(lock);    if (TimerGroupList) @@ -282,12 +284,12 @@ TimerGroup::TimerGroup(StringRef Name, StringRef Description,    TimerGroupList = this;  } -TimerGroup::TimerGroup(StringRef Name, StringRef Description) -    : TimerGroup(Name, Description, timerLock()) {} +TimerGroup::TimerGroup(StringRef Name, StringRef Description, bool PrintOnExit) +    : TimerGroup(Name, Description, timerLock(), PrintOnExit) {}  TimerGroup::TimerGroup(StringRef Name, StringRef Description, -                       const StringMap<TimeRecord> &Records) -    : TimerGroup(Name, Description) { +                       const StringMap<TimeRecord> &Records, bool PrintOnExit) +    : TimerGroup(Name, Description, PrintOnExit) {    TimersToPrint.reserve(Records.size());    for (const auto &P : Records)      TimersToPrint.emplace_back(P.getValue(), std::string(P.getKey()), @@ -301,7 +303,7 @@ TimerGroup::~TimerGroup() {    while (FirstTimer)      removeTimer(*FirstTimer); -  if (!TimersToPrint.empty()) { +  if (!TimersToPrint.empty() && PrintOnExit) {      std::unique_ptr<raw_ostream> OutStream = CreateInfoOutputFile();      PrintQueuedTimers(*OutStream);    } @@ -530,7 +532,7 @@ public:    sys::SmartMutex<true> TimerLock;    TimerGroup DefaultTimerGroup{"misc", "Miscellaneous Ungrouped Timers", -                               TimerLock}; +                               TimerLock, /*PrintOnExit=*/true};    SignpostEmitter Signposts;    // Order of these members and initialization below is important. For example diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index ecaeff7..b3ec65c 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -71,7 +71,6 @@ def AArch64PreLegalizerCombiner: GICombiner<    "AArch64PreLegalizerCombinerImpl", [all_combines,                                        icmp_redundant_trunc,                                        fold_global_offset, -                                      shuffle_to_extract,                                        ext_addv_to_udot_addv,                                        ext_uaddv_to_uaddlv,                                        push_sub_through_zext, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a81de5c..d16b116 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9002,12 +9002,12 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI,  }  static SMECallAttrs -getSMECallAttrs(const Function &Caller, const AArch64TargetLowering &TLI, +getSMECallAttrs(const Function &Caller, const RTLIB::RuntimeLibcallsInfo &RTLCI,                  const TargetLowering::CallLoweringInfo &CLI) {    if (CLI.CB) -    return SMECallAttrs(*CLI.CB, &TLI); +    return SMECallAttrs(*CLI.CB, &RTLCI);    if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) -    return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI)); +    return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), RTLCI));    return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(SMEAttrs::Normal));  } @@ -9029,7 +9029,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(    // SME Streaming functions are not eligible for TCO as they may require    // the streaming mode or ZA to be restored after returning from the call. -  SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI); +  SMECallAttrs CallAttrs = +      getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI);    if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||        CallAttrs.requiresPreservingAllZAState() ||        CallAttrs.caller().hasStreamingBody()) @@ -9454,7 +9455,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,    }    // Determine whether we need any streaming mode changes. -  SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI); +  SMECallAttrs CallAttrs = +      getSMECallAttrs(MF.getFunction(), getRuntimeLibcallsInfo(), CLI);    std::optional<unsigned> ZAMarkerNode;    bool UseNewSMEABILowering = getTM().useNewSMEABILowering(); @@ -19476,6 +19478,61 @@ static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {                       Op1 ? Op1 : Mul->getOperand(1));  } +// Multiplying an RDSVL value by a constant can sometimes be done cheaper by +// folding a power-of-two factor of the constant into the RDSVL immediate and +// compensating with an extra shift. +// +// We rewrite: +//   (mul (srl (rdsvl 1), w), x) +// to one of: +//   (shl (rdsvl y),  z)   if z > 0 +//   (srl (rdsvl y), abs(z))   if z < 0 +// where integers y, z satisfy   x = y * 2^(w + z)   and   y ∈ [-32, 31]. +static SDValue performMulRdsvlCombine(SDNode *Mul, SelectionDAG &DAG) { +  SDLoc DL(Mul); +  EVT VT = Mul->getValueType(0); +  SDValue MulOp0 = Mul->getOperand(0); +  int ConstMultiplier = +      cast<ConstantSDNode>(Mul->getOperand(1))->getSExtValue(); +  if ((MulOp0->getOpcode() != ISD::SRL) || +      (MulOp0->getOperand(0).getOpcode() != AArch64ISD::RDSVL)) +    return SDValue(); + +  unsigned AbsConstValue = abs(ConstMultiplier); +  unsigned OperandShift = +      cast<ConstantSDNode>(MulOp0->getOperand(1))->getZExtValue(); + +  // z ≤ ctz(|x|) - w  (largest extra shift we can take while keeping y +  // integral) +  int UpperBound = llvm::countr_zero(AbsConstValue) - OperandShift; + +  // To keep y in range, with B = 31 for x > 0 and B = 32 for x < 0, we need: +  // 2^(w + z) ≥ ceil(x / B)  ⇒  z ≥ ceil_log2(ceil(x / B)) - w  (LowerBound). +  unsigned B = ConstMultiplier < 0 ? 32 : 31; +  unsigned CeilAxOverB = (AbsConstValue + (B - 1)) / B; // ceil(|x|/B) +  int LowerBound = llvm::Log2_32_Ceil(CeilAxOverB) - OperandShift; + +  // No valid solution found. +  if (LowerBound > UpperBound) +    return SDValue(); + +  // Any value of z in [LowerBound, UpperBound] is valid. Prefer no extra +  // shift if possible. +  int Shift = std::min(std::max(/*prefer*/ 0, LowerBound), UpperBound); + +  // y = x / 2^(w + z) +  int32_t RdsvlMul = (AbsConstValue >> (OperandShift + Shift)) * +                     (ConstMultiplier < 0 ? -1 : 1); +  auto Rdsvl = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, +                           DAG.getSignedConstant(RdsvlMul, DL, MVT::i32)); + +  if (Shift == 0) +    return Rdsvl; +  return DAG.getNode(Shift < 0 ? ISD::SRL : ISD::SHL, DL, VT, Rdsvl, +                     DAG.getConstant(abs(Shift), DL, MVT::i32), +                     SDNodeFlags::Exact); +} +  // Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz  // Same for other types with equivalent constants.  static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) { @@ -19604,6 +19661,9 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,    if (!isa<ConstantSDNode>(N1))      return SDValue(); +  if (SDValue Ext = performMulRdsvlCombine(N, DAG)) +    return Ext; +    ConstantSDNode *C = cast<ConstantSDNode>(N1);    const APInt &ConstValue = C->getAPIntValue(); @@ -26665,11 +26725,34 @@ static SDValue performDUPCombine(SDNode *N,    }    if (N->getOpcode() == AArch64ISD::DUP) { +    SDValue Op = N->getOperand(0); + +    // Optimize DUP(extload/zextload i8/i16/i32) to avoid GPR->FPR transfer. +    // For example: +    //   v4i32 = DUP (i32 (zextloadi8 addr)) +    // => +    //   v4i32 = SCALAR_TO_VECTOR (i32 (zextloadi8 addr)) ; Matches to ldr b0 +    //   v4i32 = DUPLANE32 (v4i32), 0 +    if (auto *LD = dyn_cast<LoadSDNode>(Op)) { +      ISD::LoadExtType ExtType = LD->getExtensionType(); +      EVT MemVT = LD->getMemoryVT(); +      EVT ElemVT = VT.getVectorElementType(); +      if ((ExtType == ISD::EXTLOAD || ExtType == ISD::ZEXTLOAD) && +          (MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) && +          ElemVT != MemVT && LD->hasOneUse()) { +        EVT Vec128VT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT, +                                        128 / ElemVT.getSizeInBits()); +        SDValue ScalarToVec = +            DCI.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, Vec128VT, Op); +        return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, ScalarToVec, +                               DCI.DAG.getConstant(0, DL, MVT::i64)); +      } +    } +      // If the instruction is known to produce a scalar in SIMD registers, we can      // duplicate it across the vector lanes using DUPLANE instead of moving it      // to a GPR first. For example, this allows us to handle:      //   v4i32 = DUP (i32 (FCMGT (f32, f32))) -    SDValue Op = N->getOperand(0);      // FIXME: Ideally, we should be able to handle all instructions that      // produce a scalar value in FPRs.      if (Op.getOpcode() == AArch64ISD::FCMEQ || @@ -29430,15 +29513,6 @@ void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {    TargetLowering::insertSSPDeclarations(M);  } -Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const { -  // MSVC CRT has a function to validate security cookie. -  RTLIB::LibcallImpl SecurityCheckCookieLibcall = -      getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE); -  if (SecurityCheckCookieLibcall != RTLIB::Unsupported) -    return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall)); -  return TargetLowering::getSSPStackGuardCheck(M); -} -  Value *  AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {    // Android provides a fixed TLS slot for the SafeStack pointer. See the @@ -29447,11 +29521,6 @@ AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {    if (Subtarget->isTargetAndroid())      return UseTlsOffset(IRB, 0x48); -  // Fuchsia is similar. -  // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value. -  if (Subtarget->isTargetFuchsia()) -    return UseTlsOffset(IRB, -0x8); -    return TargetLowering::getSafeStackPointerLocation(IRB);  } @@ -29769,7 +29838,7 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {    // Checks to allow the use of SME instructions    if (auto *Base = dyn_cast<CallBase>(&Inst)) { -    auto CallAttrs = SMECallAttrs(*Base, this); +    auto CallAttrs = SMECallAttrs(*Base, &getRuntimeLibcallsInfo());      if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||          CallAttrs.requiresPreservingZT0() ||          CallAttrs.requiresPreservingAllZAState()) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 9495c9f..2cb8ed2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -366,7 +366,6 @@ public:    Value *getIRStackGuard(IRBuilderBase &IRB) const override;    void insertSSPDeclarations(Module &M) const override; -  Function *getSSPStackGuardCheck(const Module &M) const override;    /// If the target has a standard location for the unsafe stack pointer,    /// returns the address of that location. Otherwise, returns nullptr. diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index eab1627..58a53af 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -5298,7 +5298,7 @@ multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,  }  multiclass FPToIntegerSIMDScalar<bits<2> rmode, bits<3> opcode, string asm,  -                                 SDPatternOperator OpN = null_frag> { +                                 SDPatternOperator OpN> {    // double-precision to 32-bit SIMD/FPR    def SDr :  BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, FPR32, asm,               [(set FPR32:$Rd, (i32 (OpN (f64 FPR64:$Rn))))]> { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index b74ca79..b9e299e 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4022,22 +4022,6 @@ defm LDRSW  : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",  def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),        (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>; -// load zero-extended i32, bitcast to f64 -def : Pat<(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), -          (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; -// load zero-extended i16, bitcast to f64 -def : Pat<(f64 (bitconvert (i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), -          (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; -// load zero-extended i8, bitcast to f64 -def : Pat<(f64 (bitconvert (i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), -          (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; -// load zero-extended i16, bitcast to f32 -def : Pat<(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), -          (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; -// load zero-extended i8, bitcast to f32 -def : Pat<(f32 (bitconvert (i32 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), -          (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; -  // Pre-fetch.  def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",                          [(AArch64Prefetch timm:$Rt, @@ -4389,6 +4373,64 @@ def : Pat <(v1i64 (scalar_to_vector (i64                 (load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))),             (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>; +// Patterns for bitconvert or scalar_to_vector of load operations. +// Enables direct SIMD register loads for small integer types (i8/i16) that are +// naturally zero-extended to i32/i64. +multiclass ExtLoad8_16AllModes<ValueType OutTy, ValueType InnerTy, +                                SDPatternOperator OuterOp, +                                PatFrags LoadOp8, PatFrags LoadOp16> { +  // 8-bit loads. +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), +            (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))), +            (SUBREG_TO_REG (i64 0), (LDURBi GPR64sp:$Rn, simm9:$offset), bsub)>; +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$extend))))), +            (SUBREG_TO_REG (i64 0), (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$extend), bsub)>; +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$extend))))), +            (SUBREG_TO_REG (i64 0), (LDRBroX GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$extend), bsub)>; + +  // 16-bit loads. +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), +            (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))), +            (SUBREG_TO_REG (i64 0), (LDURHi GPR64sp:$Rn, simm9:$offset), hsub)>; +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$extend))))), +            (SUBREG_TO_REG (i64 0), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$extend), hsub)>; +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$extend))))), +            (SUBREG_TO_REG (i64 0), (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$extend), hsub)>; +} + +// Extended multiclass that includes 32-bit loads in addition to 8-bit and 16-bit. +multiclass ExtLoad8_16_32AllModes<ValueType OutTy, ValueType InnerTy, +                                   SDPatternOperator OuterOp, +                                   PatFrags LoadOp8, PatFrags LoadOp16, PatFrags LoadOp32> { +  defm : ExtLoad8_16AllModes<OutTy, InnerTy, OuterOp, LoadOp8, LoadOp16>; + +  // 32-bit loads. +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), +            (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))), +            (SUBREG_TO_REG (i64 0), (LDURSi GPR64sp:$Rn, simm9:$offset), ssub)>; +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$extend))))), +            (SUBREG_TO_REG (i64 0), (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$extend), ssub)>; +  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$extend))))), +            (SUBREG_TO_REG (i64 0), (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$extend), ssub)>; +} + +// Instantiate bitconvert patterns for floating-point types. +defm : ExtLoad8_16AllModes<f32, i32, bitconvert, zextloadi8, zextloadi16>; +defm : ExtLoad8_16_32AllModes<f64, i64, bitconvert, zextloadi8, zextloadi16, zextloadi32>; + +// Instantiate scalar_to_vector patterns for all vector types. +defm : ExtLoad8_16AllModes<v16i8, i32, scalar_to_vector, zextloadi8, zextloadi16>; +defm : ExtLoad8_16AllModes<v16i8, i32, scalar_to_vector, extloadi8, extloadi16>; +defm : ExtLoad8_16AllModes<v8i16, i32, scalar_to_vector, zextloadi8, zextloadi16>; +defm : ExtLoad8_16AllModes<v8i16, i32, scalar_to_vector, extloadi8, extloadi16>; +defm : ExtLoad8_16AllModes<v4i32, i32, scalar_to_vector, zextloadi8, zextloadi16>; +defm : ExtLoad8_16AllModes<v4i32, i32, scalar_to_vector, extloadi8, extloadi16>; +defm : ExtLoad8_16_32AllModes<v2i64, i64, scalar_to_vector, zextloadi8, zextloadi16, zextloadi32>; +defm : ExtLoad8_16_32AllModes<v2i64, i64, scalar_to_vector, extloadi8, extloadi16, extloadi32>; +  // Pre-fetch.  defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",                    [(AArch64Prefetch timm:$Rt, @@ -5253,113 +5295,10 @@ let Predicates = [HasNEON, HasFPRCVT] in{    defm FCVTNU : FPToIntegerSIMDScalar<0b01, 0b011, "fcvtnu", int_aarch64_neon_fcvtnu>;    defm FCVTPS : FPToIntegerSIMDScalar<0b10, 0b010, "fcvtps", int_aarch64_neon_fcvtps>;    defm FCVTPU : FPToIntegerSIMDScalar<0b10, 0b011, "fcvtpu", int_aarch64_neon_fcvtpu>; -  defm FCVTZS : FPToIntegerSIMDScalar<0b10, 0b110, "fcvtzs">; -  defm FCVTZU : FPToIntegerSIMDScalar<0b10, 0b111, "fcvtzu">; -} - - -// AArch64's FCVT instructions saturate when out of range. -multiclass FPToIntegerSatPats<SDNode to_int_sat, SDNode to_int_sat_gi, string INST> { -  let Predicates = [HasFullFP16] in { -  def : Pat<(i32 (to_int_sat f16:$Rn, i32)), -            (!cast<Instruction>(INST # UWHr) f16:$Rn)>; -  def : Pat<(i64 (to_int_sat f16:$Rn, i64)), -            (!cast<Instruction>(INST # UXHr) f16:$Rn)>; -  } -  def : Pat<(i32 (to_int_sat f32:$Rn, i32)), -            (!cast<Instruction>(INST # UWSr) f32:$Rn)>; -  def : Pat<(i64 (to_int_sat f32:$Rn, i64)), -            (!cast<Instruction>(INST # UXSr) f32:$Rn)>; -  def : Pat<(i32 (to_int_sat f64:$Rn, i32)), -            (!cast<Instruction>(INST # UWDr) f64:$Rn)>; -  def : Pat<(i64 (to_int_sat f64:$Rn, i64)), -            (!cast<Instruction>(INST # UXDr) f64:$Rn)>; - -  let Predicates = [HasFullFP16] in { -  def : Pat<(i32 (to_int_sat_gi f16:$Rn)), -            (!cast<Instruction>(INST # UWHr) f16:$Rn)>; -  def : Pat<(i64 (to_int_sat_gi f16:$Rn)), -            (!cast<Instruction>(INST # UXHr) f16:$Rn)>; -  } -  def : Pat<(i32 (to_int_sat_gi f32:$Rn)), -            (!cast<Instruction>(INST # UWSr) f32:$Rn)>; -  def : Pat<(i64 (to_int_sat_gi f32:$Rn)), -            (!cast<Instruction>(INST # UXSr) f32:$Rn)>; -  def : Pat<(i32 (to_int_sat_gi f64:$Rn)), -            (!cast<Instruction>(INST # UWDr) f64:$Rn)>; -  def : Pat<(i64 (to_int_sat_gi f64:$Rn)), -            (!cast<Instruction>(INST # UXDr) f64:$Rn)>; - -  let Predicates = [HasFullFP16] in { -  def : Pat<(i32 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i32:$scale), i32)), -            (!cast<Instruction>(INST # SWHri) $Rn, $scale)>; -  def : Pat<(i64 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i64:$scale), i64)), -            (!cast<Instruction>(INST # SXHri) $Rn, $scale)>; -  } -  def : Pat<(i32 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i32:$scale), i32)), -            (!cast<Instruction>(INST # SWSri) $Rn, $scale)>; -  def : Pat<(i64 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i64:$scale), i64)), -            (!cast<Instruction>(INST # SXSri) $Rn, $scale)>; -  def : Pat<(i32 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i32:$scale), i32)), -            (!cast<Instruction>(INST # SWDri) $Rn, $scale)>; -  def : Pat<(i64 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i64:$scale), i64)), -            (!cast<Instruction>(INST # SXDri) $Rn, $scale)>; - -  let Predicates = [HasFullFP16] in { -  def : Pat<(i32 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i32:$scale))), -            (!cast<Instruction>(INST # SWHri) $Rn, $scale)>; -  def : Pat<(i64 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i64:$scale))), -            (!cast<Instruction>(INST # SXHri) $Rn, $scale)>; -  } -  def : Pat<(i32 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i32:$scale))), -            (!cast<Instruction>(INST # SWSri) $Rn, $scale)>; -  def : Pat<(i64 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i64:$scale))), -            (!cast<Instruction>(INST # SXSri) $Rn, $scale)>; -  def : Pat<(i32 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i32:$scale))), -            (!cast<Instruction>(INST # SWDri) $Rn, $scale)>; -  def : Pat<(i64 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i64:$scale))), -            (!cast<Instruction>(INST # SXDri) $Rn, $scale)>; -} - -defm : FPToIntegerSatPats<fp_to_sint_sat, fp_to_sint_sat_gi, "FCVTZS">; -defm : FPToIntegerSatPats<fp_to_uint_sat, fp_to_uint_sat_gi, "FCVTZU">; - -multiclass FPToIntegerPats<SDNode to_int, SDNode to_int_sat, SDNode round, string INST> { -  def : Pat<(i32 (to_int (round f32:$Rn))), -            (!cast<Instruction>(INST # UWSr) f32:$Rn)>; -  def : Pat<(i64 (to_int (round f32:$Rn))), -            (!cast<Instruction>(INST # UXSr) f32:$Rn)>; -  def : Pat<(i32 (to_int (round f64:$Rn))), -            (!cast<Instruction>(INST # UWDr) f64:$Rn)>; -  def : Pat<(i64 (to_int (round f64:$Rn))), -            (!cast<Instruction>(INST # UXDr) f64:$Rn)>; - -  // These instructions saturate like fp_to_[su]int_sat. -  let Predicates = [HasFullFP16] in { -  def : Pat<(i32 (to_int_sat (round f16:$Rn), i32)), -            (!cast<Instruction>(INST # UWHr) f16:$Rn)>; -  def : Pat<(i64 (to_int_sat (round f16:$Rn), i64)), -            (!cast<Instruction>(INST # UXHr) f16:$Rn)>; -  } -  def : Pat<(i32 (to_int_sat (round f32:$Rn), i32)), -            (!cast<Instruction>(INST # UWSr) f32:$Rn)>; -  def : Pat<(i64 (to_int_sat (round f32:$Rn), i64)), -            (!cast<Instruction>(INST # UXSr) f32:$Rn)>; -  def : Pat<(i32 (to_int_sat (round f64:$Rn), i32)), -            (!cast<Instruction>(INST # UWDr) f64:$Rn)>; -  def : Pat<(i64 (to_int_sat (round f64:$Rn), i64)), -            (!cast<Instruction>(INST # UXDr) f64:$Rn)>; +  defm FCVTZS : FPToIntegerSIMDScalar<0b10, 0b110, "fcvtzs", any_fp_to_sint>; +  defm FCVTZU : FPToIntegerSIMDScalar<0b10, 0b111, "fcvtzu", any_fp_to_uint>;  } -defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fceil,  "FCVTPS">; -defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fceil,  "FCVTPU">; -defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, ffloor, "FCVTMS">; -defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, ffloor, "FCVTMU">; -defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, ftrunc, "FCVTZS">; -defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, ftrunc, "FCVTZU">; -defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fround, "FCVTAS">; -defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fround, "FCVTAU">; -  let Predicates = [HasFullFP16] in { @@ -6567,8 +6506,8 @@ defm FCVTNU : SIMDFPTwoScalar<   1, 0, 0b11010, "fcvtnu", int_aarch64_neon_fcvtn  defm FCVTPS : SIMDFPTwoScalar<   0, 1, 0b11010, "fcvtps", int_aarch64_neon_fcvtps>;  defm FCVTPU : SIMDFPTwoScalar<   1, 1, 0b11010, "fcvtpu", int_aarch64_neon_fcvtpu>;  def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">; -defm FCVTZS : SIMDFPTwoScalar<   0, 1, 0b11011, "fcvtzs">; -defm FCVTZU : SIMDFPTwoScalar<   1, 1, 0b11011, "fcvtzu">; +defm FCVTZS : SIMDFPTwoScalar<   0, 1, 0b11011, "fcvtzs", any_fp_to_sint>; +defm FCVTZU : SIMDFPTwoScalar<   1, 1, 0b11011, "fcvtzu", any_fp_to_uint>;  defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe">;  defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx">;  defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte">; @@ -6588,6 +6527,7 @@ defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",  // Floating-point conversion patterns.  multiclass FPToIntegerSIMDScalarPatterns<SDPatternOperator OpN, string INST> { +  let Predicates = [HasFPRCVT] in {    def : Pat<(f32 (bitconvert (i32 (OpN (f64 FPR64:$Rn))))),              (!cast<Instruction>(INST # SDr) FPR64:$Rn)>;    def : Pat<(f32 (bitconvert (i32 (OpN (f16 FPR16:$Rn))))), @@ -6596,6 +6536,7 @@ multiclass FPToIntegerSIMDScalarPatterns<SDPatternOperator OpN, string INST> {              (!cast<Instruction>(INST # DHr) FPR16:$Rn)>;    def : Pat<(f64 (bitconvert (i64 (OpN (f32 FPR32:$Rn))))),              (!cast<Instruction>(INST # DSr) FPR32:$Rn)>; +  }    def : Pat<(f32 (bitconvert (i32 (OpN (f32 FPR32:$Rn))))),              (!cast<Instruction>(INST # v1i32) FPR32:$Rn)>;    def : Pat<(f64 (bitconvert (i64 (OpN (f64 FPR64:$Rn))))), @@ -6610,6 +6551,8 @@ defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtns, "FCVTNS">;  defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtnu, "FCVTNU">;  defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtps, "FCVTPS">;  defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtpu, "FCVTPU">; +defm: FPToIntegerSIMDScalarPatterns<any_fp_to_sint, "FCVTZS">; +defm: FPToIntegerSIMDScalarPatterns<any_fp_to_uint, "FCVTZU">;  multiclass FPToIntegerIntPats<Intrinsic round, string INST> {    let Predicates = [HasFullFP16] in { @@ -6666,6 +6609,196 @@ multiclass FPToIntegerIntPats<Intrinsic round, string INST> {  defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">;  defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">; +// AArch64's FCVT instructions saturate when out of range. +multiclass FPToIntegerSatPats<SDNode to_int_sat, SDNode to_int_sat_gi, string INST> { +  let Predicates = [HasFullFP16] in { +  def : Pat<(i32 (to_int_sat f16:$Rn, i32)), +            (!cast<Instruction>(INST # UWHr) f16:$Rn)>; +  def : Pat<(i64 (to_int_sat f16:$Rn, i64)), +            (!cast<Instruction>(INST # UXHr) f16:$Rn)>; +  } +  def : Pat<(i32 (to_int_sat f32:$Rn, i32)), +            (!cast<Instruction>(INST # UWSr) f32:$Rn)>; +  def : Pat<(i64 (to_int_sat f32:$Rn, i64)), +            (!cast<Instruction>(INST # UXSr) f32:$Rn)>; +  def : Pat<(i32 (to_int_sat f64:$Rn, i32)), +            (!cast<Instruction>(INST # UWDr) f64:$Rn)>; +  def : Pat<(i64 (to_int_sat f64:$Rn, i64)), +            (!cast<Instruction>(INST # UXDr) f64:$Rn)>; + +  let Predicates = [HasFullFP16] in { +  def : Pat<(i32 (to_int_sat_gi f16:$Rn)), +            (!cast<Instruction>(INST # UWHr) f16:$Rn)>; +  def : Pat<(i64 (to_int_sat_gi f16:$Rn)), +            (!cast<Instruction>(INST # UXHr) f16:$Rn)>; +  } +  def : Pat<(i32 (to_int_sat_gi f32:$Rn)), +            (!cast<Instruction>(INST # UWSr) f32:$Rn)>; +  def : Pat<(i64 (to_int_sat_gi f32:$Rn)), +            (!cast<Instruction>(INST # UXSr) f32:$Rn)>; +  def : Pat<(i32 (to_int_sat_gi f64:$Rn)), +            (!cast<Instruction>(INST # UWDr) f64:$Rn)>; +  def : Pat<(i64 (to_int_sat_gi f64:$Rn)), +            (!cast<Instruction>(INST # UXDr) f64:$Rn)>; + +  // For global-isel we can use register classes to determine +  // which FCVT instruction to use. +  let Predicates = [HasFPRCVT] in { +  def : Pat<(i32 (to_int_sat_gi f16:$Rn)), +            (!cast<Instruction>(INST # SHr) f16:$Rn)>; +  def : Pat<(i64 (to_int_sat_gi f16:$Rn)), +            (!cast<Instruction>(INST # DHr) f16:$Rn)>; +  def : Pat<(i64 (to_int_sat_gi f32:$Rn)), +            (!cast<Instruction>(INST # DSr) f32:$Rn)>; +  def : Pat<(i32 (to_int_sat_gi f64:$Rn)), +            (!cast<Instruction>(INST # SDr) f64:$Rn)>; +  } +  def : Pat<(i32 (to_int_sat_gi f32:$Rn)), +            (!cast<Instruction>(INST # v1i32) f32:$Rn)>; +  def : Pat<(i64 (to_int_sat_gi f64:$Rn)), +            (!cast<Instruction>(INST # v1i64) f64:$Rn)>; + +  let Predicates = [HasFPRCVT] in { +  def : Pat<(f32 (bitconvert (i32 (to_int_sat f16:$Rn, i32)))), +            (!cast<Instruction>(INST # SHr) f16:$Rn)>; +  def : Pat<(f64 (bitconvert (i64 (to_int_sat f16:$Rn, i64)))), +            (!cast<Instruction>(INST # DHr) f16:$Rn)>; +  def : Pat<(f64 (bitconvert (i64 (to_int_sat f32:$Rn, i64)))), +            (!cast<Instruction>(INST # DSr) f32:$Rn)>; +  def : Pat<(f32 (bitconvert (i32 (to_int_sat f64:$Rn, i32)))), +            (!cast<Instruction>(INST # SDr) f64:$Rn)>; +  } +  def : Pat<(f32 (bitconvert (i32 (to_int_sat f32:$Rn, i32)))), +            (!cast<Instruction>(INST # v1i32) f32:$Rn)>; +  def : Pat<(f64 (bitconvert (i64 (to_int_sat f64:$Rn, i64)))), +            (!cast<Instruction>(INST # v1i64) f64:$Rn)>; + +  let Predicates = [HasFullFP16] in { +  def : Pat<(i32 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i32:$scale), i32)), +            (!cast<Instruction>(INST # SWHri) $Rn, $scale)>; +  def : Pat<(i64 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i64:$scale), i64)), +            (!cast<Instruction>(INST # SXHri) $Rn, $scale)>; +  } +  def : Pat<(i32 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i32:$scale), i32)), +            (!cast<Instruction>(INST # SWSri) $Rn, $scale)>; +  def : Pat<(i64 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i64:$scale), i64)), +            (!cast<Instruction>(INST # SXSri) $Rn, $scale)>; +  def : Pat<(i32 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i32:$scale), i32)), +            (!cast<Instruction>(INST # SWDri) $Rn, $scale)>; +  def : Pat<(i64 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i64:$scale), i64)), +            (!cast<Instruction>(INST # SXDri) $Rn, $scale)>; + +  let Predicates = [HasFullFP16] in { +  def : Pat<(i32 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i32:$scale))), +            (!cast<Instruction>(INST # SWHri) $Rn, $scale)>; +  def : Pat<(i64 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i64:$scale))), +            (!cast<Instruction>(INST # SXHri) $Rn, $scale)>; +  } +  def : Pat<(i32 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i32:$scale))), +            (!cast<Instruction>(INST # SWSri) $Rn, $scale)>; +  def : Pat<(i64 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i64:$scale))), +            (!cast<Instruction>(INST # SXSri) $Rn, $scale)>; +  def : Pat<(i32 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i32:$scale))), +            (!cast<Instruction>(INST # SWDri) $Rn, $scale)>; +  def : Pat<(i64 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i64:$scale))), +            (!cast<Instruction>(INST # SXDri) $Rn, $scale)>; +} + +defm : FPToIntegerSatPats<fp_to_sint_sat, fp_to_sint_sat_gi, "FCVTZS">; +defm : FPToIntegerSatPats<fp_to_uint_sat, fp_to_uint_sat_gi, "FCVTZU">; + +multiclass FPToIntegerPats<SDNode to_int, SDNode to_int_sat, SDNode to_int_sat_gi, SDNode round, string INST> { +  def : Pat<(i32 (to_int (round f32:$Rn))), +            (!cast<Instruction>(INST # UWSr) f32:$Rn)>; +  def : Pat<(i64 (to_int (round f32:$Rn))), +            (!cast<Instruction>(INST # UXSr) f32:$Rn)>; +  def : Pat<(i32 (to_int (round f64:$Rn))), +            (!cast<Instruction>(INST # UWDr) f64:$Rn)>; +  def : Pat<(i64 (to_int (round f64:$Rn))), +            (!cast<Instruction>(INST # UXDr) f64:$Rn)>; + +  // For global-isel we can use register classes to determine +  // which FCVT instruction to use. +  let Predicates = [HasFPRCVT] in { +  def : Pat<(i64 (to_int (round f32:$Rn))), +            (!cast<Instruction>(INST # DSr) f32:$Rn)>; +  def : Pat<(i32 (to_int (round f64:$Rn))), +            (!cast<Instruction>(INST # SDr) f64:$Rn)>; +  } +  def : Pat<(i32 (to_int (round f32:$Rn))), +            (!cast<Instruction>(INST # v1i32) f32:$Rn)>; +  def : Pat<(i64 (to_int (round f64:$Rn))), +            (!cast<Instruction>(INST # v1i64) f64:$Rn)>; + +  let Predicates = [HasFPRCVT] in { +  def : Pat<(f64 (bitconvert (i64 (to_int (round f32:$Rn))))), +            (!cast<Instruction>(INST # DSr) f32:$Rn)>; +  def : Pat<(f32 (bitconvert (i32 (to_int (round f64:$Rn))))), +            (!cast<Instruction>(INST # SDr) f64:$Rn)>; +  } +  def : Pat<(f32 (bitconvert (i32 (to_int (round f32:$Rn))))), +            (!cast<Instruction>(INST # v1i32) f32:$Rn)>; +  def : Pat<(f64 (bitconvert (i64 (to_int (round f64:$Rn))))), +            (!cast<Instruction>(INST # v1i64) f64:$Rn)>; + +  // These instructions saturate like fp_to_[su]int_sat. +  let Predicates = [HasFullFP16] in { +  def : Pat<(i32 (to_int_sat (round f16:$Rn), i32)), +            (!cast<Instruction>(INST # UWHr) f16:$Rn)>; +  def : Pat<(i64 (to_int_sat (round f16:$Rn), i64)), +            (!cast<Instruction>(INST # UXHr) f16:$Rn)>; +  } +  def : Pat<(i32 (to_int_sat (round f32:$Rn), i32)), +            (!cast<Instruction>(INST # UWSr) f32:$Rn)>; +  def : Pat<(i64 (to_int_sat (round f32:$Rn), i64)), +            (!cast<Instruction>(INST # UXSr) f32:$Rn)>; +  def : Pat<(i32 (to_int_sat (round f64:$Rn), i32)), +            (!cast<Instruction>(INST # UWDr) f64:$Rn)>; +  def : Pat<(i64 (to_int_sat (round f64:$Rn), i64)), +            (!cast<Instruction>(INST # UXDr) f64:$Rn)>; + +  // For global-isel we can use register classes to determine +  // which FCVT instruction to use. +  let Predicates = [HasFPRCVT] in { +    def : Pat<(i32 (to_int_sat_gi (round f16:$Rn))), +              (!cast<Instruction>(INST # SHr) f16:$Rn)>; +    def : Pat<(i64 (to_int_sat_gi (round f16:$Rn))), +              (!cast<Instruction>(INST # DHr) f16:$Rn)>; +    def : Pat<(i64 (to_int_sat_gi (round f32:$Rn))), +              (!cast<Instruction>(INST # DSr) f32:$Rn)>; +    def : Pat<(i32 (to_int_sat_gi (round f64:$Rn))), +              (!cast<Instruction>(INST # SDr) f64:$Rn)>; +  } +  def : Pat<(i32 (to_int_sat_gi (round f32:$Rn))), +            (!cast<Instruction>(INST # v1i32) f32:$Rn)>; +  def : Pat<(i64 (to_int_sat_gi (round f64:$Rn))), +            (!cast<Instruction>(INST # v1i64) f64:$Rn)>; +             +  let Predicates = [HasFPRCVT] in { +    def : Pat<(f32 (bitconvert (i32 (to_int_sat (round f16:$Rn), i32)))), +              (!cast<Instruction>(INST # SHr) f16:$Rn)>; +    def : Pat<(f64 (bitconvert (i64 (to_int_sat (round f16:$Rn), i64)))), +              (!cast<Instruction>(INST # DHr) f16:$Rn)>; +    def : Pat<(f64 (bitconvert (i64 (to_int_sat (round f32:$Rn), i64)))), +              (!cast<Instruction>(INST # DSr) f32:$Rn)>; +    def : Pat<(f32 (bitconvert (i32 (to_int_sat (round f64:$Rn), i32)))), +              (!cast<Instruction>(INST # SDr) f64:$Rn)>; +  } +  def : Pat<(f32 (bitconvert (i32 (to_int_sat (round f32:$Rn), i32)))), +            (!cast<Instruction>(INST # v1i32) f32:$Rn)>; +  def : Pat<(f64 (bitconvert (i64 (to_int_sat (round f64:$Rn), i64)))), +            (!cast<Instruction>(INST # v1i64) f64:$Rn)>; +} + +defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, fceil,  "FCVTPS">; +defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, fceil,  "FCVTPU">; +defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, ffloor, "FCVTMS">; +defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, ffloor, "FCVTMU">; +defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, ftrunc, "FCVTZS">; +defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, ftrunc, "FCVTZU">; +defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, fround, "FCVTAS">; +defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, fround, "FCVTAU">; +  // f16 -> s16 conversions  let Predicates = [HasFullFP16] in {    def : Pat<(i16(fp_to_sint_sat_gi f16:$Rn)), (FCVTZSv1f16 f16:$Rn)>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td index bdde8e3..2387f17 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td @@ -2762,11 +2762,11 @@ def : InstRW<[V2Write_11c_18L01_18V01], (instregex "^ST4[BHWD]_IMM$")>;  def : InstRW<[V2Write_11c_18L01_18S_18V01], (instregex "^ST4[BHWD]$")>;  // Non temporal store, scalar + imm -def : InstRW<[V2Write_2c_1L01_1V], (instregex "^STNT1[BHWD]_ZRI$")>; +def : InstRW<[V2Write_2c_1L01_1V01], (instregex "^STNT1[BHWD]_ZRI$")>;  // Non temporal store, scalar + scalar -def : InstRW<[V2Write_2c_1L01_1S_1V], (instrs STNT1H_ZRR)>; -def : InstRW<[V2Write_2c_1L01_1V], (instregex "^STNT1[BWD]_ZRR$")>; +def : InstRW<[V2Write_2c_1L01_1S_1V01], (instrs STNT1H_ZRR)>; +def : InstRW<[V2Write_2c_1L01_1V01], (instregex "^STNT1[BWD]_ZRR$")>;  // Scatter non temporal store, vector + scalar 32-bit element size  def : InstRW<[V2Write_4c_4L01_4V01], (instregex "^STNT1[BHW]_ZZR_S")>; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 2053fc4..fede586 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -224,7 +224,8 @@ static cl::opt<bool> EnableScalableAutovecInStreamingMode(  static bool isSMEABIRoutineCall(const CallInst &CI,                                  const AArch64TargetLowering &TLI) {    const auto *F = CI.getCalledFunction(); -  return F && SMEAttrs(F->getName(), TLI).isSMEABIRoutine(); +  return F && +         SMEAttrs(F->getName(), TLI.getRuntimeLibcallsInfo()).isSMEABIRoutine();  }  /// Returns true if the function has explicit operations that can only be @@ -355,7 +356,7 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,    // change only once and avoid inlining of G into F.    SMEAttrs FAttrs(*F); -  SMECallAttrs CallAttrs(Call, getTLI()); +  SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());    if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {      if (F == Call.getCaller()) // (1) @@ -957,23 +958,50 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,      return TyL.first + ExtraCost;    }    case Intrinsic::get_active_lane_mask: { -    auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType()); -    if (RetTy) { -      EVT RetVT = getTLI()->getValueType(DL, RetTy); -      EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); -      if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) && -          !getTLI()->isTypeLegal(RetVT)) { -        // We don't have enough context at this point to determine if the mask -        // is going to be kept live after the block, which will force the vXi1 -        // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32. -        // For now, we just assume the vectorizer created this intrinsic and -        // the result will be the input for a PHI. In this case the cost will -        // be extremely high for fixed-width vectors. -        // NOTE: getScalarizationOverhead returns a cost that's far too -        // pessimistic for the actual generated codegen. In reality there are -        // two instructions generated per lane. -        return RetTy->getNumElements() * 2; +    auto RetTy = cast<VectorType>(ICA.getReturnType()); +    EVT RetVT = getTLI()->getValueType(DL, RetTy); +    EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); +    if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT)) +      break; + +    if (RetTy->isScalableTy()) { +      if (TLI->getTypeAction(RetTy->getContext(), RetVT) != +          TargetLowering::TypeSplitVector) +        break; + +      auto LT = getTypeLegalizationCost(RetTy); +      InstructionCost Cost = LT.first; +      // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost +      // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g. +      //   nxv32i1 = get_active_lane_mask(base, idx) -> +      //    {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx) +      if (ST->hasSVE2p1() || ST->hasSME2()) { +        Cost /= 2; +        if (Cost == 1) +          return Cost;        } + +      // If more than one whilelo intrinsic is required, include the extra cost +      // required by the saturating add & select required to increment the +      // start value after the first intrinsic call. +      Type *OpTy = ICA.getArgTypes()[0]; +      IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy}); +      InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind); +      Type *CondTy = OpTy->getWithNewBitWidth(1); +      SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy, +                                      CmpInst::ICMP_UGT, CostKind); +      return Cost + (SplitCost * (Cost - 1)); +    } else if (!getTLI()->isTypeLegal(RetVT)) { +      // We don't have enough context at this point to determine if the mask +      // is going to be kept live after the block, which will force the vXi1 +      // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32. +      // For now, we just assume the vectorizer created this intrinsic and +      // the result will be the input for a PHI. In this case the cost will +      // be extremely high for fixed-width vectors. +      // NOTE: getScalarizationOverhead returns a cost that's far too +      // pessimistic for the actual generated codegen. In reality there are +      // two instructions generated per lane. +      return cast<FixedVectorType>(RetTy)->getNumElements() * 2;      }      break;    } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 3e55b76..14b0f9a 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -5126,23 +5126,13 @@ bool AArch64InstructionSelector::selectShuffleVector(      MachineInstr &I, MachineRegisterInfo &MRI) {    const LLT DstTy = MRI.getType(I.getOperand(0).getReg());    Register Src1Reg = I.getOperand(1).getReg(); -  const LLT Src1Ty = MRI.getType(Src1Reg);    Register Src2Reg = I.getOperand(2).getReg(); -  const LLT Src2Ty = MRI.getType(Src2Reg);    ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();    MachineBasicBlock &MBB = *I.getParent();    MachineFunction &MF = *MBB.getParent();    LLVMContext &Ctx = MF.getFunction().getContext(); -  // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if -  // it's originated from a <1 x T> type. Those should have been lowered into -  // G_BUILD_VECTOR earlier. -  if (!Src1Ty.isVector() || !Src2Ty.isVector()) { -    LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n"); -    return false; -  } -    unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;    SmallVector<Constant *, 64> CstIdxs; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 05a4313..5f93847 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1201,25 +1201,17 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)          return llvm::is_contained(              {v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64}, DstTy);        }) -      // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors) or scalar -      // destinations, we just want those lowered into G_BUILD_VECTOR or -      // G_EXTRACT_ELEMENT. -      .lowerIf([=](const LegalityQuery &Query) { -        return !Query.Types[0].isVector() || !Query.Types[1].isVector(); -      })        .moreElementsIf(            [](const LegalityQuery &Query) { -            return Query.Types[0].isVector() && Query.Types[1].isVector() && -                   Query.Types[0].getNumElements() > -                       Query.Types[1].getNumElements(); +            return Query.Types[0].getNumElements() > +                   Query.Types[1].getNumElements();            },            changeTo(1, 0))        .moreElementsToNextPow2(0)        .moreElementsIf(            [](const LegalityQuery &Query) { -            return Query.Types[0].isVector() && Query.Types[1].isVector() && -                   Query.Types[0].getNumElements() < -                       Query.Types[1].getNumElements(); +            return Query.Types[0].getNumElements() < +                   Query.Types[1].getNumElements();            },            changeTo(0, 1))        .widenScalarOrEltToNextPow2OrMinSize(0, 8) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index 830a35bb..6d2d705 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -856,7 +856,9 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {      break;    }    case TargetOpcode::G_FPTOSI_SAT: -  case TargetOpcode::G_FPTOUI_SAT: { +  case TargetOpcode::G_FPTOUI_SAT: +  case TargetOpcode::G_FPTOSI: +  case TargetOpcode::G_FPTOUI: {      LLT DstType = MRI.getType(MI.getOperand(0).getReg());      if (DstType.isVector())        break; @@ -864,11 +866,19 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {        OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};        break;      } -    OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR}; +    TypeSize DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); +    TypeSize SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, TRI); +    if (((DstSize == SrcSize) || STI.hasFeature(AArch64::FeatureFPRCVT)) && +        all_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()), +               [&](const MachineInstr &UseMI) { +                 return onlyUsesFP(UseMI, MRI, TRI) || +                        prefersFPUse(UseMI, MRI, TRI); +               })) +      OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; +    else +      OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR};      break;    } -  case TargetOpcode::G_FPTOSI: -  case TargetOpcode::G_FPTOUI:    case TargetOpcode::G_INTRINSIC_LRINT:    case TargetOpcode::G_INTRINSIC_LLRINT:      if (MRI.getType(MI.getOperand(0).getReg()).isVector()) diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp index d71f728..085c8588 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp @@ -75,8 +75,8 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) {  }  void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName, -                                     const AArch64TargetLowering &TLI) { -  RTLIB::LibcallImpl Impl = TLI.getSupportedLibcallImpl(FuncName); +                                     const RTLIB::RuntimeLibcallsInfo &RTLCI) { +  RTLIB::LibcallImpl Impl = RTLCI.getSupportedLibcallImpl(FuncName);    if (Impl == RTLIB::Unsupported)      return;    unsigned KnownAttrs = SMEAttrs::Normal; @@ -124,21 +124,22 @@ bool SMECallAttrs::requiresSMChange() const {    return true;  } -SMECallAttrs::SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI) +SMECallAttrs::SMECallAttrs(const CallBase &CB, +                           const RTLIB::RuntimeLibcallsInfo *RTLCI)      : CallerFn(*CB.getFunction()), CalledFn(SMEAttrs::Normal),        Callsite(CB.getAttributes()), IsIndirect(CB.isIndirectCall()) {    if (auto *CalledFunction = CB.getCalledFunction()) -    CalledFn = SMEAttrs(*CalledFunction, TLI); - -  // An `invoke` of an agnostic ZA function may not return normally (it may -  // resume in an exception block). In this case, it acts like a private ZA -  // callee and may require a ZA save to be set up before it is called. -  if (isa<InvokeInst>(CB)) -    CalledFn.set(SMEAttrs::ZA_State_Agnostic, /*Enable=*/false); +    CalledFn = SMEAttrs(*CalledFunction, RTLCI);    // FIXME: We probably should not allow SME attributes on direct calls but    // clang duplicates streaming mode attributes at each callsite.    assert((IsIndirect ||            ((Callsite.withoutPerCallsiteFlags() | CalledFn) == CalledFn)) &&           "SME attributes at callsite do not match declaration"); + +  // An `invoke` of an agnostic ZA function may not return normally (it may +  // resume in an exception block). In this case, it acts like a private ZA +  // callee and may require a ZA save to be set up before it is called. +  if (isa<InvokeInst>(CB)) +    CalledFn.set(SMEAttrs::ZA_State_Agnostic, /*Enable=*/false);  } diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h index d26e3cd..28c397e 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h @@ -12,8 +12,9 @@  #include "llvm/IR/Function.h"  namespace llvm { - -class AArch64TargetLowering; +namespace RTLIB { +struct RuntimeLibcallsInfo; +}  class Function;  class CallBase; @@ -52,14 +53,14 @@ public:    SMEAttrs() = default;    SMEAttrs(unsigned Mask) { set(Mask); } -  SMEAttrs(const Function &F, const AArch64TargetLowering *TLI = nullptr) +  SMEAttrs(const Function &F, const RTLIB::RuntimeLibcallsInfo *RTLCI = nullptr)        : SMEAttrs(F.getAttributes()) { -    if (TLI) -      addKnownFunctionAttrs(F.getName(), *TLI); +    if (RTLCI) +      addKnownFunctionAttrs(F.getName(), *RTLCI);    }    SMEAttrs(const AttributeList &L); -  SMEAttrs(StringRef FuncName, const AArch64TargetLowering &TLI) { -    addKnownFunctionAttrs(FuncName, TLI); +  SMEAttrs(StringRef FuncName, const RTLIB::RuntimeLibcallsInfo &RTLCI) { +    addKnownFunctionAttrs(FuncName, RTLCI);    };    void set(unsigned M, bool Enable = true) { @@ -157,7 +158,7 @@ public:  private:    void addKnownFunctionAttrs(StringRef FuncName, -                             const AArch64TargetLowering &TLI); +                             const RTLIB::RuntimeLibcallsInfo &RTLCI);    void validate() const;  }; @@ -175,7 +176,7 @@ public:                 SMEAttrs Callsite = SMEAttrs::Normal)        : CallerFn(Caller), CalledFn(Callee), Callsite(Callsite) {} -  SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI); +  SMECallAttrs(const CallBase &CB, const RTLIB::RuntimeLibcallsInfo *RTLCI);    SMEAttrs &caller() { return CallerFn; }    SMEAttrs &callee() { return IsIndirect ? Callsite : CalledFn; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index e8b211f..7f00ead 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -176,6 +176,19 @@ def binop_s64_with_s32_mask_combines : GICombineGroup<[    combine_or_s64_with_s32_mask, combine_and_s64_with_s32_mask  ]>; +// (or i64:x, (zext i32:y)) -> i64:(merge (or lo_32(x), i32:y), hi_32(x)) +// (or (zext i32:y), i64:x) -> i64:(merge (or lo_32(x), i32:y), hi_32(x)) +def or_s64_zext_s32_frag : GICombinePatFrag<(outs root:$dst), (ins $src_s64, $src_s32), +  [(pattern (G_OR $dst, i64:$src_s64, i64:$zext_val), (G_ZEXT i64:$zext_val, i32:$src_s32)), +   (pattern (G_OR $dst, i64:$zext_val, i64:$src_s64), (G_ZEXT i64:$zext_val, i32:$src_s32))]>; + +def combine_or_s64_s32 : GICombineRule< +  (defs root:$dst), +  (match (or_s64_zext_s32_frag $dst, i64:$x, i32:$y):$dst), +  (apply (G_UNMERGE_VALUES $x_lo, $x_hi, $x), +         (G_OR $or, $x_lo, $y), +         (G_MERGE_VALUES $dst, $or, $x_hi))>; +  let Predicates = [Has16BitInsts, NotHasMed3_16] in {  // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This  // saves one instruction compared to the promotion. @@ -206,7 +219,7 @@ def AMDGPUPreLegalizerCombiner: GICombiner<    "AMDGPUPreLegalizerCombinerImpl",    [all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16,     foldable_fneg, combine_shuffle_vector_to_build_vector, -   binop_s64_with_s32_mask_combines]> { +   binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {    let CombineAllMethodName = "tryCombineAllImpl";  } @@ -215,7 +228,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<    [all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp,     uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,     rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64, -   binop_s64_with_s32_mask_combines]> { +   binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {    let CombineAllMethodName = "tryCombineAllImpl";  } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 596a895..1a13b22 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -976,9 +976,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,      FPOpActions.clampMaxNumElementsStrict(0, S32, 2);    } +  auto &MinNumMaxNumIeee = +      getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); + +  if (ST.hasVOP3PInsts()) { +    MinNumMaxNumIeee.legalFor(FPTypesPK16) +        .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) +        .clampMaxNumElements(0, S16, 2) +        .clampScalar(0, S16, S64) +        .scalarize(0); +  } else if (ST.has16BitInsts()) { +    MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0); +  } else { +    MinNumMaxNumIeee.legalFor(FPTypesBase) +        .clampScalar(0, S32, S64) +        .scalarize(0); +  } +    auto &MinNumMaxNum = getActionDefinitionsBuilder( -      {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE, -       G_FMAXNUM_IEEE}); +      {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});    if (ST.hasVOP3PInsts()) {      MinNumMaxNum.customFor(FPTypesPK16) @@ -2136,9 +2152,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,          .legalFor(FPTypesPK16)          .clampMaxNumElements(0, S16, 2)          .scalarize(0); +  } else if (ST.hasVOP3PInsts()) { +    getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) +        .lowerFor({V2S16}) +        .clampMaxNumElementsStrict(0, S16, 2) +        .scalarize(0) +        .lower();    } else { -    // TODO: Implement -    getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); +    getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) +        .scalarize(0) +        .clampScalar(0, S32, S64) +        .lower();    }    getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) @@ -2195,8 +2219,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(    case TargetOpcode::G_FMAXNUM:    case TargetOpcode::G_FMINIMUMNUM:    case TargetOpcode::G_FMAXIMUMNUM: -  case TargetOpcode::G_FMINNUM_IEEE: -  case TargetOpcode::G_FMAXNUM_IEEE:      return legalizeMinNumMaxNum(Helper, MI);    case TargetOpcode::G_EXTRACT_VECTOR_ELT:      return legalizeExtractVectorElt(MI, MRI, B); @@ -2817,23 +2839,8 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,    MachineFunction &MF = Helper.MIRBuilder.getMF();    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); -  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || -                        MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; - -  // With ieee_mode disabled, the instructions have the correct behavior -  // already for G_FMINIMUMNUM/G_FMAXIMUMNUM. -  // -  // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode -  // enabled. -  if (!MFI->getMode().IEEE) { -    if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM || -        MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM) -      return true; - -    return !IsIEEEOp; -  } - -  if (IsIEEEOp) +  // With ieee_mode disabled, the instructions have the correct behavior. +  if (!MFI->getMode().IEEE)      return true;    return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 8122db2..313ae3d 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21381,15 +21381,6 @@ void ARMTargetLowering::insertSSPDeclarations(Module &M) const {    TargetLowering::insertSSPDeclarations(M);  } -Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const { -  // MSVC CRT has a function to validate security cookie. -  RTLIB::LibcallImpl SecurityCheckCookie = -      getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE); -  if (SecurityCheckCookie != RTLIB::Unsupported) -    return M.getFunction(getLibcallImplName(SecurityCheckCookie)); -  return TargetLowering::getSSPStackGuardCheck(M); -} -  bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,                                                    unsigned &Cost) const {    // If we do not have NEON, vector types are not natively supported. diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 8c5e0cf..357d2c5 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -708,7 +708,6 @@ class VectorType;      bool useLoadStackGuardNode(const Module &M) const override;      void insertSSPDeclarations(Module &M) const override; -    Function *getSSPStackGuardCheck(const Module &M) const override;      bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,                                     unsigned &Cost) const override; diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 53be167..10d4cd5 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -6546,23 +6546,25 @@ def KCFI_CHECK_ARM      : PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>,        Sched<[]>,        Requires<[IsARM]> { -  let Size = 28; // 7 instructions (bic, ldr, 4x eor, beq, udf) +  let Size = 40; // worst-case 10 instructions @ 4 bytes each +                 // (push, bic, ldr, 4x eor, pop, beq, udf)  }  def KCFI_CHECK_Thumb2      : PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>,        Sched<[]>,        Requires<[IsThumb2]> { -  let Size = -      32; // worst-case 9 instructions (push, bic, ldr, 4x eor, pop, beq.w, udf) +  let Size = 34; // worst-case (push.w[2], bic[4], ldr[4], 4x eor[16], pop.w[2], +                 // beq.w[4], udf[2])  }  def KCFI_CHECK_Thumb1      : PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>,        Sched<[]>,        Requires<[IsThumb1Only]> { -  let Size = 50; // worst-case 25 instructions (pushes, bic helper, type -                 // building, cmp, pops) +  let Size = 38; // worst-case 19 instructions @ 2 bytes each +                 // (2x push, 3x bic-helper, subs+ldr, 13x type-building, cmp, +                 // 2x pop, beq, bkpt)  }  //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp index 42e90f0..d6fa65f 100644 --- a/llvm/lib/Target/DirectX/DXILPrepare.cpp +++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp @@ -6,7 +6,7 @@  //  //===----------------------------------------------------------------------===//  /// -/// \file This file contains pases and utilities to convert a modern LLVM +/// \file This file contains passes and utilities to convert a modern LLVM  /// module into a module compatible with the LLVM 3.7-based DirectX Intermediate  /// Language (DXIL).  //===----------------------------------------------------------------------===// @@ -16,7 +16,6 @@  #include "DirectX.h"  #include "DirectXIRPasses/PointerTypeAnalysis.h"  #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/StringSet.h"  #include "llvm/Analysis/DXILMetadataAnalysis.h"  #include "llvm/Analysis/DXILResource.h" @@ -27,7 +26,6 @@  #include "llvm/IR/Module.h"  #include "llvm/InitializePasses.h"  #include "llvm/Pass.h" -#include "llvm/Support/Compiler.h"  #include "llvm/Support/VersionTuple.h"  #define DEBUG_TYPE "dxil-prepare" @@ -116,31 +114,6 @@ static void removeStringFunctionAttributes(Function &F,    F.removeRetAttrs(DeadAttrs);  } -static void cleanModuleFlags(Module &M) { -  NamedMDNode *MDFlags = M.getModuleFlagsMetadata(); -  if (!MDFlags) -    return; - -  SmallVector<llvm::Module::ModuleFlagEntry> FlagEntries; -  M.getModuleFlagsMetadata(FlagEntries); -  bool Updated = false; -  for (auto &Flag : FlagEntries) { -    // llvm 3.7 only supports behavior up to AppendUnique. -    if (Flag.Behavior <= Module::ModFlagBehavior::AppendUnique) -      continue; -    Flag.Behavior = Module::ModFlagBehavior::Warning; -    Updated = true; -  } - -  if (!Updated) -    return; - -  MDFlags->eraseFromParent(); - -  for (auto &Flag : FlagEntries) -    M.addModuleFlag(Flag.Behavior, Flag.Key->getString(), Flag.Val); -} -  class DXILPrepareModule : public ModulePass {    static Value *maybeGenerateBitcast(IRBuilder<> &Builder, @@ -202,15 +175,6 @@ class DXILPrepareModule : public ModulePass {                           Builder.getPtrTy(PtrTy->getAddressSpace())));    } -  static std::array<unsigned, 6> getCompatibleInstructionMDs(llvm::Module &M) { -    return {M.getMDKindID("dx.nonuniform"), -            M.getMDKindID("dx.controlflow.hints"), -            M.getMDKindID("dx.precise"), -            llvm::LLVMContext::MD_range, -            llvm::LLVMContext::MD_alias_scope, -            llvm::LLVMContext::MD_noalias}; -  } -  public:    bool runOnModule(Module &M) override {      PointerTypeMap PointerTypes = PointerTypeAnalysis::run(M); @@ -224,10 +188,7 @@ public:      const dxil::ModuleMetadataInfo MetadataInfo =          getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();      VersionTuple ValVer = MetadataInfo.ValidatorVersion; -    bool SkipValidation = ValVer.getMajor() == 0 && ValVer.getMinor() == 0; - -    // construct allowlist of valid metadata node kinds -    std::array<unsigned, 6> DXILCompatibleMDs = getCompatibleInstructionMDs(M); +    bool AllowExperimental = ValVer.getMajor() == 0 && ValVer.getMinor() == 0;      for (auto &F : M.functions()) {        F.removeFnAttrs(AttrMask); @@ -235,7 +196,7 @@ public:        // Only remove string attributes if we are not skipping validation.        // This will reserve the experimental attributes when validation version        // is 0.0 for experiment mode. -      removeStringFunctionAttributes(F, SkipValidation); +      removeStringFunctionAttributes(F, AllowExperimental);        for (size_t Idx = 0, End = F.arg_size(); Idx < End; ++Idx)          F.removeParamAttrs(Idx, AttrMask); @@ -243,11 +204,17 @@ public:          IRBuilder<> Builder(&BB);          for (auto &I : make_early_inc_range(BB)) { -          I.dropUnknownNonDebugMetadata(DXILCompatibleMDs); +          if (auto *CB = dyn_cast<CallBase>(&I)) { +            CB->removeFnAttrs(AttrMask); +            CB->removeRetAttrs(AttrMask); +            for (size_t Idx = 0, End = CB->arg_size(); Idx < End; ++Idx) +              CB->removeParamAttrs(Idx, AttrMask); +            continue; +          }            // Emtting NoOp bitcast instructions allows the ValueEnumerator to be            // unmodified as it reserves instruction IDs during contruction. -          if (auto LI = dyn_cast<LoadInst>(&I)) { +          if (auto *LI = dyn_cast<LoadInst>(&I)) {              if (Value *NoOpBitcast = maybeGenerateBitcast(                      Builder, PointerTypes, I, LI->getPointerOperand(),                      LI->getType())) { @@ -257,7 +224,7 @@ public:              }              continue;            } -          if (auto SI = dyn_cast<StoreInst>(&I)) { +          if (auto *SI = dyn_cast<StoreInst>(&I)) {              if (Value *NoOpBitcast = maybeGenerateBitcast(                      Builder, PointerTypes, I, SI->getPointerOperand(),                      SI->getValueOperand()->getType())) { @@ -268,39 +235,16 @@ public:              }              continue;            } -          if (auto GEP = dyn_cast<GetElementPtrInst>(&I)) { +          if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {              if (Value *NoOpBitcast = maybeGenerateBitcast(                      Builder, PointerTypes, I, GEP->getPointerOperand(),                      GEP->getSourceElementType()))                GEP->setOperand(0, NoOpBitcast);              continue;            } -          if (auto *CB = dyn_cast<CallBase>(&I)) { -            CB->removeFnAttrs(AttrMask); -            CB->removeRetAttrs(AttrMask); -            for (size_t Idx = 0, End = CB->arg_size(); Idx < End; ++Idx) -              CB->removeParamAttrs(Idx, AttrMask); -            continue; -          }          }        }      } -    // Remove flags not for DXIL. -    cleanModuleFlags(M); - -    // dx.rootsignatures will have been parsed from its metadata form as its -    // binary form as part of the RootSignatureAnalysisWrapper, so safely -    // remove it as it is not recognized in DXIL -    if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures")) -      RootSignature->eraseFromParent(); - -    // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and -    // causes all tests using the DXIL Validator to fail. -    // -    // This is a temporary fix and should be replaced with a whitelist once -    // we have determined all metadata that the DXIL Validator allows -    if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa")) -      ErrNo->eraseFromParent();      return true;    } @@ -308,11 +252,11 @@ public:    DXILPrepareModule() : ModulePass(ID) {}    void getAnalysisUsage(AnalysisUsage &AU) const override {      AU.addRequired<DXILMetadataAnalysisWrapperPass>(); -    AU.addRequired<RootSignatureAnalysisWrapper>(); -    AU.addPreserved<RootSignatureAnalysisWrapper>(); -    AU.addPreserved<ShaderFlagsAnalysisWrapper>(); +      AU.addPreserved<DXILMetadataAnalysisWrapperPass>();      AU.addPreserved<DXILResourceWrapperPass>(); +    AU.addPreserved<RootSignatureAnalysisWrapper>(); +    AU.addPreserved<ShaderFlagsAnalysisWrapper>();    }    static char ID; // Pass identification.  }; @@ -323,7 +267,6 @@ char DXILPrepareModule::ID = 0;  INITIALIZE_PASS_BEGIN(DXILPrepareModule, DEBUG_TYPE, "DXIL Prepare Module",                        false, false)  INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass) -INITIALIZE_PASS_DEPENDENCY(RootSignatureAnalysisWrapper)  INITIALIZE_PASS_END(DXILPrepareModule, DEBUG_TYPE, "DXIL Prepare Module", false,                      false) diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp index 9eebcc9..1e4797b 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp @@ -7,8 +7,10 @@  //===----------------------------------------------------------------------===//  #include "DXILTranslateMetadata.h" +#include "DXILRootSignature.h"  #include "DXILShaderFlags.h"  #include "DirectX.h" +#include "llvm/ADT/STLExtras.h"  #include "llvm/ADT/SmallVector.h"  #include "llvm/ADT/Twine.h"  #include "llvm/Analysis/DXILMetadataAnalysis.h" @@ -204,9 +206,9 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,    return MDNode::get(Ctx, MDVals);  } -MDTuple *constructEntryMetadata(const Function *EntryFn, MDTuple *Signatures, -                                MDNode *Resources, MDTuple *Properties, -                                LLVMContext &Ctx) { +static MDTuple *constructEntryMetadata(const Function *EntryFn, +                                       MDTuple *Signatures, MDNode *Resources, +                                       MDTuple *Properties, LLVMContext &Ctx) {    // Each entry point metadata record specifies:    //  * reference to the entry point function global symbol    //  * unmangled name @@ -290,42 +292,82 @@ static MDTuple *emitTopLevelLibraryNode(Module &M, MDNode *RMD,    return constructEntryMetadata(nullptr, nullptr, RMD, Properties, Ctx);  } -// TODO: We might need to refactor this to be more generic, -// in case we need more metadata to be replaced. -static void translateBranchMetadata(Module &M) { -  for (Function &F : M) { -    for (BasicBlock &BB : F) { -      Instruction *BBTerminatorInst = BB.getTerminator(); +static void translateBranchMetadata(Module &M, Instruction *BBTerminatorInst) { +  MDNode *HlslControlFlowMD = +      BBTerminatorInst->getMetadata("hlsl.controlflow.hint"); + +  if (!HlslControlFlowMD) +    return; -      MDNode *HlslControlFlowMD = -          BBTerminatorInst->getMetadata("hlsl.controlflow.hint"); +  assert(HlslControlFlowMD->getNumOperands() == 2 && +         "invalid operands for hlsl.controlflow.hint"); -      if (!HlslControlFlowMD) -        continue; +  MDBuilder MDHelper(M.getContext()); -      assert(HlslControlFlowMD->getNumOperands() == 2 && -             "invalid operands for hlsl.controlflow.hint"); +  llvm::Metadata *HintsStr = MDHelper.createString("dx.controlflow.hints"); +  llvm::Metadata *HintsValue = MDHelper.createConstant( +      mdconst::extract<ConstantInt>(HlslControlFlowMD->getOperand(1))); -      MDBuilder MDHelper(M.getContext()); -      ConstantInt *Op1 = -          mdconst::extract<ConstantInt>(HlslControlFlowMD->getOperand(1)); +  MDNode *MDNode = llvm::MDNode::get(M.getContext(), {HintsStr, HintsValue}); -      SmallVector<llvm::Metadata *, 2> Vals( -          ArrayRef<Metadata *>{MDHelper.createString("dx.controlflow.hints"), -                               MDHelper.createConstant(Op1)}); +  BBTerminatorInst->setMetadata("dx.controlflow.hints", MDNode); +  BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr); +} + +static std::array<unsigned, 6> getCompatibleInstructionMDs(llvm::Module &M) { +  return { +      M.getMDKindID("dx.nonuniform"),    M.getMDKindID("dx.controlflow.hints"), +      M.getMDKindID("dx.precise"),       llvm::LLVMContext::MD_range, +      llvm::LLVMContext::MD_alias_scope, llvm::LLVMContext::MD_noalias}; +} -      MDNode *MDNode = llvm::MDNode::get(M.getContext(), Vals); +static void translateInstructionMetadata(Module &M) { +  // construct allowlist of valid metadata node kinds +  std::array<unsigned, 6> DXILCompatibleMDs = getCompatibleInstructionMDs(M); -      BBTerminatorInst->setMetadata("dx.controlflow.hints", MDNode); -      BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr); +  for (Function &F : M) { +    for (BasicBlock &BB : F) { +      // This needs to be done first so that "hlsl.controlflow.hints" isn't +      // removed in the whitelist below +      if (auto *I = BB.getTerminator()) +        translateBranchMetadata(M, I); + +      for (auto &I : make_early_inc_range(BB)) { +        I.dropUnknownNonDebugMetadata(DXILCompatibleMDs); +      }      }    }  } -static void translateMetadata(Module &M, DXILResourceMap &DRM, -                              DXILResourceTypeMap &DRTM, -                              const ModuleShaderFlags &ShaderFlags, -                              const ModuleMetadataInfo &MMDI) { +static void cleanModuleFlags(Module &M) { +  NamedMDNode *MDFlags = M.getModuleFlagsMetadata(); +  if (!MDFlags) +    return; + +  SmallVector<llvm::Module::ModuleFlagEntry> FlagEntries; +  M.getModuleFlagsMetadata(FlagEntries); +  bool Updated = false; +  for (auto &Flag : FlagEntries) { +    // llvm 3.7 only supports behavior up to AppendUnique. +    if (Flag.Behavior <= Module::ModFlagBehavior::AppendUnique) +      continue; +    Flag.Behavior = Module::ModFlagBehavior::Warning; +    Updated = true; +  } + +  if (!Updated) +    return; + +  MDFlags->eraseFromParent(); + +  for (auto &Flag : FlagEntries) +    M.addModuleFlag(Flag.Behavior, Flag.Key->getString(), Flag.Val); +} + +static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM, +                                    DXILResourceTypeMap &DRTM, +                                    const ModuleShaderFlags &ShaderFlags, +                                    const ModuleMetadataInfo &MMDI) {    LLVMContext &Ctx = M.getContext();    IRBuilder<> IRB(Ctx);    SmallVector<MDNode *> EntryFnMDNodes; @@ -381,6 +423,22 @@ static void translateMetadata(Module &M, DXILResourceMap &DRM,        M.getOrInsertNamedMetadata("dx.entryPoints");    for (auto *Entry : EntryFnMDNodes)      EntryPointsNamedMD->addOperand(Entry); + +  cleanModuleFlags(M); + +  // dx.rootsignatures will have been parsed from its metadata form as its +  // binary form as part of the RootSignatureAnalysisWrapper, so safely +  // remove it as it is not recognized in DXIL +  if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures")) +    RootSignature->eraseFromParent(); + +  // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and +  // causes all tests using the DXIL Validator to fail. +  // +  // This is a temporary fix and should be replaced with a allowlist once +  // we have determined all metadata that the DXIL Validator allows +  if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa")) +    ErrNo->eraseFromParent();  }  PreservedAnalyses DXILTranslateMetadata::run(Module &M, @@ -390,8 +448,8 @@ PreservedAnalyses DXILTranslateMetadata::run(Module &M,    const ModuleShaderFlags &ShaderFlags = MAM.getResult<ShaderFlagsAnalysis>(M);    const dxil::ModuleMetadataInfo MMDI = MAM.getResult<DXILMetadataAnalysis>(M); -  translateMetadata(M, DRM, DRTM, ShaderFlags, MMDI); -  translateBranchMetadata(M); +  translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI); +  translateInstructionMetadata(M);    return PreservedAnalyses::all();  } @@ -409,10 +467,13 @@ public:      AU.addRequired<DXILResourceWrapperPass>();      AU.addRequired<ShaderFlagsAnalysisWrapper>();      AU.addRequired<DXILMetadataAnalysisWrapperPass>(); -    AU.addPreserved<DXILResourceWrapperPass>(); +    AU.addRequired<RootSignatureAnalysisWrapper>(); +      AU.addPreserved<DXILMetadataAnalysisWrapperPass>(); -    AU.addPreserved<ShaderFlagsAnalysisWrapper>();      AU.addPreserved<DXILResourceBindingWrapperPass>(); +    AU.addPreserved<DXILResourceWrapperPass>(); +    AU.addPreserved<RootSignatureAnalysisWrapper>(); +    AU.addPreserved<ShaderFlagsAnalysisWrapper>();    }    bool runOnModule(Module &M) override { @@ -425,8 +486,8 @@ public:      dxil::ModuleMetadataInfo MMDI =          getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata(); -    translateMetadata(M, DRM, DRTM, ShaderFlags, MMDI); -    translateBranchMetadata(M); +    translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI); +    translateInstructionMetadata(M);      return true;    }  }; @@ -443,6 +504,7 @@ INITIALIZE_PASS_BEGIN(DXILTranslateMetadataLegacy, "dxil-translate-metadata",                        "DXIL Translate Metadata", false, false)  INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass)  INITIALIZE_PASS_DEPENDENCY(ShaderFlagsAnalysisWrapper) +INITIALIZE_PASS_DEPENDENCY(RootSignatureAnalysisWrapper)  INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass)  INITIALIZE_PASS_END(DXILTranslateMetadataLegacy, "dxil-translate-metadata",                      "DXIL Translate Metadata", false, false) diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h index f3f5eb1..4c1ffac 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h @@ -13,7 +13,8 @@  namespace llvm { -/// A pass that transforms DXIL Intrinsics that don't have DXIL opCodes +/// A pass that transforms LLVM Metadata in the module to it's DXIL equivalent, +/// then emits all recognized DXIL Metadata  class DXILTranslateMetadata : public PassInfoMixin<DXILTranslateMetadata> {  public:    PreservedAnalyses run(Module &M, ModuleAnalysisManager &); diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td index fb0928b8..ede8463 100644 --- a/llvm/lib/Target/Hexagon/Hexagon.td +++ b/llvm/lib/Target/Hexagon/Hexagon.td @@ -79,6 +79,12 @@ def ExtensionHVXV79: SubtargetFeature<"hvxv79", "HexagonHVXVersion",         ExtensionHVXV67, ExtensionHVXV68, ExtensionHVXV69, ExtensionHVXV71,         ExtensionHVXV73, ExtensionHVXV75]>; +def ExtensionHVXV81: SubtargetFeature<"hvxv81", "HexagonHVXVersion", +      "Hexagon::ArchEnum::V81", "Hexagon HVX instructions", +      [ExtensionHVXV65, ExtensionHVXV66, ExtensionHVXV67, +       ExtensionHVXV68, ExtensionHVXV69, ExtensionHVXV71, +       ExtensionHVXV73, ExtensionHVXV75, ExtensionHVXV79]>; +  def ExtensionHVX64B: SubtargetFeature<"hvx-length64b", "UseHVX64BOps",        "true", "Hexagon HVX 64B instructions", [ExtensionHVX]>;  def ExtensionHVX128B: SubtargetFeature<"hvx-length128b", "UseHVX128BOps", @@ -151,6 +157,8 @@ def UseHVXV75          : Predicate<"HST->useHVXV75Ops()">,                           AssemblerPredicate<(all_of ExtensionHVXV75)>;  def UseHVXV79          : Predicate<"HST->useHVXV79Ops()">,                           AssemblerPredicate<(all_of ExtensionHVXV79)>; +def UseHVXV81          : Predicate<"HST->useHVXV81Ops()">, +                         AssemblerPredicate<(all_of ExtensionHVXV81)>;  def UseAudio           : Predicate<"HST->useAudioOps()">,                           AssemblerPredicate<(all_of ExtensionAudio)>;  def UseZReg            : Predicate<"HST->useZRegOps()">, @@ -488,6 +496,11 @@ def : Proc<"hexagonv79", HexagonModelV79,             ArchV68, ArchV69, ArchV71, ArchV73, ArchV75, ArchV79,             FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops,             FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>; +def : Proc<"hexagonv81", HexagonModelV81, +           [ArchV65, ArchV66, ArchV67, ArchV68, ArchV69, ArchV71, ArchV73, +            ArchV75, ArchV79, ArchV81, +            FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops, +            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;  // Need to update the correct features for tiny core.  // Disable NewValueJumps since the packetizer is unable to handle a packet with diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.h b/llvm/lib/Target/Hexagon/HexagonDepArch.h index 8984534..9bf4034 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepArch.h +++ b/llvm/lib/Target/Hexagon/HexagonDepArch.h @@ -29,7 +29,8 @@ enum class ArchEnum {    V71,    V73,    V75, -  V79 +  V79, +  V81  };  inline std::optional<Hexagon::ArchEnum> getCpu(StringRef CPU) { @@ -50,6 +51,7 @@ inline std::optional<Hexagon::ArchEnum> getCpu(StringRef CPU) {        .Case("hexagonv73", Hexagon::ArchEnum::V73)        .Case("hexagonv75", Hexagon::ArchEnum::V75)        .Case("hexagonv79", Hexagon::ArchEnum::V79) +      .Case("hexagonv81", Hexagon::ArchEnum::V81)        .Default(std::nullopt);  }  } // namespace Hexagon diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.td b/llvm/lib/Target/Hexagon/HexagonDepArch.td index 8ec1d93..f623fd0 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepArch.td +++ b/llvm/lib/Target/Hexagon/HexagonDepArch.td @@ -34,3 +34,5 @@ def ArchV75: SubtargetFeature<"v75", "HexagonArchVersion", "Hexagon::ArchEnum::V  def HasV75 : Predicate<"HST->hasV75Ops()">, AssemblerPredicate<(all_of ArchV75)>;  def ArchV79: SubtargetFeature<"v79", "HexagonArchVersion", "Hexagon::ArchEnum::V79", "Enable Hexagon V79 architecture">;  def HasV79 : Predicate<"HST->hasV79Ops()">, AssemblerPredicate<(all_of ArchV79)>; +def ArchV81: SubtargetFeature<"v81", "HexagonArchVersion", "Hexagon::ArchEnum::V81", "Enable Hexagon V81 architecture">; +def HasV81 : Predicate<"HST->hasV81Ops()">, AssemblerPredicate<(all_of ArchV81)>; diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td index 93696e0..f4e36fa7 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td @@ -7222,3 +7222,595 @@ class DepHVXItinV79 {        [Hex_FWD, Hex_FWD, HVX_FWD]>    ];  } + +class DepHVXItinV81 { +  list<InstrItinData> DepHVXItinV81_list = [ +    InstrItinData <tc_0390c1ca, /*SLOT01,LOAD,VA,VX_DV*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_04da405a, /*SLOT0123,VP_VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLSHF]>], [9, 5], +      [HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_08a4f1b6, /*SLOT23,VX_DV*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 5], +      [HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 5, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_0ec46cf9, /*SLOT0123,VA*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7], +      [HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_131f1c81, /*SLOT0,NOSLOT1,STORE,VP*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [SLOT1], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5], +      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_1381a97c, /*SLOT0123,4SLOT*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_ALL]>], [], +      []>, + +    InstrItinData <tc_15fdf750, /*SLOT23,VS_VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, +       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_16ff9ef8, /*SLOT0123,VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_191381c1, /*SLOT0,STORE,VA*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7], +      [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_1ad8a370, /*SLOT23,VX_DV*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_1ba8a0cd, /*SLOT01,LOAD,VA*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_20a4bbec, /*SLOT0,STORE*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST]>], [3, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_227864f7, /*SLOT0,STORE,VA,VX_DV*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>, +       InstrStage<1, [CVI_MPY01]>], [3, 1, 2, 5], +      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_26a377fe, /*SLOT23,4SLOT_MPY*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], +      [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_2d4051cd, /*SLOT23,4SLOT_MPY*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2], +      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_2e8f5f6e, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_309dbb4f, /*SLOT0123,VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_37820f4c, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_3904b926, /*SLOT01,LOAD*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_3aacf4a8, /*SLOT0123,VA*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7], +      [HVX_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_3ad719fb, /*SLOT01,ZW*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_3c56e5ce, /*SLOT0,NOSLOT1,LOAD,VP*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5], +      [HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_3ce09744, /*SLOT0,STORE*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST]>], [1, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_3e2aaafc, /*SLOT0,STORE,VA*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7], +      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_447d9895, /*SLOT0,STORE,VA*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], +      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_453fe68d, /*SLOT01,LOAD,VA*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_46d6c3e0, /*SLOT0123,VP*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLANE]>], [9, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_4942646a, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_SHIFT]>], [9, 5], +      [HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_52447ecc, /*SLOT01,LOAD*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD]>], [9, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_531b383c, /*SLOT0123*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_540c3da3, /*SLOT0,VA*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1], +      [Hex_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_54a0dc47, /*SLOT0,STORE,VA*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_561aaa58, /*SLOT0123,VP_VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_56c4f9fe, /*SLOT0123,VA*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_56e64202, /*SLOT0123,VP*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_58d21193, /*SLOT0,STORE,VA_DV*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], +      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLANE]>], [9, 2], +      [HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7], +      [HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_649072c2, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_660769f1, /*SLOT23,VX_DV*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_663c80a7, /*SLOT01,LOAD*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD]>], [9, 3, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_6942b6e0, /*SLOT0,STORE*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST]>], [3, 1, 2, 5], +      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_6e7fa133, /*SLOT0123,VP*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLANE]>], [9, 5, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_7095ecba, /*SLOT01,LOAD,VA_DV*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7], +      [Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_7177e272, /*SLOT0,STORE*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST]>], [2, 1, 2, 5], +      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_718b5c53, /*SLOT0123,VA_DV*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9], +      [HVX_FWD]>, + +    InstrItinData <tc_7273323b, /*SLOT0,STORE,VA_DV*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], +      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_72e2b393, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_73efe966, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_7417e785, /*SLOT0123,VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_767c4e9d, /*SLOT0123,4SLOT*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_ALL]>], [3, 2], +      [HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_7d68d5c2, /*SLOT01,LOAD,VA*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], +      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_8772086c, /*SLOT0123,VA*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_87adc037, /*SLOT0123,VP_VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_8e420e4d, /*SLOT0,STORE,VA*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7], +      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_90bcc1db, /*SLOT2,VX_DV*/ +      [InstrStage<1, [SLOT2], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_933f2b39, /*SLOT23,4SLOT_MPY*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_946013d8, /*SLOT0123,VP*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLANE]>], [9, 5], +      [HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_9a1cab75, /*SLOT01,LOAD,VA,VX_DV*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 3, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9aff7a2a, /*SLOT0,STORE,VA,VX_DV*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>, +       InstrStage<1, [CVI_MPY01]>], [1, 2, 5], +      [Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_9d1dc972, /*SLOT0123,VP_VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_9f363d21, /*SLOT0,STORE,VA*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7], +      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_a02a10a8, /*SLOT0,STORE,VA*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7], +      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_a0dbea28, /*SLOT01,ZW*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_ZW]>], [3, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a19b9305, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_a28f32b5, /*SLOT01,LOAD,VA*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], +      [Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_a69eeee1, /*SLOT01,LOAD,VA_DV*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7], +      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_XLANE]>], [9, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_ab23f776, /*SLOT0,STORE*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST]>], [1, 2, 5], +      [Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_abe8c3b2, /*SLOT01,LOAD,VA*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_ac4046bc, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_af25efd9, /*SLOT0123,VA_DV*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7], +      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_b091f1c6, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_b28e51aa, /*SLOT0123,4SLOT*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_ALL]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_b4416217, /*SLOT0123,VA_DV*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7], +      [HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_b9db8205, /*SLOT01,LOAD*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_bb599486, /*SLOT23,VX_DV*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_LD], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2], +      [HVX_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_c127de3a, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_c4edf264, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2], +      [HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_c5dba46e, /*SLOT0,STORE,VA*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], +      [Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_c7039829, /*SLOT0,NOSLOT1,STORE,VP*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [SLOT1], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_cd94bfe0, /*SLOT23,VS_VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, +       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_cda936da, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_db5555f3, /*SLOT0123,VA_DV*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], +      [HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_dcca380f, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_ZW]>], [2, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_df80eeb0, /*SLOT0123,VP_VS*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_e2d2e9e5, /*SLOT0,NOSLOT1,STORE,VP*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [SLOT1], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5], +      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_e2fdd6e6, /*SLOT0123*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 5], +      [HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_e35c1e93, /*SLOT0123,VA*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7], +      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + +    InstrItinData <tc_e3f68a46, /*SLOT0123,4SLOT*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_ALL]>], [3], +      [HVX_FWD]>, + +    InstrItinData <tc_e675c45a, /*SLOT23,VX_DV*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_e699ae41, /*SLOT01,ZW*/ +      [InstrStage<1, [SLOT0, SLOT1], 0>, +       InstrStage<1, [CVI_ZW]>], [1, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + +    InstrItinData <tc_f175e046, /*SLOT23,VX*/ +      [InstrStage<1, [SLOT2, SLOT3], 0>, +       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2], +      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/ +      [InstrStage<1, [SLOT2], 0>, +       InstrStage<1, [CVI_MPY01]>], [9, 5, 2], +      [HVX_FWD, HVX_FWD, Hex_FWD]>, + +    InstrItinData <tc_f21e8abb, /*SLOT0,NOSLOT1,STORE,VP*/ +      [InstrStage<1, [SLOT0], 0>, +       InstrStage<1, [SLOT1], 0>, +       InstrStage<1, [CVI_ST], 0>, +       InstrStage<1, [CVI_XLANE]>], [1, 2, 5], +      [Hex_FWD, Hex_FWD, HVX_FWD]> +  ]; +}
\ No newline at end of file diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td b/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td index 7a1ad3e..48b665c 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td +++ b/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td @@ -13740,3 +13740,891 @@ class DepScalarItinV79 {        [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>    ];  } + +class DepScalarItinV81 { +  list<InstrItinData> DepScalarItinV81_list = [ +    InstrItinData <tc_011e0e9d, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_01d44cb2, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_01e1be3b, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_02fe1c65, /*tc_4x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_0655b949, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [2, 3], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_075c8dd8, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_0a195f2c, /*tc_4x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_0a43be35, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_0a6c20ae, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_0ba0d5da, /*tc_3stall*/ +      [InstrStage<1, [SLOT2]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_0dfac0a7, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_0fac1eb8, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [3, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_112d30d6, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_1242dc2a, /*tc_ld*/ +      [InstrStage<1, [SLOT0]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_1248597c, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_139ef484, /*tc_3stall*/ +      [InstrStage<1, [SLOT2]>], [1, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_14ab4f41, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [3, 3, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_151bf368, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_158aa3f7, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_197dce51, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [4, 2, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_1981450d, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [3], +      [Hex_FWD]>, + +    InstrItinData <tc_1c2c7a4a, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_1c7522a8, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_1d41f8b7, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_1fcb8495, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_1fe4ab69, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_20131976, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_2237d952, /*tc_ld*/ +      [InstrStage<1, [SLOT0]>], [1, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_23708a21, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [], +      []>, + +    InstrItinData <tc_2471c1c8, /*tc_ld*/ +      [InstrStage<1, [SLOT0]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_24e109c7, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [3, 3, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_24f426ab, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_27106296, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [4, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_280f7fe1, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_28e55c6f, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [1, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_2c13e7f5, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_2c3e17fc, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_2f573607, /*tc_1*/ +      [InstrStage<1, [SLOT2]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_33e7e673, /*tc_2early*/ +      [InstrStage<1, [SLOT2]>], [], +      []>, + +    InstrItinData <tc_362b0be2, /*tc_3*/ +      [InstrStage<1, [SLOT2]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_38382228, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_388f9897, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_38e0bae9, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_3d14a17b, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_3edca78f, /*tc_2*/ +      [InstrStage<1, [SLOT3]>], [4, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_3fbf1042, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3], +      [Hex_FWD]>, + +    InstrItinData <tc_407e96f9, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_40d64c94, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [3, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_4222e6bf, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_42ff66ba, /*tc_1*/ +      [InstrStage<1, [SLOT2]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_442395f3, /*tc_2latepred*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_449acf79, /*tc_latepredstaia*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_44d5a428, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_44fffc58, /*tc_3*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_45791fb8, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_45f9d1be, /*tc_2early*/ +      [InstrStage<1, [SLOT2]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_46c18ecf, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_49fdfd4b, /*tc_3stall*/ +      [InstrStage<1, [SLOT3]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_4a55d03c, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_4abdbdc6, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_4ac61d92, /*tc_2latepred*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_4bf903b0, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [3], +      [Hex_FWD]>, + +    InstrItinData <tc_503ce0f3, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_512b1653, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_53c851ab, /*tc_3stall*/ +      [InstrStage<1, [SLOT2]>], [4, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_54f0cee2, /*tc_3stall*/ +      [InstrStage<1, [SLOT3]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_5502c366, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_55255f2b, /*tc_3stall*/ +      [InstrStage<1, [SLOT3]>], [], +      []>, + +    InstrItinData <tc_556f6577, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_55a9a350, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_55b33fda, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_56a124a7, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_57a55b54, /*tc_1*/ +      [InstrStage<1, [SLOT3]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_5944960d, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_59a7822c, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_5a222e89, /*tc_2early*/ +      [InstrStage<1, [SLOT2]>], [1, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_5a4b5e58, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [4, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_5b347363, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_5ceb2f9e, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_5da50c4b, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_5deb5e47, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_5e4cf0e8, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_5f2afaf7, /*tc_latepredldaia*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_60e324ff, /*tc_1*/ +      [InstrStage<1, [SLOT2]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_63567288, /*tc_2latepred*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4], +      [Hex_FWD]>, + +    InstrItinData <tc_64b00d8a, /*tc_ld*/ +      [InstrStage<1, [SLOT0]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_651cbe02, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_65279839, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_65cbd974, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_69bfb303, /*tc_3*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_6aa823ab, /*tc_3stall*/ +      [InstrStage<1, [SLOT3]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_6ae3426b, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_6d861a95, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [2, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_6e20402a, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [2, 3], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_6f42bc60, /*tc_3stall*/ +      [InstrStage<1, [SLOT0]>], [4, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_6fb52018, /*tc_3stall*/ +      [InstrStage<1, [SLOT0]>], [1, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_6fc5dbea, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_711c805f, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_713b66bf, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_7401744f, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_7476d766, /*tc_3stall*/ +      [InstrStage<1, [SLOT3]>], [4, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_74a42bda, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_759e57be, /*tc_3stall*/ +      [InstrStage<1, [SLOT2]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_76bb5435, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_7d6a2568, /*tc_3stall*/ +      [InstrStage<1, [SLOT2]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_77f94a5e, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [], +      []>, + +    InstrItinData <tc_788b1d09, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_78f87ed3, /*tc_3stall*/ +      [InstrStage<1, [SLOT0]>], [], +      []>, + +    InstrItinData <tc_7af3a37e, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1, 3], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_7b9187d3, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_7c28bd7e, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [3], +      [Hex_FWD]>, + +    InstrItinData <tc_7c31e19a, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_7c6d32e4, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_7dc63b5c, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_7f58404a, /*tc_3stall*/ +      [InstrStage<1, [SLOT3]>], [], +      []>, + +    InstrItinData <tc_7f7f45f5, /*tc_4x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_7f8ae742, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_8035e91f, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_822c3c68, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_829d8a86, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_838c4d7a, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_84a7500d, /*tc_2*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_86173609, /*tc_2latepred*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_887d1bb7, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_8a6d0d94, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_8a825db2, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_8b5bd4f5, /*tc_2*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_8e82e8ca, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_8f36a2fd, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9124c04f, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_92240447, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_934753bb, /*tc_ld*/ +      [InstrStage<1, [SLOT0]>], [3, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_937dd41c, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [], +      []>, + +    InstrItinData <tc_9406230a, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [2, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_95a33176, /*tc_2*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_95f43c5e, /*tc_3*/ +      [InstrStage<1, [SLOT2]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_96ef76ef, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_975a4e54, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [3, 3, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9783714b, /*tc_4x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9b20a062, /*tc_3stall*/ +      [InstrStage<1, [SLOT2]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9b34f5e0, /*tc_3stall*/ +      [InstrStage<1, [SLOT2]>], [], +      []>, + +    InstrItinData <tc_9b3c0462, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9bcfb2ee, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9c52f549, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9e27f2f9, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9e72dc89, /*tc_4x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9edb7c77, /*tc_4x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9edefe01, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_9f6cd987, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a08b630b, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a1297125, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a154b476, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a2b365d2, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a3070909, /*tc_3stall*/ +      [InstrStage<1, [SLOT0]>], [1, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a32e03e7, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a38c45dc, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a4e22bbd, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a4ee89db, /*tc_2early*/ +      [InstrStage<1, [SLOT0]>], [], +      []>, + +    InstrItinData <tc_a724463d, /*tc_3stall*/ +      [InstrStage<1, [SLOT0]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a7a13fac, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a7bdb22c, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_a9edeffa, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_abfd9a6d, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_ac65613f, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_addc37a8, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_ae5babd7, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_aee6250c, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_af6af259, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_b1ae5f67, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_b2196a3f, /*tc_3stall*/ +      [InstrStage<1, [SLOT3]>], [1, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_b3d46584, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [], +      []>, + +    InstrItinData <tc_b4dc7630, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_b7c4062a, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_b837298f, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [], +      []>, + +    InstrItinData <tc_b9bec29e, /*tc_3stall*/ +      [InstrStage<1, [SLOT2]>], [], +      []>, + +    InstrItinData <tc_ba9255a6, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_bb07f2c5, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_bb78483e, /*tc_3stall*/ +      [InstrStage<1, [SLOT3]>], [4, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_bb831a7c, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_bf2ffc0f, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_c20701f0, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_c21d7447, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_c57d9f39, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_c818ff7f, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [], +      []>, + +    InstrItinData <tc_ce59038e, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_cfa0e29b, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [2, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_d03278fd, /*tc_st*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_d234b61a, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_d33e5eee, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_d3632d88, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_d45ba9cd, /*tc_ld*/ +      [InstrStage<1, [SLOT0]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_d57d649c, /*tc_3stall*/ +      [InstrStage<1, [SLOT2]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_d61dfdc3, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_d68dca5c, /*tc_3stall*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_d71ea8fa, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [2, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_d7718fbe, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_db596beb, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_db96aa6b, /*tc_st*/ +      [InstrStage<1, [SLOT0]>], [1], +      [Hex_FWD]>, + +    InstrItinData <tc_dc51281d, /*tc_3*/ +      [InstrStage<1, [SLOT2]>], [2, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_decdde8a, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_df5d53f9, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [3, 2, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_e3d699e3, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_e60def48, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_e9170fb7, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_ed03645c, /*tc_1*/ +      [InstrStage<1, [SLOT2]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_ed3f8d2a, /*tc_ld*/ +      [InstrStage<1, [SLOT0]>], [4, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_eed07714, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_eeda4109, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_ef921005, /*tc_1*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_f098b237, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_f0cdeccf, /*tc_3x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_f0e8e832, /*tc_4x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_f34c1c21, /*tc_2*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_f38f92e1, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_f529831b, /*tc_latepredstaia*/ +      [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 3], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_f6e2aff9, /*tc_newvjump*/ +      [InstrStage<1, [SLOT0]>], [3, 2, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_f7569068, /*tc_4x*/ +      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_f97707c1, /*tc_1*/ +      [InstrStage<1, [SLOT2]>], [2], +      [Hex_FWD]>, + +    InstrItinData <tc_f999c66e, /*tc_1*/ +      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_fae9dfa5, /*tc_3x*/ +      [InstrStage<1, [SLOT3]>], [4, 2], +      [Hex_FWD, Hex_FWD]>, + +    InstrItinData <tc_fedb7e19, /*tc_ld*/ +      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2], +      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]> +  ]; +}
\ No newline at end of file diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td index ae96753..f8f1c2a 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td +++ b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td @@ -39178,6 +39178,19 @@ let opNewValue = 0;  let isCVI = 1;  let DecoderNamespace = "EXT_mmvec";  } +def V6_vsub_hf_mix : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf16 = vsub($Vu32.hf,$Vv32.qf16)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b100; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011010000; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +}  def V6_vsub_qf16 : HInst<  (outs HvxVR:$Vd32),  (ins HvxVR:$Vu32, HvxVR:$Vv32), @@ -39269,6 +39282,19 @@ let opNewValue = 0;  let isCVI = 1;  let DecoderNamespace = "EXT_mmvec";  } +def V6_vsub_sf_mix : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf32 = vsub($Vu32.sf,$Vv32.qf32)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV81,UseHVXQFloat]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011010000; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +}  def V6_vsub_sf_sf : HInst<  (outs HvxVR:$Vd32),  (ins HvxVR:$Vu32, HvxVR:$Vv32), @@ -41116,6 +41142,17 @@ let hasNewValue = 1;  let opNewValue = 0;  let isSolo = 1;  } +def Y2_tlbpp : HInst< +(outs IntRegs:$Rd32), +(ins DoubleRegs:$Rss32), +"$Rd32 = tlbp($Rss32)", +tc_6aa823ab, TypeCR>, Enc_90cd8b, Requires<[HasV81]> { +let Inst{13-5} = 0b000000000; +let Inst{31-21} = 0b01101100011; +let hasNewValue = 1; +let opNewValue = 0; +let isSolo = 1; +}  def Y2_tlbr : HInst<  (outs DoubleRegs:$Rdd32),  (ins IntRegs:$Rs32), diff --git a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td index 17cb96c..23f4b3a 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td +++ b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td @@ -3827,3 +3827,14 @@ def: Pat<(int_hexagon_V6_vsub_hf_f8 HvxVR:$src1, HvxVR:$src2),           (V6_vsub_hf_f8 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV79, UseHVX64B]>;  def: Pat<(int_hexagon_V6_vsub_hf_f8_128B HvxVR:$src1, HvxVR:$src2),           (V6_vsub_hf_f8 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV79, UseHVX128B]>; + +// V81 HVX Instructions. + +def: Pat<(int_hexagon_V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2), +         (V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_hf_mix_128B HvxVR:$src1, HvxVR:$src2), +         (V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_sf_mix HvxVR:$src1, HvxVR:$src2), +         (V6_vsub_sf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_sf_mix_128B HvxVR:$src1, HvxVR:$src2), +         (V6_vsub_sf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>; diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index e285e04..7ee280d 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -654,7 +654,9 @@ void HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) {        IntNo == Intrinsic::hexagon_V6_vgathermh ||        IntNo == Intrinsic::hexagon_V6_vgathermh_128B ||        IntNo == Intrinsic::hexagon_V6_vgathermhw || -      IntNo == Intrinsic::hexagon_V6_vgathermhw_128B) { +      IntNo == Intrinsic::hexagon_V6_vgathermhw_128B || +      IntNo == Intrinsic::hexagon_V6_vgather_vscattermh || +      IntNo == Intrinsic::hexagon_V6_vgather_vscattermh_128B) {      SelectV65Gather(N);      return;    } diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index c7a4f68..3cc146b 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -2953,6 +2953,10 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {    case Intrinsic::hexagon_V6_vgathermhw_128B:      Opcode = Hexagon::V6_vgathermhw_pseudo;      break; +  case Intrinsic::hexagon_V6_vgather_vscattermh: +  case Intrinsic::hexagon_V6_vgather_vscattermh_128B: +    Opcode = Hexagon::V6_vgather_vscatter_mh_pseudo; +    break;    }    SDVTList VTs = CurDAG->getVTList(MVT::Other); diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 9f7f434..526b4de 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -2145,7 +2145,9 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,    case Intrinsic::hexagon_V6_vgathermhq:    case Intrinsic::hexagon_V6_vgathermhq_128B:    case Intrinsic::hexagon_V6_vgathermhwq: -  case Intrinsic::hexagon_V6_vgathermhwq_128B: { +  case Intrinsic::hexagon_V6_vgathermhwq_128B: +  case Intrinsic::hexagon_V6_vgather_vscattermh: +  case Intrinsic::hexagon_V6_vgather_vscattermh_128B: {      const Module &M = *I.getParent()->getParent()->getParent();      Info.opc = ISD::INTRINSIC_W_CHAIN;      Type *VecTy = I.getArgOperand(1)->getType(); diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 939841a..47726d6 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1554,80 +1554,93 @@ HexagonInstrInfo::expandVGatherPseudo(MachineInstr &MI) const {    MachineBasicBlock::iterator First;    switch (Opc) { -    case Hexagon::V6_vgathermh_pseudo: -      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh)) -                  .add(MI.getOperand(2)) -                  .add(MI.getOperand(3)) -                  .add(MI.getOperand(4)); -      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) -          .add(MI.getOperand(0)) -          .addImm(MI.getOperand(1).getImm()) -          .addReg(Hexagon::VTMP); -      MBB.erase(MI); -      return First.getInstrIterator(); - -    case Hexagon::V6_vgathermw_pseudo: -      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermw)) -                  .add(MI.getOperand(2)) -                  .add(MI.getOperand(3)) -                  .add(MI.getOperand(4)); -      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) -          .add(MI.getOperand(0)) -          .addImm(MI.getOperand(1).getImm()) -          .addReg(Hexagon::VTMP); -      MBB.erase(MI); -      return First.getInstrIterator(); - -    case Hexagon::V6_vgathermhw_pseudo: -      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhw)) -                  .add(MI.getOperand(2)) -                  .add(MI.getOperand(3)) -                  .add(MI.getOperand(4)); -      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) -          .add(MI.getOperand(0)) -          .addImm(MI.getOperand(1).getImm()) -          .addReg(Hexagon::VTMP); -      MBB.erase(MI); -      return First.getInstrIterator(); - -    case Hexagon::V6_vgathermhq_pseudo: -      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhq)) -                  .add(MI.getOperand(2)) -                  .add(MI.getOperand(3)) -                  .add(MI.getOperand(4)) -                  .add(MI.getOperand(5)); -      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) -          .add(MI.getOperand(0)) -          .addImm(MI.getOperand(1).getImm()) -          .addReg(Hexagon::VTMP); -      MBB.erase(MI); -      return First.getInstrIterator(); - -    case Hexagon::V6_vgathermwq_pseudo: -      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermwq)) -                  .add(MI.getOperand(2)) -                  .add(MI.getOperand(3)) -                  .add(MI.getOperand(4)) -                  .add(MI.getOperand(5)); -      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) -          .add(MI.getOperand(0)) -          .addImm(MI.getOperand(1).getImm()) -          .addReg(Hexagon::VTMP); -      MBB.erase(MI); -      return First.getInstrIterator(); - -    case Hexagon::V6_vgathermhwq_pseudo: -      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhwq)) -                  .add(MI.getOperand(2)) -                  .add(MI.getOperand(3)) -                  .add(MI.getOperand(4)) -                  .add(MI.getOperand(5)); -      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) -          .add(MI.getOperand(0)) -          .addImm(MI.getOperand(1).getImm()) -          .addReg(Hexagon::VTMP); -      MBB.erase(MI); -      return First.getInstrIterator(); +  case Hexagon::V6_vgather_vscatter_mh_pseudo: +    // This is mainly a place holder. It will be extended. +    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh)) +                .add(MI.getOperand(2)) +                .add(MI.getOperand(3)) +                .add(MI.getOperand(4)); +    BuildMI(MBB, MI, DL, get(Hexagon::V6_vscattermh)) +        .add(MI.getOperand(2)) +        .add(MI.getOperand(3)) +        .add(MI.getOperand(4)) +        .addReg(Hexagon::VTMP); +    MBB.erase(MI); +    return First.getInstrIterator(); +  case Hexagon::V6_vgathermh_pseudo: +    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh)) +                .add(MI.getOperand(2)) +                .add(MI.getOperand(3)) +                .add(MI.getOperand(4)); +    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) +        .add(MI.getOperand(0)) +        .addImm(MI.getOperand(1).getImm()) +        .addReg(Hexagon::VTMP); +    MBB.erase(MI); +    return First.getInstrIterator(); + +  case Hexagon::V6_vgathermw_pseudo: +    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermw)) +                .add(MI.getOperand(2)) +                .add(MI.getOperand(3)) +                .add(MI.getOperand(4)); +    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) +        .add(MI.getOperand(0)) +        .addImm(MI.getOperand(1).getImm()) +        .addReg(Hexagon::VTMP); +    MBB.erase(MI); +    return First.getInstrIterator(); + +  case Hexagon::V6_vgathermhw_pseudo: +    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhw)) +                .add(MI.getOperand(2)) +                .add(MI.getOperand(3)) +                .add(MI.getOperand(4)); +    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) +        .add(MI.getOperand(0)) +        .addImm(MI.getOperand(1).getImm()) +        .addReg(Hexagon::VTMP); +    MBB.erase(MI); +    return First.getInstrIterator(); + +  case Hexagon::V6_vgathermhq_pseudo: +    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhq)) +                .add(MI.getOperand(2)) +                .add(MI.getOperand(3)) +                .add(MI.getOperand(4)) +                .add(MI.getOperand(5)); +    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) +        .add(MI.getOperand(0)) +        .addImm(MI.getOperand(1).getImm()) +        .addReg(Hexagon::VTMP); +    MBB.erase(MI); +    return First.getInstrIterator(); + +  case Hexagon::V6_vgathermwq_pseudo: +    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermwq)) +                .add(MI.getOperand(2)) +                .add(MI.getOperand(3)) +                .add(MI.getOperand(4)) +                .add(MI.getOperand(5)); +    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) +        .add(MI.getOperand(0)) +        .addImm(MI.getOperand(1).getImm()) +        .addReg(Hexagon::VTMP); +    MBB.erase(MI); +    return First.getInstrIterator(); + +  case Hexagon::V6_vgathermhwq_pseudo: +    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhwq)) +                .add(MI.getOperand(2)) +                .add(MI.getOperand(3)) +                .add(MI.getOperand(4)) +                .add(MI.getOperand(5)); +    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai)) +        .add(MI.getOperand(0)) +        .addImm(MI.getOperand(1).getImm()) +        .addReg(Hexagon::VTMP); +    MBB.erase(MI); +    return First.getInstrIterator();    }    return MI.getIterator(); @@ -2806,6 +2819,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,    case Hexagon::V6_vL32b_nt_tmp_npred_ai:    case Hexagon::V6_vS32Ub_npred_ai:    case Hexagon::V6_vgathermh_pseudo: +  case Hexagon::V6_vgather_vscatter_mh_pseudo:    case Hexagon::V6_vgathermw_pseudo:    case Hexagon::V6_vgathermhw_pseudo:    case Hexagon::V6_vgathermhq_pseudo: diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td index f927f9b..42393d0 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td @@ -40,6 +40,19 @@ defm V6_vgathermh_pseudo  : vgathermh<HvxVR>;  defm V6_vgathermw_pseudo  : vgathermw<HvxVR>;  defm V6_vgathermhw_pseudo  : vgathermhw<HvxWR>; + +multiclass vgather_scatter_mh<RegisterClass RC> { +  let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, +  mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in +  def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ), +                           (ins IntRegs:$_dst_, s4_0Imm:$Ii, +                                IntRegs:$Rt, ModRegs:$Mu, RC:$Vv), +                           ".error \"should not emit\" ", +                           []>; +} + +defm V6_vgather_vscatter_mh_pseudo  : vgather_scatter_mh<HvxVR>; +  multiclass vgathermhq<RegisterClass RC1, RegisterClass RC2> {    let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1,    mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in diff --git a/llvm/lib/Target/Hexagon/HexagonSchedule.td b/llvm/lib/Target/Hexagon/HexagonSchedule.td index b8a9cf3..9bcd4bf 100644 --- a/llvm/lib/Target/Hexagon/HexagonSchedule.td +++ b/llvm/lib/Target/Hexagon/HexagonSchedule.td @@ -75,3 +75,4 @@ include "HexagonScheduleV71T.td"  include "HexagonScheduleV73.td"  include "HexagonScheduleV75.td"  include "HexagonScheduleV79.td" +include "HexagonScheduleV81.td"
\ No newline at end of file diff --git a/llvm/lib/Target/Hexagon/HexagonScheduleV81.td b/llvm/lib/Target/Hexagon/HexagonScheduleV81.td new file mode 100644 index 0000000..dd5f5a0 --- /dev/null +++ b/llvm/lib/Target/Hexagon/HexagonScheduleV81.td @@ -0,0 +1,31 @@ +//=-HexagonScheduleV81.td - HexagonV81 Scheduling Definitions *- tablegen -*-=// +// +//                     The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +def HexagonV81ItinList : DepScalarItinV81, ScalarItin, +                         DepHVXItinV81, HVXItin, PseudoItin { +  list<InstrItinData> ItinList = +    !listconcat(DepScalarItinV81_list, ScalarItin_list, +                DepHVXItinV81_list, HVXItin_list, PseudoItin_list); +} + +def HexagonItinerariesV81 : +      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP, +                            CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1, +                            CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL, +                            CVI_ALL_NOMEM, CVI_ZW], +                            [Hex_FWD, HVX_FWD], +                            HexagonV81ItinList.ItinList>; + +def HexagonModelV81 : SchedMachineModel { +  // Max issue per cycle == bundle width. +  let IssueWidth = 4; +  let Itineraries = HexagonItinerariesV81; +  let LoadLatency = 1; +  let CompleteModel = 0; +} diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h index 7430567..995f66d 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h @@ -224,6 +224,15 @@ public:    bool useHVXV79Ops() const {      return HexagonHVXVersion >= Hexagon::ArchEnum::V79;    } +  bool hasV81Ops() const { +    return getHexagonArchVersion() >= Hexagon::ArchEnum::V81; +  } +  bool hasV81OpsOnly() const { +    return getHexagonArchVersion() == Hexagon::ArchEnum::V81; +  } +  bool useHVXV81Ops() const { +    return HexagonHVXVersion >= Hexagon::ArchEnum::V81; +  }    bool useAudioOps() const { return UseAudioOps; }    bool useCompound() const { return UseCompound; } diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index 171e294..e925e04 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -31,6 +31,10 @@ using namespace llvm;  static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(false),      cl::Hidden, cl::desc("Enable loop vectorizer for HVX")); +cl::opt<bool> HexagonAllowScatterGatherHVX( +    "hexagon-allow-scatter-gather-hvx", cl::init(false), cl::Hidden, +    cl::desc("Allow auto-generation of HVX scatter-gather")); +  static cl::opt<bool> EnableV68FloatAutoHVX(      "force-hvx-float", cl::Hidden,      cl::desc("Enable auto-vectorization of floatint point types on v68.")); @@ -354,6 +358,61 @@ bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/,    return HexagonMaskedVMem && ST.isTypeForHVX(DataType);  } +bool HexagonTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const { +  // For now assume we can not deal with all HVX datatypes. +  if (!Ty->isVectorTy() || !ST.isTypeForHVX(Ty) || +      !HexagonAllowScatterGatherHVX) +    return false; +  // This must be in sync with HexagonVectorCombine pass. +  switch (Ty->getScalarSizeInBits()) { +  case 8: +    return (getTypeNumElements(Ty) == 128); +  case 16: +    if (getTypeNumElements(Ty) == 64 || getTypeNumElements(Ty) == 32) +      return (Alignment >= 2); +    break; +  case 32: +    if (getTypeNumElements(Ty) == 32) +      return (Alignment >= 4); +    break; +  default: +    break; +  } +  return false; +} + +bool HexagonTTIImpl::isLegalMaskedScatter(Type *Ty, Align Alignment) const { +  if (!Ty->isVectorTy() || !ST.isTypeForHVX(Ty) || +      !HexagonAllowScatterGatherHVX) +    return false; +  // This must be in sync with HexagonVectorCombine pass. +  switch (Ty->getScalarSizeInBits()) { +  case 8: +    return (getTypeNumElements(Ty) == 128); +  case 16: +    if (getTypeNumElements(Ty) == 64) +      return (Alignment >= 2); +    break; +  case 32: +    if (getTypeNumElements(Ty) == 32) +      return (Alignment >= 4); +    break; +  default: +    break; +  } +  return false; +} + +bool HexagonTTIImpl::forceScalarizeMaskedGather(VectorType *VTy, +                                                Align Alignment) const { +  return !isLegalMaskedGather(VTy, Alignment); +} + +bool HexagonTTIImpl::forceScalarizeMaskedScatter(VectorType *VTy, +                                                 Align Alignment) const { +  return !isLegalMaskedScatter(VTy, Alignment); +} +  /// --- Vector TTI end ---  unsigned HexagonTTIImpl::getPrefetchDistance() const { diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index dbf16c9..cec2bf9 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -169,6 +169,12 @@ public:                            unsigned AddressSpace) const override;    bool isLegalMaskedLoad(Type *DataType, Align Alignment,                           unsigned AddressSpace) const override; +  bool isLegalMaskedGather(Type *Ty, Align Alignment) const override; +  bool isLegalMaskedScatter(Type *Ty, Align Alignment) const override; +  bool forceScalarizeMaskedGather(VectorType *VTy, +                                  Align Alignment) const override; +  bool forceScalarizeMaskedScatter(VectorType *VTy, +                                   Align Alignment) const override;    /// @} diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp index 9ab5202..5c50ec2 100644 --- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp @@ -57,6 +57,11 @@  #define DEBUG_TYPE "hexagon-vc" +// This is a const that represents default HVX VTCM page size. +// It is boot time configurable, so we probably want an API to +// read it, but for now assume 128KB +#define DEFAULT_HVX_VTCM_PAGE_SIZE 131072 +  using namespace llvm;  namespace { @@ -418,6 +423,18 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {  class HvxIdioms {  public: +  enum DstQualifier { +    Undefined = 0, +    Arithmetic, +    LdSt, +    LLVM_Gather, +    LLVM_Scatter, +    HEX_Gather_Scatter, +    HEX_Gather, +    HEX_Scatter, +    Call +  }; +    HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) {      auto *Int32Ty = HVC.getIntTy(32);      HvxI32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/false); @@ -473,6 +490,11 @@ private:    auto createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,                       Signedness SgnX, ArrayRef<Value *> WordY,                       Signedness SgnY) const -> SmallVector<Value *>; +  // Vector manipulations for Ripple +  bool matchScatter(Instruction &In) const; +  bool matchGather(Instruction &In) const; +  Value *processVScatter(Instruction &In) const; +  Value *processVGather(Instruction &In) const;    VectorType *HvxI32Ty;    VectorType *HvxP32Ty; @@ -1545,7 +1567,7 @@ auto AlignVectors::isSectorTy(Type *Ty) const -> bool {  }  auto AlignVectors::run() -> bool { -  LLVM_DEBUG(dbgs() << "Running HVC::AlignVectors on " << HVC.F.getName() +  LLVM_DEBUG(dbgs() << "\nRunning HVC::AlignVectors on " << HVC.F.getName()                      << '\n');    if (!createAddressGroups())      return false; @@ -1797,6 +1819,846 @@ auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const    return Ext;  } +inline bool HvxIdioms::matchScatter(Instruction &In) const { +  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In); +  if (!II) +    return false; +  return (II->getIntrinsicID() == Intrinsic::masked_scatter); +} + +inline bool HvxIdioms::matchGather(Instruction &In) const { +  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In); +  if (!II) +    return false; +  return (II->getIntrinsicID() == Intrinsic::masked_gather); +} + +Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual); + +// Binary instructions we want to handle as users of gather/scatter. +inline bool isArithmetic(unsigned Opc) { +  switch (Opc) { +  case Instruction::Add: +  case Instruction::Sub: +  case Instruction::Mul: +  case Instruction::And: +  case Instruction::Or: +  case Instruction::Xor: +  case Instruction::AShr: +  case Instruction::LShr: +  case Instruction::Shl: +  case Instruction::UDiv: +    return true; +  } +  return false; +} + +// TODO: Maybe use MemoryLocation for this. See getLocOrNone above. +inline Value *getPointer(Value *Ptr) { +  assert(Ptr && "Unable to extract pointer"); +  if (isa<AllocaInst>(Ptr) || isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) +    return Ptr; +  if (isa<LoadInst>(Ptr) || isa<StoreInst>(Ptr)) +    return getLoadStorePointerOperand(Ptr); +  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Ptr)) { +    if (II->getIntrinsicID() == Intrinsic::masked_store) +      return II->getOperand(1); +  } +  return nullptr; +} + +static Instruction *selectDestination(Instruction *In, +                                      HvxIdioms::DstQualifier &Qual) { +  Instruction *Destination = nullptr; +  if (!In) +    return Destination; +  if (isa<StoreInst>(In)) { +    Destination = In; +    Qual = HvxIdioms::LdSt; +  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) { +    if (II->getIntrinsicID() == Intrinsic::masked_gather) { +      Destination = In; +      Qual = HvxIdioms::LLVM_Gather; +    } else if (II->getIntrinsicID() == Intrinsic::masked_scatter) { +      Destination = In; +      Qual = HvxIdioms::LLVM_Scatter; +    } else if (II->getIntrinsicID() == Intrinsic::masked_store) { +      Destination = In; +      Qual = HvxIdioms::LdSt; +    } else if (II->getIntrinsicID() == +               Intrinsic::hexagon_V6_vgather_vscattermh) { +      Destination = In; +      Qual = HvxIdioms::HEX_Gather_Scatter; +    } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vscattermh_128B) { +      Destination = In; +      Qual = HvxIdioms::HEX_Scatter; +    } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vgathermh_128B) { +      Destination = In; +      Qual = HvxIdioms::HEX_Gather; +    } +  } else if (isa<ZExtInst>(In)) { +    return locateDestination(In, Qual); +  } else if (isa<CastInst>(In)) { +    return locateDestination(In, Qual); +  } else if (isa<CallInst>(In)) { +    Destination = In; +    Qual = HvxIdioms::Call; +  } else if (isa<GetElementPtrInst>(In)) { +    return locateDestination(In, Qual); +  } else if (isArithmetic(In->getOpcode())) { +    Destination = In; +    Qual = HvxIdioms::Arithmetic; +  } else { +    LLVM_DEBUG(dbgs() << "Unhandled destination : " << *In << "\n"); +  } +  return Destination; +} + +// This method attempts to find destination (user) for a given intrinsic. +// Given that these are produced only by Ripple, the number of options is +// limited. Simplest case is explicit store which in fact is redundant (since +// HVX gater creates its own store during packetization). Nevertheless we need +// to figure address where we storing. Other cases are more complicated, but +// still few. +Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual) { +  Instruction *Destination = nullptr; +  if (!In) +    return Destination; +  // Get all possible destinations +  SmallVector<Instruction *> Users; +  // Iterate over the uses of the instruction +  for (auto &U : In->uses()) { +    if (auto *UI = dyn_cast<Instruction>(U.getUser())) { +      Destination = selectDestination(UI, Qual); +      if (Destination) +        Users.push_back(Destination); +    } +  } +  // Now see which of the users (if any) is a memory destination. +  for (auto *I : Users) +    if (getPointer(I)) +      return I; +  return Destination; +} + +// The two intrinsics we handle here have GEP in a different position. +inline GetElementPtrInst *locateGepFromIntrinsic(Instruction *In) { +  assert(In && "Bad instruction"); +  IntrinsicInst *IIn = dyn_cast<IntrinsicInst>(In); +  assert((IIn && (IIn->getIntrinsicID() == Intrinsic::masked_gather || +                  IIn->getIntrinsicID() == Intrinsic::masked_scatter)) && +         "Not a gather Intrinsic"); +  GetElementPtrInst *GEPIndex = nullptr; +  if (IIn->getIntrinsicID() == Intrinsic::masked_gather) +    GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(0)); +  else +    GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(1)); +  return GEPIndex; +} + +// Given the intrinsic find its GEP argument and extract base address it uses. +// The method relies on the way how Ripple typically forms the GEP for +// scatter/gather. +static Value *locateAddressFromIntrinsic(Instruction *In) { +  GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In); +  if (!GEPIndex) { +    LLVM_DEBUG(dbgs() << "  No GEP in intrinsic\n"); +    return nullptr; +  } +  Value *BaseAddress = GEPIndex->getPointerOperand(); +  auto *IndexLoad = dyn_cast<LoadInst>(BaseAddress); +  if (IndexLoad) +    return IndexLoad; + +  auto *IndexZEx = dyn_cast<ZExtInst>(BaseAddress); +  if (IndexZEx) { +    IndexLoad = dyn_cast<LoadInst>(IndexZEx->getOperand(0)); +    if (IndexLoad) +      return IndexLoad; +    IntrinsicInst *II = dyn_cast<IntrinsicInst>(IndexZEx->getOperand(0)); +    if (II && II->getIntrinsicID() == Intrinsic::masked_gather) +      return locateAddressFromIntrinsic(II); +  } +  auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(BaseAddress); +  if (BaseShuffle) { +    IndexLoad = dyn_cast<LoadInst>(BaseShuffle->getOperand(0)); +    if (IndexLoad) +      return IndexLoad; +    auto *IE = dyn_cast<InsertElementInst>(BaseShuffle->getOperand(0)); +    if (IE) { +      auto *Src = IE->getOperand(1); +      IndexLoad = dyn_cast<LoadInst>(Src); +      if (IndexLoad) +        return IndexLoad; +      auto *Alloca = dyn_cast<AllocaInst>(Src); +      if (Alloca) +        return Alloca; +      if (isa<Argument>(Src)) { +        return Src; +      } +      if (isa<GlobalValue>(Src)) { +        return Src; +      } +    } +  } +  LLVM_DEBUG(dbgs() << "  Unable to locate Address from intrinsic\n"); +  return nullptr; +} + +static Type *getIndexType(Value *In) { +  if (!In) +    return nullptr; + +  if (isa<LoadInst>(In) || isa<StoreInst>(In)) +    return getLoadStoreType(In); + +  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) { +    if (II->getIntrinsicID() == Intrinsic::masked_load) +      return II->getType(); +    if (II->getIntrinsicID() == Intrinsic::masked_store) +      return II->getOperand(0)->getType(); +  } +  return In->getType(); +} + +static Value *locateIndexesFromGEP(Value *In) { +  if (!In) +    return nullptr; +  if (isa<LoadInst>(In)) +    return In; +  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) { +    if (II->getIntrinsicID() == Intrinsic::masked_load) +      return In; +    if (II->getIntrinsicID() == Intrinsic::masked_gather) +      return In; +  } +  if (auto *IndexZEx = dyn_cast<ZExtInst>(In)) +    return locateIndexesFromGEP(IndexZEx->getOperand(0)); +  if (auto *IndexSEx = dyn_cast<SExtInst>(In)) +    return locateIndexesFromGEP(IndexSEx->getOperand(0)); +  if (auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(In)) +    return locateIndexesFromGEP(BaseShuffle->getOperand(0)); +  if (auto *IE = dyn_cast<InsertElementInst>(In)) +    return locateIndexesFromGEP(IE->getOperand(1)); +  if (auto *cstDataVector = dyn_cast<ConstantDataVector>(In)) +    return cstDataVector; +  if (auto *GEPIndex = dyn_cast<GetElementPtrInst>(In)) +    return GEPIndex->getOperand(0); +  return nullptr; +} + +// Given the intrinsic find its GEP argument and extract offsetts from the base +// address it uses. +static Value *locateIndexesFromIntrinsic(Instruction *In) { +  GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In); +  if (!GEPIndex) { +    LLVM_DEBUG(dbgs() << "  No GEP in intrinsic\n"); +    return nullptr; +  } +  Value *Indexes = GEPIndex->getOperand(1); +  if (auto *IndexLoad = locateIndexesFromGEP(Indexes)) +    return IndexLoad; + +  LLVM_DEBUG(dbgs() << "  Unable to locate Index from intrinsic\n"); +  return nullptr; +} + +// Because of aukward definition of many Hex intrinsics we often have to +// reinterprete HVX native <64 x i16> as <32 x i32> which in practice is a NOP +// for all use cases, so this only exist to make IR builder happy. +inline Value *getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC, +                                               IRBuilderBase &Builder, +                                               LLVMContext &Ctx, Value *I) { +  assert(I && "Unable to reinterprete cast"); +  Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false); +  std::vector<unsigned> shuffleMask; +  for (unsigned i = 0; i < 64; ++i) +    shuffleMask.push_back(i); +  Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask); +  Value *CastShuffle = +      Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle"); +  return Builder.CreateBitCast(CastShuffle, NT, "cst64_i16_to_32_i32"); +} + +// Recast <128 x i8> as <32 x i32> +inline Value *getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC, +                                              IRBuilderBase &Builder, +                                              LLVMContext &Ctx, Value *I) { +  assert(I && "Unable to reinterprete cast"); +  Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false); +  std::vector<unsigned> shuffleMask; +  for (unsigned i = 0; i < 128; ++i) +    shuffleMask.push_back(i); +  Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask); +  Value *CastShuffle = +      Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle"); +  return Builder.CreateBitCast(CastShuffle, NT, "cst128_i8_to_32_i32"); +} + +// Create <32 x i32> mask reinterpreted as <128 x i1> with a given pattern +inline Value *get_i32_Mask(const HexagonVectorCombine &HVC, +                           IRBuilderBase &Builder, LLVMContext &Ctx, +                           unsigned int pattern) { +  std::vector<unsigned int> byteMask; +  for (unsigned i = 0; i < 32; ++i) +    byteMask.push_back(pattern); + +  return Builder.CreateIntrinsic( +      HVC.getBoolTy(128), HVC.HST.getIntrinsicId(Hexagon::V6_vandvrt), +      {llvm::ConstantDataVector::get(Ctx, byteMask), HVC.getConstInt(~0)}, +      nullptr); +} + +Value *HvxIdioms::processVScatter(Instruction &In) const { +  auto *InpTy = dyn_cast<VectorType>(In.getOperand(0)->getType()); +  assert(InpTy && "Cannot handle no vector type for llvm.scatter/gather"); +  unsigned InpSize = HVC.getSizeOf(InpTy); +  auto *F = In.getFunction(); +  LLVMContext &Ctx = F->getContext(); +  auto *ElemTy = dyn_cast<IntegerType>(InpTy->getElementType()); +  assert(ElemTy && "llvm.scatter needs integer type argument"); +  unsigned ElemWidth = HVC.DL.getTypeAllocSize(ElemTy); +  LLVM_DEBUG({ +    unsigned Elements = HVC.length(InpTy); +    dbgs() << "\n[Process scatter](" << In << ")\n" << *In.getParent() << "\n"; +    dbgs() << "  Input type(" << *InpTy << ") elements(" << Elements +           << ") VecLen(" << InpSize << ") type(" << *ElemTy << ") ElemWidth(" +           << ElemWidth << ")\n"; +  }); + +  IRBuilder Builder(In.getParent(), In.getIterator(), +                    InstSimplifyFolder(HVC.DL)); + +  auto *ValueToScatter = In.getOperand(0); +  LLVM_DEBUG(dbgs() << "  ValueToScatter   : " << *ValueToScatter << "\n"); + +  if (HVC.HST.getVectorLength() != InpSize) { +    LLVM_DEBUG(dbgs() << "Unhandled vector size(" << InpSize +                      << ") for vscatter\n"); +    return nullptr; +  } + +  // Base address of indexes. +  auto *IndexLoad = locateAddressFromIntrinsic(&In); +  if (!IndexLoad) +    return nullptr; +  LLVM_DEBUG(dbgs() << "  IndexLoad        : " << *IndexLoad << "\n"); + +  // Address of destination. Must be in VTCM. +  auto *Ptr = getPointer(IndexLoad); +  if (!Ptr) +    return nullptr; +  LLVM_DEBUG(dbgs() << "  Ptr              : " << *Ptr << "\n"); +  // Indexes/offsets +  auto *Indexes = locateIndexesFromIntrinsic(&In); +  if (!Indexes) +    return nullptr; +  LLVM_DEBUG(dbgs() << "  Indexes          : " << *Indexes << "\n"); +  Value *CastedDst = Builder.CreateBitOrPointerCast(Ptr, Type::getInt32Ty(Ctx), +                                                    "cst_ptr_to_i32"); +  LLVM_DEBUG(dbgs() << "  CastedDst        : " << *CastedDst << "\n"); +  // Adjust Indexes +  auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes); +  Value *CastIndex = nullptr; +  if (cstDataVector) { +    // Our indexes are represented as a constant. We need it in a reg. +    AllocaInst *IndexesAlloca = +        Builder.CreateAlloca(HVC.getHvxTy(HVC.getIntTy(32), false)); +    [[maybe_unused]] auto *StoreIndexes = +        Builder.CreateStore(cstDataVector, IndexesAlloca); +    LLVM_DEBUG(dbgs() << "  StoreIndexes     : " << *StoreIndexes << "\n"); +    CastIndex = Builder.CreateLoad(IndexesAlloca->getAllocatedType(), +                                   IndexesAlloca, "reload_index"); +  } else { +    if (ElemWidth == 2) +      CastIndex = getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes); +    else +      CastIndex = Indexes; +  } +  LLVM_DEBUG(dbgs() << "  Cast index       : " << *CastIndex << ")\n"); + +  if (ElemWidth == 1) { +    // v128i8 There is no native instruction for this. +    // Do this as two Hi/Lo gathers with masking. +    Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false); +    // Extend indexes. We assume that indexes are in 128i8 format - need to +    // expand them to Hi/Lo 64i16 +    Value *CastIndexes = Builder.CreateBitCast(CastIndex, NT, "cast_to_32i32"); +    auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub); +    auto *UnpackedIndexes = Builder.CreateIntrinsic( +        HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastIndexes, nullptr); +    LLVM_DEBUG(dbgs() << "  UnpackedIndexes  : " << *UnpackedIndexes << ")\n"); + +    auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi); +    auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo); +    [[maybe_unused]] Value *IndexHi = +        HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes); +    [[maybe_unused]] Value *IndexLo = +        HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes); +    LLVM_DEBUG(dbgs() << "  UnpackedIndHi    : " << *IndexHi << ")\n"); +    LLVM_DEBUG(dbgs() << "  UnpackedIndLo    : " << *IndexLo << ")\n"); +    // Now unpack values to scatter +    Value *CastSrc = +        getReinterpretiveCast_i8_to_i32(HVC, Builder, Ctx, ValueToScatter); +    LLVM_DEBUG(dbgs() << "  CastSrc          : " << *CastSrc << ")\n"); +    auto *UnpackedValueToScatter = Builder.CreateIntrinsic( +        HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastSrc, nullptr); +    LLVM_DEBUG(dbgs() << "  UnpackedValToScat: " << *UnpackedValueToScatter +                      << ")\n"); + +    [[maybe_unused]] Value *UVSHi = +        HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedValueToScatter); +    [[maybe_unused]] Value *UVSLo = +        HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedValueToScatter); +    LLVM_DEBUG(dbgs() << "  UVSHi            : " << *UVSHi << ")\n"); +    LLVM_DEBUG(dbgs() << "  UVSLo            : " << *UVSLo << ")\n"); + +    // Create the mask for individual bytes +    auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff); +    LLVM_DEBUG(dbgs() << "  QByteMask        : " << *QByteMask << "\n"); +    [[maybe_unused]] auto *ResHi = Builder.CreateIntrinsic( +        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B, +        {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), +         IndexHi, UVSHi}, +        nullptr); +    LLVM_DEBUG(dbgs() << "  ResHi            : " << *ResHi << ")\n"); +    return Builder.CreateIntrinsic( +        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B, +        {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), +         IndexLo, UVSLo}, +        nullptr); +  } else if (ElemWidth == 2) { +    Value *CastSrc = +        getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, ValueToScatter); +    LLVM_DEBUG(dbgs() << "  CastSrc        : " << *CastSrc << ")\n"); +    return Builder.CreateIntrinsic( +        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermh_128B, +        {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex, +         CastSrc}, +        nullptr); +  } else if (ElemWidth == 4) { +    return Builder.CreateIntrinsic( +        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermw_128B, +        {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex, +         ValueToScatter}, +        nullptr); +  } else { +    LLVM_DEBUG(dbgs() << "Unhandled element type for vscatter\n"); +    return nullptr; +  } +} + +Value *HvxIdioms::processVGather(Instruction &In) const { +  [[maybe_unused]] auto *InpTy = +      dyn_cast<VectorType>(In.getOperand(0)->getType()); +  assert(InpTy && "Cannot handle no vector type for llvm.gather"); +  [[maybe_unused]] auto *ElemTy = +      dyn_cast<PointerType>(InpTy->getElementType()); +  assert(ElemTy && "llvm.gather needs vector of ptr argument"); +  auto *F = In.getFunction(); +  LLVMContext &Ctx = F->getContext(); +  LLVM_DEBUG(dbgs() << "\n[Process gather](" << In << ")\n" +                    << *In.getParent() << "\n"); +  LLVM_DEBUG(dbgs() << "  Input type(" << *InpTy << ") elements(" +                    << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy) +                    << ") type(" << *ElemTy << ") Access alignment(" +                    << *In.getOperand(1) << ") AddressSpace(" +                    << ElemTy->getAddressSpace() << ")\n"); + +  // TODO: Handle masking of elements. +  assert(dyn_cast<VectorType>(In.getOperand(2)->getType()) && +         "llvm.gather needs vector for mask"); +  IRBuilder Builder(In.getParent(), In.getIterator(), +                    InstSimplifyFolder(HVC.DL)); + +  // See who is using the result. The difference between LLVM and HVX vgather +  // Intrinsic makes it impossible to handle all cases with temp storage. Alloca +  // in VTCM is not yet supported, so for now we just bail out for those cases. +  HvxIdioms::DstQualifier Qual = HvxIdioms::Undefined; +  Instruction *Dst = locateDestination(&In, Qual); +  if (!Dst) { +    LLVM_DEBUG(dbgs() << "  Unable to locate vgather destination\n"); +    return nullptr; +  } +  LLVM_DEBUG(dbgs() << "  Destination    : " << *Dst << " Qual(" << Qual +                    << ")\n"); + +  // Address of destination. Must be in VTCM. +  auto *Ptr = getPointer(Dst); +  if (!Ptr) { +    LLVM_DEBUG(dbgs() << "Could not locate vgather destination ptr\n"); +    return nullptr; +  } + +  // Result type. Assume it is a vector type. +  auto *DstType = cast<VectorType>(getIndexType(Dst)); +  assert(DstType && "Cannot handle non vector dst type for llvm.gather"); + +  // Base address for sources to be loaded +  auto *IndexLoad = locateAddressFromIntrinsic(&In); +  if (!IndexLoad) +    return nullptr; +  LLVM_DEBUG(dbgs() << "  IndexLoad      : " << *IndexLoad << "\n"); + +  // Gather indexes/offsets +  auto *Indexes = locateIndexesFromIntrinsic(&In); +  if (!Indexes) +    return nullptr; +  LLVM_DEBUG(dbgs() << "  Indexes        : " << *Indexes << "\n"); + +  Instruction *Gather = nullptr; +  Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false); +  if (Qual == HvxIdioms::LdSt || Qual == HvxIdioms::Arithmetic) { +    // We fully assume the address space is in VTCM. We also assume that all +    // pointers in Operand(0) have the same base(!). +    // This is the most basic case of all the above. +    unsigned OutputSize = HVC.getSizeOf(DstType); +    auto *DstElemTy = cast<IntegerType>(DstType->getElementType()); +    unsigned ElemWidth = HVC.DL.getTypeAllocSize(DstElemTy); +    LLVM_DEBUG(dbgs() << "  Buffer type    : " << *Ptr->getType() +                      << "  Address space (" +                      << Ptr->getType()->getPointerAddressSpace() << ")\n" +                      << "  Result type    : " << *DstType +                      << "\n  Size in bytes  : " << OutputSize +                      << " element type(" << *DstElemTy +                      << ")\n  ElemWidth      : " << ElemWidth << " bytes\n"); + +    auto *IndexType = cast<VectorType>(getIndexType(Indexes)); +    assert(IndexType && "Cannot handle non vector index type for llvm.gather"); +    unsigned IndexWidth = HVC.DL.getTypeAllocSize(IndexType->getElementType()); +    LLVM_DEBUG(dbgs() << "  IndexWidth(" << IndexWidth << ")\n"); + +    // Intrinsic takes i32 instead of pointer so cast. +    Value *CastedPtr = Builder.CreateBitOrPointerCast( +        IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); +    // [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ...] +    // int_hexagon_V6_vgathermh       [... , llvm_v16i32_ty] +    // int_hexagon_V6_vgathermh_128B  [... , llvm_v32i32_ty] +    // int_hexagon_V6_vgathermhw      [... , llvm_v32i32_ty] +    // int_hexagon_V6_vgathermhw_128B [... , llvm_v64i32_ty] +    // int_hexagon_V6_vgathermw       [... , llvm_v16i32_ty] +    // int_hexagon_V6_vgathermw_128B  [... , llvm_v32i32_ty] +    if (HVC.HST.getVectorLength() == OutputSize) { +      if (ElemWidth == 1) { +        // v128i8 There is no native instruction for this. +        // Do this as two Hi/Lo gathers with masking. +        // Unpack indexes. We assume that indexes are in 128i8 format - need to +        // expand them to Hi/Lo 64i16 +        Value *CastIndexes = +            Builder.CreateBitCast(Indexes, NT, "cast_to_32i32"); +        auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub); +        auto *UnpackedIndexes = +            Builder.CreateIntrinsic(HVC.getHvxTy(HVC.getIntTy(32), true), +                                    V6_vunpack, CastIndexes, nullptr); +        LLVM_DEBUG(dbgs() << "  UnpackedIndexes : " << *UnpackedIndexes +                          << ")\n"); + +        auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi); +        auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo); +        [[maybe_unused]] Value *IndexHi = +            HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes); +        [[maybe_unused]] Value *IndexLo = +            HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes); +        LLVM_DEBUG(dbgs() << "  UnpackedIndHi   : " << *IndexHi << ")\n"); +        LLVM_DEBUG(dbgs() << "  UnpackedIndLo   : " << *IndexLo << ")\n"); +        // Create the mask for individual bytes +        auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff); +        LLVM_DEBUG(dbgs() << "  QByteMask       : " << *QByteMask << "\n"); +        // We use our destination allocation as a temp storage +        // This is unlikely to work properly for masked gather. +        auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermhq); +        [[maybe_unused]] auto GatherHi = Builder.CreateIntrinsic( +            Type::getVoidTy(Ctx), V6_vgather, +            {Ptr, QByteMask, CastedPtr, +             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexHi}, +            nullptr); +        LLVM_DEBUG(dbgs() << "  GatherHi        : " << *GatherHi << ")\n"); +        // Rematerialize the result +        [[maybe_unused]] Value *LoadedResultHi = Builder.CreateLoad( +            HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_hi"); +        LLVM_DEBUG(dbgs() << "  LoadedResultHi : " << *LoadedResultHi << "\n"); +        // Same for the low part. Here we use Gather to return non-NULL result +        // from this function and continue to iterate. We also are deleting Dst +        // store below. +        Gather = Builder.CreateIntrinsic( +            Type::getVoidTy(Ctx), V6_vgather, +            {Ptr, QByteMask, CastedPtr, +             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexLo}, +            nullptr); +        LLVM_DEBUG(dbgs() << "  GatherLo        : " << *Gather << ")\n"); +        Value *LoadedResultLo = Builder.CreateLoad( +            HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_lo"); +        LLVM_DEBUG(dbgs() << "  LoadedResultLo : " << *LoadedResultLo << "\n"); +        // Now we have properly sized bytes in every other position +        // B b A a c a A b B c f F g G h H is presented as +        // B . b . A . a . c . a . A . b . B . c . f . F . g . G . h . H +        // Use vpack to gather them +        auto V6_vpackeb = HVC.HST.getIntrinsicId(Hexagon::V6_vpackeb); +        [[maybe_unused]] auto Res = Builder.CreateIntrinsic( +            NT, V6_vpackeb, {LoadedResultHi, LoadedResultLo}, nullptr); +        LLVM_DEBUG(dbgs() << "  ScaledRes      : " << *Res << "\n"); +        [[maybe_unused]] auto *StoreRes = Builder.CreateStore(Res, Ptr); +        LLVM_DEBUG(dbgs() << "  StoreRes       : " << *StoreRes << "\n"); +      } else if (ElemWidth == 2) { +        // v32i16 +        if (IndexWidth == 2) { +          // Reinterprete 64i16 as 32i32. Only needed for syntactic IR match. +          Value *CastIndex = +              getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes); +          LLVM_DEBUG(dbgs() << "  Cast index: " << *CastIndex << ")\n"); +          // shift all i16 left by 1 to match short addressing mode instead of +          // byte. +          auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh); +          Value *AdjustedIndex = HVC.createHvxIntrinsic( +              Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)}); +          LLVM_DEBUG(dbgs() +                     << "  Shifted half index: " << *AdjustedIndex << ")\n"); + +          auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermh); +          // The 3rd argument is the size of the region to gather from. Probably +          // want to set it to max VTCM size. +          Gather = Builder.CreateIntrinsic( +              Type::getVoidTy(Ctx), V6_vgather, +              {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), +               AdjustedIndex}, +              nullptr); +          for (auto &U : Dst->uses()) { +            if (auto *UI = dyn_cast<Instruction>(U.getUser())) +              dbgs() << "    dst used by: " << *UI << "\n"; +          } +          for (auto &U : In.uses()) { +            if (auto *UI = dyn_cast<Instruction>(U.getUser())) +              dbgs() << "    In used by : " << *UI << "\n"; +          } +          // Create temp load from result in case the result is used by any +          // other instruction. +          Value *LoadedResult = Builder.CreateLoad( +              HVC.getHvxTy(HVC.getIntTy(16), false), Ptr, "temp_result"); +          LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n"); +          In.replaceAllUsesWith(LoadedResult); +        } else { +          dbgs() << "    Unhandled index type for vgather\n"; +          return nullptr; +        } +      } else if (ElemWidth == 4) { +        if (IndexWidth == 4) { +          // v32i32 +          auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh); +          Value *AdjustedIndex = HVC.createHvxIntrinsic( +              Builder, V6_vaslh, NT, {Indexes, HVC.getConstInt(2)}); +          LLVM_DEBUG(dbgs() +                     << "  Shifted word index: " << *AdjustedIndex << ")\n"); +          Gather = Builder.CreateIntrinsic( +              Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermw_128B, +              {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), +               AdjustedIndex}, +              nullptr); +        } else { +          LLVM_DEBUG(dbgs() << "    Unhandled index type for vgather\n"); +          return nullptr; +        } +      } else { +        LLVM_DEBUG(dbgs() << "    Unhandled element type for vgather\n"); +        return nullptr; +      } +    } else if (HVC.HST.getVectorLength() == OutputSize * 2) { +      // This is half of the reg width, duplicate low in high +      LLVM_DEBUG(dbgs() << "    Unhandled half of register size\n"); +      return nullptr; +    } else if (HVC.HST.getVectorLength() * 2 == OutputSize) { +      LLVM_DEBUG(dbgs() << "    Unhandle twice the register size\n"); +      return nullptr; +    } +    // Erase the original intrinsic and store that consumes it. +    // HVX will create a pseudo for gather that is expanded to gather + store +    // during packetization. +    Dst->eraseFromParent(); +  } else if (Qual == HvxIdioms::LLVM_Scatter) { +    // Gather feeds directly into scatter. +    LLVM_DEBUG({ +      auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType()); +      assert(DstInpTy && "Cannot handle no vector type for llvm.scatter"); +      unsigned DstInpSize = HVC.getSizeOf(DstInpTy); +      unsigned DstElements = HVC.length(DstInpTy); +      auto *DstElemTy = cast<PointerType>(DstInpTy->getElementType()); +      assert(DstElemTy && "llvm.scatter needs vector of ptr argument"); +      dbgs() << "  Gather feeds into scatter\n  Values to scatter : " +             << *Dst->getOperand(0) << "\n"; +      dbgs() << "  Dst type(" << *DstInpTy << ") elements(" << DstElements +             << ") VecLen(" << DstInpSize << ") type(" << *DstElemTy +             << ") Access alignment(" << *Dst->getOperand(2) << ")\n"; +    }); +    // Address of source +    auto *Src = getPointer(IndexLoad); +    if (!Src) +      return nullptr; +    LLVM_DEBUG(dbgs() << "  Src            : " << *Src << "\n"); + +    if (!isa<PointerType>(Src->getType())) { +      LLVM_DEBUG(dbgs() << "    Source is not a pointer type...\n"); +      return nullptr; +    } + +    Value *CastedSrc = Builder.CreateBitOrPointerCast( +        Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); +    LLVM_DEBUG(dbgs() << "  CastedSrc: " << *CastedSrc << "\n"); + +    auto *DstLoad = locateAddressFromIntrinsic(Dst); +    if (!DstLoad) { +      LLVM_DEBUG(dbgs() << "  Unable to locate DstLoad\n"); +      return nullptr; +    } +    LLVM_DEBUG(dbgs() << "  DstLoad  : " << *DstLoad << "\n"); + +    Value *Ptr = getPointer(DstLoad); +    if (!Ptr) +      return nullptr; +    LLVM_DEBUG(dbgs() << "  Ptr      : " << *Ptr << "\n"); +    Value *CastIndex = +        getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, IndexLoad); +    LLVM_DEBUG(dbgs() << "  Cast index: " << *CastIndex << ")\n"); +    // Shift all i16 left by 1 to match short addressing mode instead of +    // byte. +    auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh); +    Value *AdjustedIndex = HVC.createHvxIntrinsic( +        Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)}); +    LLVM_DEBUG(dbgs() << "  Shifted half index: " << *AdjustedIndex << ")\n"); + +    return Builder.CreateIntrinsic( +        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B, +        {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), +         AdjustedIndex}, +        nullptr); +  } else if (Qual == HvxIdioms::HEX_Gather_Scatter) { +    // Gather feeds into previously inserted pseudo intrinsic. +    // These could not be in the same packet, so we need to generate another +    // pseudo that is expanded to .tmp + store V6_vgathermh_pseudo +    // V6_vgathermh_pseudo (ins IntRegs:$_dst_, s4_0Imm:$Ii, IntRegs:$Rt, +    // ModRegs:$Mu, HvxVR:$Vv) +    if (isa<AllocaInst>(IndexLoad)) { +      auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes); +      if (cstDataVector) { +        // Our indexes are represented as a constant. We need THEM in a reg. +        // This most likely will not work properly since alloca gives us DDR +        // stack location. This will be fixed once we teach compiler about VTCM. +        AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT); +        [[maybe_unused]] auto *StoreIndexes = +            Builder.CreateStore(cstDataVector, IndexesAlloca); +        LLVM_DEBUG(dbgs() << "  StoreIndexes   : " << *StoreIndexes << "\n"); +        Value *LoadedIndex = Builder.CreateLoad( +            IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index"); +        AllocaInst *ResultAlloca = Builder.CreateAlloca(NT); +        LLVM_DEBUG(dbgs() << "  ResultAlloca   : " << *ResultAlloca << "\n"); + +        Value *CastedSrc = Builder.CreateBitOrPointerCast( +            IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); +        LLVM_DEBUG(dbgs() << "  CastedSrc      : " << *CastedSrc << "\n"); + +        Gather = Builder.CreateIntrinsic( +            Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B, +            {ResultAlloca, CastedSrc, +             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex}, +            nullptr); +        Value *LoadedResult = Builder.CreateLoad( +            HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result"); +        LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n"); +        LLVM_DEBUG(dbgs() << "  Gather         : " << *Gather << "\n"); +        In.replaceAllUsesWith(LoadedResult); +      } +    } else { +      // Address of source +      auto *Src = getPointer(IndexLoad); +      if (!Src) +        return nullptr; +      LLVM_DEBUG(dbgs() << "  Src      : " << *Src << "\n"); + +      Value *CastedSrc = Builder.CreateBitOrPointerCast( +          Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); +      LLVM_DEBUG(dbgs() << "  CastedSrc: " << *CastedSrc << "\n"); + +      auto *DstLoad = locateAddressFromIntrinsic(Dst); +      if (!DstLoad) +        return nullptr; +      LLVM_DEBUG(dbgs() << "  DstLoad  : " << *DstLoad << "\n"); +      auto *Ptr = getPointer(DstLoad); +      if (!Ptr) +        return nullptr; +      LLVM_DEBUG(dbgs() << "  Ptr      : " << *Ptr << "\n"); + +      Gather = Builder.CreateIntrinsic( +          Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgather_vscattermh, +          {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), +           Indexes}, +          nullptr); +    } +    return Gather; +  } else if (Qual == HvxIdioms::HEX_Scatter) { +    // This is the case when result of a gather is used as an argument to +    // Intrinsic::hexagon_V6_vscattermh_128B. Most likely we just inserted it +    // ourselves. We have to create alloca, store to it, and replace all uses +    // with that. +    AllocaInst *ResultAlloca = Builder.CreateAlloca(NT); +    Value *CastedSrc = Builder.CreateBitOrPointerCast( +        IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); +    LLVM_DEBUG(dbgs() << "  CastedSrc      : " << *CastedSrc << "\n"); +    Value *CastIndex = +        getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes); +    LLVM_DEBUG(dbgs() << "  Cast index     : " << *CastIndex << ")\n"); + +    Gather = Builder.CreateIntrinsic( +        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B, +        {ResultAlloca, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), +         CastIndex}, +        nullptr); +    Value *LoadedResult = Builder.CreateLoad( +        HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result"); +    LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n"); +    In.replaceAllUsesWith(LoadedResult); +  } else if (Qual == HvxIdioms::HEX_Gather) { +    // Gather feeds to another gather but already replaced with +    // hexagon_V6_vgathermh_128B +    if (isa<AllocaInst>(IndexLoad)) { +      auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes); +      if (cstDataVector) { +        // Our indexes are represented as a constant. We need it in a reg. +        AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT); + +        [[maybe_unused]] auto *StoreIndexes = +            Builder.CreateStore(cstDataVector, IndexesAlloca); +        LLVM_DEBUG(dbgs() << "  StoreIndexes   : " << *StoreIndexes << "\n"); +        Value *LoadedIndex = Builder.CreateLoad( +            IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index"); +        AllocaInst *ResultAlloca = Builder.CreateAlloca(NT); +        LLVM_DEBUG(dbgs() << "  ResultAlloca   : " << *ResultAlloca +                          << "\n  AddressSpace: " +                          << ResultAlloca->getAddressSpace() << "\n";); + +        Value *CastedSrc = Builder.CreateBitOrPointerCast( +            IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32"); +        LLVM_DEBUG(dbgs() << "  CastedSrc      : " << *CastedSrc << "\n"); + +        Gather = Builder.CreateIntrinsic( +            Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B, +            {ResultAlloca, CastedSrc, +             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex}, +            nullptr); +        Value *LoadedResult = Builder.CreateLoad( +            HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result"); +        LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n"); +        LLVM_DEBUG(dbgs() << "  Gather         : " << *Gather << "\n"); +        In.replaceAllUsesWith(LoadedResult); +      } +    } +  } else if (Qual == HvxIdioms::LLVM_Gather) { +    // Gather feeds into another gather +    errs() << " Underimplemented vgather to vgather sequence\n"; +    return nullptr; +  } else +    llvm_unreachable("Unhandled Qual enum"); + +  return Gather; +} +  auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,                                       const FxpOp &Op) const -> Value * {    assert(Op.X.Val->getType() == Op.Y.Val->getType()); @@ -2138,6 +3000,26 @@ auto HvxIdioms::run() -> bool {          It = StartOver ? B.rbegin()                         : cast<Instruction>(New)->getReverseIterator();          Changed = true; +      } else if (matchGather(*It)) { +        Value *New = processVGather(*It); +        if (!New) +          continue; +        LLVM_DEBUG(dbgs() << "  Gather : " << *New << "\n"); +        // We replace original intrinsic with a new pseudo call. +        It->eraseFromParent(); +        It = cast<Instruction>(New)->getReverseIterator(); +        RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI); +        Changed = true; +      } else if (matchScatter(*It)) { +        Value *New = processVScatter(*It); +        if (!New) +          continue; +        LLVM_DEBUG(dbgs() << "  Scatter : " << *New << "\n"); +        // We replace original intrinsic with a new pseudo call. +        It->eraseFromParent(); +        It = cast<Instruction>(New)->getReverseIterator(); +        RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI); +        Changed = true;        }      }    } diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp index 6455757..2f59b7c 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp @@ -186,6 +186,9 @@ static unsigned featureToArchVersion(unsigned Feature) {    case Hexagon::ArchV79:    case Hexagon::ExtensionHVXV79:      return 79; +  case Hexagon::ArchV81: +  case Hexagon::ExtensionHVXV81: +    return 81;    }    llvm_unreachable("Expected valid arch feature");    return 0; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index 6b48a21..b8075bd 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -96,6 +96,8 @@ cl::opt<bool> MV75("mv75", cl::Hidden, cl::desc("Build for Hexagon V75"),                     cl::init(false));  cl::opt<bool> MV79("mv79", cl::Hidden, cl::desc("Build for Hexagon V79"),                     cl::init(false)); +cl::opt<bool> MV81("mv81", cl::Hidden, cl::desc("Build for Hexagon V81"), +                   cl::init(false));  } // namespace  static cl::opt<Hexagon::ArchEnum> EnableHVX( @@ -111,6 +113,7 @@ static cl::opt<Hexagon::ArchEnum> EnableHVX(                 clEnumValN(Hexagon::ArchEnum::V73, "v73", "Build for HVX v73"),                 clEnumValN(Hexagon::ArchEnum::V75, "v75", "Build for HVX v75"),                 clEnumValN(Hexagon::ArchEnum::V79, "v79", "Build for HVX v79"), +               clEnumValN(Hexagon::ArchEnum::V81, "v81", "Build for HVX v81"),                 // Sentinel for no value specified.                 clEnumValN(Hexagon::ArchEnum::Generic, "", "")),      // Sentinel for flag not present. @@ -159,6 +162,8 @@ static StringRef HexagonGetArchVariant() {      return "hexagonv75";    if (MV79)      return "hexagonv79"; +  if (MV81) +    return "hexagonv81";    return "";  } @@ -474,6 +479,9 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {    case Hexagon::ArchEnum::V79:      Result.push_back("+hvxv79");      break; +  case Hexagon::ArchEnum::V81: +    Result.push_back("+hvxv81"); +    break;    case Hexagon::ArchEnum::Generic: {      Result.push_back(StringSwitch<StringRef>(CPU) @@ -489,7 +497,8 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {                           .Case("hexagonv71t", "+hvxv71")                           .Case("hexagonv73", "+hvxv73")                           .Case("hexagonv75", "+hvxv75") -                         .Case("hexagonv79", "+hvxv79")); +                         .Case("hexagonv79", "+hvxv79") +                         .Case("hexagonv81", "+hvxv81"));      break;    }    case Hexagon::ArchEnum::NoArch: @@ -538,8 +547,8 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {    FeatureBitset FB = S;    unsigned CpuArch = ArchV5;    for (unsigned F : -       {ArchV79, ArchV75, ArchV73, ArchV71, ArchV69, ArchV68, ArchV67, ArchV66, -        ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) { +       {ArchV81, ArchV79, ArchV75, ArchV73, ArchV71, ArchV69, ArchV68, ArchV67, +        ArchV66, ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) {      if (!FB.test(F))        continue;      CpuArch = F; @@ -556,7 +565,7 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {    for (unsigned F :         {ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65, ExtensionHVXV66,          ExtensionHVXV67, ExtensionHVXV68, ExtensionHVXV69, ExtensionHVXV71, -        ExtensionHVXV73, ExtensionHVXV75, ExtensionHVXV79}) { +        ExtensionHVXV73, ExtensionHVXV75, ExtensionHVXV79, ExtensionHVXV81}) {      if (!FB.test(F))        continue;      HasHvxVer = true; @@ -569,6 +578,9 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {    // HasHvxVer is false, and UseHvx is true.    switch (CpuArch) { +  case ArchV81: +    FB.set(ExtensionHVXV81); +    [[fallthrough]];    case ArchV79:      FB.set(ExtensionHVXV79);      [[fallthrough]]; @@ -668,12 +680,12 @@ void Hexagon_MC::addArchSubtarget(MCSubtargetInfo const *STI, StringRef FS) {  std::optional<unsigned>  Hexagon_MC::getHVXVersion(const FeatureBitset &Features) { -  for (auto Arch : {Hexagon::ExtensionHVXV79, Hexagon::ExtensionHVXV75, -                    Hexagon::ExtensionHVXV73, Hexagon::ExtensionHVXV71, -                    Hexagon::ExtensionHVXV69, Hexagon::ExtensionHVXV68, -                    Hexagon::ExtensionHVXV67, Hexagon::ExtensionHVXV66, -                    Hexagon::ExtensionHVXV65, Hexagon::ExtensionHVXV62, -                    Hexagon::ExtensionHVXV60}) +  for (auto Arch : {Hexagon::ExtensionHVXV81, Hexagon::ExtensionHVXV79, +                    Hexagon::ExtensionHVXV75, Hexagon::ExtensionHVXV73, +                    Hexagon::ExtensionHVXV71, Hexagon::ExtensionHVXV69, +                    Hexagon::ExtensionHVXV68, Hexagon::ExtensionHVXV67, +                    Hexagon::ExtensionHVXV66, Hexagon::ExtensionHVXV65, +                    Hexagon::ExtensionHVXV62, Hexagon::ExtensionHVXV60})      if (Features.test(Arch))        return Arch;    return {}; @@ -681,13 +693,13 @@ Hexagon_MC::getHVXVersion(const FeatureBitset &Features) {  unsigned Hexagon_MC::getArchVersion(const FeatureBitset &Features) {    for (auto Arch : -       {Hexagon::ArchV79, Hexagon::ArchV75, Hexagon::ArchV73, Hexagon::ArchV71, -        Hexagon::ArchV69, Hexagon::ArchV68, Hexagon::ArchV67, Hexagon::ArchV66, -        Hexagon::ArchV65, Hexagon::ArchV62, Hexagon::ArchV60, Hexagon::ArchV55, -        Hexagon::ArchV5}) +       {Hexagon::ArchV81, Hexagon::ArchV79, Hexagon::ArchV75, Hexagon::ArchV73, +        Hexagon::ArchV71, Hexagon::ArchV69, Hexagon::ArchV68, Hexagon::ArchV67, +        Hexagon::ArchV66, Hexagon::ArchV65, Hexagon::ArchV62, Hexagon::ArchV60, +        Hexagon::ArchV55, Hexagon::ArchV5})      if (Features.test(Arch))        return Arch; -  llvm_unreachable("Expected arch v5-v79"); +  llvm_unreachable("Expected arch v5-v81");    return 0;  } @@ -708,7 +720,8 @@ unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {        .Case("hexagonv71t", llvm::ELF::EF_HEXAGON_MACH_V71T)        .Case("hexagonv73", llvm::ELF::EF_HEXAGON_MACH_V73)        .Case("hexagonv75", llvm::ELF::EF_HEXAGON_MACH_V75) -      .Case("hexagonv79", llvm::ELF::EF_HEXAGON_MACH_V79); +      .Case("hexagonv79", llvm::ELF::EF_HEXAGON_MACH_V79) +      .Case("hexagonv81", llvm::ELF::EF_HEXAGON_MACH_V81);  }  llvm::ArrayRef<MCPhysReg> Hexagon_MC::GetVectRegRev() { diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index aca7abd..44d1a44 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -4578,6 +4578,8 @@ def : InstAlias<"mfamr $Rx", (MFSPR gprc:$Rx, 29)>;  def : InstAlias<"mtpid $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsBookE]>;  def : InstAlias<"mfpid $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsBookE]>; +def : InstAlias<"mtpidr $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsISA3_0]>; +def : InstAlias<"mfpidr $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsISA3_0]>;  foreach SPRG = 4-7 in {    def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 256))>, diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 9e6b7f0..2754d78 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1124,7 +1124,8 @@ def HasStdExtZbkbOrP                           "'Base P' (Packed-SIMD)">;  def HasStdExtZbbOrZbkbOrP -    : Predicate<"Subtarget->HasStdExtZbbOrZbkb()|| Subtarget->hasStdExtP()">, +    : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbkb() || " +                "Subtarget->hasStdExtP()">,        AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtZbkb, FeatureStdExtP),                           "'Zbb' (Basic Bit-Manipulation) or "                           "'Zbkb' (Bitmanip instructions for Cryptography) or " diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 219e3f2..1c930ac 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -318,8 +318,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,    setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom); -  if (!Subtarget.hasStdExtZbb() && !Subtarget.hasVendorXTHeadBb() && -      !Subtarget.hasVendorXqcibm() && !Subtarget.hasVendorXAndesPerf() && +  if (!Subtarget.hasStdExtZbb() && !Subtarget.hasStdExtP() && +      !Subtarget.hasVendorXTHeadBb() && !Subtarget.hasVendorXqcibm() && +      !Subtarget.hasVendorXAndesPerf() &&        !(Subtarget.hasVendorXCValu() && !Subtarget.is64Bit()))      setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::i8, MVT::i16}, Expand); @@ -392,7 +393,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,        setOperationAction(ISD::BITREVERSE, MVT::i8, Custom);    } -  if (Subtarget.hasStdExtZbb() || +  if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtP() ||        (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {      setOperationAction({ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, XLenVT,                         Legal); @@ -403,6 +404,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,        setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);    } else {      setOperationAction(ISD::CTTZ, XLenVT, Expand); +    // If have a CLZW, but not CTZW, custom promote i32. +    if (Subtarget.hasStdExtP() && Subtarget.is64Bit()) +      setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);    }    if (!Subtarget.hasCPOPLike()) { @@ -419,13 +423,15 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,      // We need the custom lowering to make sure that the resulting sequence      // for the 32bit case is efficient on 64bit targets.      // Use default promotion for i32 without Zbb. -    if (Subtarget.is64Bit() && Subtarget.hasStdExtZbb()) +    if (Subtarget.is64Bit() && +        (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtP()))        setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);    } else {      setOperationAction(ISD::CTLZ, XLenVT, Expand);    } -  if (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit()) { +  if (Subtarget.hasStdExtP() || +      (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {      setOperationAction(ISD::ABS, XLenVT, Legal);    } else if (Subtarget.hasShortForwardBranchOpt()) {      // We can use PseudoCCSUB to implement ABS. @@ -14669,6 +14675,25 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));      bool IsCTZ =          N->getOpcode() == ISD::CTTZ || N->getOpcode() == ISD::CTTZ_ZERO_UNDEF; + +    // Without Zbb, lower as 32 - clzw(~X & (X-1)) +    if (IsCTZ && !Subtarget.hasStdExtZbb()) { +      assert(Subtarget.hasStdExtP()); + +      NewOp0 = DAG.getFreeze(NewOp0); +      SDValue Not = DAG.getNOT(DL, NewOp0, MVT::i64); +      SDValue Minus1 = DAG.getNode(ISD::SUB, DL, MVT::i64, NewOp0, +                                   DAG.getConstant(1, DL, MVT::i64)); +      SDValue And = DAG.getNode(ISD::AND, DL, MVT::i64, Not, Minus1); +      SDValue CLZW = DAG.getNode(RISCVISD::CLZW, DL, MVT::i64, And); +      SDValue Sub = DAG.getNode(ISD::SUB, DL, MVT::i64, +                                DAG.getConstant(32, DL, MVT::i64), CLZW); +      SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, Sub, +                                DAG.getValueType(MVT::i32)); +      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); +      return; +    } +      unsigned Opc = IsCTZ ? RISCVISD::CTZW : RISCVISD::CLZW;      SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp0);      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index 7d8a919..cc085bb 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -1455,3 +1455,11 @@ let Predicates = [HasStdExtP, IsRV32] in {    def PMAXU_DW     : RVPPairBinaryExchanged_rr<0b1111, 0b01, "pmaxu.dw">;    def PMAXU_DB     : RVPPairBinaryExchanged_rr<0b1111, 0b10, "pmaxu.db">;  } // Predicates = [HasStdExtP, IsRV32] + + +//===----------------------------------------------------------------------===// +// Codegen patterns +//===----------------------------------------------------------------------===// + +let Predicates = [HasStdExtP] in +def : PatGpr<abs, ABS>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td index 4c2f7f6..f7b4914 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td @@ -218,11 +218,13 @@ let Predicates = [HasVendorXSfvcp], mayLoad = 0, mayStore = 0,  }  let Predicates = [HasVendorXSfvfexpAny], DecoderNamespace = "XSfvector" in { -  def SF_VFEXP_V : VALUVs2<0b010011, 0b00111, OPFVV, "sf.vfexp.v">; +  def SF_VFEXP_V : VALUVs2<0b010011, 0b00111, OPFVV, "sf.vfexp.v">, +                   SchedUnaryMC<"WriteSF_VFExp", "ReadSF_VFExp">;  }  let Predicates = [HasVendorXSfvfexpa], DecoderNamespace = "XSfvector" in { -  def SF_VFEXPA_V : VALUVs2<0b010011, 0b00110, OPFVV, "sf.vfexpa.v">; +  def SF_VFEXPA_V : VALUVs2<0b010011, 0b00110, OPFVV, "sf.vfexpa.v">, +                    SchedUnaryMC<"WriteSF_VFExpa", "ReadSF_VFExpa">;  }  let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvector", @@ -487,6 +489,48 @@ let Predicates = [HasVendorXSfvfnrclipxfqf], AltFmtType = IS_NOT_ALTFMT in {    defm SF_VFNRCLIP_X_F_QF : VPseudoSiFiveVFNRCLIP;  } +class VFExpSchedSEWSet<string mx, bit IsBF16, bit IsApprox> { +  defvar BaseSet = SchedSEWSet<mx, isF=1>.val; +  list<int> val = !if(IsBF16, !listremove(BaseSet, [32, 64]), +                      !if(IsApprox, BaseSet, !listremove(BaseSet, [64]))); +} +multiclass VPseudoVFExp_V<bit IsBF16 = false, bit IsApprox = false> { +  defvar SchedSuffix = !if(IsApprox, "VFExpa", "VFExp"); + +  foreach m = MxListF in { +    defvar mx = m.MX; +    foreach e = VFExpSchedSEWSet<mx, IsBF16, IsApprox>.val in { +      let VLMul = m.value in { +        def "_V_" # mx # "_E" # e +            : VPseudoUnaryNoMask<m.vrclass, m.vrclass>, +              SchedUnary<"WriteSF_" # SchedSuffix, "ReadSF_" # SchedSuffix, +                         mx, e, forcePassthruRead=true>; +        def "_V_" # mx # "_E" # e # "_MASK" +            : VPseudoUnaryMask<m.vrclass, m.vrclass>, +              RISCVMaskedPseudo<MaskIdx = 2>, +              SchedUnary<"WriteSF_" # SchedSuffix, "ReadSF_" # SchedSuffix, +                         mx, e, forcePassthruRead=true>; +      } +    } +  } +} + +let Predicates = [HasVendorXSfvfbfexp16e], hasSideEffects = 0 in { +  let AltFmtType = IS_ALTFMT in { +    defm PseudoSF_VFEXP_ALT : VPseudoVFExp_V<IsBF16=true>; +  } +} + +let Predicates = [HasVendorXSfvfexpAnyFloat], hasSideEffects = 0 in { +  let AltFmtType = IS_NOT_ALTFMT in { +    defm PseudoSF_VFEXP : VPseudoVFExp_V; +  } +} + +let Predicates = [HasVendorXSfvfexpa], AltFmtType = IS_NOT_ALTFMT in { +  defm PseudoSF_VFEXPA : VPseudoVFExp_V<IsApprox=true>; +} +  // SDNode  def SDT_SF_VC_V_X : SDTypeProfile<1, 4, [SDTCisVec<0>,                                           SDTCisVT<1, XLenVT>, @@ -893,3 +937,36 @@ let Predicates = [HasVendorXSfcease] in {      let rs2 = 0b00101;  }  } + +let Predicates = [HasVendorXSfvfbfexp16e] in { +  defm : VPatUnaryV_V<"int_riscv_sf_vfexp", "PseudoSF_VFEXP_ALT", +                      AllBF16Vectors, +                      isSEWAware=1>; +} + +let Predicates = [HasVendorXSfvfexp16e] in { +  defm : VPatUnaryV_V<"int_riscv_sf_vfexp", "PseudoSF_VFEXP", +                      [VF16MF4, VF16MF2, VF16M1, VF16M2, VF16M4, VF16M8], +                      isSEWAware=1>; +} + +let Predicates = [HasVendorXSfvfexp32e] in { +  defm : VPatUnaryV_V<"int_riscv_sf_vfexp", "PseudoSF_VFEXP", +                      [VF32MF2, VF32M1, VF32M2, VF32M4, VF32M8], isSEWAware=1>; +} + +let Predicates = [HasVendorXSfvfexpa] in { +  defm : VPatUnaryV_V<"int_riscv_sf_vfexpa", "PseudoSF_VFEXPA", +                      [VF32MF2, VF32M1, VF32M2, VF32M4, VF32M8], isSEWAware=1>; +} + +let Predicates = [HasVendorXSfvfexpa, HasVInstructionsF16] in { +  defm : VPatUnaryV_V<"int_riscv_sf_vfexpa", "PseudoSF_VFEXPA", +                      [VF16MF4, VF16MF2, VF16M1, VF16M2, VF16M4, VF16M8], +                      isSEWAware=1>; +} + +let Predicates = [HasVendorXSfvfexpa64e] in { +  defm : VPatUnaryV_V<"int_riscv_sf_vfexpa", "PseudoSF_VFEXPA", +                      [VF64M1, VF64M2, VF64M4, VF64M8], isSEWAware=1>; +} diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index 6b9a75f..5429c2a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -599,14 +599,20 @@ def : PatGpr<riscv_zip, ZIP_RV32, i32>;  def : PatGpr<riscv_unzip, UNZIP_RV32, i32>;  } // Predicates = [HasStdExtZbkb, IsRV32] -let Predicates = [HasStdExtZbb] in { +let Predicates = [HasStdExtZbbOrP] in {  def : PatGpr<ctlz, CLZ>; +} + +let Predicates = [HasStdExtZbb] in {  def : PatGpr<cttz, CTZ>;  def : PatGpr<ctpop, CPOP>;  } // Predicates = [HasStdExtZbb] -let Predicates = [HasStdExtZbb, IsRV64] in { +let Predicates = [HasStdExtZbbOrP, IsRV64] in {  def : PatGpr<riscv_clzw, CLZW>; +} + +let Predicates = [HasStdExtZbb, IsRV64] in {  def : PatGpr<riscv_ctzw, CTZW>;  def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>; @@ -614,22 +620,22 @@ def : Pat<(i64 (riscv_negw_max GPR:$rs1)),            (MAX GPR:$rs1, (XLenVT (SUBW (XLenVT X0), GPR:$rs1)))>;  } // Predicates = [HasStdExtZbb, IsRV64] -let Predicates = [HasStdExtZbb] in { +let Predicates = [HasStdExtZbbOrP] in {  def : Pat<(XLenVT (sext_inreg GPR:$rs1, i8)), (SEXT_B GPR:$rs1)>;  def : Pat<(XLenVT (sext_inreg GPR:$rs1, i16)), (SEXT_H GPR:$rs1)>;  } // Predicates = [HasStdExtZbb] -let Predicates = [HasStdExtZbb] in { +let Predicates = [HasStdExtZbbOrP] in {  def : PatGprGpr<smin, MIN>;  def : PatGprGpr<smax, MAX>;  def : PatGprGpr<umin, MINU>;  def : PatGprGpr<umax, MAXU>;  } // Predicates = [HasStdExtZbb] -let Predicates = [HasStdExtZbbOrZbkb, IsRV32] in +let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV32] in  def : PatGpr<bswap, REV8_RV32, i32>; -let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in +let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV64] in  def : PatGpr<bswap, REV8_RV64, i64>;  let Predicates = [HasStdExtZbkb] in { diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index 637d61fe..36a2f46 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -1588,6 +1588,10 @@ multiclass SiFive7SchedResources<int vlen, bit dualVALU,    //===----------------------------------------------------------------------===//    // Unsupported extensions    defm : UnsupportedSchedQ; +  // TODO: scheduling info of XSfvfexp* and XSfvfexpa* +  // for SiFive7 will be added in follow-up patches. +  defm : UnsupportedSchedXSfvfexp; +  defm : UnsupportedSchedXSfvfexpa;    defm : UnsupportedSchedZabha;    defm : UnsupportedSchedZbc;    defm : UnsupportedSchedZbkb; diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td index 9ab9636..64ccfd8 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedule.td +++ b/llvm/lib/Target/RISCV/RISCVSchedule.td @@ -523,6 +523,8 @@ include "RISCVScheduleZvk.td"  // Vendor Extensions  multiclass UnsupportedSchedXsf {    defm : UnsupportedSchedXsfvcp; +  defm : UnsupportedSchedXSfvfexp; +  defm : UnsupportedSchedXSfvfexpa;    defm : UnsupportedSchedXSfvfnrclipxfqf;    defm : UnsupportedSchedXSfvfwmaccqqq;    defm : UnsupportedSchedXSfvqmaccdod; diff --git a/llvm/lib/Target/RISCV/RISCVScheduleXSf.td b/llvm/lib/Target/RISCV/RISCVScheduleXSf.td index 99632e4..1ee6dc1 100644 --- a/llvm/lib/Target/RISCV/RISCVScheduleXSf.td +++ b/llvm/lib/Target/RISCV/RISCVScheduleXSf.td @@ -99,3 +99,23 @@ defm : LMULWriteRes<"WriteSF_VFWMACC_QQQ", []>;  defm : LMULReadAdvance<"ReadSF_VFWMACC_QQQ", 0>;  } // Unsupported = true  } + +defm "" : LMULSEWSchedWritesF<"WriteSF_VFExp">; +defm "" : LMULSEWSchedReadsF<"ReadSF_VFExp">; + +multiclass UnsupportedSchedXSfvfexp { +let Unsupported = true in { +defm : LMULSEWWriteResF<"WriteSF_VFExp", []>; +defm : LMULSEWReadAdvanceF<"ReadSF_VFExp", 0>; +} // Unsupported = true +} + +defm "" : LMULSEWSchedWritesF<"WriteSF_VFExpa">; +defm "" : LMULSEWSchedReadsF<"ReadSF_VFExpa">; + +multiclass UnsupportedSchedXSfvfexpa { +let Unsupported = true in { +defm : LMULSEWWriteResF<"WriteSF_VFExpa", []>; +defm : LMULSEWReadAdvanceF<"ReadSF_VFExpa", 0>; +} // Unsupported = true +} diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 334db4b..4b4fc8f 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -187,7 +187,7 @@ public:    }    bool hasCLZLike() const { -    return HasStdExtZbb || HasVendorXTHeadBb || +    return HasStdExtZbb || HasStdExtP || HasVendorXTHeadBb ||             (HasVendorXCVbitmanip && !IsRV64);    }    bool hasCTZLike() const { @@ -197,7 +197,7 @@ public:      return HasStdExtZbb || (HasVendorXCVbitmanip && !IsRV64);    }    bool hasREV8Like() const { -    return HasStdExtZbb || HasStdExtZbkb || HasVendorXTHeadBb; +    return HasStdExtZbb || HasStdExtZbkb || HasStdExtP || HasVendorXTHeadBb;    }    bool hasBEXTILike() const { return HasStdExtZbs || HasVendorXTHeadBs; } diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 62073ec..4393f6e 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4721,9 +4721,6 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {    if (!(Subtarget->hasVLX() || NVT.is512BitVector()))      return false; -  SDValue N0 = N->getOperand(0); -  SDValue N1 = N->getOperand(1); -    auto getFoldableLogicOp = [](SDValue Op) {      // Peek through single use bitcast.      if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse()) @@ -4740,13 +4737,47 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {      return SDValue();    }; -  SDValue A, FoldableOp; -  if ((FoldableOp = getFoldableLogicOp(N1))) { -    A = N0; -  } else if ((FoldableOp = getFoldableLogicOp(N0))) { -    A = N1; -  } else -    return false; +  SDValue N0, N1, A, FoldableOp; + +  // Identify and (optionally) peel an outer NOT that wraps a pure logic tree +  auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) { +    if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() && +        ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) { +      SDValue InnerOp = Op->getOperand(0); + +      if (!getFoldableLogicOp(InnerOp)) +        return SDValue(); + +      N0 = InnerOp.getOperand(0); +      N1 = InnerOp.getOperand(1); +      if ((FoldableOp = getFoldableLogicOp(N1))) { +        A = N0; +        return InnerOp; +      } +      if ((FoldableOp = getFoldableLogicOp(N0))) { +        A = N1; +        return InnerOp; +      } +    } +    return SDValue(); +  }; + +  bool PeeledOuterNot = false; +  SDNode *OriN = N; +  if (SDValue InnerOp = tryPeelOuterNotWrappingLogic(N)) { +    PeeledOuterNot = true; +    N = InnerOp.getNode(); +  } else { +    N0 = N->getOperand(0); +    N1 = N->getOperand(1); + +    if ((FoldableOp = getFoldableLogicOp(N1))) +      A = N0; +    else if ((FoldableOp = getFoldableLogicOp(N0))) +      A = N1; +    else +      return false; +  }    SDValue B = FoldableOp.getOperand(0);    SDValue C = FoldableOp.getOperand(1); @@ -4798,7 +4829,10 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {    case ISD::XOR: Imm ^= TernlogMagicA; break;    } -  return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm); +  if (PeeledOuterNot) +    Imm = ~Imm; + +  return matchVPTERNLOG(OriN, ParentA, ParentB, ParentC, A, B, C, Imm);  }  /// If the high bits of an 'and' operand are known zero, try setting the diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4dfc400..410f20e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -57617,10 +57617,10 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,    }    // Fold any similar generic ADD/SUB opcodes to reuse this node. -  auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) { +  auto MatchGeneric = [&](unsigned Opc, SDValue N0, SDValue N1, bool Negate) {      SDValue Ops[] = {N0, N1};      SDVTList VTs = DAG.getVTList(N->getValueType(0)); -    if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) { +    if (SDNode *GenericAddSub = DAG.getNodeIfExists(Opc, VTs, Ops)) {        SDValue Op(N, 0);        if (Negate) {          // Bail if this is only used by a user of the x86 add/sub. @@ -57632,8 +57632,25 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,        DCI.CombineTo(GenericAddSub, Op);      }    }; -  MatchGeneric(LHS, RHS, false); -  MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode()); +  MatchGeneric(GenericOpc, LHS, RHS, false); +  MatchGeneric(GenericOpc, RHS, LHS, X86ISD::SUB == N->getOpcode()); + +  if (auto *Const = dyn_cast<ConstantSDNode>(RHS)) { +    SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT); +    if (X86ISD::SUB == N->getOpcode()) { +      // Fold generic add(LHS, -C) to X86ISD::SUB(LHS, C). +      MatchGeneric(ISD::ADD, LHS, NegC, false); +    } else { +      // Negate X86ISD::ADD(LHS, C) and replace generic sub(-C, LHS). +      MatchGeneric(ISD::SUB, NegC, LHS, true); +    } +  } else if (auto *Const = dyn_cast<ConstantSDNode>(LHS)) { +    if (X86ISD::SUB == N->getOpcode()) { +      SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT); +      // Negate X86ISD::SUB(C, RHS) and replace generic add(RHS, -C). +      MatchGeneric(ISD::ADD, RHS, NegC, true); +    } +  }    // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the    // EFLAGS result doesn't change. diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index e28b9c1..b7151f6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1592,7 +1592,6 @@ namespace llvm {      bool useLoadStackGuardNode(const Module &M) const override;      bool useStackGuardXorFP() const override;      void insertSSPDeclarations(Module &M) const override; -    Function *getSSPStackGuardCheck(const Module &M) const override;      SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,                                  const SDLoc &DL) const override; diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 37d7772..a61bbe5 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -640,15 +640,6 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {    TargetLowering::insertSSPDeclarations(M);  } -Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { -  // MSVC CRT has a function to validate security cookie. -  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || -      Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { -    return M.getFunction("__security_check_cookie"); -  } -  return TargetLowering::getSSPStackGuardCheck(M); -} -  Value *  X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {    // Android provides a fixed TLS slot for the SafeStack pointer. See the diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td index edcf247..632c6a2 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td @@ -1407,7 +1407,7 @@ let isBarrier = 1, isTerminator = 1 in {      let r = 0x04;    } -  def BREAK_N : RRRN_Inst<0x0C, (outs), (ins uimm4:$imm), +  def BREAK_N : RRRN_Inst<0x0D, (outs), (ins uimm4:$imm),                           "break.n\t$imm", []>, Requires<[HasDensity, HasDebug]> {      bits<4> imm; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 669d4f0..8d9933b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -582,6 +582,18 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {            IC.Builder.CreateBinaryIntrinsic(Intrinsic::ctlz, C, Op1);        return BinaryOperator::CreateSub(ConstCtlz, X);      } + +    // ctlz(~x & (x - 1)) -> bitwidth - cttz(x, false) +    if (Op0->hasOneUse() && +        match(Op0, +              m_c_And(m_Not(m_Value(X)), m_Add(m_Deferred(X), m_AllOnes())))) { +      Type *Ty = II.getType(); +      unsigned BitWidth = Ty->getScalarSizeInBits(); +      auto *Cttz = IC.Builder.CreateIntrinsic(Intrinsic::cttz, Ty, +                                              {X, IC.Builder.getFalse()}); +      auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth)); +      return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz)); +    }    }    // cttz(Pow2) -> Log2(Pow2) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 5aa8de3..f5130da 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -4697,5 +4697,31 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {                  cast<IntrinsicInst>(TrueVal)->getParamAlign(0).valueOrOne(),                  CondVal, FalseVal)); +  // Canonicalize sign function ashr pattern: select (icmp slt X, 1), ashr X, +  // bitwidth-1, 1 -> scmp(X, 0) +  // Also handles: select (icmp sgt X, 0), 1, ashr X, bitwidth-1 -> scmp(X, 0) +  unsigned BitWidth = SI.getType()->getScalarSizeInBits(); +  CmpPredicate Pred; +  Value *CmpLHS, *CmpRHS; + +  // Canonicalize sign function ashr patterns: +  // select (icmp slt X, 1), ashr X, bitwidth-1, 1 -> scmp(X, 0) +  // select (icmp sgt X, 0), 1, ashr X, bitwidth-1 -> scmp(X, 0) +  if (match(&SI, m_Select(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)), +                          m_Value(TrueVal), m_Value(FalseVal))) && +      ((Pred == ICmpInst::ICMP_SLT && match(CmpRHS, m_One()) && +        match(TrueVal, +              m_AShr(m_Specific(CmpLHS), m_SpecificInt(BitWidth - 1))) && +        match(FalseVal, m_One())) || +       (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_Zero()) && +        match(TrueVal, m_One()) && +        match(FalseVal, +              m_AShr(m_Specific(CmpLHS), m_SpecificInt(BitWidth - 1)))))) { + +    Function *Scmp = Intrinsic::getOrInsertDeclaration( +        SI.getModule(), Intrinsic::scmp, {SI.getType(), SI.getType()}); +    return CallInst::Create(Scmp, {CmpLHS, ConstantInt::get(SI.getType(), 0)}); +  } +    return nullptr;  } diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 67e2aae..9c8de45 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2327,6 +2327,18 @@ Constant *InstCombinerImpl::unshuffleConstant(ArrayRef<int> ShMask, Constant *C,    return ConstantVector::get(NewVecC);  } +// Get the result of `Vector Op Splat` (or Splat Op Vector if \p SplatLHS). +static Constant *constantFoldBinOpWithSplat(unsigned Opcode, Constant *Vector, +                                            Constant *Splat, bool SplatLHS, +                                            const DataLayout &DL) { +  ElementCount EC = cast<VectorType>(Vector->getType())->getElementCount(); +  Constant *LHS = ConstantVector::getSplat(EC, Splat); +  Constant *RHS = Vector; +  if (!SplatLHS) +    std::swap(LHS, RHS); +  return ConstantFoldBinaryOpOperands(Opcode, LHS, RHS, DL); +} +  Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {    if (!isa<VectorType>(Inst.getType()))      return nullptr; @@ -2338,6 +2350,37 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {    assert(cast<VectorType>(RHS->getType())->getElementCount() ==           cast<VectorType>(Inst.getType())->getElementCount()); +  auto foldConstantsThroughSubVectorInsertSplat = +      [&](Value *MaybeSubVector, Value *MaybeSplat, +          bool SplatLHS) -> Instruction * { +    Value *Idx; +    Constant *Splat, *SubVector, *Dest; +    if (!match(MaybeSplat, m_ConstantSplat(m_Constant(Splat))) || +        !match(MaybeSubVector, +               m_VectorInsert(m_Constant(Dest), m_Constant(SubVector), +                              m_Value(Idx)))) +      return nullptr; +    SubVector = +        constantFoldBinOpWithSplat(Opcode, SubVector, Splat, SplatLHS, DL); +    Dest = constantFoldBinOpWithSplat(Opcode, Dest, Splat, SplatLHS, DL); +    if (!SubVector || !Dest) +      return nullptr; +    auto *InsertVector = +        Builder.CreateInsertVector(Dest->getType(), Dest, SubVector, Idx); +    return replaceInstUsesWith(Inst, InsertVector); +  }; + +  // If one operand is a constant splat and the other operand is a +  // `vector.insert` where both the destination and subvector are constant, +  // apply the operation to both the destination and subvector, returning a new +  // constant `vector.insert`. This helps constant folding for scalable vectors. +  if (Instruction *Folded = foldConstantsThroughSubVectorInsertSplat( +          /*MaybeSubVector=*/LHS, /*MaybeSplat=*/RHS, /*SplatLHS=*/false)) +    return Folded; +  if (Instruction *Folded = foldConstantsThroughSubVectorInsertSplat( +          /*MaybeSubVector=*/RHS, /*MaybeSplat=*/LHS, /*SplatLHS=*/true)) +    return Folded; +    // If both operands of the binop are vector concatenations, then perform the    // narrow binop on each pair of the source operands followed by concatenation    // of the results. diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index b6cbecb..10b03bb 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -226,6 +226,7 @@ static const Align kMinOriginAlignment = Align(4);  static const Align kShadowTLSAlignment = Align(8);  // These constants must be kept in sync with the ones in msan.h. +// TODO: increase size to match SVE/SVE2/SME/SME2 limits  static const unsigned kParamTLSSize = 800;  static const unsigned kRetvalTLSSize = 800; @@ -1544,6 +1545,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {      }    } +  static bool isAArch64SVCount(Type *Ty) { +    if (TargetExtType *TTy = dyn_cast<TargetExtType>(Ty)) +      return TTy->getName() == "aarch64.svcount"; +    return false; +  } + +  // This is intended to match the "AArch64 Predicate-as-Counter Type" (aka +  // 'target("aarch64.svcount")', but not e.g., <vscale x 4 x i32>. +  static bool isScalableNonVectorType(Type *Ty) { +    if (!isAArch64SVCount(Ty)) +      LLVM_DEBUG(dbgs() << "isScalableNonVectorType: Unexpected type " << *Ty +                        << "\n"); + +    return Ty->isScalableTy() && !isa<VectorType>(Ty); +  } +    void materializeChecks() {  #ifndef NDEBUG      // For assert below. @@ -1672,6 +1689,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {        LLVM_DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");        return Res;      } +    if (isScalableNonVectorType(OrigTy)) { +      LLVM_DEBUG(dbgs() << "getShadowTy: Scalable non-vector type: " << *OrigTy +                        << "\n"); +      return OrigTy; +    } +      uint32_t TypeSize = DL.getTypeSizeInBits(OrigTy);      return IntegerType::get(*MS.C, TypeSize);    } @@ -2185,8 +2208,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {                          << *OrigIns << "\n");        return;      } -#ifndef NDEBUG +      Type *ShadowTy = Shadow->getType(); +    if (isScalableNonVectorType(ShadowTy)) { +      LLVM_DEBUG(dbgs() << "Skipping check of scalable non-vector " << *Shadow +                        << " before " << *OrigIns << "\n"); +      return; +    } +#ifndef NDEBUG      assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy) ||              isa<StructType>(ShadowTy) || isa<ArrayType>(ShadowTy)) &&             "Can only insert checks for integer, vector, and aggregate shadow " @@ -6972,6 +7001,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {        // an extra "select". This results in much more compact IR.        // Sa = select Sb, poisoned, (select b, Sc, Sd)        Sa1 = getPoisonedShadow(getShadowTy(I.getType())); +    } else if (isScalableNonVectorType(I.getType())) { +      // This is intended to handle target("aarch64.svcount"), which can't be +      // handled in the else branch because of incompatibility with CreateXor +      // ("The supported LLVM operations on this type are limited to load, +      // store, phi, select and alloca instructions"). + +      // TODO: this currently underapproximates. Use Arm SVE EOR in the else +      //       branch as needed instead. +      Sa1 = getCleanShadow(getShadowTy(I.getType()));      } else {        // Sa = select Sb, [ (c^d) | Sc | Sd ], [ b ? Sc : Sd ]        // If Sb (condition is poisoned), look for bits in c and d that are equal diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index a1ad2db..2591df8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -4172,11 +4172,6 @@ class VPlan {    /// definitions are VPValues that hold a pointer to their underlying IR.    SmallVector<VPValue *, 16> VPLiveIns; -  /// Mapping from SCEVs to the VPValues representing their expansions. -  /// NOTE: This mapping is temporary and will be removed once all users have -  /// been modeled in VPlan directly. -  DenseMap<const SCEV *, VPValue *> SCEVToExpansion; -    /// Blocks allocated and owned by the VPlan. They will be deleted once the    /// VPlan is destroyed.    SmallVector<VPBlockBase *> CreatedBlocks; @@ -4424,15 +4419,6 @@ public:    LLVM_DUMP_METHOD void dump() const;  #endif -  VPValue *getSCEVExpansion(const SCEV *S) const { -    return SCEVToExpansion.lookup(S); -  } - -  void addSCEVExpansion(const SCEV *S, VPValue *V) { -    assert(!SCEVToExpansion.contains(S) && "SCEV already expanded"); -    SCEVToExpansion[S] = V; -  } -    /// Clone the current VPlan, update all VPValues of the new VPlan and cloned    /// recipes to refer to the clones, and return it.    VPlan *duplicate(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index c385c36..84817d7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -943,12 +943,40 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {    }  } +/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R. +/// Returns an optional pair, where the first element indicates whether it is +/// an intrinsic ID. +static std::optional<std::pair<bool, unsigned>> +getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) { +  return TypeSwitch<const VPSingleDefRecipe *, +                    std::optional<std::pair<bool, unsigned>>>(R) +      .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, +            VPWidenSelectRecipe, VPWidenGEPRecipe, VPReplicateRecipe>( +          [](auto *I) { return std::make_pair(false, I->getOpcode()); }) +      .Case<VPWidenIntrinsicRecipe>([](auto *I) { +        return std::make_pair(true, I->getVectorIntrinsicID()); +      }) +      .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) { +        // For recipes that do not directly map to LLVM IR instructions, +        // assign opcodes after the last VPInstruction opcode (which is also +        // after the last IR Instruction opcode), based on the VPDefID. +        return std::make_pair(false, +                              VPInstruction::OpsEnd + 1 + I->getVPDefID()); +      }) +      .Default([](auto *) { return std::nullopt; }); +} +  /// Try to fold \p R using InstSimplifyFolder. Will succeed and return a -/// non-nullptr Value for a handled \p Opcode if corresponding \p Operands are -/// foldable live-ins. -static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode, -                               ArrayRef<VPValue *> Operands, -                               const DataLayout &DL, VPTypeAnalysis &TypeInfo) { +/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p +/// Operands are foldable live-ins. +static VPValue *tryToFoldLiveIns(VPSingleDefRecipe &R, +                                 ArrayRef<VPValue *> Operands, +                                 const DataLayout &DL, +                                 VPTypeAnalysis &TypeInfo) { +  auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R); +  if (!OpcodeOrIID) +    return nullptr; +    SmallVector<Value *, 4> Ops;    for (VPValue *Op : Operands) {      if (!Op->isLiveIn() || !Op->getLiveInIRValue()) @@ -956,43 +984,57 @@ static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,      Ops.push_back(Op->getLiveInIRValue());    } -  InstSimplifyFolder Folder(DL); -  if (Instruction::isBinaryOp(Opcode)) -    return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode), Ops[0], +  auto FoldToIRValue = [&]() -> Value * { +    InstSimplifyFolder Folder(DL); +    if (OpcodeOrIID->first) { +      if (R.getNumOperands() != 2) +        return nullptr; +      unsigned ID = OpcodeOrIID->second; +      return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1], +                                        TypeInfo.inferScalarType(&R)); +    } +    unsigned Opcode = OpcodeOrIID->second; +    if (Instruction::isBinaryOp(Opcode)) +      return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode), +                              Ops[0], Ops[1]); +    if (Instruction::isCast(Opcode)) +      return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0], +                             TypeInfo.inferScalarType(R.getVPSingleValue())); +    switch (Opcode) { +    case VPInstruction::LogicalAnd: +      return Folder.FoldSelect(Ops[0], Ops[1], +                               ConstantInt::getNullValue(Ops[1]->getType())); +    case VPInstruction::Not: +      return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0], +                              Constant::getAllOnesValue(Ops[0]->getType())); +    case Instruction::Select: +      return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]); +    case Instruction::ICmp: +    case Instruction::FCmp: +      return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],                              Ops[1]); -  if (Instruction::isCast(Opcode)) -    return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0], -                           TypeInfo.inferScalarType(R.getVPSingleValue())); -  switch (Opcode) { -  case VPInstruction::LogicalAnd: -    return Folder.FoldSelect(Ops[0], Ops[1], -                             ConstantInt::getNullValue(Ops[1]->getType())); -  case VPInstruction::Not: -    return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0], -                            Constant::getAllOnesValue(Ops[0]->getType())); -  case Instruction::Select: -    return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]); -  case Instruction::ICmp: -  case Instruction::FCmp: -    return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0], -                          Ops[1]); -  case Instruction::GetElementPtr: { -    auto &RFlags = cast<VPRecipeWithIRFlags>(R); -    auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr()); -    return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0], drop_begin(Ops), -                          RFlags.getGEPNoWrapFlags()); -  } -  case VPInstruction::PtrAdd: -  case VPInstruction::WidePtrAdd: -    return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), Ops[0], -                          Ops[1], -                          cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags()); -  // An extract of a live-in is an extract of a broadcast, so return the -  // broadcasted element. -  case Instruction::ExtractElement: -    assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar"); -    return Ops[0]; -  } +    case Instruction::GetElementPtr: { +      auto &RFlags = cast<VPRecipeWithIRFlags>(R); +      auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr()); +      return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0], +                            drop_begin(Ops), RFlags.getGEPNoWrapFlags()); +    } +    case VPInstruction::PtrAdd: +    case VPInstruction::WidePtrAdd: +      return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), +                            Ops[0], Ops[1], +                            cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags()); +    // An extract of a live-in is an extract of a broadcast, so return the +    // broadcasted element. +    case Instruction::ExtractElement: +      assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar"); +      return Ops[0]; +    } +    return nullptr; +  }; + +  if (Value *V = FoldToIRValue()) +    return R.getParent()->getPlan()->getOrAddLiveIn(V);    return nullptr;  } @@ -1006,19 +1048,10 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {    // Simplification of live-in IR values for SingleDef recipes using    // InstSimplifyFolder. -  if (TypeSwitch<VPRecipeBase *, bool>(&R) -          .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, -                VPReplicateRecipe, VPWidenSelectRecipe>([&](auto *I) { -            const DataLayout &DL = -                Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout(); -            Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL, -                                        TypeInfo); -            if (V) -              I->replaceAllUsesWith(Plan->getOrAddLiveIn(V)); -            return V; -          }) -          .Default([](auto *) { return false; })) -    return; +  const DataLayout &DL = +      Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout(); +  if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo)) +    return Def->replaceAllUsesWith(V);    // Fold PredPHI LiveIn -> LiveIn.    if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) { @@ -1996,29 +2029,6 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {      return Def == getEmptyKey() || Def == getTombstoneKey();    } -  /// Get any instruction opcode or intrinsic ID data embedded in recipe \p R. -  /// Returns an optional pair, where the first element indicates whether it is -  /// an intrinsic ID. -  static std::optional<std::pair<bool, unsigned>> -  getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) { -    return TypeSwitch<const VPSingleDefRecipe *, -                      std::optional<std::pair<bool, unsigned>>>(R) -        .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe, -              VPWidenSelectRecipe, VPWidenGEPRecipe, VPReplicateRecipe>( -            [](auto *I) { return std::make_pair(false, I->getOpcode()); }) -        .Case<VPWidenIntrinsicRecipe>([](auto *I) { -          return std::make_pair(true, I->getVectorIntrinsicID()); -        }) -        .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) { -          // For recipes that do not directly map to LLVM IR instructions, -          // assign opcodes after the last VPInstruction opcode (which is also -          // after the last IR Instruction opcode), based on the VPDefID. -          return std::make_pair(false, -                                VPInstruction::OpsEnd + 1 + I->getVPDefID()); -        }) -        .Default([](auto *) { return std::nullopt; }); -  } -    /// If recipe \p R will lower to a GEP with a non-i8 source element type,    /// return that source element type.    static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 06c3d75..fe66f13 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -32,8 +32,6 @@ bool vputils::onlyScalarValuesUsed(const VPValue *Def) {  }  VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { -  if (auto *Expanded = Plan.getSCEVExpansion(Expr)) -    return Expanded;    VPValue *Expanded = nullptr;    if (auto *E = dyn_cast<SCEVConstant>(Expr))      Expanded = Plan.getOrAddLiveIn(E->getValue()); @@ -50,7 +48,6 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) {        Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe());      }    } -  Plan.addSCEVExpansion(Expr, Expanded);    return Expanded;  } | 
