aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/DependenceAnalysis.cpp51
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp58
-rw-r--r--llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp2
-rw-r--r--llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp48
-rw-r--r--llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp100
-rw-r--r--llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp7
-rw-r--r--llvm/lib/CodeGen/MIRParser/MIParser.cpp3
-rw-r--r--llvm/lib/CodeGen/MachineVerifier.cpp16
-rw-r--r--llvm/lib/CodeGen/RegAllocFast.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp6
-rw-r--r--llvm/lib/CodeGen/TargetLoweringBase.cpp5
-rw-r--r--llvm/lib/IR/AsmWriter.cpp4
-rw-r--r--llvm/lib/MC/CMakeLists.txt3
-rw-r--r--llvm/lib/MC/MCSFrame.cpp155
-rw-r--r--llvm/lib/Support/Timer.cpp20
-rw-r--r--llvm/lib/Target/AArch64/AArch64Combine.td1
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp111
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h1
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrFormats.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td379
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td6
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp64
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp10
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp16
-rw-r--r--llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp18
-rw-r--r--llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp21
-rw-r--r--llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h19
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCombine.td17
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp53
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp9
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h1
-rw-r--r--llvm/lib/Target/ARM/ARMInstrInfo.td12
-rw-r--r--llvm/lib/Target/DirectX/DXILPrepare.cpp89
-rw-r--r--llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp130
-rw-r--r--llvm/lib/Target/DirectX/DXILTranslateMetadata.h3
-rw-r--r--llvm/lib/Target/Hexagon/Hexagon.td13
-rw-r--r--llvm/lib/Target/Hexagon/HexagonDepArch.h4
-rw-r--r--llvm/lib/Target/Hexagon/HexagonDepArch.td2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonDepIICHVX.td592
-rw-r--r--llvm/lib/Target/Hexagon/HexagonDepIICScalar.td888
-rw-r--r--llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td37
-rw-r--r--llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td11
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp4
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp4
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelLowering.cpp4
-rw-r--r--llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp162
-rw-r--r--llvm/lib/Target/Hexagon/HexagonPatternsV65.td13
-rw-r--r--llvm/lib/Target/Hexagon/HexagonSchedule.td1
-rw-r--r--llvm/lib/Target/Hexagon/HexagonScheduleV81.td31
-rw-r--r--llvm/lib/Target/Hexagon/HexagonSubtarget.h9
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp59
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h6
-rw-r--r--llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp884
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp3
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp45
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrInfo.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td3
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp35
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoP.td8
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td81
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoZb.td18
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSiFive7.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedule.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVScheduleXSf.td20
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.h4
-rw-r--r--llvm/lib/Target/X86/X86ISelDAGToDAG.cpp56
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp25
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h1
-rw-r--r--llvm/lib/Target/X86/X86ISelLoweringCall.cpp9
-rw-r--r--llvm/lib/Target/Xtensa/XtensaInstrInfo.td2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp12
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp26
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp43
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp40
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h14
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp164
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUtils.cpp3
77 files changed, 4091 insertions, 695 deletions
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 853bd66..a572eef 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -1582,6 +1582,23 @@ static const SCEV *minusSCEVNoSignedOverflow(const SCEV *A, const SCEV *B,
return nullptr;
}
+/// Returns the absolute value of \p A. In the context of dependence analysis,
+/// we need an absolute value in a mathematical sense. If \p A is the signed
+/// minimum value, we cannot represent it unless extending the original type.
+/// Thus if we cannot prove that \p A is not the signed minimum value, returns
+/// nullptr.
+static const SCEV *absSCEVNoSignedOverflow(const SCEV *A, ScalarEvolution &SE) {
+ IntegerType *Ty = cast<IntegerType>(A->getType());
+ if (!Ty)
+ return nullptr;
+
+ const SCEV *SMin =
+ SE.getConstant(APInt::getSignedMinValue(Ty->getBitWidth()));
+ if (!SE.isKnownPredicate(CmpInst::ICMP_NE, A, SMin))
+ return nullptr;
+ return SE.getAbsExpr(A, /*IsNSW=*/true);
+}
+
/// Returns true iff \p Test is enabled.
static bool isDependenceTestEnabled(DependenceTestType Test) {
if (EnableDependenceTest == DependenceTestType::All)
@@ -1669,21 +1686,25 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
LLVM_DEBUG(dbgs() << ", " << *Delta->getType() << "\n");
// check that |Delta| < iteration count
- if (const SCEV *UpperBound =
- collectUpperBound(CurSrcLoop, Delta->getType())) {
+ bool IsDeltaLarge = [&] {
+ const SCEV *UpperBound = collectUpperBound(CurSrcLoop, Delta->getType());
+ if (!UpperBound)
+ return false;
+
LLVM_DEBUG(dbgs() << "\t UpperBound = " << *UpperBound);
LLVM_DEBUG(dbgs() << ", " << *UpperBound->getType() << "\n");
- const SCEV *AbsDelta =
- SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta);
- const SCEV *AbsCoeff =
- SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff);
+ const SCEV *AbsDelta = absSCEVNoSignedOverflow(Delta, *SE);
+ const SCEV *AbsCoeff = absSCEVNoSignedOverflow(Coeff, *SE);
+ if (!AbsDelta || !AbsCoeff)
+ return false;
const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff);
- if (isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product)) {
- // Distance greater than trip count - no dependence
- ++StrongSIVindependence;
- ++StrongSIVsuccesses;
- return true;
- }
+ return isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product);
+ }();
+ if (IsDeltaLarge) {
+ // Distance greater than trip count - no dependence
+ ++StrongSIVindependence;
+ ++StrongSIVsuccesses;
+ return true;
}
// Can we compute distance?
@@ -2259,6 +2280,9 @@ bool DependenceInfo::weakZeroSrcSIVtest(
const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(DstCoeff);
if (!ConstCoeff)
return false;
+
+ // Since ConstCoeff is constant, !isKnownNegative means it's non-negative.
+ // TODO: Bail out if it's a signed minimum value.
const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff)
? SE->getNegativeSCEV(ConstCoeff)
: ConstCoeff;
@@ -2369,6 +2393,9 @@ bool DependenceInfo::weakZeroDstSIVtest(
const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(SrcCoeff);
if (!ConstCoeff)
return false;
+
+ // Since ConstCoeff is constant, !isKnownNegative means it's non-negative.
+ // TODO: Bail out if it's a signed minimum value.
const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff)
? SE->getNegativeSCEV(ConstCoeff)
: ConstCoeff;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index b425b95..1f10478 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -391,19 +391,6 @@ void CombinerHelper::applyCombineConcatVectors(
MI.eraseFromParent();
}
-bool CombinerHelper::matchCombineShuffleToBuildVector(MachineInstr &MI) const {
- assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
- "Invalid instruction");
- auto &Shuffle = cast<GShuffleVector>(MI);
-
- Register SrcVec1 = Shuffle.getSrc1Reg();
- Register SrcVec2 = Shuffle.getSrc2Reg();
-
- LLT SrcVec1Type = MRI.getType(SrcVec1);
- LLT SrcVec2Type = MRI.getType(SrcVec2);
- return SrcVec1Type.isVector() && SrcVec2Type.isVector();
-}
-
void CombinerHelper::applyCombineShuffleToBuildVector(MachineInstr &MI) const {
auto &Shuffle = cast<GShuffleVector>(MI);
@@ -535,11 +522,9 @@ bool CombinerHelper::matchCombineShuffleVector(
LLT DstType = MRI.getType(MI.getOperand(0).getReg());
Register Src1 = MI.getOperand(1).getReg();
LLT SrcType = MRI.getType(Src1);
- // As bizarre as it may look, shuffle vector can actually produce
- // scalar! This is because at the IR level a <1 x ty> shuffle
- // vector is perfectly valid.
- unsigned DstNumElts = DstType.isVector() ? DstType.getNumElements() : 1;
- unsigned SrcNumElts = SrcType.isVector() ? SrcType.getNumElements() : 1;
+
+ unsigned DstNumElts = DstType.getNumElements();
+ unsigned SrcNumElts = SrcType.getNumElements();
// If the resulting vector is smaller than the size of the source
// vectors being concatenated, we won't be able to replace the
@@ -556,7 +541,7 @@ bool CombinerHelper::matchCombineShuffleVector(
//
// TODO: If the size between the source and destination don't match
// we could still emit an extract vector element in that case.
- if (DstNumElts < 2 * SrcNumElts && DstNumElts != 1)
+ if (DstNumElts < 2 * SrcNumElts)
return false;
// Check that the shuffle mask can be broken evenly between the
@@ -619,39 +604,6 @@ void CombinerHelper::applyCombineShuffleVector(
MI.eraseFromParent();
}
-bool CombinerHelper::matchShuffleToExtract(MachineInstr &MI) const {
- assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
- "Invalid instruction kind");
-
- ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
- return Mask.size() == 1;
-}
-
-void CombinerHelper::applyShuffleToExtract(MachineInstr &MI) const {
- Register DstReg = MI.getOperand(0).getReg();
- Builder.setInsertPt(*MI.getParent(), MI);
-
- int I = MI.getOperand(3).getShuffleMask()[0];
- Register Src1 = MI.getOperand(1).getReg();
- LLT Src1Ty = MRI.getType(Src1);
- int Src1NumElts = Src1Ty.isVector() ? Src1Ty.getNumElements() : 1;
- Register SrcReg;
- if (I >= Src1NumElts) {
- SrcReg = MI.getOperand(2).getReg();
- I -= Src1NumElts;
- } else if (I >= 0)
- SrcReg = Src1;
-
- if (I < 0)
- Builder.buildUndef(DstReg);
- else if (!MRI.getType(SrcReg).isVector())
- Builder.buildCopy(DstReg, SrcReg);
- else
- Builder.buildExtractVectorElementConstant(DstReg, SrcReg, I);
-
- MI.eraseFromParent();
-}
-
namespace {
/// Select a preference between two uses. CurrentUse is the current preference
@@ -8369,7 +8321,7 @@ bool CombinerHelper::matchShuffleDisjointMask(MachineInstr &MI,
return false;
ArrayRef<int> Mask = Shuffle.getMask();
- const unsigned NumSrcElems = Src1Ty.isVector() ? Src1Ty.getNumElements() : 1;
+ const unsigned NumSrcElems = Src1Ty.getNumElements();
bool TouchesSrc1 = false;
bool TouchesSrc2 = false;
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index 04d9309..d6f23b6 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -602,6 +602,8 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known,
Depth + 1);
computeKnownBitsImpl(MI.getOperand(3).getReg(), WidthKnown, DemandedElts,
Depth + 1);
+ OffsetKnown = OffsetKnown.sext(BitWidth);
+ WidthKnown = WidthKnown.sext(BitWidth);
Known = extractBits(BitWidth, SrcOpKnown, OffsetKnown, WidthKnown);
// Sign extend the extracted value using shift left and arithmetic shift
// right.
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index b49040b..1fc90d0 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -3359,6 +3359,54 @@ bool IRTranslator::translateShuffleVector(const User &U,
Mask = SVI->getShuffleMask();
else
Mask = cast<ConstantExpr>(U).getShuffleMask();
+
+ // As GISel does not represent <1 x > vectors as a separate type from scalars,
+ // we transform shuffle_vector with a scalar output to an
+ // ExtractVectorElement. If the input type is also scalar it becomes a Copy.
+ unsigned DstElts = cast<FixedVectorType>(U.getType())->getNumElements();
+ unsigned SrcElts =
+ cast<FixedVectorType>(U.getOperand(0)->getType())->getNumElements();
+ if (DstElts == 1) {
+ unsigned M = Mask[0];
+ if (SrcElts == 1) {
+ if (M == 0 || M == 1)
+ return translateCopy(U, *U.getOperand(M), MIRBuilder);
+ MIRBuilder.buildUndef(getOrCreateVReg(U));
+ } else {
+ Register Dst = getOrCreateVReg(U);
+ if (M < SrcElts) {
+ MIRBuilder.buildExtractVectorElementConstant(
+ Dst, getOrCreateVReg(*U.getOperand(0)), M);
+ } else if (M < SrcElts * 2) {
+ MIRBuilder.buildExtractVectorElementConstant(
+ Dst, getOrCreateVReg(*U.getOperand(1)), M - SrcElts);
+ } else {
+ MIRBuilder.buildUndef(Dst);
+ }
+ }
+ return true;
+ }
+
+ // A single element src is transformed to a build_vector.
+ if (SrcElts == 1) {
+ SmallVector<Register> Ops;
+ Register Undef;
+ for (int M : Mask) {
+ LLT SrcTy = getLLTForType(*U.getOperand(0)->getType(), *DL);
+ if (M == 0 || M == 1) {
+ Ops.push_back(getOrCreateVReg(*U.getOperand(M)));
+ } else {
+ if (!Undef.isValid()) {
+ Undef = MRI->createGenericVirtualRegister(SrcTy);
+ MIRBuilder.buildUndef(Undef);
+ }
+ Ops.push_back(Undef);
+ }
+ }
+ MIRBuilder.buildBuildVector(getOrCreateVReg(U), Ops);
+ return true;
+ }
+
ArrayRef<int> MaskAlloc = MF->allocateShuffleMask(Mask);
MIRBuilder
.buildInstr(TargetOpcode::G_SHUFFLE_VECTOR, {getOrCreateVReg(U)},
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 38ec83f..178529f 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4748,6 +4748,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
case G_FMINIMUMNUM:
case G_FMAXIMUMNUM:
return lowerFMinNumMaxNum(MI);
+ case G_FMINIMUM:
+ case G_FMAXIMUM:
+ return lowerFMinimumMaximum(MI);
case G_MERGE_VALUES:
return lowerMergeValues(MI);
case G_UNMERGE_VALUES:
@@ -5819,6 +5822,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
} else if (InputUsed[0] == -1U) {
// No input vectors were used! The result is undefined.
Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
+ } else if (NewElts == 1) {
+ Output = MIRBuilder.buildCopy(NarrowTy, Inputs[InputUsed[0]]).getReg(0);
} else {
Register Op0 = Inputs[InputUsed[0]];
// If only one input was used, use an undefined vector for the other.
@@ -8775,6 +8780,77 @@ LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
return Legalized;
}
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFMinimumMaximum(MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ auto [Dst, Src0, Src1] = MI.getFirst3Regs();
+ LLT Ty = MRI.getType(Dst);
+ LLT CmpTy = Ty.changeElementSize(1);
+
+ bool IsMax = (Opc == TargetOpcode::G_FMAXIMUM);
+ unsigned OpcIeee =
+ IsMax ? TargetOpcode::G_FMAXNUM_IEEE : TargetOpcode::G_FMINNUM_IEEE;
+ unsigned OpcNonIeee =
+ IsMax ? TargetOpcode::G_FMAXNUM : TargetOpcode::G_FMINNUM;
+ bool MinMaxMustRespectOrderedZero = false;
+ Register Res;
+
+ // IEEE variants don't need canonicalization
+ if (LI.isLegalOrCustom({OpcIeee, Ty})) {
+ Res = MIRBuilder.buildInstr(OpcIeee, {Ty}, {Src0, Src1}).getReg(0);
+ MinMaxMustRespectOrderedZero = true;
+ } else if (LI.isLegalOrCustom({OpcNonIeee, Ty})) {
+ Res = MIRBuilder.buildInstr(OpcNonIeee, {Ty}, {Src0, Src1}).getReg(0);
+ } else {
+ auto Compare = MIRBuilder.buildFCmp(
+ IsMax ? CmpInst::FCMP_OGT : CmpInst::FCMP_OLT, CmpTy, Src0, Src1);
+ Res = MIRBuilder.buildSelect(Ty, Compare, Src0, Src1).getReg(0);
+ }
+
+ // Propagate any NaN of both operands
+ if (!MI.getFlag(MachineInstr::FmNoNans) &&
+ (!isKnownNeverNaN(Src0, MRI) || isKnownNeverNaN(Src1, MRI))) {
+ auto IsOrdered = MIRBuilder.buildFCmp(CmpInst::FCMP_ORD, CmpTy, Src0, Src1);
+
+ LLT ElementTy = Ty.isScalar() ? Ty : Ty.getElementType();
+ APFloat NaNValue = APFloat::getNaN(getFltSemanticForLLT(ElementTy));
+ Register NaN = MIRBuilder.buildFConstant(ElementTy, NaNValue).getReg(0);
+ if (Ty.isVector())
+ NaN = MIRBuilder.buildSplatBuildVector(Ty, NaN).getReg(0);
+
+ Res = MIRBuilder.buildSelect(Ty, IsOrdered, Res, NaN).getReg(0);
+ }
+
+ // fminimum/fmaximum requires -0.0 less than +0.0
+ if (!MinMaxMustRespectOrderedZero && !MI.getFlag(MachineInstr::FmNsz)) {
+ GISelValueTracking VT(MIRBuilder.getMF());
+ KnownFPClass Src0Info = VT.computeKnownFPClass(Src0, fcZero);
+ KnownFPClass Src1Info = VT.computeKnownFPClass(Src1, fcZero);
+
+ if (!Src0Info.isKnownNeverZero() && !Src1Info.isKnownNeverZero()) {
+ const unsigned Flags = MI.getFlags();
+ Register Zero = MIRBuilder.buildFConstant(Ty, 0.0).getReg(0);
+ auto IsZero = MIRBuilder.buildFCmp(CmpInst::FCMP_OEQ, CmpTy, Res, Zero);
+
+ unsigned TestClass = IsMax ? fcPosZero : fcNegZero;
+
+ auto LHSTestZero = MIRBuilder.buildIsFPClass(CmpTy, Src0, TestClass);
+ auto LHSSelect =
+ MIRBuilder.buildSelect(Ty, LHSTestZero, Src0, Res, Flags);
+
+ auto RHSTestZero = MIRBuilder.buildIsFPClass(CmpTy, Src1, TestClass);
+ auto RHSSelect =
+ MIRBuilder.buildSelect(Ty, RHSTestZero, Src1, LHSSelect, Flags);
+
+ Res = MIRBuilder.buildSelect(Ty, IsZero, RHSSelect, Res, Flags).getReg(0);
+ }
+ }
+
+ MIRBuilder.buildCopy(Dst, Res);
+ MI.eraseFromParent();
+ return Legalized;
+}
+
LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
// Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
Register DstReg = MI.getOperand(0).getReg();
@@ -9016,22 +9092,18 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
continue;
}
- if (Src0Ty.isScalar()) {
- BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
- } else {
- int NumElts = Src0Ty.getNumElements();
- Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
- int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
- auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
- auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
- BuildVec.push_back(Extract.getReg(0));
- }
+ assert(!Src0Ty.isScalar() && "Unexpected scalar G_SHUFFLE_VECTOR");
+
+ int NumElts = Src0Ty.getNumElements();
+ Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
+ int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
+ auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
+ auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
+ BuildVec.push_back(Extract.getReg(0));
}
- if (DstTy.isVector())
- MIRBuilder.buildBuildVector(DstReg, BuildVec);
- else
- MIRBuilder.buildCopy(DstReg, BuildVec[0]);
+ assert(DstTy.isVector() && "Unexpected scalar G_SHUFFLE_VECTOR");
+ MIRBuilder.buildBuildVector(DstReg, BuildVec);
MI.eraseFromParent();
return Legalized;
}
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 27df7e3..4b4df98 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -800,10 +800,11 @@ MachineInstrBuilder MachineIRBuilder::buildShuffleVector(const DstOp &Res,
LLT DstTy = Res.getLLTTy(*getMRI());
LLT Src1Ty = Src1.getLLTTy(*getMRI());
LLT Src2Ty = Src2.getLLTTy(*getMRI());
- const LLT DstElemTy = DstTy.isVector() ? DstTy.getElementType() : DstTy;
- const LLT ElemTy1 = Src1Ty.isVector() ? Src1Ty.getElementType() : Src1Ty;
- const LLT ElemTy2 = Src2Ty.isVector() ? Src2Ty.getElementType() : Src2Ty;
+ const LLT DstElemTy = DstTy.getScalarType();
+ const LLT ElemTy1 = Src1Ty.getScalarType();
+ const LLT ElemTy2 = Src2Ty.getScalarType();
assert(DstElemTy == ElemTy1 && DstElemTy == ElemTy2);
+ assert(Mask.size() > 1 && "Scalar G_SHUFFLE_VECTOR are not supported");
(void)DstElemTy;
(void)ElemTy1;
(void)ElemTy2;
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 6a464d9..4795d81 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -2788,6 +2788,9 @@ bool MIParser::parseShuffleMaskOperand(MachineOperand &Dest) {
if (expectAndConsume(MIToken::rparen))
return error("shufflemask should be terminated by ')'.");
+ if (ShufMask.size() < 2)
+ return error("shufflemask should have > 1 element");
+
ArrayRef<int> MaskAlloc = MF.allocateShuffleMask(ShufMask);
Dest = MachineOperand::CreateShuffleMask(MaskAlloc);
return false;
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 1154855..c0710c4 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1924,13 +1924,23 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
if (Src0Ty != Src1Ty)
report("Source operands must be the same type", MI);
- if (Src0Ty.getScalarType() != DstTy.getScalarType())
+ if (Src0Ty.getScalarType() != DstTy.getScalarType()) {
report("G_SHUFFLE_VECTOR cannot change element type", MI);
+ break;
+ }
+ if (!Src0Ty.isVector()) {
+ report("G_SHUFFLE_VECTOR must have vector src", MI);
+ break;
+ }
+ if (!DstTy.isVector()) {
+ report("G_SHUFFLE_VECTOR must have vector dst", MI);
+ break;
+ }
// Don't check that all operands are vector because scalars are used in
// place of 1 element vectors.
- int SrcNumElts = Src0Ty.isVector() ? Src0Ty.getNumElements() : 1;
- int DstNumElts = DstTy.isVector() ? DstTy.getNumElements() : 1;
+ int SrcNumElts = Src0Ty.getNumElements();
+ int DstNumElts = DstTy.getNumElements();
ArrayRef<int> MaskIdxes = MaskOp.getShuffleMask();
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 72b364c..697b779 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -211,7 +211,7 @@ private:
unsigned getSparseSetIndex() const { return VirtReg.virtRegIndex(); }
};
- using LiveRegMap = SparseSet<LiveReg, unsigned, identity_cxx20, uint16_t>;
+ using LiveRegMap = SparseSet<LiveReg, unsigned, identity, uint16_t>;
/// This map contains entries for each virtual register that is currently
/// available in a physical register.
LiveRegMap LiveVirtRegs;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d2ea652..8676060 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -19993,8 +19993,12 @@ static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad,
// nor a successor of N. Otherwise, if Op is folded that would
// create a cycle.
unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
- for (SDNode *Op : Ptr->users()) {
+ for (SDUse &U : Ptr->uses()) {
+ if (U.getResNo() != Ptr.getResNo())
+ continue;
+
// Check for #1.
+ SDNode *Op = U.getUser();
if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI))
continue;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 060b1dd..59798b3 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2097,6 +2097,11 @@ Value *TargetLoweringBase::getSDagStackGuard(const Module &M) const {
}
Function *TargetLoweringBase::getSSPStackGuardCheck(const Module &M) const {
+ // MSVC CRT has a function to validate security cookie.
+ RTLIB::LibcallImpl SecurityCheckCookieLibcall =
+ getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
+ if (SecurityCheckCookieLibcall != RTLIB::Unsupported)
+ return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall));
return nullptr;
}
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 488b078..1096e57 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -4082,10 +4082,10 @@ void AssemblyWriter::printTypeIdentities() {
/// printFunction - Print all aspects of a function.
void AssemblyWriter::printFunction(const Function *F) {
- if (AnnotationWriter) AnnotationWriter->emitFunctionAnnot(F, Out);
-
if (F->isMaterializable())
Out << "; Materializable\n";
+ else if (AnnotationWriter)
+ AnnotationWriter->emitFunctionAnnot(F, Out);
const AttributeList &Attrs = F->getAttributes();
if (Attrs.hasFnAttrs()) {
diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt
index 1e1d0a6..70c4577 100644
--- a/llvm/lib/MC/CMakeLists.txt
+++ b/llvm/lib/MC/CMakeLists.txt
@@ -73,9 +73,10 @@ add_llvm_component_library(LLVMMC
${LLVM_MAIN_INCLUDE_DIR}/llvm/MC
LINK_COMPONENTS
+ BinaryFormat
+ DebugInfoDWARFLowLevel
Support
TargetParser
- BinaryFormat
DEPENDS
intrinsics_gen
diff --git a/llvm/lib/MC/MCSFrame.cpp b/llvm/lib/MC/MCSFrame.cpp
index d6fa54c..e0a90df 100644
--- a/llvm/lib/MC/MCSFrame.cpp
+++ b/llvm/lib/MC/MCSFrame.cpp
@@ -8,6 +8,8 @@
#include "llvm/MC/MCSFrame.h"
#include "llvm/BinaryFormat/SFrame.h"
+#include "llvm/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.h"
+#include "llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCObjectFileInfo.h"
@@ -211,8 +213,152 @@ class SFrameEmitterImpl {
return true;
}
+ // Technically, the escape data could be anything, but it is commonly a dwarf
+ // CFI program. Even then, it could contain an arbitrarily complicated Dwarf
+ // expression. Following gnu-gas, look for certain common cases that could
+ // invalidate an FDE, emit a warning for those sequences, and don't generate
+ // an FDE in those cases. Allow any that are known safe. It is likely that
+ // more thorough test cases could refine this code, but it handles the most
+ // important ones compatibly with gas.
+ // Returns true if the CFI escape sequence is safe for sframes.
+ bool isCFIEscapeSafe(SFrameFDE &FDE, const SFrameFRE &FRE,
+ const MCCFIInstruction &CFI) {
+ const MCAsmInfo *AI = Streamer.getContext().getAsmInfo();
+ DWARFDataExtractorSimple data(CFI.getValues(), AI->isLittleEndian(),
+ AI->getCodePointerSize());
+
+ // Normally, both alignment factors are extracted from the enclosing Dwarf
+ // FDE or CIE. We don't have one here. Alignments are used for scaling
+ // factors for ops like CFA_def_cfa_offset_sf. But this particular function
+ // is only interested in registers.
+ dwarf::CFIProgram P(/*CodeAlignmentFactor=*/1,
+ /*DataAlignmentFactor=*/1,
+ Streamer.getContext().getTargetTriple().getArch());
+ uint64_t Offset = 0;
+ if (P.parse(data, &Offset, CFI.getValues().size())) {
+ // Not a parsable dwarf expression. Assume the worst.
+ Streamer.getContext().reportWarning(
+ CFI.getLoc(),
+ "skipping SFrame FDE; .cfi_escape with unknown effects");
+ return false;
+ }
+
+ // This loop deals with dwarf::CFIProgram::Instructions. Everywhere else
+ // this file deals with MCCFIInstructions.
+ for (const dwarf::CFIProgram::Instruction &I : P) {
+ switch (I.Opcode) {
+ case dwarf::DW_CFA_nop:
+ break;
+ case dwarf::DW_CFA_val_offset: {
+ // First argument is a register. Anything that touches CFA, FP, or RA is
+ // a problem, but allow others through. As an even more special case,
+ // allow SP + 0.
+ auto Reg = I.getOperandAsUnsigned(P, 0);
+ // The parser should have failed in this case.
+ assert(Reg && "DW_CFA_val_offset with no register.");
+ bool SPOk = true;
+ if (*Reg == SPReg) {
+ auto Opnd = I.getOperandAsSigned(P, 1);
+ if (!Opnd || *Opnd != 0)
+ SPOk = false;
+ }
+ if (!SPOk || *Reg == RAReg || *Reg == FPReg) {
+ StringRef RN = *Reg == SPReg
+ ? "SP reg "
+ : (*Reg == FPReg ? "FP reg " : "RA reg ");
+ Streamer.getContext().reportWarning(
+ CFI.getLoc(),
+ Twine(
+ "skipping SFrame FDE; .cfi_escape DW_CFA_val_offset with ") +
+ RN + Twine(*Reg));
+ return false;
+ }
+ } break;
+ case dwarf::DW_CFA_expression: {
+ // First argument is a register. Anything that touches CFA, FP, or RA is
+ // a problem, but allow others through.
+ auto Reg = I.getOperandAsUnsigned(P, 0);
+ if (!Reg) {
+ Streamer.getContext().reportWarning(
+ CFI.getLoc(),
+ "skipping SFrame FDE; .cfi_escape with unknown effects");
+ return false;
+ }
+ if (*Reg == SPReg || *Reg == RAReg || *Reg == FPReg) {
+ StringRef RN = *Reg == SPReg
+ ? "SP reg "
+ : (*Reg == FPReg ? "FP reg " : "RA reg ");
+ Streamer.getContext().reportWarning(
+ CFI.getLoc(),
+ Twine(
+ "skipping SFrame FDE; .cfi_escape DW_CFA_expression with ") +
+ RN + Twine(*Reg));
+ return false;
+ }
+ } break;
+ case dwarf::DW_CFA_GNU_args_size: {
+ auto Size = I.getOperandAsSigned(P, 0);
+ // Zero size doesn't affect the cfa.
+ if (Size && *Size == 0)
+ break;
+ if (FRE.Info.getBaseRegister() != BaseReg::FP) {
+ Streamer.getContext().reportWarning(
+ CFI.getLoc(),
+ Twine("skipping SFrame FDE; .cfi_escape DW_CFA_GNU_args_size "
+ "with non frame-pointer CFA"));
+ return false;
+ }
+ } break;
+ // Cases that gas doesn't specially handle. TODO: Some of these could be
+ // analyzed and handled instead of just punting. But these are uncommon,
+ // or should be written as normal cfi directives. Some will need fixes to
+ // the scaling factor.
+ case dwarf::DW_CFA_advance_loc:
+ case dwarf::DW_CFA_offset:
+ case dwarf::DW_CFA_restore:
+ case dwarf::DW_CFA_set_loc:
+ case dwarf::DW_CFA_advance_loc1:
+ case dwarf::DW_CFA_advance_loc2:
+ case dwarf::DW_CFA_advance_loc4:
+ case dwarf::DW_CFA_offset_extended:
+ case dwarf::DW_CFA_restore_extended:
+ case dwarf::DW_CFA_undefined:
+ case dwarf::DW_CFA_same_value:
+ case dwarf::DW_CFA_register:
+ case dwarf::DW_CFA_remember_state:
+ case dwarf::DW_CFA_restore_state:
+ case dwarf::DW_CFA_def_cfa:
+ case dwarf::DW_CFA_def_cfa_register:
+ case dwarf::DW_CFA_def_cfa_offset:
+ case dwarf::DW_CFA_def_cfa_expression:
+ case dwarf::DW_CFA_offset_extended_sf:
+ case dwarf::DW_CFA_def_cfa_sf:
+ case dwarf::DW_CFA_def_cfa_offset_sf:
+ case dwarf::DW_CFA_val_offset_sf:
+ case dwarf::DW_CFA_val_expression:
+ case dwarf::DW_CFA_MIPS_advance_loc8:
+ case dwarf::DW_CFA_AARCH64_negate_ra_state_with_pc:
+ case dwarf::DW_CFA_AARCH64_negate_ra_state:
+ case dwarf::DW_CFA_LLVM_def_aspace_cfa:
+ case dwarf::DW_CFA_LLVM_def_aspace_cfa_sf:
+ Streamer.getContext().reportWarning(
+ CFI.getLoc(), "skipping SFrame FDE; .cfi_escape "
+ "CFA expression with unknown side effects");
+ return false;
+ default:
+ // Dwarf expression was only partially valid, and user could have
+ // written anything.
+ Streamer.getContext().reportWarning(
+ CFI.getLoc(),
+ "skipping SFrame FDE; .cfi_escape with unknown effects");
+ return false;
+ }
+ }
+ return true;
+ }
+
// Add the effects of CFI to the current FDE, creating a new FRE when
- // necessary.
+ // necessary. Return true if the CFI is representable in the sframe format.
bool handleCFI(SFrameFDE &FDE, SFrameFRE &FRE, const MCCFIInstruction &CFI) {
switch (CFI.getOperation()) {
case MCCFIInstruction::OpDefCfaRegister:
@@ -265,10 +411,11 @@ class SFrameEmitterImpl {
FRE = FDE.SaveState.pop_back_val();
return true;
case MCCFIInstruction::OpEscape:
- // TODO: Implement. Will use FDE.
- return true;
+ // This is a string of bytes that contains an arbitrary dwarf-expression
+ // that may or may not affect unwind info.
+ return isCFIEscapeSafe(FDE, FRE, CFI);
default:
- // Instructions that don't affect the CFA, RA, and SP can be safely
+ // Instructions that don't affect the CFA, RA, and FP can be safely
// ignored.
return true;
}
diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp
index 67483ba..9d45096 100644
--- a/llvm/lib/Support/Timer.cpp
+++ b/llvm/lib/Support/Timer.cpp
@@ -240,7 +240,8 @@ private:
getGroupEntry(StringRef GroupName, StringRef GroupDescription) {
std::pair<TimerGroup *, Name2TimerMap> &GroupEntry = Map[GroupName];
if (!GroupEntry.first)
- GroupEntry.first = new TimerGroup(GroupName, GroupDescription);
+ GroupEntry.first =
+ new TimerGroup(GroupName, GroupDescription, /*PrintOnExit=*/true);
return GroupEntry;
}
@@ -270,9 +271,10 @@ TimerGroup &NamedRegionTimer::getNamedTimerGroup(StringRef GroupName,
static TimerGroup *TimerGroupList = nullptr;
TimerGroup::TimerGroup(StringRef Name, StringRef Description,
- sys::SmartMutex<true> &lock)
+ sys::SmartMutex<true> &lock, bool PrintOnExit)
: Name(Name.begin(), Name.end()),
- Description(Description.begin(), Description.end()) {
+ Description(Description.begin(), Description.end()),
+ PrintOnExit(PrintOnExit) {
// Add the group to TimerGroupList.
sys::SmartScopedLock<true> L(lock);
if (TimerGroupList)
@@ -282,12 +284,12 @@ TimerGroup::TimerGroup(StringRef Name, StringRef Description,
TimerGroupList = this;
}
-TimerGroup::TimerGroup(StringRef Name, StringRef Description)
- : TimerGroup(Name, Description, timerLock()) {}
+TimerGroup::TimerGroup(StringRef Name, StringRef Description, bool PrintOnExit)
+ : TimerGroup(Name, Description, timerLock(), PrintOnExit) {}
TimerGroup::TimerGroup(StringRef Name, StringRef Description,
- const StringMap<TimeRecord> &Records)
- : TimerGroup(Name, Description) {
+ const StringMap<TimeRecord> &Records, bool PrintOnExit)
+ : TimerGroup(Name, Description, PrintOnExit) {
TimersToPrint.reserve(Records.size());
for (const auto &P : Records)
TimersToPrint.emplace_back(P.getValue(), std::string(P.getKey()),
@@ -301,7 +303,7 @@ TimerGroup::~TimerGroup() {
while (FirstTimer)
removeTimer(*FirstTimer);
- if (!TimersToPrint.empty()) {
+ if (!TimersToPrint.empty() && PrintOnExit) {
std::unique_ptr<raw_ostream> OutStream = CreateInfoOutputFile();
PrintQueuedTimers(*OutStream);
}
@@ -530,7 +532,7 @@ public:
sys::SmartMutex<true> TimerLock;
TimerGroup DefaultTimerGroup{"misc", "Miscellaneous Ungrouped Timers",
- TimerLock};
+ TimerLock, /*PrintOnExit=*/true};
SignpostEmitter Signposts;
// Order of these members and initialization below is important. For example
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index ecaeff7..b3ec65c 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -71,7 +71,6 @@ def AArch64PreLegalizerCombiner: GICombiner<
"AArch64PreLegalizerCombinerImpl", [all_combines,
icmp_redundant_trunc,
fold_global_offset,
- shuffle_to_extract,
ext_addv_to_udot_addv,
ext_uaddv_to_uaddlv,
push_sub_through_zext,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a81de5c..d16b116 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9002,12 +9002,12 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI,
}
static SMECallAttrs
-getSMECallAttrs(const Function &Caller, const AArch64TargetLowering &TLI,
+getSMECallAttrs(const Function &Caller, const RTLIB::RuntimeLibcallsInfo &RTLCI,
const TargetLowering::CallLoweringInfo &CLI) {
if (CLI.CB)
- return SMECallAttrs(*CLI.CB, &TLI);
+ return SMECallAttrs(*CLI.CB, &RTLCI);
if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
- return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI));
+ return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), RTLCI));
return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(SMEAttrs::Normal));
}
@@ -9029,7 +9029,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
// SME Streaming functions are not eligible for TCO as they may require
// the streaming mode or ZA to be restored after returning from the call.
- SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI);
+ SMECallAttrs CallAttrs =
+ getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI);
if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
CallAttrs.requiresPreservingAllZAState() ||
CallAttrs.caller().hasStreamingBody())
@@ -9454,7 +9455,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
}
// Determine whether we need any streaming mode changes.
- SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI);
+ SMECallAttrs CallAttrs =
+ getSMECallAttrs(MF.getFunction(), getRuntimeLibcallsInfo(), CLI);
std::optional<unsigned> ZAMarkerNode;
bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
@@ -19476,6 +19478,61 @@ static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
Op1 ? Op1 : Mul->getOperand(1));
}
+// Multiplying an RDSVL value by a constant can sometimes be done cheaper by
+// folding a power-of-two factor of the constant into the RDSVL immediate and
+// compensating with an extra shift.
+//
+// We rewrite:
+// (mul (srl (rdsvl 1), w), x)
+// to one of:
+// (shl (rdsvl y), z) if z > 0
+// (srl (rdsvl y), abs(z)) if z < 0
+// where integers y, z satisfy x = y * 2^(w + z) and y ∈ [-32, 31].
+static SDValue performMulRdsvlCombine(SDNode *Mul, SelectionDAG &DAG) {
+ SDLoc DL(Mul);
+ EVT VT = Mul->getValueType(0);
+ SDValue MulOp0 = Mul->getOperand(0);
+ int ConstMultiplier =
+ cast<ConstantSDNode>(Mul->getOperand(1))->getSExtValue();
+ if ((MulOp0->getOpcode() != ISD::SRL) ||
+ (MulOp0->getOperand(0).getOpcode() != AArch64ISD::RDSVL))
+ return SDValue();
+
+ unsigned AbsConstValue = abs(ConstMultiplier);
+ unsigned OperandShift =
+ cast<ConstantSDNode>(MulOp0->getOperand(1))->getZExtValue();
+
+ // z ≤ ctz(|x|) - w (largest extra shift we can take while keeping y
+ // integral)
+ int UpperBound = llvm::countr_zero(AbsConstValue) - OperandShift;
+
+ // To keep y in range, with B = 31 for x > 0 and B = 32 for x < 0, we need:
+ // 2^(w + z) ≥ ceil(x / B) ⇒ z ≥ ceil_log2(ceil(x / B)) - w (LowerBound).
+ unsigned B = ConstMultiplier < 0 ? 32 : 31;
+ unsigned CeilAxOverB = (AbsConstValue + (B - 1)) / B; // ceil(|x|/B)
+ int LowerBound = llvm::Log2_32_Ceil(CeilAxOverB) - OperandShift;
+
+ // No valid solution found.
+ if (LowerBound > UpperBound)
+ return SDValue();
+
+ // Any value of z in [LowerBound, UpperBound] is valid. Prefer no extra
+ // shift if possible.
+ int Shift = std::min(std::max(/*prefer*/ 0, LowerBound), UpperBound);
+
+ // y = x / 2^(w + z)
+ int32_t RdsvlMul = (AbsConstValue >> (OperandShift + Shift)) *
+ (ConstMultiplier < 0 ? -1 : 1);
+ auto Rdsvl = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+ DAG.getSignedConstant(RdsvlMul, DL, MVT::i32));
+
+ if (Shift == 0)
+ return Rdsvl;
+ return DAG.getNode(Shift < 0 ? ISD::SRL : ISD::SHL, DL, VT, Rdsvl,
+ DAG.getConstant(abs(Shift), DL, MVT::i32),
+ SDNodeFlags::Exact);
+}
+
// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
// Same for other types with equivalent constants.
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
@@ -19604,6 +19661,9 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
if (!isa<ConstantSDNode>(N1))
return SDValue();
+ if (SDValue Ext = performMulRdsvlCombine(N, DAG))
+ return Ext;
+
ConstantSDNode *C = cast<ConstantSDNode>(N1);
const APInt &ConstValue = C->getAPIntValue();
@@ -26665,11 +26725,34 @@ static SDValue performDUPCombine(SDNode *N,
}
if (N->getOpcode() == AArch64ISD::DUP) {
+ SDValue Op = N->getOperand(0);
+
+ // Optimize DUP(extload/zextload i8/i16/i32) to avoid GPR->FPR transfer.
+ // For example:
+ // v4i32 = DUP (i32 (zextloadi8 addr))
+ // =>
+ // v4i32 = SCALAR_TO_VECTOR (i32 (zextloadi8 addr)) ; Matches to ldr b0
+ // v4i32 = DUPLANE32 (v4i32), 0
+ if (auto *LD = dyn_cast<LoadSDNode>(Op)) {
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ EVT MemVT = LD->getMemoryVT();
+ EVT ElemVT = VT.getVectorElementType();
+ if ((ExtType == ISD::EXTLOAD || ExtType == ISD::ZEXTLOAD) &&
+ (MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) &&
+ ElemVT != MemVT && LD->hasOneUse()) {
+ EVT Vec128VT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
+ 128 / ElemVT.getSizeInBits());
+ SDValue ScalarToVec =
+ DCI.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, Vec128VT, Op);
+ return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, ScalarToVec,
+ DCI.DAG.getConstant(0, DL, MVT::i64));
+ }
+ }
+
// If the instruction is known to produce a scalar in SIMD registers, we can
// duplicate it across the vector lanes using DUPLANE instead of moving it
// to a GPR first. For example, this allows us to handle:
// v4i32 = DUP (i32 (FCMGT (f32, f32)))
- SDValue Op = N->getOperand(0);
// FIXME: Ideally, we should be able to handle all instructions that
// produce a scalar value in FPRs.
if (Op.getOpcode() == AArch64ISD::FCMEQ ||
@@ -29430,15 +29513,6 @@ void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
TargetLowering::insertSSPDeclarations(M);
}
-Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
- // MSVC CRT has a function to validate security cookie.
- RTLIB::LibcallImpl SecurityCheckCookieLibcall =
- getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
- if (SecurityCheckCookieLibcall != RTLIB::Unsupported)
- return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall));
- return TargetLowering::getSSPStackGuardCheck(M);
-}
-
Value *
AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
// Android provides a fixed TLS slot for the SafeStack pointer. See the
@@ -29447,11 +29521,6 @@ AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
if (Subtarget->isTargetAndroid())
return UseTlsOffset(IRB, 0x48);
- // Fuchsia is similar.
- // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
- if (Subtarget->isTargetFuchsia())
- return UseTlsOffset(IRB, -0x8);
-
return TargetLowering::getSafeStackPointerLocation(IRB);
}
@@ -29769,7 +29838,7 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
// Checks to allow the use of SME instructions
if (auto *Base = dyn_cast<CallBase>(&Inst)) {
- auto CallAttrs = SMECallAttrs(*Base, this);
+ auto CallAttrs = SMECallAttrs(*Base, &getRuntimeLibcallsInfo());
if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
CallAttrs.requiresPreservingZT0() ||
CallAttrs.requiresPreservingAllZAState())
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 9495c9f..2cb8ed2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -366,7 +366,6 @@ public:
Value *getIRStackGuard(IRBuilderBase &IRB) const override;
void insertSSPDeclarations(Module &M) const override;
- Function *getSSPStackGuardCheck(const Module &M) const override;
/// If the target has a standard location for the unsafe stack pointer,
/// returns the address of that location. Otherwise, returns nullptr.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index eab1627..58a53af 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -5298,7 +5298,7 @@ multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
}
multiclass FPToIntegerSIMDScalar<bits<2> rmode, bits<3> opcode, string asm,
- SDPatternOperator OpN = null_frag> {
+ SDPatternOperator OpN> {
// double-precision to 32-bit SIMD/FPR
def SDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, FPR32, asm,
[(set FPR32:$Rd, (i32 (OpN (f64 FPR64:$Rn))))]> {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index b74ca79..b9e299e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4022,22 +4022,6 @@ defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
(SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
-// load zero-extended i32, bitcast to f64
-def : Pat<(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
- (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
-// load zero-extended i16, bitcast to f64
-def : Pat<(f64 (bitconvert (i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
- (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-// load zero-extended i8, bitcast to f64
-def : Pat<(f64 (bitconvert (i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
- (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-// load zero-extended i16, bitcast to f32
-def : Pat<(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
- (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-// load zero-extended i8, bitcast to f32
-def : Pat<(f32 (bitconvert (i32 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
- (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-
// Pre-fetch.
def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
[(AArch64Prefetch timm:$Rt,
@@ -4389,6 +4373,64 @@ def : Pat <(v1i64 (scalar_to_vector (i64
(load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))),
(LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>;
+// Patterns for bitconvert or scalar_to_vector of load operations.
+// Enables direct SIMD register loads for small integer types (i8/i16) that are
+// naturally zero-extended to i32/i64.
+multiclass ExtLoad8_16AllModes<ValueType OutTy, ValueType InnerTy,
+ SDPatternOperator OuterOp,
+ PatFrags LoadOp8, PatFrags LoadOp16> {
+ // 8-bit loads.
+ def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+ def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDURBi GPR64sp:$Rn, simm9:$offset), bsub)>;
+ def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$extend))))),
+ (SUBREG_TO_REG (i64 0), (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$extend), bsub)>;
+ def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$extend))))),
+ (SUBREG_TO_REG (i64 0), (LDRBroX GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$extend), bsub)>;
+
+ // 16-bit loads.
+ def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+ def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDURHi GPR64sp:$Rn, simm9:$offset), hsub)>;
+ def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$extend))))),
+ (SUBREG_TO_REG (i64 0), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$extend), hsub)>;
+ def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$extend))))),
+ (SUBREG_TO_REG (i64 0), (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$extend), hsub)>;
+}
+
+// Extended multiclass that includes 32-bit loads in addition to 8-bit and 16-bit.
+multiclass ExtLoad8_16_32AllModes<ValueType OutTy, ValueType InnerTy,
+ SDPatternOperator OuterOp,
+ PatFrags LoadOp8, PatFrags LoadOp16, PatFrags LoadOp32> {
+ defm : ExtLoad8_16AllModes<OutTy, InnerTy, OuterOp, LoadOp8, LoadOp16>;
+
+ // 32-bit loads.
+ def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+ def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDURSi GPR64sp:$Rn, simm9:$offset), ssub)>;
+ def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$extend))))),
+ (SUBREG_TO_REG (i64 0), (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$extend), ssub)>;
+ def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$extend))))),
+ (SUBREG_TO_REG (i64 0), (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$extend), ssub)>;
+}
+
+// Instantiate bitconvert patterns for floating-point types.
+defm : ExtLoad8_16AllModes<f32, i32, bitconvert, zextloadi8, zextloadi16>;
+defm : ExtLoad8_16_32AllModes<f64, i64, bitconvert, zextloadi8, zextloadi16, zextloadi32>;
+
+// Instantiate scalar_to_vector patterns for all vector types.
+defm : ExtLoad8_16AllModes<v16i8, i32, scalar_to_vector, zextloadi8, zextloadi16>;
+defm : ExtLoad8_16AllModes<v16i8, i32, scalar_to_vector, extloadi8, extloadi16>;
+defm : ExtLoad8_16AllModes<v8i16, i32, scalar_to_vector, zextloadi8, zextloadi16>;
+defm : ExtLoad8_16AllModes<v8i16, i32, scalar_to_vector, extloadi8, extloadi16>;
+defm : ExtLoad8_16AllModes<v4i32, i32, scalar_to_vector, zextloadi8, zextloadi16>;
+defm : ExtLoad8_16AllModes<v4i32, i32, scalar_to_vector, extloadi8, extloadi16>;
+defm : ExtLoad8_16_32AllModes<v2i64, i64, scalar_to_vector, zextloadi8, zextloadi16, zextloadi32>;
+defm : ExtLoad8_16_32AllModes<v2i64, i64, scalar_to_vector, extloadi8, extloadi16, extloadi32>;
+
// Pre-fetch.
defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
[(AArch64Prefetch timm:$Rt,
@@ -5253,113 +5295,10 @@ let Predicates = [HasNEON, HasFPRCVT] in{
defm FCVTNU : FPToIntegerSIMDScalar<0b01, 0b011, "fcvtnu", int_aarch64_neon_fcvtnu>;
defm FCVTPS : FPToIntegerSIMDScalar<0b10, 0b010, "fcvtps", int_aarch64_neon_fcvtps>;
defm FCVTPU : FPToIntegerSIMDScalar<0b10, 0b011, "fcvtpu", int_aarch64_neon_fcvtpu>;
- defm FCVTZS : FPToIntegerSIMDScalar<0b10, 0b110, "fcvtzs">;
- defm FCVTZU : FPToIntegerSIMDScalar<0b10, 0b111, "fcvtzu">;
-}
-
-
-// AArch64's FCVT instructions saturate when out of range.
-multiclass FPToIntegerSatPats<SDNode to_int_sat, SDNode to_int_sat_gi, string INST> {
- let Predicates = [HasFullFP16] in {
- def : Pat<(i32 (to_int_sat f16:$Rn, i32)),
- (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
- def : Pat<(i64 (to_int_sat f16:$Rn, i64)),
- (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
- }
- def : Pat<(i32 (to_int_sat f32:$Rn, i32)),
- (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
- def : Pat<(i64 (to_int_sat f32:$Rn, i64)),
- (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
- def : Pat<(i32 (to_int_sat f64:$Rn, i32)),
- (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
- def : Pat<(i64 (to_int_sat f64:$Rn, i64)),
- (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
-
- let Predicates = [HasFullFP16] in {
- def : Pat<(i32 (to_int_sat_gi f16:$Rn)),
- (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
- def : Pat<(i64 (to_int_sat_gi f16:$Rn)),
- (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
- }
- def : Pat<(i32 (to_int_sat_gi f32:$Rn)),
- (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
- def : Pat<(i64 (to_int_sat_gi f32:$Rn)),
- (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
- def : Pat<(i32 (to_int_sat_gi f64:$Rn)),
- (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
- def : Pat<(i64 (to_int_sat_gi f64:$Rn)),
- (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
-
- let Predicates = [HasFullFP16] in {
- def : Pat<(i32 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i32:$scale), i32)),
- (!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
- def : Pat<(i64 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i64:$scale), i64)),
- (!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
- }
- def : Pat<(i32 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i32:$scale), i32)),
- (!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
- def : Pat<(i64 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i64:$scale), i64)),
- (!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
- def : Pat<(i32 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i32:$scale), i32)),
- (!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
- def : Pat<(i64 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i64:$scale), i64)),
- (!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
-
- let Predicates = [HasFullFP16] in {
- def : Pat<(i32 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i32:$scale))),
- (!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
- def : Pat<(i64 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i64:$scale))),
- (!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
- }
- def : Pat<(i32 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i32:$scale))),
- (!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
- def : Pat<(i64 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i64:$scale))),
- (!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
- def : Pat<(i32 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i32:$scale))),
- (!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
- def : Pat<(i64 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i64:$scale))),
- (!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
-}
-
-defm : FPToIntegerSatPats<fp_to_sint_sat, fp_to_sint_sat_gi, "FCVTZS">;
-defm : FPToIntegerSatPats<fp_to_uint_sat, fp_to_uint_sat_gi, "FCVTZU">;
-
-multiclass FPToIntegerPats<SDNode to_int, SDNode to_int_sat, SDNode round, string INST> {
- def : Pat<(i32 (to_int (round f32:$Rn))),
- (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
- def : Pat<(i64 (to_int (round f32:$Rn))),
- (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
- def : Pat<(i32 (to_int (round f64:$Rn))),
- (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
- def : Pat<(i64 (to_int (round f64:$Rn))),
- (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
-
- // These instructions saturate like fp_to_[su]int_sat.
- let Predicates = [HasFullFP16] in {
- def : Pat<(i32 (to_int_sat (round f16:$Rn), i32)),
- (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
- def : Pat<(i64 (to_int_sat (round f16:$Rn), i64)),
- (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
- }
- def : Pat<(i32 (to_int_sat (round f32:$Rn), i32)),
- (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
- def : Pat<(i64 (to_int_sat (round f32:$Rn), i64)),
- (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
- def : Pat<(i32 (to_int_sat (round f64:$Rn), i32)),
- (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
- def : Pat<(i64 (to_int_sat (round f64:$Rn), i64)),
- (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+ defm FCVTZS : FPToIntegerSIMDScalar<0b10, 0b110, "fcvtzs", any_fp_to_sint>;
+ defm FCVTZU : FPToIntegerSIMDScalar<0b10, 0b111, "fcvtzu", any_fp_to_uint>;
}
-defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fceil, "FCVTPS">;
-defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fceil, "FCVTPU">;
-defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, ffloor, "FCVTMS">;
-defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, ffloor, "FCVTMU">;
-defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, ftrunc, "FCVTZS">;
-defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, ftrunc, "FCVTZU">;
-defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fround, "FCVTAS">;
-defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fround, "FCVTAU">;
-
let Predicates = [HasFullFP16] in {
@@ -6567,8 +6506,8 @@ defm FCVTNU : SIMDFPTwoScalar< 1, 0, 0b11010, "fcvtnu", int_aarch64_neon_fcvtn
defm FCVTPS : SIMDFPTwoScalar< 0, 1, 0b11010, "fcvtps", int_aarch64_neon_fcvtps>;
defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu", int_aarch64_neon_fcvtpu>;
def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
-defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">;
-defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">;
+defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs", any_fp_to_sint>;
+defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu", any_fp_to_uint>;
defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe">;
defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx">;
defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte">;
@@ -6588,6 +6527,7 @@ defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
// Floating-point conversion patterns.
multiclass FPToIntegerSIMDScalarPatterns<SDPatternOperator OpN, string INST> {
+ let Predicates = [HasFPRCVT] in {
def : Pat<(f32 (bitconvert (i32 (OpN (f64 FPR64:$Rn))))),
(!cast<Instruction>(INST # SDr) FPR64:$Rn)>;
def : Pat<(f32 (bitconvert (i32 (OpN (f16 FPR16:$Rn))))),
@@ -6596,6 +6536,7 @@ multiclass FPToIntegerSIMDScalarPatterns<SDPatternOperator OpN, string INST> {
(!cast<Instruction>(INST # DHr) FPR16:$Rn)>;
def : Pat<(f64 (bitconvert (i64 (OpN (f32 FPR32:$Rn))))),
(!cast<Instruction>(INST # DSr) FPR32:$Rn)>;
+ }
def : Pat<(f32 (bitconvert (i32 (OpN (f32 FPR32:$Rn))))),
(!cast<Instruction>(INST # v1i32) FPR32:$Rn)>;
def : Pat<(f64 (bitconvert (i64 (OpN (f64 FPR64:$Rn))))),
@@ -6610,6 +6551,8 @@ defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtns, "FCVTNS">;
defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtnu, "FCVTNU">;
defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtps, "FCVTPS">;
defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtpu, "FCVTPU">;
+defm: FPToIntegerSIMDScalarPatterns<any_fp_to_sint, "FCVTZS">;
+defm: FPToIntegerSIMDScalarPatterns<any_fp_to_uint, "FCVTZU">;
multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
let Predicates = [HasFullFP16] in {
@@ -6666,6 +6609,196 @@ multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">;
defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">;
+// AArch64's FCVT instructions saturate when out of range.
+multiclass FPToIntegerSatPats<SDNode to_int_sat, SDNode to_int_sat_gi, string INST> {
+ let Predicates = [HasFullFP16] in {
+ def : Pat<(i32 (to_int_sat f16:$Rn, i32)),
+ (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
+ def : Pat<(i64 (to_int_sat f16:$Rn, i64)),
+ (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
+ }
+ def : Pat<(i32 (to_int_sat f32:$Rn, i32)),
+ (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
+ def : Pat<(i64 (to_int_sat f32:$Rn, i64)),
+ (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+ def : Pat<(i32 (to_int_sat f64:$Rn, i32)),
+ (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
+ def : Pat<(i64 (to_int_sat f64:$Rn, i64)),
+ (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+
+ let Predicates = [HasFullFP16] in {
+ def : Pat<(i32 (to_int_sat_gi f16:$Rn)),
+ (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
+ def : Pat<(i64 (to_int_sat_gi f16:$Rn)),
+ (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
+ }
+ def : Pat<(i32 (to_int_sat_gi f32:$Rn)),
+ (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
+ def : Pat<(i64 (to_int_sat_gi f32:$Rn)),
+ (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+ def : Pat<(i32 (to_int_sat_gi f64:$Rn)),
+ (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
+ def : Pat<(i64 (to_int_sat_gi f64:$Rn)),
+ (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+
+ // For global-isel we can use register classes to determine
+ // which FCVT instruction to use.
+ let Predicates = [HasFPRCVT] in {
+ def : Pat<(i32 (to_int_sat_gi f16:$Rn)),
+ (!cast<Instruction>(INST # SHr) f16:$Rn)>;
+ def : Pat<(i64 (to_int_sat_gi f16:$Rn)),
+ (!cast<Instruction>(INST # DHr) f16:$Rn)>;
+ def : Pat<(i64 (to_int_sat_gi f32:$Rn)),
+ (!cast<Instruction>(INST # DSr) f32:$Rn)>;
+ def : Pat<(i32 (to_int_sat_gi f64:$Rn)),
+ (!cast<Instruction>(INST # SDr) f64:$Rn)>;
+ }
+ def : Pat<(i32 (to_int_sat_gi f32:$Rn)),
+ (!cast<Instruction>(INST # v1i32) f32:$Rn)>;
+ def : Pat<(i64 (to_int_sat_gi f64:$Rn)),
+ (!cast<Instruction>(INST # v1i64) f64:$Rn)>;
+
+ let Predicates = [HasFPRCVT] in {
+ def : Pat<(f32 (bitconvert (i32 (to_int_sat f16:$Rn, i32)))),
+ (!cast<Instruction>(INST # SHr) f16:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (to_int_sat f16:$Rn, i64)))),
+ (!cast<Instruction>(INST # DHr) f16:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (to_int_sat f32:$Rn, i64)))),
+ (!cast<Instruction>(INST # DSr) f32:$Rn)>;
+ def : Pat<(f32 (bitconvert (i32 (to_int_sat f64:$Rn, i32)))),
+ (!cast<Instruction>(INST # SDr) f64:$Rn)>;
+ }
+ def : Pat<(f32 (bitconvert (i32 (to_int_sat f32:$Rn, i32)))),
+ (!cast<Instruction>(INST # v1i32) f32:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (to_int_sat f64:$Rn, i64)))),
+ (!cast<Instruction>(INST # v1i64) f64:$Rn)>;
+
+ let Predicates = [HasFullFP16] in {
+ def : Pat<(i32 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i32:$scale), i32)),
+ (!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
+ def : Pat<(i64 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i64:$scale), i64)),
+ (!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
+ }
+ def : Pat<(i32 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i32:$scale), i32)),
+ (!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
+ def : Pat<(i64 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i64:$scale), i64)),
+ (!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
+ def : Pat<(i32 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i32:$scale), i32)),
+ (!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
+ def : Pat<(i64 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i64:$scale), i64)),
+ (!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
+
+ let Predicates = [HasFullFP16] in {
+ def : Pat<(i32 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i32:$scale))),
+ (!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
+ def : Pat<(i64 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i64:$scale))),
+ (!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
+ }
+ def : Pat<(i32 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i32:$scale))),
+ (!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
+ def : Pat<(i64 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i64:$scale))),
+ (!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
+ def : Pat<(i32 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i32:$scale))),
+ (!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
+ def : Pat<(i64 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i64:$scale))),
+ (!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
+}
+
+defm : FPToIntegerSatPats<fp_to_sint_sat, fp_to_sint_sat_gi, "FCVTZS">;
+defm : FPToIntegerSatPats<fp_to_uint_sat, fp_to_uint_sat_gi, "FCVTZU">;
+
+multiclass FPToIntegerPats<SDNode to_int, SDNode to_int_sat, SDNode to_int_sat_gi, SDNode round, string INST> {
+ def : Pat<(i32 (to_int (round f32:$Rn))),
+ (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
+ def : Pat<(i64 (to_int (round f32:$Rn))),
+ (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+ def : Pat<(i32 (to_int (round f64:$Rn))),
+ (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
+ def : Pat<(i64 (to_int (round f64:$Rn))),
+ (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+
+ // For global-isel we can use register classes to determine
+ // which FCVT instruction to use.
+ let Predicates = [HasFPRCVT] in {
+ def : Pat<(i64 (to_int (round f32:$Rn))),
+ (!cast<Instruction>(INST # DSr) f32:$Rn)>;
+ def : Pat<(i32 (to_int (round f64:$Rn))),
+ (!cast<Instruction>(INST # SDr) f64:$Rn)>;
+ }
+ def : Pat<(i32 (to_int (round f32:$Rn))),
+ (!cast<Instruction>(INST # v1i32) f32:$Rn)>;
+ def : Pat<(i64 (to_int (round f64:$Rn))),
+ (!cast<Instruction>(INST # v1i64) f64:$Rn)>;
+
+ let Predicates = [HasFPRCVT] in {
+ def : Pat<(f64 (bitconvert (i64 (to_int (round f32:$Rn))))),
+ (!cast<Instruction>(INST # DSr) f32:$Rn)>;
+ def : Pat<(f32 (bitconvert (i32 (to_int (round f64:$Rn))))),
+ (!cast<Instruction>(INST # SDr) f64:$Rn)>;
+ }
+ def : Pat<(f32 (bitconvert (i32 (to_int (round f32:$Rn))))),
+ (!cast<Instruction>(INST # v1i32) f32:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (to_int (round f64:$Rn))))),
+ (!cast<Instruction>(INST # v1i64) f64:$Rn)>;
+
+ // These instructions saturate like fp_to_[su]int_sat.
+ let Predicates = [HasFullFP16] in {
+ def : Pat<(i32 (to_int_sat (round f16:$Rn), i32)),
+ (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
+ def : Pat<(i64 (to_int_sat (round f16:$Rn), i64)),
+ (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
+ }
+ def : Pat<(i32 (to_int_sat (round f32:$Rn), i32)),
+ (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
+ def : Pat<(i64 (to_int_sat (round f32:$Rn), i64)),
+ (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+ def : Pat<(i32 (to_int_sat (round f64:$Rn), i32)),
+ (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
+ def : Pat<(i64 (to_int_sat (round f64:$Rn), i64)),
+ (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+
+ // For global-isel we can use register classes to determine
+ // which FCVT instruction to use.
+ let Predicates = [HasFPRCVT] in {
+ def : Pat<(i32 (to_int_sat_gi (round f16:$Rn))),
+ (!cast<Instruction>(INST # SHr) f16:$Rn)>;
+ def : Pat<(i64 (to_int_sat_gi (round f16:$Rn))),
+ (!cast<Instruction>(INST # DHr) f16:$Rn)>;
+ def : Pat<(i64 (to_int_sat_gi (round f32:$Rn))),
+ (!cast<Instruction>(INST # DSr) f32:$Rn)>;
+ def : Pat<(i32 (to_int_sat_gi (round f64:$Rn))),
+ (!cast<Instruction>(INST # SDr) f64:$Rn)>;
+ }
+ def : Pat<(i32 (to_int_sat_gi (round f32:$Rn))),
+ (!cast<Instruction>(INST # v1i32) f32:$Rn)>;
+ def : Pat<(i64 (to_int_sat_gi (round f64:$Rn))),
+ (!cast<Instruction>(INST # v1i64) f64:$Rn)>;
+
+ let Predicates = [HasFPRCVT] in {
+ def : Pat<(f32 (bitconvert (i32 (to_int_sat (round f16:$Rn), i32)))),
+ (!cast<Instruction>(INST # SHr) f16:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (to_int_sat (round f16:$Rn), i64)))),
+ (!cast<Instruction>(INST # DHr) f16:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (to_int_sat (round f32:$Rn), i64)))),
+ (!cast<Instruction>(INST # DSr) f32:$Rn)>;
+ def : Pat<(f32 (bitconvert (i32 (to_int_sat (round f64:$Rn), i32)))),
+ (!cast<Instruction>(INST # SDr) f64:$Rn)>;
+ }
+ def : Pat<(f32 (bitconvert (i32 (to_int_sat (round f32:$Rn), i32)))),
+ (!cast<Instruction>(INST # v1i32) f32:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (to_int_sat (round f64:$Rn), i64)))),
+ (!cast<Instruction>(INST # v1i64) f64:$Rn)>;
+}
+
+defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, fceil, "FCVTPS">;
+defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, fceil, "FCVTPU">;
+defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, ffloor, "FCVTMS">;
+defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, ffloor, "FCVTMU">;
+defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, ftrunc, "FCVTZS">;
+defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, ftrunc, "FCVTZU">;
+defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, fround, "FCVTAS">;
+defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, fround, "FCVTAU">;
+
// f16 -> s16 conversions
let Predicates = [HasFullFP16] in {
def : Pat<(i16(fp_to_sint_sat_gi f16:$Rn)), (FCVTZSv1f16 f16:$Rn)>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index bdde8e3..2387f17 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -2762,11 +2762,11 @@ def : InstRW<[V2Write_11c_18L01_18V01], (instregex "^ST4[BHWD]_IMM$")>;
def : InstRW<[V2Write_11c_18L01_18S_18V01], (instregex "^ST4[BHWD]$")>;
// Non temporal store, scalar + imm
-def : InstRW<[V2Write_2c_1L01_1V], (instregex "^STNT1[BHWD]_ZRI$")>;
+def : InstRW<[V2Write_2c_1L01_1V01], (instregex "^STNT1[BHWD]_ZRI$")>;
// Non temporal store, scalar + scalar
-def : InstRW<[V2Write_2c_1L01_1S_1V], (instrs STNT1H_ZRR)>;
-def : InstRW<[V2Write_2c_1L01_1V], (instregex "^STNT1[BWD]_ZRR$")>;
+def : InstRW<[V2Write_2c_1L01_1S_1V01], (instrs STNT1H_ZRR)>;
+def : InstRW<[V2Write_2c_1L01_1V01], (instregex "^STNT1[BWD]_ZRR$")>;
// Scatter non temporal store, vector + scalar 32-bit element size
def : InstRW<[V2Write_4c_4L01_4V01], (instregex "^STNT1[BHW]_ZZR_S")>;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 2053fc4..fede586 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -224,7 +224,8 @@ static cl::opt<bool> EnableScalableAutovecInStreamingMode(
static bool isSMEABIRoutineCall(const CallInst &CI,
const AArch64TargetLowering &TLI) {
const auto *F = CI.getCalledFunction();
- return F && SMEAttrs(F->getName(), TLI).isSMEABIRoutine();
+ return F &&
+ SMEAttrs(F->getName(), TLI.getRuntimeLibcallsInfo()).isSMEABIRoutine();
}
/// Returns true if the function has explicit operations that can only be
@@ -355,7 +356,7 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
// change only once and avoid inlining of G into F.
SMEAttrs FAttrs(*F);
- SMECallAttrs CallAttrs(Call, getTLI());
+ SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
if (F == Call.getCaller()) // (1)
@@ -957,23 +958,50 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return TyL.first + ExtraCost;
}
case Intrinsic::get_active_lane_mask: {
- auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
- if (RetTy) {
- EVT RetVT = getTLI()->getValueType(DL, RetTy);
- EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
- if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
- !getTLI()->isTypeLegal(RetVT)) {
- // We don't have enough context at this point to determine if the mask
- // is going to be kept live after the block, which will force the vXi1
- // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
- // For now, we just assume the vectorizer created this intrinsic and
- // the result will be the input for a PHI. In this case the cost will
- // be extremely high for fixed-width vectors.
- // NOTE: getScalarizationOverhead returns a cost that's far too
- // pessimistic for the actual generated codegen. In reality there are
- // two instructions generated per lane.
- return RetTy->getNumElements() * 2;
+ auto RetTy = cast<VectorType>(ICA.getReturnType());
+ EVT RetVT = getTLI()->getValueType(DL, RetTy);
+ EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+ if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
+ break;
+
+ if (RetTy->isScalableTy()) {
+ if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
+ TargetLowering::TypeSplitVector)
+ break;
+
+ auto LT = getTypeLegalizationCost(RetTy);
+ InstructionCost Cost = LT.first;
+ // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
+ // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
+ // nxv32i1 = get_active_lane_mask(base, idx) ->
+ // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
+ if (ST->hasSVE2p1() || ST->hasSME2()) {
+ Cost /= 2;
+ if (Cost == 1)
+ return Cost;
}
+
+ // If more than one whilelo intrinsic is required, include the extra cost
+ // required by the saturating add & select required to increment the
+ // start value after the first intrinsic call.
+ Type *OpTy = ICA.getArgTypes()[0];
+ IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
+ InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
+ Type *CondTy = OpTy->getWithNewBitWidth(1);
+ SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
+ CmpInst::ICMP_UGT, CostKind);
+ return Cost + (SplitCost * (Cost - 1));
+ } else if (!getTLI()->isTypeLegal(RetVT)) {
+ // We don't have enough context at this point to determine if the mask
+ // is going to be kept live after the block, which will force the vXi1
+ // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
+ // For now, we just assume the vectorizer created this intrinsic and
+ // the result will be the input for a PHI. In this case the cost will
+ // be extremely high for fixed-width vectors.
+ // NOTE: getScalarizationOverhead returns a cost that's far too
+ // pessimistic for the actual generated codegen. In reality there are
+ // two instructions generated per lane.
+ return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
}
break;
}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 3e55b76..14b0f9a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -5126,23 +5126,13 @@ bool AArch64InstructionSelector::selectShuffleVector(
MachineInstr &I, MachineRegisterInfo &MRI) {
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
Register Src1Reg = I.getOperand(1).getReg();
- const LLT Src1Ty = MRI.getType(Src1Reg);
Register Src2Reg = I.getOperand(2).getReg();
- const LLT Src2Ty = MRI.getType(Src2Reg);
ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
LLVMContext &Ctx = MF.getFunction().getContext();
- // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
- // it's originated from a <1 x T> type. Those should have been lowered into
- // G_BUILD_VECTOR earlier.
- if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
- LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
- return false;
- }
-
unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
SmallVector<Constant *, 64> CstIdxs;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 05a4313..5f93847 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1201,25 +1201,17 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
return llvm::is_contained(
{v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64}, DstTy);
})
- // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors) or scalar
- // destinations, we just want those lowered into G_BUILD_VECTOR or
- // G_EXTRACT_ELEMENT.
- .lowerIf([=](const LegalityQuery &Query) {
- return !Query.Types[0].isVector() || !Query.Types[1].isVector();
- })
.moreElementsIf(
[](const LegalityQuery &Query) {
- return Query.Types[0].isVector() && Query.Types[1].isVector() &&
- Query.Types[0].getNumElements() >
- Query.Types[1].getNumElements();
+ return Query.Types[0].getNumElements() >
+ Query.Types[1].getNumElements();
},
changeTo(1, 0))
.moreElementsToNextPow2(0)
.moreElementsIf(
[](const LegalityQuery &Query) {
- return Query.Types[0].isVector() && Query.Types[1].isVector() &&
- Query.Types[0].getNumElements() <
- Query.Types[1].getNumElements();
+ return Query.Types[0].getNumElements() <
+ Query.Types[1].getNumElements();
},
changeTo(0, 1))
.widenScalarOrEltToNextPow2OrMinSize(0, 8)
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 830a35bb..6d2d705 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -856,7 +856,9 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case TargetOpcode::G_FPTOSI_SAT:
- case TargetOpcode::G_FPTOUI_SAT: {
+ case TargetOpcode::G_FPTOUI_SAT:
+ case TargetOpcode::G_FPTOSI:
+ case TargetOpcode::G_FPTOUI: {
LLT DstType = MRI.getType(MI.getOperand(0).getReg());
if (DstType.isVector())
break;
@@ -864,11 +866,19 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
break;
}
- OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR};
+ TypeSize DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+ TypeSize SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, TRI);
+ if (((DstSize == SrcSize) || STI.hasFeature(AArch64::FeatureFPRCVT)) &&
+ all_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()),
+ [&](const MachineInstr &UseMI) {
+ return onlyUsesFP(UseMI, MRI, TRI) ||
+ prefersFPUse(UseMI, MRI, TRI);
+ }))
+ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
+ else
+ OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR};
break;
}
- case TargetOpcode::G_FPTOSI:
- case TargetOpcode::G_FPTOUI:
case TargetOpcode::G_INTRINSIC_LRINT:
case TargetOpcode::G_INTRINSIC_LLRINT:
if (MRI.getType(MI.getOperand(0).getReg()).isVector())
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
index d71f728..085c8588 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
@@ -75,8 +75,8 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) {
}
void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName,
- const AArch64TargetLowering &TLI) {
- RTLIB::LibcallImpl Impl = TLI.getSupportedLibcallImpl(FuncName);
+ const RTLIB::RuntimeLibcallsInfo &RTLCI) {
+ RTLIB::LibcallImpl Impl = RTLCI.getSupportedLibcallImpl(FuncName);
if (Impl == RTLIB::Unsupported)
return;
unsigned KnownAttrs = SMEAttrs::Normal;
@@ -124,21 +124,22 @@ bool SMECallAttrs::requiresSMChange() const {
return true;
}
-SMECallAttrs::SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI)
+SMECallAttrs::SMECallAttrs(const CallBase &CB,
+ const RTLIB::RuntimeLibcallsInfo *RTLCI)
: CallerFn(*CB.getFunction()), CalledFn(SMEAttrs::Normal),
Callsite(CB.getAttributes()), IsIndirect(CB.isIndirectCall()) {
if (auto *CalledFunction = CB.getCalledFunction())
- CalledFn = SMEAttrs(*CalledFunction, TLI);
-
- // An `invoke` of an agnostic ZA function may not return normally (it may
- // resume in an exception block). In this case, it acts like a private ZA
- // callee and may require a ZA save to be set up before it is called.
- if (isa<InvokeInst>(CB))
- CalledFn.set(SMEAttrs::ZA_State_Agnostic, /*Enable=*/false);
+ CalledFn = SMEAttrs(*CalledFunction, RTLCI);
// FIXME: We probably should not allow SME attributes on direct calls but
// clang duplicates streaming mode attributes at each callsite.
assert((IsIndirect ||
((Callsite.withoutPerCallsiteFlags() | CalledFn) == CalledFn)) &&
"SME attributes at callsite do not match declaration");
+
+ // An `invoke` of an agnostic ZA function may not return normally (it may
+ // resume in an exception block). In this case, it acts like a private ZA
+ // callee and may require a ZA save to be set up before it is called.
+ if (isa<InvokeInst>(CB))
+ CalledFn.set(SMEAttrs::ZA_State_Agnostic, /*Enable=*/false);
}
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
index d26e3cd..28c397e 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
@@ -12,8 +12,9 @@
#include "llvm/IR/Function.h"
namespace llvm {
-
-class AArch64TargetLowering;
+namespace RTLIB {
+struct RuntimeLibcallsInfo;
+}
class Function;
class CallBase;
@@ -52,14 +53,14 @@ public:
SMEAttrs() = default;
SMEAttrs(unsigned Mask) { set(Mask); }
- SMEAttrs(const Function &F, const AArch64TargetLowering *TLI = nullptr)
+ SMEAttrs(const Function &F, const RTLIB::RuntimeLibcallsInfo *RTLCI = nullptr)
: SMEAttrs(F.getAttributes()) {
- if (TLI)
- addKnownFunctionAttrs(F.getName(), *TLI);
+ if (RTLCI)
+ addKnownFunctionAttrs(F.getName(), *RTLCI);
}
SMEAttrs(const AttributeList &L);
- SMEAttrs(StringRef FuncName, const AArch64TargetLowering &TLI) {
- addKnownFunctionAttrs(FuncName, TLI);
+ SMEAttrs(StringRef FuncName, const RTLIB::RuntimeLibcallsInfo &RTLCI) {
+ addKnownFunctionAttrs(FuncName, RTLCI);
};
void set(unsigned M, bool Enable = true) {
@@ -157,7 +158,7 @@ public:
private:
void addKnownFunctionAttrs(StringRef FuncName,
- const AArch64TargetLowering &TLI);
+ const RTLIB::RuntimeLibcallsInfo &RTLCI);
void validate() const;
};
@@ -175,7 +176,7 @@ public:
SMEAttrs Callsite = SMEAttrs::Normal)
: CallerFn(Caller), CalledFn(Callee), Callsite(Callsite) {}
- SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI);
+ SMECallAttrs(const CallBase &CB, const RTLIB::RuntimeLibcallsInfo *RTLCI);
SMEAttrs &caller() { return CallerFn; }
SMEAttrs &callee() { return IsIndirect ? Callsite : CalledFn; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index e8b211f..7f00ead 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -176,6 +176,19 @@ def binop_s64_with_s32_mask_combines : GICombineGroup<[
combine_or_s64_with_s32_mask, combine_and_s64_with_s32_mask
]>;
+// (or i64:x, (zext i32:y)) -> i64:(merge (or lo_32(x), i32:y), hi_32(x))
+// (or (zext i32:y), i64:x) -> i64:(merge (or lo_32(x), i32:y), hi_32(x))
+def or_s64_zext_s32_frag : GICombinePatFrag<(outs root:$dst), (ins $src_s64, $src_s32),
+ [(pattern (G_OR $dst, i64:$src_s64, i64:$zext_val), (G_ZEXT i64:$zext_val, i32:$src_s32)),
+ (pattern (G_OR $dst, i64:$zext_val, i64:$src_s64), (G_ZEXT i64:$zext_val, i32:$src_s32))]>;
+
+def combine_or_s64_s32 : GICombineRule<
+ (defs root:$dst),
+ (match (or_s64_zext_s32_frag $dst, i64:$x, i32:$y):$dst),
+ (apply (G_UNMERGE_VALUES $x_lo, $x_hi, $x),
+ (G_OR $or, $x_lo, $y),
+ (G_MERGE_VALUES $dst, $or, $x_hi))>;
+
let Predicates = [Has16BitInsts, NotHasMed3_16] in {
// For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
// saves one instruction compared to the promotion.
@@ -206,7 +219,7 @@ def AMDGPUPreLegalizerCombiner: GICombiner<
"AMDGPUPreLegalizerCombinerImpl",
[all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16,
foldable_fneg, combine_shuffle_vector_to_build_vector,
- binop_s64_with_s32_mask_combines]> {
+ binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
@@ -215,7 +228,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
[all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp,
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64,
- binop_s64_with_s32_mask_combines]> {
+ binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 596a895..1a13b22 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -976,9 +976,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
}
+ auto &MinNumMaxNumIeee =
+ getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
+
+ if (ST.hasVOP3PInsts()) {
+ MinNumMaxNumIeee.legalFor(FPTypesPK16)
+ .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+ .clampMaxNumElements(0, S16, 2)
+ .clampScalar(0, S16, S64)
+ .scalarize(0);
+ } else if (ST.has16BitInsts()) {
+ MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
+ } else {
+ MinNumMaxNumIeee.legalFor(FPTypesBase)
+ .clampScalar(0, S32, S64)
+ .scalarize(0);
+ }
+
auto &MinNumMaxNum = getActionDefinitionsBuilder(
- {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
- G_FMAXNUM_IEEE});
+ {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
if (ST.hasVOP3PInsts()) {
MinNumMaxNum.customFor(FPTypesPK16)
@@ -2136,9 +2152,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.legalFor(FPTypesPK16)
.clampMaxNumElements(0, S16, 2)
.scalarize(0);
+ } else if (ST.hasVOP3PInsts()) {
+ getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+ .lowerFor({V2S16})
+ .clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
+ .lower();
} else {
- // TODO: Implement
- getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
+ getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+ .scalarize(0)
+ .clampScalar(0, S32, S64)
+ .lower();
}
getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
@@ -2195,8 +2219,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(
case TargetOpcode::G_FMAXNUM:
case TargetOpcode::G_FMINIMUMNUM:
case TargetOpcode::G_FMAXIMUMNUM:
- case TargetOpcode::G_FMINNUM_IEEE:
- case TargetOpcode::G_FMAXNUM_IEEE:
return legalizeMinNumMaxNum(Helper, MI);
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
return legalizeExtractVectorElt(MI, MRI, B);
@@ -2817,23 +2839,8 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
MachineFunction &MF = Helper.MIRBuilder.getMF();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
- MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
-
- // With ieee_mode disabled, the instructions have the correct behavior
- // already for G_FMINIMUMNUM/G_FMAXIMUMNUM.
- //
- // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode
- // enabled.
- if (!MFI->getMode().IEEE) {
- if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
- MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
- return true;
-
- return !IsIEEEOp;
- }
-
- if (IsIEEEOp)
+ // With ieee_mode disabled, the instructions have the correct behavior.
+ if (!MFI->getMode().IEEE)
return true;
return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 8122db2..313ae3d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21381,15 +21381,6 @@ void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
TargetLowering::insertSSPDeclarations(M);
}
-Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
- // MSVC CRT has a function to validate security cookie.
- RTLIB::LibcallImpl SecurityCheckCookie =
- getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
- if (SecurityCheckCookie != RTLIB::Unsupported)
- return M.getFunction(getLibcallImplName(SecurityCheckCookie));
- return TargetLowering::getSSPStackGuardCheck(M);
-}
-
bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
unsigned &Cost) const {
// If we do not have NEON, vector types are not natively supported.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 8c5e0cf..357d2c5 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -708,7 +708,6 @@ class VectorType;
bool useLoadStackGuardNode(const Module &M) const override;
void insertSSPDeclarations(Module &M) const override;
- Function *getSSPStackGuardCheck(const Module &M) const override;
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
unsigned &Cost) const override;
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 53be167..10d4cd5 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -6546,23 +6546,25 @@ def KCFI_CHECK_ARM
: PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>,
Sched<[]>,
Requires<[IsARM]> {
- let Size = 28; // 7 instructions (bic, ldr, 4x eor, beq, udf)
+ let Size = 40; // worst-case 10 instructions @ 4 bytes each
+ // (push, bic, ldr, 4x eor, pop, beq, udf)
}
def KCFI_CHECK_Thumb2
: PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>,
Sched<[]>,
Requires<[IsThumb2]> {
- let Size =
- 32; // worst-case 9 instructions (push, bic, ldr, 4x eor, pop, beq.w, udf)
+ let Size = 34; // worst-case (push.w[2], bic[4], ldr[4], 4x eor[16], pop.w[2],
+ // beq.w[4], udf[2])
}
def KCFI_CHECK_Thumb1
: PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>,
Sched<[]>,
Requires<[IsThumb1Only]> {
- let Size = 50; // worst-case 25 instructions (pushes, bic helper, type
- // building, cmp, pops)
+ let Size = 38; // worst-case 19 instructions @ 2 bytes each
+ // (2x push, 3x bic-helper, subs+ldr, 13x type-building, cmp,
+ // 2x pop, beq, bkpt)
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index 42e90f0..d6fa65f 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
///
-/// \file This file contains pases and utilities to convert a modern LLVM
+/// \file This file contains passes and utilities to convert a modern LLVM
/// module into a module compatible with the LLVM 3.7-based DirectX Intermediate
/// Language (DXIL).
//===----------------------------------------------------------------------===//
@@ -16,7 +16,6 @@
#include "DirectX.h"
#include "DirectXIRPasses/PointerTypeAnalysis.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringSet.h"
#include "llvm/Analysis/DXILMetadataAnalysis.h"
#include "llvm/Analysis/DXILResource.h"
@@ -27,7 +26,6 @@
#include "llvm/IR/Module.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
-#include "llvm/Support/Compiler.h"
#include "llvm/Support/VersionTuple.h"
#define DEBUG_TYPE "dxil-prepare"
@@ -116,31 +114,6 @@ static void removeStringFunctionAttributes(Function &F,
F.removeRetAttrs(DeadAttrs);
}
-static void cleanModuleFlags(Module &M) {
- NamedMDNode *MDFlags = M.getModuleFlagsMetadata();
- if (!MDFlags)
- return;
-
- SmallVector<llvm::Module::ModuleFlagEntry> FlagEntries;
- M.getModuleFlagsMetadata(FlagEntries);
- bool Updated = false;
- for (auto &Flag : FlagEntries) {
- // llvm 3.7 only supports behavior up to AppendUnique.
- if (Flag.Behavior <= Module::ModFlagBehavior::AppendUnique)
- continue;
- Flag.Behavior = Module::ModFlagBehavior::Warning;
- Updated = true;
- }
-
- if (!Updated)
- return;
-
- MDFlags->eraseFromParent();
-
- for (auto &Flag : FlagEntries)
- M.addModuleFlag(Flag.Behavior, Flag.Key->getString(), Flag.Val);
-}
-
class DXILPrepareModule : public ModulePass {
static Value *maybeGenerateBitcast(IRBuilder<> &Builder,
@@ -202,15 +175,6 @@ class DXILPrepareModule : public ModulePass {
Builder.getPtrTy(PtrTy->getAddressSpace())));
}
- static std::array<unsigned, 6> getCompatibleInstructionMDs(llvm::Module &M) {
- return {M.getMDKindID("dx.nonuniform"),
- M.getMDKindID("dx.controlflow.hints"),
- M.getMDKindID("dx.precise"),
- llvm::LLVMContext::MD_range,
- llvm::LLVMContext::MD_alias_scope,
- llvm::LLVMContext::MD_noalias};
- }
-
public:
bool runOnModule(Module &M) override {
PointerTypeMap PointerTypes = PointerTypeAnalysis::run(M);
@@ -224,10 +188,7 @@ public:
const dxil::ModuleMetadataInfo MetadataInfo =
getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();
VersionTuple ValVer = MetadataInfo.ValidatorVersion;
- bool SkipValidation = ValVer.getMajor() == 0 && ValVer.getMinor() == 0;
-
- // construct allowlist of valid metadata node kinds
- std::array<unsigned, 6> DXILCompatibleMDs = getCompatibleInstructionMDs(M);
+ bool AllowExperimental = ValVer.getMajor() == 0 && ValVer.getMinor() == 0;
for (auto &F : M.functions()) {
F.removeFnAttrs(AttrMask);
@@ -235,7 +196,7 @@ public:
// Only remove string attributes if we are not skipping validation.
// This will reserve the experimental attributes when validation version
// is 0.0 for experiment mode.
- removeStringFunctionAttributes(F, SkipValidation);
+ removeStringFunctionAttributes(F, AllowExperimental);
for (size_t Idx = 0, End = F.arg_size(); Idx < End; ++Idx)
F.removeParamAttrs(Idx, AttrMask);
@@ -243,11 +204,17 @@ public:
IRBuilder<> Builder(&BB);
for (auto &I : make_early_inc_range(BB)) {
- I.dropUnknownNonDebugMetadata(DXILCompatibleMDs);
+ if (auto *CB = dyn_cast<CallBase>(&I)) {
+ CB->removeFnAttrs(AttrMask);
+ CB->removeRetAttrs(AttrMask);
+ for (size_t Idx = 0, End = CB->arg_size(); Idx < End; ++Idx)
+ CB->removeParamAttrs(Idx, AttrMask);
+ continue;
+ }
// Emtting NoOp bitcast instructions allows the ValueEnumerator to be
// unmodified as it reserves instruction IDs during contruction.
- if (auto LI = dyn_cast<LoadInst>(&I)) {
+ if (auto *LI = dyn_cast<LoadInst>(&I)) {
if (Value *NoOpBitcast = maybeGenerateBitcast(
Builder, PointerTypes, I, LI->getPointerOperand(),
LI->getType())) {
@@ -257,7 +224,7 @@ public:
}
continue;
}
- if (auto SI = dyn_cast<StoreInst>(&I)) {
+ if (auto *SI = dyn_cast<StoreInst>(&I)) {
if (Value *NoOpBitcast = maybeGenerateBitcast(
Builder, PointerTypes, I, SI->getPointerOperand(),
SI->getValueOperand()->getType())) {
@@ -268,39 +235,16 @@ public:
}
continue;
}
- if (auto GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
if (Value *NoOpBitcast = maybeGenerateBitcast(
Builder, PointerTypes, I, GEP->getPointerOperand(),
GEP->getSourceElementType()))
GEP->setOperand(0, NoOpBitcast);
continue;
}
- if (auto *CB = dyn_cast<CallBase>(&I)) {
- CB->removeFnAttrs(AttrMask);
- CB->removeRetAttrs(AttrMask);
- for (size_t Idx = 0, End = CB->arg_size(); Idx < End; ++Idx)
- CB->removeParamAttrs(Idx, AttrMask);
- continue;
- }
}
}
}
- // Remove flags not for DXIL.
- cleanModuleFlags(M);
-
- // dx.rootsignatures will have been parsed from its metadata form as its
- // binary form as part of the RootSignatureAnalysisWrapper, so safely
- // remove it as it is not recognized in DXIL
- if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures"))
- RootSignature->eraseFromParent();
-
- // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and
- // causes all tests using the DXIL Validator to fail.
- //
- // This is a temporary fix and should be replaced with a whitelist once
- // we have determined all metadata that the DXIL Validator allows
- if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa"))
- ErrNo->eraseFromParent();
return true;
}
@@ -308,11 +252,11 @@ public:
DXILPrepareModule() : ModulePass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<DXILMetadataAnalysisWrapperPass>();
- AU.addRequired<RootSignatureAnalysisWrapper>();
- AU.addPreserved<RootSignatureAnalysisWrapper>();
- AU.addPreserved<ShaderFlagsAnalysisWrapper>();
+
AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
AU.addPreserved<DXILResourceWrapperPass>();
+ AU.addPreserved<RootSignatureAnalysisWrapper>();
+ AU.addPreserved<ShaderFlagsAnalysisWrapper>();
}
static char ID; // Pass identification.
};
@@ -323,7 +267,6 @@ char DXILPrepareModule::ID = 0;
INITIALIZE_PASS_BEGIN(DXILPrepareModule, DEBUG_TYPE, "DXIL Prepare Module",
false, false)
INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(RootSignatureAnalysisWrapper)
INITIALIZE_PASS_END(DXILPrepareModule, DEBUG_TYPE, "DXIL Prepare Module", false,
false)
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index 9eebcc9..1e4797b 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -7,8 +7,10 @@
//===----------------------------------------------------------------------===//
#include "DXILTranslateMetadata.h"
+#include "DXILRootSignature.h"
#include "DXILShaderFlags.h"
#include "DirectX.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/DXILMetadataAnalysis.h"
@@ -204,9 +206,9 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
return MDNode::get(Ctx, MDVals);
}
-MDTuple *constructEntryMetadata(const Function *EntryFn, MDTuple *Signatures,
- MDNode *Resources, MDTuple *Properties,
- LLVMContext &Ctx) {
+static MDTuple *constructEntryMetadata(const Function *EntryFn,
+ MDTuple *Signatures, MDNode *Resources,
+ MDTuple *Properties, LLVMContext &Ctx) {
// Each entry point metadata record specifies:
// * reference to the entry point function global symbol
// * unmangled name
@@ -290,42 +292,82 @@ static MDTuple *emitTopLevelLibraryNode(Module &M, MDNode *RMD,
return constructEntryMetadata(nullptr, nullptr, RMD, Properties, Ctx);
}
-// TODO: We might need to refactor this to be more generic,
-// in case we need more metadata to be replaced.
-static void translateBranchMetadata(Module &M) {
- for (Function &F : M) {
- for (BasicBlock &BB : F) {
- Instruction *BBTerminatorInst = BB.getTerminator();
+static void translateBranchMetadata(Module &M, Instruction *BBTerminatorInst) {
+ MDNode *HlslControlFlowMD =
+ BBTerminatorInst->getMetadata("hlsl.controlflow.hint");
+
+ if (!HlslControlFlowMD)
+ return;
- MDNode *HlslControlFlowMD =
- BBTerminatorInst->getMetadata("hlsl.controlflow.hint");
+ assert(HlslControlFlowMD->getNumOperands() == 2 &&
+ "invalid operands for hlsl.controlflow.hint");
- if (!HlslControlFlowMD)
- continue;
+ MDBuilder MDHelper(M.getContext());
- assert(HlslControlFlowMD->getNumOperands() == 2 &&
- "invalid operands for hlsl.controlflow.hint");
+ llvm::Metadata *HintsStr = MDHelper.createString("dx.controlflow.hints");
+ llvm::Metadata *HintsValue = MDHelper.createConstant(
+ mdconst::extract<ConstantInt>(HlslControlFlowMD->getOperand(1)));
- MDBuilder MDHelper(M.getContext());
- ConstantInt *Op1 =
- mdconst::extract<ConstantInt>(HlslControlFlowMD->getOperand(1));
+ MDNode *MDNode = llvm::MDNode::get(M.getContext(), {HintsStr, HintsValue});
- SmallVector<llvm::Metadata *, 2> Vals(
- ArrayRef<Metadata *>{MDHelper.createString("dx.controlflow.hints"),
- MDHelper.createConstant(Op1)});
+ BBTerminatorInst->setMetadata("dx.controlflow.hints", MDNode);
+ BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr);
+}
+
+static std::array<unsigned, 6> getCompatibleInstructionMDs(llvm::Module &M) {
+ return {
+ M.getMDKindID("dx.nonuniform"), M.getMDKindID("dx.controlflow.hints"),
+ M.getMDKindID("dx.precise"), llvm::LLVMContext::MD_range,
+ llvm::LLVMContext::MD_alias_scope, llvm::LLVMContext::MD_noalias};
+}
- MDNode *MDNode = llvm::MDNode::get(M.getContext(), Vals);
+static void translateInstructionMetadata(Module &M) {
+ // construct allowlist of valid metadata node kinds
+ std::array<unsigned, 6> DXILCompatibleMDs = getCompatibleInstructionMDs(M);
- BBTerminatorInst->setMetadata("dx.controlflow.hints", MDNode);
- BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr);
+ for (Function &F : M) {
+ for (BasicBlock &BB : F) {
+ // This needs to be done first so that "hlsl.controlflow.hints" isn't
+ // removed in the whitelist below
+ if (auto *I = BB.getTerminator())
+ translateBranchMetadata(M, I);
+
+ for (auto &I : make_early_inc_range(BB)) {
+ I.dropUnknownNonDebugMetadata(DXILCompatibleMDs);
+ }
}
}
}
-static void translateMetadata(Module &M, DXILResourceMap &DRM,
- DXILResourceTypeMap &DRTM,
- const ModuleShaderFlags &ShaderFlags,
- const ModuleMetadataInfo &MMDI) {
+static void cleanModuleFlags(Module &M) {
+ NamedMDNode *MDFlags = M.getModuleFlagsMetadata();
+ if (!MDFlags)
+ return;
+
+ SmallVector<llvm::Module::ModuleFlagEntry> FlagEntries;
+ M.getModuleFlagsMetadata(FlagEntries);
+ bool Updated = false;
+ for (auto &Flag : FlagEntries) {
+ // llvm 3.7 only supports behavior up to AppendUnique.
+ if (Flag.Behavior <= Module::ModFlagBehavior::AppendUnique)
+ continue;
+ Flag.Behavior = Module::ModFlagBehavior::Warning;
+ Updated = true;
+ }
+
+ if (!Updated)
+ return;
+
+ MDFlags->eraseFromParent();
+
+ for (auto &Flag : FlagEntries)
+ M.addModuleFlag(Flag.Behavior, Flag.Key->getString(), Flag.Val);
+}
+
+static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM,
+ DXILResourceTypeMap &DRTM,
+ const ModuleShaderFlags &ShaderFlags,
+ const ModuleMetadataInfo &MMDI) {
LLVMContext &Ctx = M.getContext();
IRBuilder<> IRB(Ctx);
SmallVector<MDNode *> EntryFnMDNodes;
@@ -381,6 +423,22 @@ static void translateMetadata(Module &M, DXILResourceMap &DRM,
M.getOrInsertNamedMetadata("dx.entryPoints");
for (auto *Entry : EntryFnMDNodes)
EntryPointsNamedMD->addOperand(Entry);
+
+ cleanModuleFlags(M);
+
+ // dx.rootsignatures will have been parsed from its metadata form as its
+ // binary form as part of the RootSignatureAnalysisWrapper, so safely
+ // remove it as it is not recognized in DXIL
+ if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures"))
+ RootSignature->eraseFromParent();
+
+ // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and
+ // causes all tests using the DXIL Validator to fail.
+ //
+ // This is a temporary fix and should be replaced with a allowlist once
+ // we have determined all metadata that the DXIL Validator allows
+ if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa"))
+ ErrNo->eraseFromParent();
}
PreservedAnalyses DXILTranslateMetadata::run(Module &M,
@@ -390,8 +448,8 @@ PreservedAnalyses DXILTranslateMetadata::run(Module &M,
const ModuleShaderFlags &ShaderFlags = MAM.getResult<ShaderFlagsAnalysis>(M);
const dxil::ModuleMetadataInfo MMDI = MAM.getResult<DXILMetadataAnalysis>(M);
- translateMetadata(M, DRM, DRTM, ShaderFlags, MMDI);
- translateBranchMetadata(M);
+ translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI);
+ translateInstructionMetadata(M);
return PreservedAnalyses::all();
}
@@ -409,10 +467,13 @@ public:
AU.addRequired<DXILResourceWrapperPass>();
AU.addRequired<ShaderFlagsAnalysisWrapper>();
AU.addRequired<DXILMetadataAnalysisWrapperPass>();
- AU.addPreserved<DXILResourceWrapperPass>();
+ AU.addRequired<RootSignatureAnalysisWrapper>();
+
AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
- AU.addPreserved<ShaderFlagsAnalysisWrapper>();
AU.addPreserved<DXILResourceBindingWrapperPass>();
+ AU.addPreserved<DXILResourceWrapperPass>();
+ AU.addPreserved<RootSignatureAnalysisWrapper>();
+ AU.addPreserved<ShaderFlagsAnalysisWrapper>();
}
bool runOnModule(Module &M) override {
@@ -425,8 +486,8 @@ public:
dxil::ModuleMetadataInfo MMDI =
getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();
- translateMetadata(M, DRM, DRTM, ShaderFlags, MMDI);
- translateBranchMetadata(M);
+ translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI);
+ translateInstructionMetadata(M);
return true;
}
};
@@ -443,6 +504,7 @@ INITIALIZE_PASS_BEGIN(DXILTranslateMetadataLegacy, "dxil-translate-metadata",
"DXIL Translate Metadata", false, false)
INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ShaderFlagsAnalysisWrapper)
+INITIALIZE_PASS_DEPENDENCY(RootSignatureAnalysisWrapper)
INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass)
INITIALIZE_PASS_END(DXILTranslateMetadataLegacy, "dxil-translate-metadata",
"DXIL Translate Metadata", false, false)
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h
index f3f5eb1..4c1ffac 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h
@@ -13,7 +13,8 @@
namespace llvm {
-/// A pass that transforms DXIL Intrinsics that don't have DXIL opCodes
+/// A pass that transforms LLVM Metadata in the module to it's DXIL equivalent,
+/// then emits all recognized DXIL Metadata
class DXILTranslateMetadata : public PassInfoMixin<DXILTranslateMetadata> {
public:
PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td
index fb0928b8..ede8463 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.td
+++ b/llvm/lib/Target/Hexagon/Hexagon.td
@@ -79,6 +79,12 @@ def ExtensionHVXV79: SubtargetFeature<"hvxv79", "HexagonHVXVersion",
ExtensionHVXV67, ExtensionHVXV68, ExtensionHVXV69, ExtensionHVXV71,
ExtensionHVXV73, ExtensionHVXV75]>;
+def ExtensionHVXV81: SubtargetFeature<"hvxv81", "HexagonHVXVersion",
+ "Hexagon::ArchEnum::V81", "Hexagon HVX instructions",
+ [ExtensionHVXV65, ExtensionHVXV66, ExtensionHVXV67,
+ ExtensionHVXV68, ExtensionHVXV69, ExtensionHVXV71,
+ ExtensionHVXV73, ExtensionHVXV75, ExtensionHVXV79]>;
+
def ExtensionHVX64B: SubtargetFeature<"hvx-length64b", "UseHVX64BOps",
"true", "Hexagon HVX 64B instructions", [ExtensionHVX]>;
def ExtensionHVX128B: SubtargetFeature<"hvx-length128b", "UseHVX128BOps",
@@ -151,6 +157,8 @@ def UseHVXV75 : Predicate<"HST->useHVXV75Ops()">,
AssemblerPredicate<(all_of ExtensionHVXV75)>;
def UseHVXV79 : Predicate<"HST->useHVXV79Ops()">,
AssemblerPredicate<(all_of ExtensionHVXV79)>;
+def UseHVXV81 : Predicate<"HST->useHVXV81Ops()">,
+ AssemblerPredicate<(all_of ExtensionHVXV81)>;
def UseAudio : Predicate<"HST->useAudioOps()">,
AssemblerPredicate<(all_of ExtensionAudio)>;
def UseZReg : Predicate<"HST->useZRegOps()">,
@@ -488,6 +496,11 @@ def : Proc<"hexagonv79", HexagonModelV79,
ArchV68, ArchV69, ArchV71, ArchV73, ArchV75, ArchV79,
FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops,
FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
+def : Proc<"hexagonv81", HexagonModelV81,
+ [ArchV65, ArchV66, ArchV67, ArchV68, ArchV69, ArchV71, ArchV73,
+ ArchV75, ArchV79, ArchV81,
+ FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops,
+ FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
// Need to update the correct features for tiny core.
// Disable NewValueJumps since the packetizer is unable to handle a packet with
diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.h b/llvm/lib/Target/Hexagon/HexagonDepArch.h
index 8984534..9bf4034 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepArch.h
+++ b/llvm/lib/Target/Hexagon/HexagonDepArch.h
@@ -29,7 +29,8 @@ enum class ArchEnum {
V71,
V73,
V75,
- V79
+ V79,
+ V81
};
inline std::optional<Hexagon::ArchEnum> getCpu(StringRef CPU) {
@@ -50,6 +51,7 @@ inline std::optional<Hexagon::ArchEnum> getCpu(StringRef CPU) {
.Case("hexagonv73", Hexagon::ArchEnum::V73)
.Case("hexagonv75", Hexagon::ArchEnum::V75)
.Case("hexagonv79", Hexagon::ArchEnum::V79)
+ .Case("hexagonv81", Hexagon::ArchEnum::V81)
.Default(std::nullopt);
}
} // namespace Hexagon
diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.td b/llvm/lib/Target/Hexagon/HexagonDepArch.td
index 8ec1d93..f623fd0 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepArch.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepArch.td
@@ -34,3 +34,5 @@ def ArchV75: SubtargetFeature<"v75", "HexagonArchVersion", "Hexagon::ArchEnum::V
def HasV75 : Predicate<"HST->hasV75Ops()">, AssemblerPredicate<(all_of ArchV75)>;
def ArchV79: SubtargetFeature<"v79", "HexagonArchVersion", "Hexagon::ArchEnum::V79", "Enable Hexagon V79 architecture">;
def HasV79 : Predicate<"HST->hasV79Ops()">, AssemblerPredicate<(all_of ArchV79)>;
+def ArchV81: SubtargetFeature<"v81", "HexagonArchVersion", "Hexagon::ArchEnum::V81", "Enable Hexagon V81 architecture">;
+def HasV81 : Predicate<"HST->hasV81Ops()">, AssemblerPredicate<(all_of ArchV81)>;
diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
index 93696e0..f4e36fa7 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
@@ -7222,3 +7222,595 @@ class DepHVXItinV79 {
[Hex_FWD, Hex_FWD, HVX_FWD]>
];
}
+
+class DepHVXItinV81 {
+ list<InstrItinData> DepHVXItinV81_list = [
+ InstrItinData <tc_0390c1ca, /*SLOT01,LOAD,VA,VX_DV*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_04da405a, /*SLOT0123,VP_VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_08a4f1b6, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_0ec46cf9, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7],
+ [HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_131f1c81, /*SLOT0,NOSLOT1,STORE,VP*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5],
+ [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_1381a97c, /*SLOT0123,4SLOT*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_ALL]>], [],
+ []>,
+
+ InstrItinData <tc_15fdf750, /*SLOT23,VS_VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+ InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_16ff9ef8, /*SLOT0123,VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_191381c1, /*SLOT0,STORE,VA*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7],
+ [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_1ad8a370, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_1ba8a0cd, /*SLOT01,LOAD,VA*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_20a4bbec, /*SLOT0,STORE*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST]>], [3, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_227864f7, /*SLOT0,STORE,VA,VX_DV*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>,
+ InstrStage<1, [CVI_MPY01]>], [3, 1, 2, 5],
+ [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_26a377fe, /*SLOT23,4SLOT_MPY*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
+ [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_2d4051cd, /*SLOT23,4SLOT_MPY*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2],
+ [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_2e8f5f6e, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_309dbb4f, /*SLOT0123,VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_37820f4c, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_3904b926, /*SLOT01,LOAD*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_3aacf4a8, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7],
+ [HVX_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_3ad719fb, /*SLOT01,ZW*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_3c56e5ce, /*SLOT0,NOSLOT1,LOAD,VP*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_3ce09744, /*SLOT0,STORE*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST]>], [1, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_3e2aaafc, /*SLOT0,STORE,VA*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7],
+ [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_447d9895, /*SLOT0,STORE,VA*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_453fe68d, /*SLOT01,LOAD,VA*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_46d6c3e0, /*SLOT0123,VP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_4942646a, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_52447ecc, /*SLOT01,LOAD*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD]>], [9, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_531b383c, /*SLOT0123*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 5, 5],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_540c3da3, /*SLOT0,VA*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1],
+ [Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_54a0dc47, /*SLOT0,STORE,VA*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_561aaa58, /*SLOT0123,VP_VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_56c4f9fe, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_56e64202, /*SLOT0123,VP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_58d21193, /*SLOT0,STORE,VA_DV*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLANE]>], [9, 2],
+ [HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7],
+ [HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_649072c2, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_660769f1, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_663c80a7, /*SLOT01,LOAD*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD]>], [9, 3, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_6942b6e0, /*SLOT0,STORE*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST]>], [3, 1, 2, 5],
+ [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_6e7fa133, /*SLOT0123,VP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7095ecba, /*SLOT01,LOAD,VA_DV*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_7177e272, /*SLOT0,STORE*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST]>], [2, 1, 2, 5],
+ [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_718b5c53, /*SLOT0123,VA_DV*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9],
+ [HVX_FWD]>,
+
+ InstrItinData <tc_7273323b, /*SLOT0,STORE,VA_DV*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_72e2b393, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_73efe966, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_7417e785, /*SLOT0123,VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_767c4e9d, /*SLOT0123,4SLOT*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_ALL]>], [3, 2],
+ [HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7d68d5c2, /*SLOT01,LOAD,VA*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_8772086c, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_87adc037, /*SLOT0123,VP_VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_8e420e4d, /*SLOT0,STORE,VA*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_90bcc1db, /*SLOT2,VX_DV*/
+ [InstrStage<1, [SLOT2], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_933f2b39, /*SLOT23,4SLOT_MPY*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_946013d8, /*SLOT0123,VP*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLANE]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_9a1cab75, /*SLOT01,LOAD,VA,VX_DV*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 3, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9aff7a2a, /*SLOT0,STORE,VA,VX_DV*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>,
+ InstrStage<1, [CVI_MPY01]>], [1, 2, 5],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_9d1dc972, /*SLOT0123,VP_VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9f363d21, /*SLOT0,STORE,VA*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_a02a10a8, /*SLOT0,STORE,VA*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7],
+ [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_a0dbea28, /*SLOT01,ZW*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_ZW]>], [3, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a19b9305, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_a28f32b5, /*SLOT01,LOAD,VA*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_a69eeee1, /*SLOT01,LOAD,VA_DV*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
+ [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_ab23f776, /*SLOT0,STORE*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST]>], [1, 2, 5],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_abe8c3b2, /*SLOT01,LOAD,VA*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_ac4046bc, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_af25efd9, /*SLOT0123,VA_DV*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7],
+ [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_b091f1c6, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_b28e51aa, /*SLOT0123,4SLOT*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_ALL]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_b4416217, /*SLOT0123,VA_DV*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7],
+ [HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_b9db8205, /*SLOT01,LOAD*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_bb599486, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_LD], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2],
+ [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_c127de3a, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_c4edf264, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2],
+ [HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_c5dba46e, /*SLOT0,STORE,VA*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_c7039829, /*SLOT0,NOSLOT1,STORE,VP*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_cd94bfe0, /*SLOT23,VS_VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+ InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_cda936da, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_db5555f3, /*SLOT0123,VA_DV*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
+ [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_dcca380f, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_ZW]>], [2, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_df80eeb0, /*SLOT0123,VP_VS*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_e2d2e9e5, /*SLOT0,NOSLOT1,STORE,VP*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5],
+ [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_e2fdd6e6, /*SLOT0123*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 5],
+ [HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_e35c1e93, /*SLOT0123,VA*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7],
+ [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_e3f68a46, /*SLOT0123,4SLOT*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_ALL]>], [3],
+ [HVX_FWD]>,
+
+ InstrItinData <tc_e675c45a, /*SLOT23,VX_DV*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_e699ae41, /*SLOT01,ZW*/
+ [InstrStage<1, [SLOT0, SLOT1], 0>,
+ InstrStage<1, [CVI_ZW]>], [1, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+ InstrItinData <tc_f175e046, /*SLOT23,VX*/
+ [InstrStage<1, [SLOT2, SLOT3], 0>,
+ InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2],
+ [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/
+ [InstrStage<1, [SLOT2], 0>,
+ InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+ [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f21e8abb, /*SLOT0,NOSLOT1,STORE,VP*/
+ [InstrStage<1, [SLOT0], 0>,
+ InstrStage<1, [SLOT1], 0>,
+ InstrStage<1, [CVI_ST], 0>,
+ InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
+ [Hex_FWD, Hex_FWD, HVX_FWD]>
+ ];
+} \ No newline at end of file
diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td b/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
index 7a1ad3e..48b665c 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
@@ -13740,3 +13740,891 @@ class DepScalarItinV79 {
[Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>
];
}
+
+class DepScalarItinV81 {
+ list<InstrItinData> DepScalarItinV81_list = [
+ InstrItinData <tc_011e0e9d, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_01d44cb2, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_01e1be3b, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_02fe1c65, /*tc_4x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_0655b949, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [2, 3],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_075c8dd8, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_0a195f2c, /*tc_4x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_0a43be35, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_0a6c20ae, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_0ba0d5da, /*tc_3stall*/
+ [InstrStage<1, [SLOT2]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_0dfac0a7, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_0fac1eb8, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_112d30d6, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_1242dc2a, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_1248597c, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_139ef484, /*tc_3stall*/
+ [InstrStage<1, [SLOT2]>], [1, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_14ab4f41, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [3, 3, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_151bf368, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_158aa3f7, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_197dce51, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 2, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_1981450d, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [3],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_1c2c7a4a, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_1c7522a8, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_1d41f8b7, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_1fcb8495, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_1fe4ab69, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_20131976, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_2237d952, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [1, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_23708a21, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+ []>,
+
+ InstrItinData <tc_2471c1c8, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_24e109c7, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [3, 3, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_24f426ab, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_27106296, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_280f7fe1, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_28e55c6f, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [1, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_2c13e7f5, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_2c3e17fc, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_2f573607, /*tc_1*/
+ [InstrStage<1, [SLOT2]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_33e7e673, /*tc_2early*/
+ [InstrStage<1, [SLOT2]>], [],
+ []>,
+
+ InstrItinData <tc_362b0be2, /*tc_3*/
+ [InstrStage<1, [SLOT2]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_38382228, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_388f9897, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_38e0bae9, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_3d14a17b, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_3edca78f, /*tc_2*/
+ [InstrStage<1, [SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_3fbf1042, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_407e96f9, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_40d64c94, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [3, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_4222e6bf, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_42ff66ba, /*tc_1*/
+ [InstrStage<1, [SLOT2]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_442395f3, /*tc_2latepred*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_449acf79, /*tc_latepredstaia*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_44d5a428, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_44fffc58, /*tc_3*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_45791fb8, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_45f9d1be, /*tc_2early*/
+ [InstrStage<1, [SLOT2]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_46c18ecf, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_49fdfd4b, /*tc_3stall*/
+ [InstrStage<1, [SLOT3]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_4a55d03c, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_4abdbdc6, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_4ac61d92, /*tc_2latepred*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_4bf903b0, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_503ce0f3, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_512b1653, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_53c851ab, /*tc_3stall*/
+ [InstrStage<1, [SLOT2]>], [4, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_54f0cee2, /*tc_3stall*/
+ [InstrStage<1, [SLOT3]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_5502c366, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_55255f2b, /*tc_3stall*/
+ [InstrStage<1, [SLOT3]>], [],
+ []>,
+
+ InstrItinData <tc_556f6577, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_55a9a350, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_55b33fda, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_56a124a7, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_57a55b54, /*tc_1*/
+ [InstrStage<1, [SLOT3]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5944960d, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_59a7822c, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5a222e89, /*tc_2early*/
+ [InstrStage<1, [SLOT2]>], [1, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5a4b5e58, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5b347363, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5ceb2f9e, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5da50c4b, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5deb5e47, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5e4cf0e8, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_5f2afaf7, /*tc_latepredldaia*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_60e324ff, /*tc_1*/
+ [InstrStage<1, [SLOT2]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_63567288, /*tc_2latepred*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_64b00d8a, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_651cbe02, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_65279839, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_65cbd974, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_69bfb303, /*tc_3*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_6aa823ab, /*tc_3stall*/
+ [InstrStage<1, [SLOT3]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_6ae3426b, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_6d861a95, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [2, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_6e20402a, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [2, 3],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_6f42bc60, /*tc_3stall*/
+ [InstrStage<1, [SLOT0]>], [4, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_6fb52018, /*tc_3stall*/
+ [InstrStage<1, [SLOT0]>], [1, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_6fc5dbea, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_711c805f, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_713b66bf, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7401744f, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7476d766, /*tc_3stall*/
+ [InstrStage<1, [SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_74a42bda, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_759e57be, /*tc_3stall*/
+ [InstrStage<1, [SLOT2]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_76bb5435, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7d6a2568, /*tc_3stall*/
+ [InstrStage<1, [SLOT2]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_77f94a5e, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [],
+ []>,
+
+ InstrItinData <tc_788b1d09, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_78f87ed3, /*tc_3stall*/
+ [InstrStage<1, [SLOT0]>], [],
+ []>,
+
+ InstrItinData <tc_7af3a37e, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 3],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7b9187d3, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7c28bd7e, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_7c31e19a, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7c6d32e4, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7dc63b5c, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7f58404a, /*tc_3stall*/
+ [InstrStage<1, [SLOT3]>], [],
+ []>,
+
+ InstrItinData <tc_7f7f45f5, /*tc_4x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_7f8ae742, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_8035e91f, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_822c3c68, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_829d8a86, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_838c4d7a, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_84a7500d, /*tc_2*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_86173609, /*tc_2latepred*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_887d1bb7, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_8a6d0d94, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_8a825db2, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_8b5bd4f5, /*tc_2*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_8e82e8ca, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_8f36a2fd, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9124c04f, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_92240447, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_934753bb, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [3, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_937dd41c, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [],
+ []>,
+
+ InstrItinData <tc_9406230a, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [2, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_95a33176, /*tc_2*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_95f43c5e, /*tc_3*/
+ [InstrStage<1, [SLOT2]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_96ef76ef, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_975a4e54, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [3, 3, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9783714b, /*tc_4x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9b20a062, /*tc_3stall*/
+ [InstrStage<1, [SLOT2]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9b34f5e0, /*tc_3stall*/
+ [InstrStage<1, [SLOT2]>], [],
+ []>,
+
+ InstrItinData <tc_9b3c0462, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9bcfb2ee, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9c52f549, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9e27f2f9, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9e72dc89, /*tc_4x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9edb7c77, /*tc_4x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9edefe01, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_9f6cd987, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a08b630b, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a1297125, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a154b476, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a2b365d2, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a3070909, /*tc_3stall*/
+ [InstrStage<1, [SLOT0]>], [1, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a32e03e7, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a38c45dc, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a4e22bbd, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a4ee89db, /*tc_2early*/
+ [InstrStage<1, [SLOT0]>], [],
+ []>,
+
+ InstrItinData <tc_a724463d, /*tc_3stall*/
+ [InstrStage<1, [SLOT0]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a7a13fac, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a7bdb22c, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_a9edeffa, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_abfd9a6d, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_ac65613f, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_addc37a8, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_ae5babd7, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_aee6250c, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_af6af259, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_b1ae5f67, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_b2196a3f, /*tc_3stall*/
+ [InstrStage<1, [SLOT3]>], [1, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_b3d46584, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [],
+ []>,
+
+ InstrItinData <tc_b4dc7630, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_b7c4062a, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_b837298f, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+ []>,
+
+ InstrItinData <tc_b9bec29e, /*tc_3stall*/
+ [InstrStage<1, [SLOT2]>], [],
+ []>,
+
+ InstrItinData <tc_ba9255a6, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_bb07f2c5, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_bb78483e, /*tc_3stall*/
+ [InstrStage<1, [SLOT3]>], [4, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_bb831a7c, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_bf2ffc0f, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_c20701f0, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_c21d7447, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_c57d9f39, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_c818ff7f, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [],
+ []>,
+
+ InstrItinData <tc_ce59038e, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_cfa0e29b, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [2, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_d03278fd, /*tc_st*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_d234b61a, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_d33e5eee, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_d3632d88, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_d45ba9cd, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_d57d649c, /*tc_3stall*/
+ [InstrStage<1, [SLOT2]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_d61dfdc3, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_d68dca5c, /*tc_3stall*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_d71ea8fa, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [2, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_d7718fbe, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_db596beb, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_db96aa6b, /*tc_st*/
+ [InstrStage<1, [SLOT0]>], [1],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_dc51281d, /*tc_3*/
+ [InstrStage<1, [SLOT2]>], [2, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_decdde8a, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_df5d53f9, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [3, 2, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_e3d699e3, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_e60def48, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_e9170fb7, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_ed03645c, /*tc_1*/
+ [InstrStage<1, [SLOT2]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_ed3f8d2a, /*tc_ld*/
+ [InstrStage<1, [SLOT0]>], [4, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_eed07714, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_eeda4109, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_ef921005, /*tc_1*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f098b237, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f0cdeccf, /*tc_3x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f0e8e832, /*tc_4x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f34c1c21, /*tc_2*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f38f92e1, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_f529831b, /*tc_latepredstaia*/
+ [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 3],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f6e2aff9, /*tc_newvjump*/
+ [InstrStage<1, [SLOT0]>], [3, 2, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f7569068, /*tc_4x*/
+ [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_f97707c1, /*tc_1*/
+ [InstrStage<1, [SLOT2]>], [2],
+ [Hex_FWD]>,
+
+ InstrItinData <tc_f999c66e, /*tc_1*/
+ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_fae9dfa5, /*tc_3x*/
+ [InstrStage<1, [SLOT3]>], [4, 2],
+ [Hex_FWD, Hex_FWD]>,
+
+ InstrItinData <tc_fedb7e19, /*tc_ld*/
+ [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+ [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>
+ ];
+} \ No newline at end of file
diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
index ae96753..f8f1c2a 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -39178,6 +39178,19 @@ let opNewValue = 0;
let isCVI = 1;
let DecoderNamespace = "EXT_mmvec";
}
+def V6_vsub_hf_mix : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf16 = vsub($Vu32.hf,$Vv32.qf16)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
def V6_vsub_qf16 : HInst<
(outs HvxVR:$Vd32),
(ins HvxVR:$Vu32, HvxVR:$Vv32),
@@ -39269,6 +39282,19 @@ let opNewValue = 0;
let isCVI = 1;
let DecoderNamespace = "EXT_mmvec";
}
+def V6_vsub_sf_mix : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf32 = vsub($Vu32.sf,$Vv32.qf32)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
def V6_vsub_sf_sf : HInst<
(outs HvxVR:$Vd32),
(ins HvxVR:$Vu32, HvxVR:$Vv32),
@@ -41116,6 +41142,17 @@ let hasNewValue = 1;
let opNewValue = 0;
let isSolo = 1;
}
+def Y2_tlbpp : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = tlbp($Rss32)",
+tc_6aa823ab, TypeCR>, Enc_90cd8b, Requires<[HasV81]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01101100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isSolo = 1;
+}
def Y2_tlbr : HInst<
(outs DoubleRegs:$Rdd32),
(ins IntRegs:$Rs32),
diff --git a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
index 17cb96c..23f4b3a 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
@@ -3827,3 +3827,14 @@ def: Pat<(int_hexagon_V6_vsub_hf_f8 HvxVR:$src1, HvxVR:$src2),
(V6_vsub_hf_f8 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV79, UseHVX64B]>;
def: Pat<(int_hexagon_V6_vsub_hf_f8_128B HvxVR:$src1, HvxVR:$src2),
(V6_vsub_hf_f8 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV79, UseHVX128B]>;
+
+// V81 HVX Instructions.
+
+def: Pat<(int_hexagon_V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2),
+ (V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_hf_mix_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_sf_mix HvxVR:$src1, HvxVR:$src2),
+ (V6_vsub_sf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_sf_mix_128B HvxVR:$src1, HvxVR:$src2),
+ (V6_vsub_sf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index e285e04..7ee280d 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -654,7 +654,9 @@ void HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) {
IntNo == Intrinsic::hexagon_V6_vgathermh ||
IntNo == Intrinsic::hexagon_V6_vgathermh_128B ||
IntNo == Intrinsic::hexagon_V6_vgathermhw ||
- IntNo == Intrinsic::hexagon_V6_vgathermhw_128B) {
+ IntNo == Intrinsic::hexagon_V6_vgathermhw_128B ||
+ IntNo == Intrinsic::hexagon_V6_vgather_vscattermh ||
+ IntNo == Intrinsic::hexagon_V6_vgather_vscattermh_128B) {
SelectV65Gather(N);
return;
}
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index c7a4f68..3cc146b 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -2953,6 +2953,10 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
case Intrinsic::hexagon_V6_vgathermhw_128B:
Opcode = Hexagon::V6_vgathermhw_pseudo;
break;
+ case Intrinsic::hexagon_V6_vgather_vscattermh:
+ case Intrinsic::hexagon_V6_vgather_vscattermh_128B:
+ Opcode = Hexagon::V6_vgather_vscatter_mh_pseudo;
+ break;
}
SDVTList VTs = CurDAG->getVTList(MVT::Other);
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 9f7f434..526b4de 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -2145,7 +2145,9 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::hexagon_V6_vgathermhq:
case Intrinsic::hexagon_V6_vgathermhq_128B:
case Intrinsic::hexagon_V6_vgathermhwq:
- case Intrinsic::hexagon_V6_vgathermhwq_128B: {
+ case Intrinsic::hexagon_V6_vgathermhwq_128B:
+ case Intrinsic::hexagon_V6_vgather_vscattermh:
+ case Intrinsic::hexagon_V6_vgather_vscattermh_128B: {
const Module &M = *I.getParent()->getParent()->getParent();
Info.opc = ISD::INTRINSIC_W_CHAIN;
Type *VecTy = I.getArgOperand(1)->getType();
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 939841a..47726d6 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1554,80 +1554,93 @@ HexagonInstrInfo::expandVGatherPseudo(MachineInstr &MI) const {
MachineBasicBlock::iterator First;
switch (Opc) {
- case Hexagon::V6_vgathermh_pseudo:
- First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh))
- .add(MI.getOperand(2))
- .add(MI.getOperand(3))
- .add(MI.getOperand(4));
- BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
- .add(MI.getOperand(0))
- .addImm(MI.getOperand(1).getImm())
- .addReg(Hexagon::VTMP);
- MBB.erase(MI);
- return First.getInstrIterator();
-
- case Hexagon::V6_vgathermw_pseudo:
- First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermw))
- .add(MI.getOperand(2))
- .add(MI.getOperand(3))
- .add(MI.getOperand(4));
- BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
- .add(MI.getOperand(0))
- .addImm(MI.getOperand(1).getImm())
- .addReg(Hexagon::VTMP);
- MBB.erase(MI);
- return First.getInstrIterator();
-
- case Hexagon::V6_vgathermhw_pseudo:
- First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhw))
- .add(MI.getOperand(2))
- .add(MI.getOperand(3))
- .add(MI.getOperand(4));
- BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
- .add(MI.getOperand(0))
- .addImm(MI.getOperand(1).getImm())
- .addReg(Hexagon::VTMP);
- MBB.erase(MI);
- return First.getInstrIterator();
-
- case Hexagon::V6_vgathermhq_pseudo:
- First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhq))
- .add(MI.getOperand(2))
- .add(MI.getOperand(3))
- .add(MI.getOperand(4))
- .add(MI.getOperand(5));
- BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
- .add(MI.getOperand(0))
- .addImm(MI.getOperand(1).getImm())
- .addReg(Hexagon::VTMP);
- MBB.erase(MI);
- return First.getInstrIterator();
-
- case Hexagon::V6_vgathermwq_pseudo:
- First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermwq))
- .add(MI.getOperand(2))
- .add(MI.getOperand(3))
- .add(MI.getOperand(4))
- .add(MI.getOperand(5));
- BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
- .add(MI.getOperand(0))
- .addImm(MI.getOperand(1).getImm())
- .addReg(Hexagon::VTMP);
- MBB.erase(MI);
- return First.getInstrIterator();
-
- case Hexagon::V6_vgathermhwq_pseudo:
- First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhwq))
- .add(MI.getOperand(2))
- .add(MI.getOperand(3))
- .add(MI.getOperand(4))
- .add(MI.getOperand(5));
- BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
- .add(MI.getOperand(0))
- .addImm(MI.getOperand(1).getImm())
- .addReg(Hexagon::VTMP);
- MBB.erase(MI);
- return First.getInstrIterator();
+ case Hexagon::V6_vgather_vscatter_mh_pseudo:
+ // This is mainly a place holder. It will be extended.
+ First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(4));
+ BuildMI(MBB, MI, DL, get(Hexagon::V6_vscattermh))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(4))
+ .addReg(Hexagon::VTMP);
+ MBB.erase(MI);
+ return First.getInstrIterator();
+ case Hexagon::V6_vgathermh_pseudo:
+ First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(4));
+ BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+ .add(MI.getOperand(0))
+ .addImm(MI.getOperand(1).getImm())
+ .addReg(Hexagon::VTMP);
+ MBB.erase(MI);
+ return First.getInstrIterator();
+
+ case Hexagon::V6_vgathermw_pseudo:
+ First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermw))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(4));
+ BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+ .add(MI.getOperand(0))
+ .addImm(MI.getOperand(1).getImm())
+ .addReg(Hexagon::VTMP);
+ MBB.erase(MI);
+ return First.getInstrIterator();
+
+ case Hexagon::V6_vgathermhw_pseudo:
+ First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhw))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(4));
+ BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+ .add(MI.getOperand(0))
+ .addImm(MI.getOperand(1).getImm())
+ .addReg(Hexagon::VTMP);
+ MBB.erase(MI);
+ return First.getInstrIterator();
+
+ case Hexagon::V6_vgathermhq_pseudo:
+ First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhq))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(4))
+ .add(MI.getOperand(5));
+ BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+ .add(MI.getOperand(0))
+ .addImm(MI.getOperand(1).getImm())
+ .addReg(Hexagon::VTMP);
+ MBB.erase(MI);
+ return First.getInstrIterator();
+
+ case Hexagon::V6_vgathermwq_pseudo:
+ First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermwq))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(4))
+ .add(MI.getOperand(5));
+ BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+ .add(MI.getOperand(0))
+ .addImm(MI.getOperand(1).getImm())
+ .addReg(Hexagon::VTMP);
+ MBB.erase(MI);
+ return First.getInstrIterator();
+
+ case Hexagon::V6_vgathermhwq_pseudo:
+ First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhwq))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(4))
+ .add(MI.getOperand(5));
+ BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+ .add(MI.getOperand(0))
+ .addImm(MI.getOperand(1).getImm())
+ .addReg(Hexagon::VTMP);
+ MBB.erase(MI);
+ return First.getInstrIterator();
}
return MI.getIterator();
@@ -2806,6 +2819,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
case Hexagon::V6_vL32b_nt_tmp_npred_ai:
case Hexagon::V6_vS32Ub_npred_ai:
case Hexagon::V6_vgathermh_pseudo:
+ case Hexagon::V6_vgather_vscatter_mh_pseudo:
case Hexagon::V6_vgathermw_pseudo:
case Hexagon::V6_vgathermhw_pseudo:
case Hexagon::V6_vgathermhq_pseudo:
diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td
index f927f9b..42393d0 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td
@@ -40,6 +40,19 @@ defm V6_vgathermh_pseudo : vgathermh<HvxVR>;
defm V6_vgathermw_pseudo : vgathermw<HvxVR>;
defm V6_vgathermhw_pseudo : vgathermhw<HvxWR>;
+
+multiclass vgather_scatter_mh<RegisterClass RC> {
+ let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1,
+ mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in
+ def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ),
+ (ins IntRegs:$_dst_, s4_0Imm:$Ii,
+ IntRegs:$Rt, ModRegs:$Mu, RC:$Vv),
+ ".error \"should not emit\" ",
+ []>;
+}
+
+defm V6_vgather_vscatter_mh_pseudo : vgather_scatter_mh<HvxVR>;
+
multiclass vgathermhq<RegisterClass RC1, RegisterClass RC2> {
let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1,
mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in
diff --git a/llvm/lib/Target/Hexagon/HexagonSchedule.td b/llvm/lib/Target/Hexagon/HexagonSchedule.td
index b8a9cf3..9bcd4bf 100644
--- a/llvm/lib/Target/Hexagon/HexagonSchedule.td
+++ b/llvm/lib/Target/Hexagon/HexagonSchedule.td
@@ -75,3 +75,4 @@ include "HexagonScheduleV71T.td"
include "HexagonScheduleV73.td"
include "HexagonScheduleV75.td"
include "HexagonScheduleV79.td"
+include "HexagonScheduleV81.td" \ No newline at end of file
diff --git a/llvm/lib/Target/Hexagon/HexagonScheduleV81.td b/llvm/lib/Target/Hexagon/HexagonScheduleV81.td
new file mode 100644
index 0000000..dd5f5a0
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonScheduleV81.td
@@ -0,0 +1,31 @@
+//=-HexagonScheduleV81.td - HexagonV81 Scheduling Definitions *- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def HexagonV81ItinList : DepScalarItinV81, ScalarItin,
+ DepHVXItinV81, HVXItin, PseudoItin {
+ list<InstrItinData> ItinList =
+ !listconcat(DepScalarItinV81_list, ScalarItin_list,
+ DepHVXItinV81_list, HVXItin_list, PseudoItin_list);
+}
+
+def HexagonItinerariesV81 :
+ ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
+ CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
+ CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL,
+ CVI_ALL_NOMEM, CVI_ZW],
+ [Hex_FWD, HVX_FWD],
+ HexagonV81ItinList.ItinList>;
+
+def HexagonModelV81 : SchedMachineModel {
+ // Max issue per cycle == bundle width.
+ let IssueWidth = 4;
+ let Itineraries = HexagonItinerariesV81;
+ let LoadLatency = 1;
+ let CompleteModel = 0;
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index 7430567..995f66d 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -224,6 +224,15 @@ public:
bool useHVXV79Ops() const {
return HexagonHVXVersion >= Hexagon::ArchEnum::V79;
}
+ bool hasV81Ops() const {
+ return getHexagonArchVersion() >= Hexagon::ArchEnum::V81;
+ }
+ bool hasV81OpsOnly() const {
+ return getHexagonArchVersion() == Hexagon::ArchEnum::V81;
+ }
+ bool useHVXV81Ops() const {
+ return HexagonHVXVersion >= Hexagon::ArchEnum::V81;
+ }
bool useAudioOps() const { return UseAudioOps; }
bool useCompound() const { return UseCompound; }
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 171e294..e925e04 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -31,6 +31,10 @@ using namespace llvm;
static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(false),
cl::Hidden, cl::desc("Enable loop vectorizer for HVX"));
+cl::opt<bool> HexagonAllowScatterGatherHVX(
+ "hexagon-allow-scatter-gather-hvx", cl::init(false), cl::Hidden,
+ cl::desc("Allow auto-generation of HVX scatter-gather"));
+
static cl::opt<bool> EnableV68FloatAutoHVX(
"force-hvx-float", cl::Hidden,
cl::desc("Enable auto-vectorization of floatint point types on v68."));
@@ -354,6 +358,61 @@ bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/,
return HexagonMaskedVMem && ST.isTypeForHVX(DataType);
}
+bool HexagonTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const {
+ // For now assume we can not deal with all HVX datatypes.
+ if (!Ty->isVectorTy() || !ST.isTypeForHVX(Ty) ||
+ !HexagonAllowScatterGatherHVX)
+ return false;
+ // This must be in sync with HexagonVectorCombine pass.
+ switch (Ty->getScalarSizeInBits()) {
+ case 8:
+ return (getTypeNumElements(Ty) == 128);
+ case 16:
+ if (getTypeNumElements(Ty) == 64 || getTypeNumElements(Ty) == 32)
+ return (Alignment >= 2);
+ break;
+ case 32:
+ if (getTypeNumElements(Ty) == 32)
+ return (Alignment >= 4);
+ break;
+ default:
+ break;
+ }
+ return false;
+}
+
+bool HexagonTTIImpl::isLegalMaskedScatter(Type *Ty, Align Alignment) const {
+ if (!Ty->isVectorTy() || !ST.isTypeForHVX(Ty) ||
+ !HexagonAllowScatterGatherHVX)
+ return false;
+ // This must be in sync with HexagonVectorCombine pass.
+ switch (Ty->getScalarSizeInBits()) {
+ case 8:
+ return (getTypeNumElements(Ty) == 128);
+ case 16:
+ if (getTypeNumElements(Ty) == 64)
+ return (Alignment >= 2);
+ break;
+ case 32:
+ if (getTypeNumElements(Ty) == 32)
+ return (Alignment >= 4);
+ break;
+ default:
+ break;
+ }
+ return false;
+}
+
+bool HexagonTTIImpl::forceScalarizeMaskedGather(VectorType *VTy,
+ Align Alignment) const {
+ return !isLegalMaskedGather(VTy, Alignment);
+}
+
+bool HexagonTTIImpl::forceScalarizeMaskedScatter(VectorType *VTy,
+ Align Alignment) const {
+ return !isLegalMaskedScatter(VTy, Alignment);
+}
+
/// --- Vector TTI end ---
unsigned HexagonTTIImpl::getPrefetchDistance() const {
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index dbf16c9..cec2bf9 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -169,6 +169,12 @@ public:
unsigned AddressSpace) const override;
bool isLegalMaskedLoad(Type *DataType, Align Alignment,
unsigned AddressSpace) const override;
+ bool isLegalMaskedGather(Type *Ty, Align Alignment) const override;
+ bool isLegalMaskedScatter(Type *Ty, Align Alignment) const override;
+ bool forceScalarizeMaskedGather(VectorType *VTy,
+ Align Alignment) const override;
+ bool forceScalarizeMaskedScatter(VectorType *VTy,
+ Align Alignment) const override;
/// @}
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 9ab5202..5c50ec2 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -57,6 +57,11 @@
#define DEBUG_TYPE "hexagon-vc"
+// This is a const that represents default HVX VTCM page size.
+// It is boot time configurable, so we probably want an API to
+// read it, but for now assume 128KB
+#define DEFAULT_HVX_VTCM_PAGE_SIZE 131072
+
using namespace llvm;
namespace {
@@ -418,6 +423,18 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {
class HvxIdioms {
public:
+ enum DstQualifier {
+ Undefined = 0,
+ Arithmetic,
+ LdSt,
+ LLVM_Gather,
+ LLVM_Scatter,
+ HEX_Gather_Scatter,
+ HEX_Gather,
+ HEX_Scatter,
+ Call
+ };
+
HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) {
auto *Int32Ty = HVC.getIntTy(32);
HvxI32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/false);
@@ -473,6 +490,11 @@ private:
auto createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
Signedness SgnX, ArrayRef<Value *> WordY,
Signedness SgnY) const -> SmallVector<Value *>;
+ // Vector manipulations for Ripple
+ bool matchScatter(Instruction &In) const;
+ bool matchGather(Instruction &In) const;
+ Value *processVScatter(Instruction &In) const;
+ Value *processVGather(Instruction &In) const;
VectorType *HvxI32Ty;
VectorType *HvxP32Ty;
@@ -1545,7 +1567,7 @@ auto AlignVectors::isSectorTy(Type *Ty) const -> bool {
}
auto AlignVectors::run() -> bool {
- LLVM_DEBUG(dbgs() << "Running HVC::AlignVectors on " << HVC.F.getName()
+ LLVM_DEBUG(dbgs() << "\nRunning HVC::AlignVectors on " << HVC.F.getName()
<< '\n');
if (!createAddressGroups())
return false;
@@ -1797,6 +1819,846 @@ auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const
return Ext;
}
+inline bool HvxIdioms::matchScatter(Instruction &In) const {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
+ if (!II)
+ return false;
+ return (II->getIntrinsicID() == Intrinsic::masked_scatter);
+}
+
+inline bool HvxIdioms::matchGather(Instruction &In) const {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
+ if (!II)
+ return false;
+ return (II->getIntrinsicID() == Intrinsic::masked_gather);
+}
+
+Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual);
+
+// Binary instructions we want to handle as users of gather/scatter.
+inline bool isArithmetic(unsigned Opc) {
+ switch (Opc) {
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::AShr:
+ case Instruction::LShr:
+ case Instruction::Shl:
+ case Instruction::UDiv:
+ return true;
+ }
+ return false;
+}
+
+// TODO: Maybe use MemoryLocation for this. See getLocOrNone above.
+inline Value *getPointer(Value *Ptr) {
+ assert(Ptr && "Unable to extract pointer");
+ if (isa<AllocaInst>(Ptr) || isa<Argument>(Ptr) || isa<GlobalValue>(Ptr))
+ return Ptr;
+ if (isa<LoadInst>(Ptr) || isa<StoreInst>(Ptr))
+ return getLoadStorePointerOperand(Ptr);
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Ptr)) {
+ if (II->getIntrinsicID() == Intrinsic::masked_store)
+ return II->getOperand(1);
+ }
+ return nullptr;
+}
+
+static Instruction *selectDestination(Instruction *In,
+ HvxIdioms::DstQualifier &Qual) {
+ Instruction *Destination = nullptr;
+ if (!In)
+ return Destination;
+ if (isa<StoreInst>(In)) {
+ Destination = In;
+ Qual = HvxIdioms::LdSt;
+ } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {
+ if (II->getIntrinsicID() == Intrinsic::masked_gather) {
+ Destination = In;
+ Qual = HvxIdioms::LLVM_Gather;
+ } else if (II->getIntrinsicID() == Intrinsic::masked_scatter) {
+ Destination = In;
+ Qual = HvxIdioms::LLVM_Scatter;
+ } else if (II->getIntrinsicID() == Intrinsic::masked_store) {
+ Destination = In;
+ Qual = HvxIdioms::LdSt;
+ } else if (II->getIntrinsicID() ==
+ Intrinsic::hexagon_V6_vgather_vscattermh) {
+ Destination = In;
+ Qual = HvxIdioms::HEX_Gather_Scatter;
+ } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vscattermh_128B) {
+ Destination = In;
+ Qual = HvxIdioms::HEX_Scatter;
+ } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vgathermh_128B) {
+ Destination = In;
+ Qual = HvxIdioms::HEX_Gather;
+ }
+ } else if (isa<ZExtInst>(In)) {
+ return locateDestination(In, Qual);
+ } else if (isa<CastInst>(In)) {
+ return locateDestination(In, Qual);
+ } else if (isa<CallInst>(In)) {
+ Destination = In;
+ Qual = HvxIdioms::Call;
+ } else if (isa<GetElementPtrInst>(In)) {
+ return locateDestination(In, Qual);
+ } else if (isArithmetic(In->getOpcode())) {
+ Destination = In;
+ Qual = HvxIdioms::Arithmetic;
+ } else {
+ LLVM_DEBUG(dbgs() << "Unhandled destination : " << *In << "\n");
+ }
+ return Destination;
+}
+
+// This method attempts to find destination (user) for a given intrinsic.
+// Given that these are produced only by Ripple, the number of options is
+// limited. Simplest case is explicit store which in fact is redundant (since
+// HVX gater creates its own store during packetization). Nevertheless we need
+// to figure address where we storing. Other cases are more complicated, but
+// still few.
+Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual) {
+ Instruction *Destination = nullptr;
+ if (!In)
+ return Destination;
+ // Get all possible destinations
+ SmallVector<Instruction *> Users;
+ // Iterate over the uses of the instruction
+ for (auto &U : In->uses()) {
+ if (auto *UI = dyn_cast<Instruction>(U.getUser())) {
+ Destination = selectDestination(UI, Qual);
+ if (Destination)
+ Users.push_back(Destination);
+ }
+ }
+ // Now see which of the users (if any) is a memory destination.
+ for (auto *I : Users)
+ if (getPointer(I))
+ return I;
+ return Destination;
+}
+
+// The two intrinsics we handle here have GEP in a different position.
+inline GetElementPtrInst *locateGepFromIntrinsic(Instruction *In) {
+ assert(In && "Bad instruction");
+ IntrinsicInst *IIn = dyn_cast<IntrinsicInst>(In);
+ assert((IIn && (IIn->getIntrinsicID() == Intrinsic::masked_gather ||
+ IIn->getIntrinsicID() == Intrinsic::masked_scatter)) &&
+ "Not a gather Intrinsic");
+ GetElementPtrInst *GEPIndex = nullptr;
+ if (IIn->getIntrinsicID() == Intrinsic::masked_gather)
+ GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(0));
+ else
+ GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(1));
+ return GEPIndex;
+}
+
+// Given the intrinsic find its GEP argument and extract base address it uses.
+// The method relies on the way how Ripple typically forms the GEP for
+// scatter/gather.
+static Value *locateAddressFromIntrinsic(Instruction *In) {
+ GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In);
+ if (!GEPIndex) {
+ LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n");
+ return nullptr;
+ }
+ Value *BaseAddress = GEPIndex->getPointerOperand();
+ auto *IndexLoad = dyn_cast<LoadInst>(BaseAddress);
+ if (IndexLoad)
+ return IndexLoad;
+
+ auto *IndexZEx = dyn_cast<ZExtInst>(BaseAddress);
+ if (IndexZEx) {
+ IndexLoad = dyn_cast<LoadInst>(IndexZEx->getOperand(0));
+ if (IndexLoad)
+ return IndexLoad;
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(IndexZEx->getOperand(0));
+ if (II && II->getIntrinsicID() == Intrinsic::masked_gather)
+ return locateAddressFromIntrinsic(II);
+ }
+ auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(BaseAddress);
+ if (BaseShuffle) {
+ IndexLoad = dyn_cast<LoadInst>(BaseShuffle->getOperand(0));
+ if (IndexLoad)
+ return IndexLoad;
+ auto *IE = dyn_cast<InsertElementInst>(BaseShuffle->getOperand(0));
+ if (IE) {
+ auto *Src = IE->getOperand(1);
+ IndexLoad = dyn_cast<LoadInst>(Src);
+ if (IndexLoad)
+ return IndexLoad;
+ auto *Alloca = dyn_cast<AllocaInst>(Src);
+ if (Alloca)
+ return Alloca;
+ if (isa<Argument>(Src)) {
+ return Src;
+ }
+ if (isa<GlobalValue>(Src)) {
+ return Src;
+ }
+ }
+ }
+ LLVM_DEBUG(dbgs() << " Unable to locate Address from intrinsic\n");
+ return nullptr;
+}
+
+static Type *getIndexType(Value *In) {
+ if (!In)
+ return nullptr;
+
+ if (isa<LoadInst>(In) || isa<StoreInst>(In))
+ return getLoadStoreType(In);
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {
+ if (II->getIntrinsicID() == Intrinsic::masked_load)
+ return II->getType();
+ if (II->getIntrinsicID() == Intrinsic::masked_store)
+ return II->getOperand(0)->getType();
+ }
+ return In->getType();
+}
+
+static Value *locateIndexesFromGEP(Value *In) {
+ if (!In)
+ return nullptr;
+ if (isa<LoadInst>(In))
+ return In;
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {
+ if (II->getIntrinsicID() == Intrinsic::masked_load)
+ return In;
+ if (II->getIntrinsicID() == Intrinsic::masked_gather)
+ return In;
+ }
+ if (auto *IndexZEx = dyn_cast<ZExtInst>(In))
+ return locateIndexesFromGEP(IndexZEx->getOperand(0));
+ if (auto *IndexSEx = dyn_cast<SExtInst>(In))
+ return locateIndexesFromGEP(IndexSEx->getOperand(0));
+ if (auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(In))
+ return locateIndexesFromGEP(BaseShuffle->getOperand(0));
+ if (auto *IE = dyn_cast<InsertElementInst>(In))
+ return locateIndexesFromGEP(IE->getOperand(1));
+ if (auto *cstDataVector = dyn_cast<ConstantDataVector>(In))
+ return cstDataVector;
+ if (auto *GEPIndex = dyn_cast<GetElementPtrInst>(In))
+ return GEPIndex->getOperand(0);
+ return nullptr;
+}
+
+// Given the intrinsic find its GEP argument and extract offsetts from the base
+// address it uses.
+static Value *locateIndexesFromIntrinsic(Instruction *In) {
+ GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In);
+ if (!GEPIndex) {
+ LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n");
+ return nullptr;
+ }
+ Value *Indexes = GEPIndex->getOperand(1);
+ if (auto *IndexLoad = locateIndexesFromGEP(Indexes))
+ return IndexLoad;
+
+ LLVM_DEBUG(dbgs() << " Unable to locate Index from intrinsic\n");
+ return nullptr;
+}
+
+// Because of aukward definition of many Hex intrinsics we often have to
+// reinterprete HVX native <64 x i16> as <32 x i32> which in practice is a NOP
+// for all use cases, so this only exist to make IR builder happy.
+inline Value *getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC,
+ IRBuilderBase &Builder,
+ LLVMContext &Ctx, Value *I) {
+ assert(I && "Unable to reinterprete cast");
+ Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
+ std::vector<unsigned> shuffleMask;
+ for (unsigned i = 0; i < 64; ++i)
+ shuffleMask.push_back(i);
+ Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);
+ Value *CastShuffle =
+ Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");
+ return Builder.CreateBitCast(CastShuffle, NT, "cst64_i16_to_32_i32");
+}
+
+// Recast <128 x i8> as <32 x i32>
+inline Value *getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC,
+ IRBuilderBase &Builder,
+ LLVMContext &Ctx, Value *I) {
+ assert(I && "Unable to reinterprete cast");
+ Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
+ std::vector<unsigned> shuffleMask;
+ for (unsigned i = 0; i < 128; ++i)
+ shuffleMask.push_back(i);
+ Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);
+ Value *CastShuffle =
+ Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");
+ return Builder.CreateBitCast(CastShuffle, NT, "cst128_i8_to_32_i32");
+}
+
+// Create <32 x i32> mask reinterpreted as <128 x i1> with a given pattern
+inline Value *get_i32_Mask(const HexagonVectorCombine &HVC,
+ IRBuilderBase &Builder, LLVMContext &Ctx,
+ unsigned int pattern) {
+ std::vector<unsigned int> byteMask;
+ for (unsigned i = 0; i < 32; ++i)
+ byteMask.push_back(pattern);
+
+ return Builder.CreateIntrinsic(
+ HVC.getBoolTy(128), HVC.HST.getIntrinsicId(Hexagon::V6_vandvrt),
+ {llvm::ConstantDataVector::get(Ctx, byteMask), HVC.getConstInt(~0)},
+ nullptr);
+}
+
+Value *HvxIdioms::processVScatter(Instruction &In) const {
+ auto *InpTy = dyn_cast<VectorType>(In.getOperand(0)->getType());
+ assert(InpTy && "Cannot handle no vector type for llvm.scatter/gather");
+ unsigned InpSize = HVC.getSizeOf(InpTy);
+ auto *F = In.getFunction();
+ LLVMContext &Ctx = F->getContext();
+ auto *ElemTy = dyn_cast<IntegerType>(InpTy->getElementType());
+ assert(ElemTy && "llvm.scatter needs integer type argument");
+ unsigned ElemWidth = HVC.DL.getTypeAllocSize(ElemTy);
+ LLVM_DEBUG({
+ unsigned Elements = HVC.length(InpTy);
+ dbgs() << "\n[Process scatter](" << In << ")\n" << *In.getParent() << "\n";
+ dbgs() << " Input type(" << *InpTy << ") elements(" << Elements
+ << ") VecLen(" << InpSize << ") type(" << *ElemTy << ") ElemWidth("
+ << ElemWidth << ")\n";
+ });
+
+ IRBuilder Builder(In.getParent(), In.getIterator(),
+ InstSimplifyFolder(HVC.DL));
+
+ auto *ValueToScatter = In.getOperand(0);
+ LLVM_DEBUG(dbgs() << " ValueToScatter : " << *ValueToScatter << "\n");
+
+ if (HVC.HST.getVectorLength() != InpSize) {
+ LLVM_DEBUG(dbgs() << "Unhandled vector size(" << InpSize
+ << ") for vscatter\n");
+ return nullptr;
+ }
+
+ // Base address of indexes.
+ auto *IndexLoad = locateAddressFromIntrinsic(&In);
+ if (!IndexLoad)
+ return nullptr;
+ LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n");
+
+ // Address of destination. Must be in VTCM.
+ auto *Ptr = getPointer(IndexLoad);
+ if (!Ptr)
+ return nullptr;
+ LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
+ // Indexes/offsets
+ auto *Indexes = locateIndexesFromIntrinsic(&In);
+ if (!Indexes)
+ return nullptr;
+ LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n");
+ Value *CastedDst = Builder.CreateBitOrPointerCast(Ptr, Type::getInt32Ty(Ctx),
+ "cst_ptr_to_i32");
+ LLVM_DEBUG(dbgs() << " CastedDst : " << *CastedDst << "\n");
+ // Adjust Indexes
+ auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
+ Value *CastIndex = nullptr;
+ if (cstDataVector) {
+ // Our indexes are represented as a constant. We need it in a reg.
+ AllocaInst *IndexesAlloca =
+ Builder.CreateAlloca(HVC.getHvxTy(HVC.getIntTy(32), false));
+ [[maybe_unused]] auto *StoreIndexes =
+ Builder.CreateStore(cstDataVector, IndexesAlloca);
+ LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
+ CastIndex = Builder.CreateLoad(IndexesAlloca->getAllocatedType(),
+ IndexesAlloca, "reload_index");
+ } else {
+ if (ElemWidth == 2)
+ CastIndex = getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
+ else
+ CastIndex = Indexes;
+ }
+ LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n");
+
+ if (ElemWidth == 1) {
+ // v128i8 There is no native instruction for this.
+ // Do this as two Hi/Lo gathers with masking.
+ Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
+ // Extend indexes. We assume that indexes are in 128i8 format - need to
+ // expand them to Hi/Lo 64i16
+ Value *CastIndexes = Builder.CreateBitCast(CastIndex, NT, "cast_to_32i32");
+ auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);
+ auto *UnpackedIndexes = Builder.CreateIntrinsic(
+ HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastIndexes, nullptr);
+ LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes << ")\n");
+
+ auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);
+ auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);
+ [[maybe_unused]] Value *IndexHi =
+ HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);
+ [[maybe_unused]] Value *IndexLo =
+ HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);
+ LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n");
+ LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n");
+ // Now unpack values to scatter
+ Value *CastSrc =
+ getReinterpretiveCast_i8_to_i32(HVC, Builder, Ctx, ValueToScatter);
+ LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n");
+ auto *UnpackedValueToScatter = Builder.CreateIntrinsic(
+ HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastSrc, nullptr);
+ LLVM_DEBUG(dbgs() << " UnpackedValToScat: " << *UnpackedValueToScatter
+ << ")\n");
+
+ [[maybe_unused]] Value *UVSHi =
+ HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedValueToScatter);
+ [[maybe_unused]] Value *UVSLo =
+ HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedValueToScatter);
+ LLVM_DEBUG(dbgs() << " UVSHi : " << *UVSHi << ")\n");
+ LLVM_DEBUG(dbgs() << " UVSLo : " << *UVSLo << ")\n");
+
+ // Create the mask for individual bytes
+ auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);
+ LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n");
+ [[maybe_unused]] auto *ResHi = Builder.CreateIntrinsic(
+ Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,
+ {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+ IndexHi, UVSHi},
+ nullptr);
+ LLVM_DEBUG(dbgs() << " ResHi : " << *ResHi << ")\n");
+ return Builder.CreateIntrinsic(
+ Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,
+ {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+ IndexLo, UVSLo},
+ nullptr);
+ } else if (ElemWidth == 2) {
+ Value *CastSrc =
+ getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, ValueToScatter);
+ LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n");
+ return Builder.CreateIntrinsic(
+ Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermh_128B,
+ {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
+ CastSrc},
+ nullptr);
+ } else if (ElemWidth == 4) {
+ return Builder.CreateIntrinsic(
+ Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermw_128B,
+ {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
+ ValueToScatter},
+ nullptr);
+ } else {
+ LLVM_DEBUG(dbgs() << "Unhandled element type for vscatter\n");
+ return nullptr;
+ }
+}
+
+Value *HvxIdioms::processVGather(Instruction &In) const {
+ [[maybe_unused]] auto *InpTy =
+ dyn_cast<VectorType>(In.getOperand(0)->getType());
+ assert(InpTy && "Cannot handle no vector type for llvm.gather");
+ [[maybe_unused]] auto *ElemTy =
+ dyn_cast<PointerType>(InpTy->getElementType());
+ assert(ElemTy && "llvm.gather needs vector of ptr argument");
+ auto *F = In.getFunction();
+ LLVMContext &Ctx = F->getContext();
+ LLVM_DEBUG(dbgs() << "\n[Process gather](" << In << ")\n"
+ << *In.getParent() << "\n");
+ LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements("
+ << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
+ << ") type(" << *ElemTy << ") Access alignment("
+ << *In.getOperand(1) << ") AddressSpace("
+ << ElemTy->getAddressSpace() << ")\n");
+
+ // TODO: Handle masking of elements.
+ assert(dyn_cast<VectorType>(In.getOperand(2)->getType()) &&
+ "llvm.gather needs vector for mask");
+ IRBuilder Builder(In.getParent(), In.getIterator(),
+ InstSimplifyFolder(HVC.DL));
+
+ // See who is using the result. The difference between LLVM and HVX vgather
+ // Intrinsic makes it impossible to handle all cases with temp storage. Alloca
+ // in VTCM is not yet supported, so for now we just bail out for those cases.
+ HvxIdioms::DstQualifier Qual = HvxIdioms::Undefined;
+ Instruction *Dst = locateDestination(&In, Qual);
+ if (!Dst) {
+ LLVM_DEBUG(dbgs() << " Unable to locate vgather destination\n");
+ return nullptr;
+ }
+ LLVM_DEBUG(dbgs() << " Destination : " << *Dst << " Qual(" << Qual
+ << ")\n");
+
+ // Address of destination. Must be in VTCM.
+ auto *Ptr = getPointer(Dst);
+ if (!Ptr) {
+ LLVM_DEBUG(dbgs() << "Could not locate vgather destination ptr\n");
+ return nullptr;
+ }
+
+ // Result type. Assume it is a vector type.
+ auto *DstType = cast<VectorType>(getIndexType(Dst));
+ assert(DstType && "Cannot handle non vector dst type for llvm.gather");
+
+ // Base address for sources to be loaded
+ auto *IndexLoad = locateAddressFromIntrinsic(&In);
+ if (!IndexLoad)
+ return nullptr;
+ LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n");
+
+ // Gather indexes/offsets
+ auto *Indexes = locateIndexesFromIntrinsic(&In);
+ if (!Indexes)
+ return nullptr;
+ LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n");
+
+ Instruction *Gather = nullptr;
+ Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
+ if (Qual == HvxIdioms::LdSt || Qual == HvxIdioms::Arithmetic) {
+ // We fully assume the address space is in VTCM. We also assume that all
+ // pointers in Operand(0) have the same base(!).
+ // This is the most basic case of all the above.
+ unsigned OutputSize = HVC.getSizeOf(DstType);
+ auto *DstElemTy = cast<IntegerType>(DstType->getElementType());
+ unsigned ElemWidth = HVC.DL.getTypeAllocSize(DstElemTy);
+ LLVM_DEBUG(dbgs() << " Buffer type : " << *Ptr->getType()
+ << " Address space ("
+ << Ptr->getType()->getPointerAddressSpace() << ")\n"
+ << " Result type : " << *DstType
+ << "\n Size in bytes : " << OutputSize
+ << " element type(" << *DstElemTy
+ << ")\n ElemWidth : " << ElemWidth << " bytes\n");
+
+ auto *IndexType = cast<VectorType>(getIndexType(Indexes));
+ assert(IndexType && "Cannot handle non vector index type for llvm.gather");
+ unsigned IndexWidth = HVC.DL.getTypeAllocSize(IndexType->getElementType());
+ LLVM_DEBUG(dbgs() << " IndexWidth(" << IndexWidth << ")\n");
+
+ // Intrinsic takes i32 instead of pointer so cast.
+ Value *CastedPtr = Builder.CreateBitOrPointerCast(
+ IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+ // [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ...]
+ // int_hexagon_V6_vgathermh [... , llvm_v16i32_ty]
+ // int_hexagon_V6_vgathermh_128B [... , llvm_v32i32_ty]
+ // int_hexagon_V6_vgathermhw [... , llvm_v32i32_ty]
+ // int_hexagon_V6_vgathermhw_128B [... , llvm_v64i32_ty]
+ // int_hexagon_V6_vgathermw [... , llvm_v16i32_ty]
+ // int_hexagon_V6_vgathermw_128B [... , llvm_v32i32_ty]
+ if (HVC.HST.getVectorLength() == OutputSize) {
+ if (ElemWidth == 1) {
+ // v128i8 There is no native instruction for this.
+ // Do this as two Hi/Lo gathers with masking.
+ // Unpack indexes. We assume that indexes are in 128i8 format - need to
+ // expand them to Hi/Lo 64i16
+ Value *CastIndexes =
+ Builder.CreateBitCast(Indexes, NT, "cast_to_32i32");
+ auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);
+ auto *UnpackedIndexes =
+ Builder.CreateIntrinsic(HVC.getHvxTy(HVC.getIntTy(32), true),
+ V6_vunpack, CastIndexes, nullptr);
+ LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes
+ << ")\n");
+
+ auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);
+ auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);
+ [[maybe_unused]] Value *IndexHi =
+ HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);
+ [[maybe_unused]] Value *IndexLo =
+ HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);
+ LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n");
+ LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n");
+ // Create the mask for individual bytes
+ auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);
+ LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n");
+ // We use our destination allocation as a temp storage
+ // This is unlikely to work properly for masked gather.
+ auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermhq);
+ [[maybe_unused]] auto GatherHi = Builder.CreateIntrinsic(
+ Type::getVoidTy(Ctx), V6_vgather,
+ {Ptr, QByteMask, CastedPtr,
+ HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexHi},
+ nullptr);
+ LLVM_DEBUG(dbgs() << " GatherHi : " << *GatherHi << ")\n");
+ // Rematerialize the result
+ [[maybe_unused]] Value *LoadedResultHi = Builder.CreateLoad(
+ HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_hi");
+ LLVM_DEBUG(dbgs() << " LoadedResultHi : " << *LoadedResultHi << "\n");
+ // Same for the low part. Here we use Gather to return non-NULL result
+ // from this function and continue to iterate. We also are deleting Dst
+ // store below.
+ Gather = Builder.CreateIntrinsic(
+ Type::getVoidTy(Ctx), V6_vgather,
+ {Ptr, QByteMask, CastedPtr,
+ HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexLo},
+ nullptr);
+ LLVM_DEBUG(dbgs() << " GatherLo : " << *Gather << ")\n");
+ Value *LoadedResultLo = Builder.CreateLoad(
+ HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_lo");
+ LLVM_DEBUG(dbgs() << " LoadedResultLo : " << *LoadedResultLo << "\n");
+ // Now we have properly sized bytes in every other position
+ // B b A a c a A b B c f F g G h H is presented as
+ // B . b . A . a . c . a . A . b . B . c . f . F . g . G . h . H
+ // Use vpack to gather them
+ auto V6_vpackeb = HVC.HST.getIntrinsicId(Hexagon::V6_vpackeb);
+ [[maybe_unused]] auto Res = Builder.CreateIntrinsic(
+ NT, V6_vpackeb, {LoadedResultHi, LoadedResultLo}, nullptr);
+ LLVM_DEBUG(dbgs() << " ScaledRes : " << *Res << "\n");
+ [[maybe_unused]] auto *StoreRes = Builder.CreateStore(Res, Ptr);
+ LLVM_DEBUG(dbgs() << " StoreRes : " << *StoreRes << "\n");
+ } else if (ElemWidth == 2) {
+ // v32i16
+ if (IndexWidth == 2) {
+ // Reinterprete 64i16 as 32i32. Only needed for syntactic IR match.
+ Value *CastIndex =
+ getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
+ LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n");
+ // shift all i16 left by 1 to match short addressing mode instead of
+ // byte.
+ auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
+ Value *AdjustedIndex = HVC.createHvxIntrinsic(
+ Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});
+ LLVM_DEBUG(dbgs()
+ << " Shifted half index: " << *AdjustedIndex << ")\n");
+
+ auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermh);
+ // The 3rd argument is the size of the region to gather from. Probably
+ // want to set it to max VTCM size.
+ Gather = Builder.CreateIntrinsic(
+ Type::getVoidTy(Ctx), V6_vgather,
+ {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+ AdjustedIndex},
+ nullptr);
+ for (auto &U : Dst->uses()) {
+ if (auto *UI = dyn_cast<Instruction>(U.getUser()))
+ dbgs() << " dst used by: " << *UI << "\n";
+ }
+ for (auto &U : In.uses()) {
+ if (auto *UI = dyn_cast<Instruction>(U.getUser()))
+ dbgs() << " In used by : " << *UI << "\n";
+ }
+ // Create temp load from result in case the result is used by any
+ // other instruction.
+ Value *LoadedResult = Builder.CreateLoad(
+ HVC.getHvxTy(HVC.getIntTy(16), false), Ptr, "temp_result");
+ LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
+ In.replaceAllUsesWith(LoadedResult);
+ } else {
+ dbgs() << " Unhandled index type for vgather\n";
+ return nullptr;
+ }
+ } else if (ElemWidth == 4) {
+ if (IndexWidth == 4) {
+ // v32i32
+ auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
+ Value *AdjustedIndex = HVC.createHvxIntrinsic(
+ Builder, V6_vaslh, NT, {Indexes, HVC.getConstInt(2)});
+ LLVM_DEBUG(dbgs()
+ << " Shifted word index: " << *AdjustedIndex << ")\n");
+ Gather = Builder.CreateIntrinsic(
+ Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermw_128B,
+ {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+ AdjustedIndex},
+ nullptr);
+ } else {
+ LLVM_DEBUG(dbgs() << " Unhandled index type for vgather\n");
+ return nullptr;
+ }
+ } else {
+ LLVM_DEBUG(dbgs() << " Unhandled element type for vgather\n");
+ return nullptr;
+ }
+ } else if (HVC.HST.getVectorLength() == OutputSize * 2) {
+ // This is half of the reg width, duplicate low in high
+ LLVM_DEBUG(dbgs() << " Unhandled half of register size\n");
+ return nullptr;
+ } else if (HVC.HST.getVectorLength() * 2 == OutputSize) {
+ LLVM_DEBUG(dbgs() << " Unhandle twice the register size\n");
+ return nullptr;
+ }
+ // Erase the original intrinsic and store that consumes it.
+ // HVX will create a pseudo for gather that is expanded to gather + store
+ // during packetization.
+ Dst->eraseFromParent();
+ } else if (Qual == HvxIdioms::LLVM_Scatter) {
+ // Gather feeds directly into scatter.
+ LLVM_DEBUG({
+ auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType());
+ assert(DstInpTy && "Cannot handle no vector type for llvm.scatter");
+ unsigned DstInpSize = HVC.getSizeOf(DstInpTy);
+ unsigned DstElements = HVC.length(DstInpTy);
+ auto *DstElemTy = cast<PointerType>(DstInpTy->getElementType());
+ assert(DstElemTy && "llvm.scatter needs vector of ptr argument");
+ dbgs() << " Gather feeds into scatter\n Values to scatter : "
+ << *Dst->getOperand(0) << "\n";
+ dbgs() << " Dst type(" << *DstInpTy << ") elements(" << DstElements
+ << ") VecLen(" << DstInpSize << ") type(" << *DstElemTy
+ << ") Access alignment(" << *Dst->getOperand(2) << ")\n";
+ });
+ // Address of source
+ auto *Src = getPointer(IndexLoad);
+ if (!Src)
+ return nullptr;
+ LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n");
+
+ if (!isa<PointerType>(Src->getType())) {
+ LLVM_DEBUG(dbgs() << " Source is not a pointer type...\n");
+ return nullptr;
+ }
+
+ Value *CastedSrc = Builder.CreateBitOrPointerCast(
+ Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+ LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n");
+
+ auto *DstLoad = locateAddressFromIntrinsic(Dst);
+ if (!DstLoad) {
+ LLVM_DEBUG(dbgs() << " Unable to locate DstLoad\n");
+ return nullptr;
+ }
+ LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n");
+
+ Value *Ptr = getPointer(DstLoad);
+ if (!Ptr)
+ return nullptr;
+ LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
+ Value *CastIndex =
+ getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, IndexLoad);
+ LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n");
+ // Shift all i16 left by 1 to match short addressing mode instead of
+ // byte.
+ auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
+ Value *AdjustedIndex = HVC.createHvxIntrinsic(
+ Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});
+ LLVM_DEBUG(dbgs() << " Shifted half index: " << *AdjustedIndex << ")\n");
+
+ return Builder.CreateIntrinsic(
+ Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
+ {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+ AdjustedIndex},
+ nullptr);
+ } else if (Qual == HvxIdioms::HEX_Gather_Scatter) {
+ // Gather feeds into previously inserted pseudo intrinsic.
+ // These could not be in the same packet, so we need to generate another
+ // pseudo that is expanded to .tmp + store V6_vgathermh_pseudo
+ // V6_vgathermh_pseudo (ins IntRegs:$_dst_, s4_0Imm:$Ii, IntRegs:$Rt,
+ // ModRegs:$Mu, HvxVR:$Vv)
+ if (isa<AllocaInst>(IndexLoad)) {
+ auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
+ if (cstDataVector) {
+ // Our indexes are represented as a constant. We need THEM in a reg.
+ // This most likely will not work properly since alloca gives us DDR
+ // stack location. This will be fixed once we teach compiler about VTCM.
+ AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);
+ [[maybe_unused]] auto *StoreIndexes =
+ Builder.CreateStore(cstDataVector, IndexesAlloca);
+ LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
+ Value *LoadedIndex = Builder.CreateLoad(
+ IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index");
+ AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
+ LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca << "\n");
+
+ Value *CastedSrc = Builder.CreateBitOrPointerCast(
+ IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+ LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
+
+ Gather = Builder.CreateIntrinsic(
+ Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
+ {ResultAlloca, CastedSrc,
+ HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
+ nullptr);
+ Value *LoadedResult = Builder.CreateLoad(
+ HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
+ LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
+ LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n");
+ In.replaceAllUsesWith(LoadedResult);
+ }
+ } else {
+ // Address of source
+ auto *Src = getPointer(IndexLoad);
+ if (!Src)
+ return nullptr;
+ LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n");
+
+ Value *CastedSrc = Builder.CreateBitOrPointerCast(
+ Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+ LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n");
+
+ auto *DstLoad = locateAddressFromIntrinsic(Dst);
+ if (!DstLoad)
+ return nullptr;
+ LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n");
+ auto *Ptr = getPointer(DstLoad);
+ if (!Ptr)
+ return nullptr;
+ LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
+
+ Gather = Builder.CreateIntrinsic(
+ Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgather_vscattermh,
+ {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+ Indexes},
+ nullptr);
+ }
+ return Gather;
+ } else if (Qual == HvxIdioms::HEX_Scatter) {
+ // This is the case when result of a gather is used as an argument to
+ // Intrinsic::hexagon_V6_vscattermh_128B. Most likely we just inserted it
+ // ourselves. We have to create alloca, store to it, and replace all uses
+ // with that.
+ AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
+ Value *CastedSrc = Builder.CreateBitOrPointerCast(
+ IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+ LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
+ Value *CastIndex =
+ getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
+ LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n");
+
+ Gather = Builder.CreateIntrinsic(
+ Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
+ {ResultAlloca, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+ CastIndex},
+ nullptr);
+ Value *LoadedResult = Builder.CreateLoad(
+ HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
+ LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
+ In.replaceAllUsesWith(LoadedResult);
+ } else if (Qual == HvxIdioms::HEX_Gather) {
+ // Gather feeds to another gather but already replaced with
+ // hexagon_V6_vgathermh_128B
+ if (isa<AllocaInst>(IndexLoad)) {
+ auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
+ if (cstDataVector) {
+ // Our indexes are represented as a constant. We need it in a reg.
+ AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);
+
+ [[maybe_unused]] auto *StoreIndexes =
+ Builder.CreateStore(cstDataVector, IndexesAlloca);
+ LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
+ Value *LoadedIndex = Builder.CreateLoad(
+ IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index");
+ AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
+ LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca
+ << "\n AddressSpace: "
+ << ResultAlloca->getAddressSpace() << "\n";);
+
+ Value *CastedSrc = Builder.CreateBitOrPointerCast(
+ IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+ LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
+
+ Gather = Builder.CreateIntrinsic(
+ Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
+ {ResultAlloca, CastedSrc,
+ HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
+ nullptr);
+ Value *LoadedResult = Builder.CreateLoad(
+ HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
+ LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
+ LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n");
+ In.replaceAllUsesWith(LoadedResult);
+ }
+ }
+ } else if (Qual == HvxIdioms::LLVM_Gather) {
+ // Gather feeds into another gather
+ errs() << " Underimplemented vgather to vgather sequence\n";
+ return nullptr;
+ } else
+ llvm_unreachable("Unhandled Qual enum");
+
+ return Gather;
+}
+
auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
const FxpOp &Op) const -> Value * {
assert(Op.X.Val->getType() == Op.Y.Val->getType());
@@ -2138,6 +3000,26 @@ auto HvxIdioms::run() -> bool {
It = StartOver ? B.rbegin()
: cast<Instruction>(New)->getReverseIterator();
Changed = true;
+ } else if (matchGather(*It)) {
+ Value *New = processVGather(*It);
+ if (!New)
+ continue;
+ LLVM_DEBUG(dbgs() << " Gather : " << *New << "\n");
+ // We replace original intrinsic with a new pseudo call.
+ It->eraseFromParent();
+ It = cast<Instruction>(New)->getReverseIterator();
+ RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI);
+ Changed = true;
+ } else if (matchScatter(*It)) {
+ Value *New = processVScatter(*It);
+ if (!New)
+ continue;
+ LLVM_DEBUG(dbgs() << " Scatter : " << *New << "\n");
+ // We replace original intrinsic with a new pseudo call.
+ It->eraseFromParent();
+ It = cast<Instruction>(New)->getReverseIterator();
+ RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI);
+ Changed = true;
}
}
}
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index 6455757..2f59b7c 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -186,6 +186,9 @@ static unsigned featureToArchVersion(unsigned Feature) {
case Hexagon::ArchV79:
case Hexagon::ExtensionHVXV79:
return 79;
+ case Hexagon::ArchV81:
+ case Hexagon::ExtensionHVXV81:
+ return 81;
}
llvm_unreachable("Expected valid arch feature");
return 0;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 6b48a21..b8075bd 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -96,6 +96,8 @@ cl::opt<bool> MV75("mv75", cl::Hidden, cl::desc("Build for Hexagon V75"),
cl::init(false));
cl::opt<bool> MV79("mv79", cl::Hidden, cl::desc("Build for Hexagon V79"),
cl::init(false));
+cl::opt<bool> MV81("mv81", cl::Hidden, cl::desc("Build for Hexagon V81"),
+ cl::init(false));
} // namespace
static cl::opt<Hexagon::ArchEnum> EnableHVX(
@@ -111,6 +113,7 @@ static cl::opt<Hexagon::ArchEnum> EnableHVX(
clEnumValN(Hexagon::ArchEnum::V73, "v73", "Build for HVX v73"),
clEnumValN(Hexagon::ArchEnum::V75, "v75", "Build for HVX v75"),
clEnumValN(Hexagon::ArchEnum::V79, "v79", "Build for HVX v79"),
+ clEnumValN(Hexagon::ArchEnum::V81, "v81", "Build for HVX v81"),
// Sentinel for no value specified.
clEnumValN(Hexagon::ArchEnum::Generic, "", "")),
// Sentinel for flag not present.
@@ -159,6 +162,8 @@ static StringRef HexagonGetArchVariant() {
return "hexagonv75";
if (MV79)
return "hexagonv79";
+ if (MV81)
+ return "hexagonv81";
return "";
}
@@ -474,6 +479,9 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {
case Hexagon::ArchEnum::V79:
Result.push_back("+hvxv79");
break;
+ case Hexagon::ArchEnum::V81:
+ Result.push_back("+hvxv81");
+ break;
case Hexagon::ArchEnum::Generic: {
Result.push_back(StringSwitch<StringRef>(CPU)
@@ -489,7 +497,8 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {
.Case("hexagonv71t", "+hvxv71")
.Case("hexagonv73", "+hvxv73")
.Case("hexagonv75", "+hvxv75")
- .Case("hexagonv79", "+hvxv79"));
+ .Case("hexagonv79", "+hvxv79")
+ .Case("hexagonv81", "+hvxv81"));
break;
}
case Hexagon::ArchEnum::NoArch:
@@ -538,8 +547,8 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
FeatureBitset FB = S;
unsigned CpuArch = ArchV5;
for (unsigned F :
- {ArchV79, ArchV75, ArchV73, ArchV71, ArchV69, ArchV68, ArchV67, ArchV66,
- ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) {
+ {ArchV81, ArchV79, ArchV75, ArchV73, ArchV71, ArchV69, ArchV68, ArchV67,
+ ArchV66, ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) {
if (!FB.test(F))
continue;
CpuArch = F;
@@ -556,7 +565,7 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
for (unsigned F :
{ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65, ExtensionHVXV66,
ExtensionHVXV67, ExtensionHVXV68, ExtensionHVXV69, ExtensionHVXV71,
- ExtensionHVXV73, ExtensionHVXV75, ExtensionHVXV79}) {
+ ExtensionHVXV73, ExtensionHVXV75, ExtensionHVXV79, ExtensionHVXV81}) {
if (!FB.test(F))
continue;
HasHvxVer = true;
@@ -569,6 +578,9 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
// HasHvxVer is false, and UseHvx is true.
switch (CpuArch) {
+ case ArchV81:
+ FB.set(ExtensionHVXV81);
+ [[fallthrough]];
case ArchV79:
FB.set(ExtensionHVXV79);
[[fallthrough]];
@@ -668,12 +680,12 @@ void Hexagon_MC::addArchSubtarget(MCSubtargetInfo const *STI, StringRef FS) {
std::optional<unsigned>
Hexagon_MC::getHVXVersion(const FeatureBitset &Features) {
- for (auto Arch : {Hexagon::ExtensionHVXV79, Hexagon::ExtensionHVXV75,
- Hexagon::ExtensionHVXV73, Hexagon::ExtensionHVXV71,
- Hexagon::ExtensionHVXV69, Hexagon::ExtensionHVXV68,
- Hexagon::ExtensionHVXV67, Hexagon::ExtensionHVXV66,
- Hexagon::ExtensionHVXV65, Hexagon::ExtensionHVXV62,
- Hexagon::ExtensionHVXV60})
+ for (auto Arch : {Hexagon::ExtensionHVXV81, Hexagon::ExtensionHVXV79,
+ Hexagon::ExtensionHVXV75, Hexagon::ExtensionHVXV73,
+ Hexagon::ExtensionHVXV71, Hexagon::ExtensionHVXV69,
+ Hexagon::ExtensionHVXV68, Hexagon::ExtensionHVXV67,
+ Hexagon::ExtensionHVXV66, Hexagon::ExtensionHVXV65,
+ Hexagon::ExtensionHVXV62, Hexagon::ExtensionHVXV60})
if (Features.test(Arch))
return Arch;
return {};
@@ -681,13 +693,13 @@ Hexagon_MC::getHVXVersion(const FeatureBitset &Features) {
unsigned Hexagon_MC::getArchVersion(const FeatureBitset &Features) {
for (auto Arch :
- {Hexagon::ArchV79, Hexagon::ArchV75, Hexagon::ArchV73, Hexagon::ArchV71,
- Hexagon::ArchV69, Hexagon::ArchV68, Hexagon::ArchV67, Hexagon::ArchV66,
- Hexagon::ArchV65, Hexagon::ArchV62, Hexagon::ArchV60, Hexagon::ArchV55,
- Hexagon::ArchV5})
+ {Hexagon::ArchV81, Hexagon::ArchV79, Hexagon::ArchV75, Hexagon::ArchV73,
+ Hexagon::ArchV71, Hexagon::ArchV69, Hexagon::ArchV68, Hexagon::ArchV67,
+ Hexagon::ArchV66, Hexagon::ArchV65, Hexagon::ArchV62, Hexagon::ArchV60,
+ Hexagon::ArchV55, Hexagon::ArchV5})
if (Features.test(Arch))
return Arch;
- llvm_unreachable("Expected arch v5-v79");
+ llvm_unreachable("Expected arch v5-v81");
return 0;
}
@@ -708,7 +720,8 @@ unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
.Case("hexagonv71t", llvm::ELF::EF_HEXAGON_MACH_V71T)
.Case("hexagonv73", llvm::ELF::EF_HEXAGON_MACH_V73)
.Case("hexagonv75", llvm::ELF::EF_HEXAGON_MACH_V75)
- .Case("hexagonv79", llvm::ELF::EF_HEXAGON_MACH_V79);
+ .Case("hexagonv79", llvm::ELF::EF_HEXAGON_MACH_V79)
+ .Case("hexagonv81", llvm::ELF::EF_HEXAGON_MACH_V81);
}
llvm::ArrayRef<MCPhysReg> Hexagon_MC::GetVectRegRev() {
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index aca7abd..44d1a44 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -4578,6 +4578,8 @@ def : InstAlias<"mfamr $Rx", (MFSPR gprc:$Rx, 29)>;
def : InstAlias<"mtpid $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsBookE]>;
def : InstAlias<"mfpid $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsBookE]>;
+def : InstAlias<"mtpidr $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsISA3_0]>;
+def : InstAlias<"mfpidr $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsISA3_0]>;
foreach SPRG = 4-7 in {
def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 256))>,
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 9e6b7f0..2754d78 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1124,7 +1124,8 @@ def HasStdExtZbkbOrP
"'Base P' (Packed-SIMD)">;
def HasStdExtZbbOrZbkbOrP
- : Predicate<"Subtarget->HasStdExtZbbOrZbkb()|| Subtarget->hasStdExtP()">,
+ : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbkb() || "
+ "Subtarget->hasStdExtP()">,
AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtZbkb, FeatureStdExtP),
"'Zbb' (Basic Bit-Manipulation) or "
"'Zbkb' (Bitmanip instructions for Cryptography) or "
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 219e3f2..1c930ac 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -318,8 +318,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
- if (!Subtarget.hasStdExtZbb() && !Subtarget.hasVendorXTHeadBb() &&
- !Subtarget.hasVendorXqcibm() && !Subtarget.hasVendorXAndesPerf() &&
+ if (!Subtarget.hasStdExtZbb() && !Subtarget.hasStdExtP() &&
+ !Subtarget.hasVendorXTHeadBb() && !Subtarget.hasVendorXqcibm() &&
+ !Subtarget.hasVendorXAndesPerf() &&
!(Subtarget.hasVendorXCValu() && !Subtarget.is64Bit()))
setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::i8, MVT::i16}, Expand);
@@ -392,7 +393,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BITREVERSE, MVT::i8, Custom);
}
- if (Subtarget.hasStdExtZbb() ||
+ if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtP() ||
(Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {
setOperationAction({ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, XLenVT,
Legal);
@@ -403,6 +404,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
} else {
setOperationAction(ISD::CTTZ, XLenVT, Expand);
+ // If have a CLZW, but not CTZW, custom promote i32.
+ if (Subtarget.hasStdExtP() && Subtarget.is64Bit())
+ setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
}
if (!Subtarget.hasCPOPLike()) {
@@ -419,13 +423,15 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
// We need the custom lowering to make sure that the resulting sequence
// for the 32bit case is efficient on 64bit targets.
// Use default promotion for i32 without Zbb.
- if (Subtarget.is64Bit() && Subtarget.hasStdExtZbb())
+ if (Subtarget.is64Bit() &&
+ (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtP()))
setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
} else {
setOperationAction(ISD::CTLZ, XLenVT, Expand);
}
- if (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit()) {
+ if (Subtarget.hasStdExtP() ||
+ (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {
setOperationAction(ISD::ABS, XLenVT, Legal);
} else if (Subtarget.hasShortForwardBranchOpt()) {
// We can use PseudoCCSUB to implement ABS.
@@ -14669,6 +14675,25 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
bool IsCTZ =
N->getOpcode() == ISD::CTTZ || N->getOpcode() == ISD::CTTZ_ZERO_UNDEF;
+
+ // Without Zbb, lower as 32 - clzw(~X & (X-1))
+ if (IsCTZ && !Subtarget.hasStdExtZbb()) {
+ assert(Subtarget.hasStdExtP());
+
+ NewOp0 = DAG.getFreeze(NewOp0);
+ SDValue Not = DAG.getNOT(DL, NewOp0, MVT::i64);
+ SDValue Minus1 = DAG.getNode(ISD::SUB, DL, MVT::i64, NewOp0,
+ DAG.getConstant(1, DL, MVT::i64));
+ SDValue And = DAG.getNode(ISD::AND, DL, MVT::i64, Not, Minus1);
+ SDValue CLZW = DAG.getNode(RISCVISD::CLZW, DL, MVT::i64, And);
+ SDValue Sub = DAG.getNode(ISD::SUB, DL, MVT::i64,
+ DAG.getConstant(32, DL, MVT::i64), CLZW);
+ SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, Sub,
+ DAG.getValueType(MVT::i32));
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
+ return;
+ }
+
unsigned Opc = IsCTZ ? RISCVISD::CTZW : RISCVISD::CLZW;
SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp0);
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index 7d8a919..cc085bb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -1455,3 +1455,11 @@ let Predicates = [HasStdExtP, IsRV32] in {
def PMAXU_DW : RVPPairBinaryExchanged_rr<0b1111, 0b01, "pmaxu.dw">;
def PMAXU_DB : RVPPairBinaryExchanged_rr<0b1111, 0b10, "pmaxu.db">;
} // Predicates = [HasStdExtP, IsRV32]
+
+
+//===----------------------------------------------------------------------===//
+// Codegen patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtP] in
+def : PatGpr<abs, ABS>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 4c2f7f6..f7b4914 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -218,11 +218,13 @@ let Predicates = [HasVendorXSfvcp], mayLoad = 0, mayStore = 0,
}
let Predicates = [HasVendorXSfvfexpAny], DecoderNamespace = "XSfvector" in {
- def SF_VFEXP_V : VALUVs2<0b010011, 0b00111, OPFVV, "sf.vfexp.v">;
+ def SF_VFEXP_V : VALUVs2<0b010011, 0b00111, OPFVV, "sf.vfexp.v">,
+ SchedUnaryMC<"WriteSF_VFExp", "ReadSF_VFExp">;
}
let Predicates = [HasVendorXSfvfexpa], DecoderNamespace = "XSfvector" in {
- def SF_VFEXPA_V : VALUVs2<0b010011, 0b00110, OPFVV, "sf.vfexpa.v">;
+ def SF_VFEXPA_V : VALUVs2<0b010011, 0b00110, OPFVV, "sf.vfexpa.v">,
+ SchedUnaryMC<"WriteSF_VFExpa", "ReadSF_VFExpa">;
}
let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvector",
@@ -487,6 +489,48 @@ let Predicates = [HasVendorXSfvfnrclipxfqf], AltFmtType = IS_NOT_ALTFMT in {
defm SF_VFNRCLIP_X_F_QF : VPseudoSiFiveVFNRCLIP;
}
+class VFExpSchedSEWSet<string mx, bit IsBF16, bit IsApprox> {
+ defvar BaseSet = SchedSEWSet<mx, isF=1>.val;
+ list<int> val = !if(IsBF16, !listremove(BaseSet, [32, 64]),
+ !if(IsApprox, BaseSet, !listremove(BaseSet, [64])));
+}
+multiclass VPseudoVFExp_V<bit IsBF16 = false, bit IsApprox = false> {
+ defvar SchedSuffix = !if(IsApprox, "VFExpa", "VFExp");
+
+ foreach m = MxListF in {
+ defvar mx = m.MX;
+ foreach e = VFExpSchedSEWSet<mx, IsBF16, IsApprox>.val in {
+ let VLMul = m.value in {
+ def "_V_" # mx # "_E" # e
+ : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
+ SchedUnary<"WriteSF_" # SchedSuffix, "ReadSF_" # SchedSuffix,
+ mx, e, forcePassthruRead=true>;
+ def "_V_" # mx # "_E" # e # "_MASK"
+ : VPseudoUnaryMask<m.vrclass, m.vrclass>,
+ RISCVMaskedPseudo<MaskIdx = 2>,
+ SchedUnary<"WriteSF_" # SchedSuffix, "ReadSF_" # SchedSuffix,
+ mx, e, forcePassthruRead=true>;
+ }
+ }
+ }
+}
+
+let Predicates = [HasVendorXSfvfbfexp16e], hasSideEffects = 0 in {
+ let AltFmtType = IS_ALTFMT in {
+ defm PseudoSF_VFEXP_ALT : VPseudoVFExp_V<IsBF16=true>;
+ }
+}
+
+let Predicates = [HasVendorXSfvfexpAnyFloat], hasSideEffects = 0 in {
+ let AltFmtType = IS_NOT_ALTFMT in {
+ defm PseudoSF_VFEXP : VPseudoVFExp_V;
+ }
+}
+
+let Predicates = [HasVendorXSfvfexpa], AltFmtType = IS_NOT_ALTFMT in {
+ defm PseudoSF_VFEXPA : VPseudoVFExp_V<IsApprox=true>;
+}
+
// SDNode
def SDT_SF_VC_V_X : SDTypeProfile<1, 4, [SDTCisVec<0>,
SDTCisVT<1, XLenVT>,
@@ -893,3 +937,36 @@ let Predicates = [HasVendorXSfcease] in {
let rs2 = 0b00101;
}
}
+
+let Predicates = [HasVendorXSfvfbfexp16e] in {
+ defm : VPatUnaryV_V<"int_riscv_sf_vfexp", "PseudoSF_VFEXP_ALT",
+ AllBF16Vectors,
+ isSEWAware=1>;
+}
+
+let Predicates = [HasVendorXSfvfexp16e] in {
+ defm : VPatUnaryV_V<"int_riscv_sf_vfexp", "PseudoSF_VFEXP",
+ [VF16MF4, VF16MF2, VF16M1, VF16M2, VF16M4, VF16M8],
+ isSEWAware=1>;
+}
+
+let Predicates = [HasVendorXSfvfexp32e] in {
+ defm : VPatUnaryV_V<"int_riscv_sf_vfexp", "PseudoSF_VFEXP",
+ [VF32MF2, VF32M1, VF32M2, VF32M4, VF32M8], isSEWAware=1>;
+}
+
+let Predicates = [HasVendorXSfvfexpa] in {
+ defm : VPatUnaryV_V<"int_riscv_sf_vfexpa", "PseudoSF_VFEXPA",
+ [VF32MF2, VF32M1, VF32M2, VF32M4, VF32M8], isSEWAware=1>;
+}
+
+let Predicates = [HasVendorXSfvfexpa, HasVInstructionsF16] in {
+ defm : VPatUnaryV_V<"int_riscv_sf_vfexpa", "PseudoSF_VFEXPA",
+ [VF16MF4, VF16MF2, VF16M1, VF16M2, VF16M4, VF16M8],
+ isSEWAware=1>;
+}
+
+let Predicates = [HasVendorXSfvfexpa64e] in {
+ defm : VPatUnaryV_V<"int_riscv_sf_vfexpa", "PseudoSF_VFEXPA",
+ [VF64M1, VF64M2, VF64M4, VF64M8], isSEWAware=1>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 6b9a75f..5429c2a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -599,14 +599,20 @@ def : PatGpr<riscv_zip, ZIP_RV32, i32>;
def : PatGpr<riscv_unzip, UNZIP_RV32, i32>;
} // Predicates = [HasStdExtZbkb, IsRV32]
-let Predicates = [HasStdExtZbb] in {
+let Predicates = [HasStdExtZbbOrP] in {
def : PatGpr<ctlz, CLZ>;
+}
+
+let Predicates = [HasStdExtZbb] in {
def : PatGpr<cttz, CTZ>;
def : PatGpr<ctpop, CPOP>;
} // Predicates = [HasStdExtZbb]
-let Predicates = [HasStdExtZbb, IsRV64] in {
+let Predicates = [HasStdExtZbbOrP, IsRV64] in {
def : PatGpr<riscv_clzw, CLZW>;
+}
+
+let Predicates = [HasStdExtZbb, IsRV64] in {
def : PatGpr<riscv_ctzw, CTZW>;
def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>;
@@ -614,22 +620,22 @@ def : Pat<(i64 (riscv_negw_max GPR:$rs1)),
(MAX GPR:$rs1, (XLenVT (SUBW (XLenVT X0), GPR:$rs1)))>;
} // Predicates = [HasStdExtZbb, IsRV64]
-let Predicates = [HasStdExtZbb] in {
+let Predicates = [HasStdExtZbbOrP] in {
def : Pat<(XLenVT (sext_inreg GPR:$rs1, i8)), (SEXT_B GPR:$rs1)>;
def : Pat<(XLenVT (sext_inreg GPR:$rs1, i16)), (SEXT_H GPR:$rs1)>;
} // Predicates = [HasStdExtZbb]
-let Predicates = [HasStdExtZbb] in {
+let Predicates = [HasStdExtZbbOrP] in {
def : PatGprGpr<smin, MIN>;
def : PatGprGpr<smax, MAX>;
def : PatGprGpr<umin, MINU>;
def : PatGprGpr<umax, MAXU>;
} // Predicates = [HasStdExtZbb]
-let Predicates = [HasStdExtZbbOrZbkb, IsRV32] in
+let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV32] in
def : PatGpr<bswap, REV8_RV32, i32>;
-let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in
+let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV64] in
def : PatGpr<bswap, REV8_RV64, i64>;
let Predicates = [HasStdExtZbkb] in {
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 637d61fe..36a2f46 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -1588,6 +1588,10 @@ multiclass SiFive7SchedResources<int vlen, bit dualVALU,
//===----------------------------------------------------------------------===//
// Unsupported extensions
defm : UnsupportedSchedQ;
+ // TODO: scheduling info of XSfvfexp* and XSfvfexpa*
+ // for SiFive7 will be added in follow-up patches.
+ defm : UnsupportedSchedXSfvfexp;
+ defm : UnsupportedSchedXSfvfexpa;
defm : UnsupportedSchedZabha;
defm : UnsupportedSchedZbc;
defm : UnsupportedSchedZbkb;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td
index 9ab9636..64ccfd8 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedule.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedule.td
@@ -523,6 +523,8 @@ include "RISCVScheduleZvk.td"
// Vendor Extensions
multiclass UnsupportedSchedXsf {
defm : UnsupportedSchedXsfvcp;
+ defm : UnsupportedSchedXSfvfexp;
+ defm : UnsupportedSchedXSfvfexpa;
defm : UnsupportedSchedXSfvfnrclipxfqf;
defm : UnsupportedSchedXSfvfwmaccqqq;
defm : UnsupportedSchedXSfvqmaccdod;
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleXSf.td b/llvm/lib/Target/RISCV/RISCVScheduleXSf.td
index 99632e4..1ee6dc1 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleXSf.td
@@ -99,3 +99,23 @@ defm : LMULWriteRes<"WriteSF_VFWMACC_QQQ", []>;
defm : LMULReadAdvance<"ReadSF_VFWMACC_QQQ", 0>;
} // Unsupported = true
}
+
+defm "" : LMULSEWSchedWritesF<"WriteSF_VFExp">;
+defm "" : LMULSEWSchedReadsF<"ReadSF_VFExp">;
+
+multiclass UnsupportedSchedXSfvfexp {
+let Unsupported = true in {
+defm : LMULSEWWriteResF<"WriteSF_VFExp", []>;
+defm : LMULSEWReadAdvanceF<"ReadSF_VFExp", 0>;
+} // Unsupported = true
+}
+
+defm "" : LMULSEWSchedWritesF<"WriteSF_VFExpa">;
+defm "" : LMULSEWSchedReadsF<"ReadSF_VFExpa">;
+
+multiclass UnsupportedSchedXSfvfexpa {
+let Unsupported = true in {
+defm : LMULSEWWriteResF<"WriteSF_VFExpa", []>;
+defm : LMULSEWReadAdvanceF<"ReadSF_VFExpa", 0>;
+} // Unsupported = true
+}
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 334db4b..4b4fc8f 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -187,7 +187,7 @@ public:
}
bool hasCLZLike() const {
- return HasStdExtZbb || HasVendorXTHeadBb ||
+ return HasStdExtZbb || HasStdExtP || HasVendorXTHeadBb ||
(HasVendorXCVbitmanip && !IsRV64);
}
bool hasCTZLike() const {
@@ -197,7 +197,7 @@ public:
return HasStdExtZbb || (HasVendorXCVbitmanip && !IsRV64);
}
bool hasREV8Like() const {
- return HasStdExtZbb || HasStdExtZbkb || HasVendorXTHeadBb;
+ return HasStdExtZbb || HasStdExtZbkb || HasStdExtP || HasVendorXTHeadBb;
}
bool hasBEXTILike() const { return HasStdExtZbs || HasVendorXTHeadBs; }
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 62073ec..4393f6e 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4721,9 +4721,6 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
return false;
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
-
auto getFoldableLogicOp = [](SDValue Op) {
// Peek through single use bitcast.
if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
@@ -4740,13 +4737,47 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
return SDValue();
};
- SDValue A, FoldableOp;
- if ((FoldableOp = getFoldableLogicOp(N1))) {
- A = N0;
- } else if ((FoldableOp = getFoldableLogicOp(N0))) {
- A = N1;
- } else
- return false;
+ SDValue N0, N1, A, FoldableOp;
+
+ // Identify and (optionally) peel an outer NOT that wraps a pure logic tree
+ auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) {
+ if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() &&
+ ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) {
+ SDValue InnerOp = Op->getOperand(0);
+
+ if (!getFoldableLogicOp(InnerOp))
+ return SDValue();
+
+ N0 = InnerOp.getOperand(0);
+ N1 = InnerOp.getOperand(1);
+ if ((FoldableOp = getFoldableLogicOp(N1))) {
+ A = N0;
+ return InnerOp;
+ }
+ if ((FoldableOp = getFoldableLogicOp(N0))) {
+ A = N1;
+ return InnerOp;
+ }
+ }
+ return SDValue();
+ };
+
+ bool PeeledOuterNot = false;
+ SDNode *OriN = N;
+ if (SDValue InnerOp = tryPeelOuterNotWrappingLogic(N)) {
+ PeeledOuterNot = true;
+ N = InnerOp.getNode();
+ } else {
+ N0 = N->getOperand(0);
+ N1 = N->getOperand(1);
+
+ if ((FoldableOp = getFoldableLogicOp(N1)))
+ A = N0;
+ else if ((FoldableOp = getFoldableLogicOp(N0)))
+ A = N1;
+ else
+ return false;
+ }
SDValue B = FoldableOp.getOperand(0);
SDValue C = FoldableOp.getOperand(1);
@@ -4798,7 +4829,10 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
case ISD::XOR: Imm ^= TernlogMagicA; break;
}
- return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
+ if (PeeledOuterNot)
+ Imm = ~Imm;
+
+ return matchVPTERNLOG(OriN, ParentA, ParentB, ParentC, A, B, C, Imm);
}
/// If the high bits of an 'and' operand are known zero, try setting the
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4dfc400..410f20e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57617,10 +57617,10 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
}
// Fold any similar generic ADD/SUB opcodes to reuse this node.
- auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
+ auto MatchGeneric = [&](unsigned Opc, SDValue N0, SDValue N1, bool Negate) {
SDValue Ops[] = {N0, N1};
SDVTList VTs = DAG.getVTList(N->getValueType(0));
- if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
+ if (SDNode *GenericAddSub = DAG.getNodeIfExists(Opc, VTs, Ops)) {
SDValue Op(N, 0);
if (Negate) {
// Bail if this is only used by a user of the x86 add/sub.
@@ -57632,8 +57632,25 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
DCI.CombineTo(GenericAddSub, Op);
}
};
- MatchGeneric(LHS, RHS, false);
- MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
+ MatchGeneric(GenericOpc, LHS, RHS, false);
+ MatchGeneric(GenericOpc, RHS, LHS, X86ISD::SUB == N->getOpcode());
+
+ if (auto *Const = dyn_cast<ConstantSDNode>(RHS)) {
+ SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT);
+ if (X86ISD::SUB == N->getOpcode()) {
+ // Fold generic add(LHS, -C) to X86ISD::SUB(LHS, C).
+ MatchGeneric(ISD::ADD, LHS, NegC, false);
+ } else {
+ // Negate X86ISD::ADD(LHS, C) and replace generic sub(-C, LHS).
+ MatchGeneric(ISD::SUB, NegC, LHS, true);
+ }
+ } else if (auto *Const = dyn_cast<ConstantSDNode>(LHS)) {
+ if (X86ISD::SUB == N->getOpcode()) {
+ SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT);
+ // Negate X86ISD::SUB(C, RHS) and replace generic add(RHS, -C).
+ MatchGeneric(ISD::ADD, RHS, NegC, true);
+ }
+ }
// TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
// EFLAGS result doesn't change.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index e28b9c1..b7151f6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1592,7 +1592,6 @@ namespace llvm {
bool useLoadStackGuardNode(const Module &M) const override;
bool useStackGuardXorFP() const override;
void insertSSPDeclarations(Module &M) const override;
- Function *getSSPStackGuardCheck(const Module &M) const override;
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
const SDLoc &DL) const override;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 37d7772..a61bbe5 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -640,15 +640,6 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {
TargetLowering::insertSSPDeclarations(M);
}
-Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
- // MSVC CRT has a function to validate security cookie.
- if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
- Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
- return M.getFunction("__security_check_cookie");
- }
- return TargetLowering::getSSPStackGuardCheck(M);
-}
-
Value *
X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
// Android provides a fixed TLS slot for the SafeStack pointer. See the
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
index edcf247..632c6a2 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
@@ -1407,7 +1407,7 @@ let isBarrier = 1, isTerminator = 1 in {
let r = 0x04;
}
- def BREAK_N : RRRN_Inst<0x0C, (outs), (ins uimm4:$imm),
+ def BREAK_N : RRRN_Inst<0x0D, (outs), (ins uimm4:$imm),
"break.n\t$imm", []>, Requires<[HasDensity, HasDebug]> {
bits<4> imm;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 669d4f0..8d9933b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -582,6 +582,18 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
IC.Builder.CreateBinaryIntrinsic(Intrinsic::ctlz, C, Op1);
return BinaryOperator::CreateSub(ConstCtlz, X);
}
+
+ // ctlz(~x & (x - 1)) -> bitwidth - cttz(x, false)
+ if (Op0->hasOneUse() &&
+ match(Op0,
+ m_c_And(m_Not(m_Value(X)), m_Add(m_Deferred(X), m_AllOnes())))) {
+ Type *Ty = II.getType();
+ unsigned BitWidth = Ty->getScalarSizeInBits();
+ auto *Cttz = IC.Builder.CreateIntrinsic(Intrinsic::cttz, Ty,
+ {X, IC.Builder.getFalse()});
+ auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth));
+ return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz));
+ }
}
// cttz(Pow2) -> Log2(Pow2)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 5aa8de3..f5130da 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -4697,5 +4697,31 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
cast<IntrinsicInst>(TrueVal)->getParamAlign(0).valueOrOne(),
CondVal, FalseVal));
+ // Canonicalize sign function ashr pattern: select (icmp slt X, 1), ashr X,
+ // bitwidth-1, 1 -> scmp(X, 0)
+ // Also handles: select (icmp sgt X, 0), 1, ashr X, bitwidth-1 -> scmp(X, 0)
+ unsigned BitWidth = SI.getType()->getScalarSizeInBits();
+ CmpPredicate Pred;
+ Value *CmpLHS, *CmpRHS;
+
+ // Canonicalize sign function ashr patterns:
+ // select (icmp slt X, 1), ashr X, bitwidth-1, 1 -> scmp(X, 0)
+ // select (icmp sgt X, 0), 1, ashr X, bitwidth-1 -> scmp(X, 0)
+ if (match(&SI, m_Select(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
+ m_Value(TrueVal), m_Value(FalseVal))) &&
+ ((Pred == ICmpInst::ICMP_SLT && match(CmpRHS, m_One()) &&
+ match(TrueVal,
+ m_AShr(m_Specific(CmpLHS), m_SpecificInt(BitWidth - 1))) &&
+ match(FalseVal, m_One())) ||
+ (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_Zero()) &&
+ match(TrueVal, m_One()) &&
+ match(FalseVal,
+ m_AShr(m_Specific(CmpLHS), m_SpecificInt(BitWidth - 1)))))) {
+
+ Function *Scmp = Intrinsic::getOrInsertDeclaration(
+ SI.getModule(), Intrinsic::scmp, {SI.getType(), SI.getType()});
+ return CallInst::Create(Scmp, {CmpLHS, ConstantInt::get(SI.getType(), 0)});
+ }
+
return nullptr;
}
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 67e2aae..9c8de45 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2327,6 +2327,18 @@ Constant *InstCombinerImpl::unshuffleConstant(ArrayRef<int> ShMask, Constant *C,
return ConstantVector::get(NewVecC);
}
+// Get the result of `Vector Op Splat` (or Splat Op Vector if \p SplatLHS).
+static Constant *constantFoldBinOpWithSplat(unsigned Opcode, Constant *Vector,
+ Constant *Splat, bool SplatLHS,
+ const DataLayout &DL) {
+ ElementCount EC = cast<VectorType>(Vector->getType())->getElementCount();
+ Constant *LHS = ConstantVector::getSplat(EC, Splat);
+ Constant *RHS = Vector;
+ if (!SplatLHS)
+ std::swap(LHS, RHS);
+ return ConstantFoldBinaryOpOperands(Opcode, LHS, RHS, DL);
+}
+
Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
if (!isa<VectorType>(Inst.getType()))
return nullptr;
@@ -2338,6 +2350,37 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
assert(cast<VectorType>(RHS->getType())->getElementCount() ==
cast<VectorType>(Inst.getType())->getElementCount());
+ auto foldConstantsThroughSubVectorInsertSplat =
+ [&](Value *MaybeSubVector, Value *MaybeSplat,
+ bool SplatLHS) -> Instruction * {
+ Value *Idx;
+ Constant *Splat, *SubVector, *Dest;
+ if (!match(MaybeSplat, m_ConstantSplat(m_Constant(Splat))) ||
+ !match(MaybeSubVector,
+ m_VectorInsert(m_Constant(Dest), m_Constant(SubVector),
+ m_Value(Idx))))
+ return nullptr;
+ SubVector =
+ constantFoldBinOpWithSplat(Opcode, SubVector, Splat, SplatLHS, DL);
+ Dest = constantFoldBinOpWithSplat(Opcode, Dest, Splat, SplatLHS, DL);
+ if (!SubVector || !Dest)
+ return nullptr;
+ auto *InsertVector =
+ Builder.CreateInsertVector(Dest->getType(), Dest, SubVector, Idx);
+ return replaceInstUsesWith(Inst, InsertVector);
+ };
+
+ // If one operand is a constant splat and the other operand is a
+ // `vector.insert` where both the destination and subvector are constant,
+ // apply the operation to both the destination and subvector, returning a new
+ // constant `vector.insert`. This helps constant folding for scalable vectors.
+ if (Instruction *Folded = foldConstantsThroughSubVectorInsertSplat(
+ /*MaybeSubVector=*/LHS, /*MaybeSplat=*/RHS, /*SplatLHS=*/false))
+ return Folded;
+ if (Instruction *Folded = foldConstantsThroughSubVectorInsertSplat(
+ /*MaybeSubVector=*/RHS, /*MaybeSplat=*/LHS, /*SplatLHS=*/true))
+ return Folded;
+
// If both operands of the binop are vector concatenations, then perform the
// narrow binop on each pair of the source operands followed by concatenation
// of the results.
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index b6cbecb..10b03bb 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -226,6 +226,7 @@ static const Align kMinOriginAlignment = Align(4);
static const Align kShadowTLSAlignment = Align(8);
// These constants must be kept in sync with the ones in msan.h.
+// TODO: increase size to match SVE/SVE2/SME/SME2 limits
static const unsigned kParamTLSSize = 800;
static const unsigned kRetvalTLSSize = 800;
@@ -1544,6 +1545,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
}
}
+ static bool isAArch64SVCount(Type *Ty) {
+ if (TargetExtType *TTy = dyn_cast<TargetExtType>(Ty))
+ return TTy->getName() == "aarch64.svcount";
+ return false;
+ }
+
+ // This is intended to match the "AArch64 Predicate-as-Counter Type" (aka
+ // 'target("aarch64.svcount")', but not e.g., <vscale x 4 x i32>.
+ static bool isScalableNonVectorType(Type *Ty) {
+ if (!isAArch64SVCount(Ty))
+ LLVM_DEBUG(dbgs() << "isScalableNonVectorType: Unexpected type " << *Ty
+ << "\n");
+
+ return Ty->isScalableTy() && !isa<VectorType>(Ty);
+ }
+
void materializeChecks() {
#ifndef NDEBUG
// For assert below.
@@ -1672,6 +1689,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
LLVM_DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");
return Res;
}
+ if (isScalableNonVectorType(OrigTy)) {
+ LLVM_DEBUG(dbgs() << "getShadowTy: Scalable non-vector type: " << *OrigTy
+ << "\n");
+ return OrigTy;
+ }
+
uint32_t TypeSize = DL.getTypeSizeInBits(OrigTy);
return IntegerType::get(*MS.C, TypeSize);
}
@@ -2185,8 +2208,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
<< *OrigIns << "\n");
return;
}
-#ifndef NDEBUG
+
Type *ShadowTy = Shadow->getType();
+ if (isScalableNonVectorType(ShadowTy)) {
+ LLVM_DEBUG(dbgs() << "Skipping check of scalable non-vector " << *Shadow
+ << " before " << *OrigIns << "\n");
+ return;
+ }
+#ifndef NDEBUG
assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy) ||
isa<StructType>(ShadowTy) || isa<ArrayType>(ShadowTy)) &&
"Can only insert checks for integer, vector, and aggregate shadow "
@@ -6972,6 +7001,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// an extra "select". This results in much more compact IR.
// Sa = select Sb, poisoned, (select b, Sc, Sd)
Sa1 = getPoisonedShadow(getShadowTy(I.getType()));
+ } else if (isScalableNonVectorType(I.getType())) {
+ // This is intended to handle target("aarch64.svcount"), which can't be
+ // handled in the else branch because of incompatibility with CreateXor
+ // ("The supported LLVM operations on this type are limited to load,
+ // store, phi, select and alloca instructions").
+
+ // TODO: this currently underapproximates. Use Arm SVE EOR in the else
+ // branch as needed instead.
+ Sa1 = getCleanShadow(getShadowTy(I.getType()));
} else {
// Sa = select Sb, [ (c^d) | Sc | Sd ], [ b ? Sc : Sd ]
// If Sb (condition is poisoned), look for bits in c and d that are equal
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a1ad2db..2591df8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4172,11 +4172,6 @@ class VPlan {
/// definitions are VPValues that hold a pointer to their underlying IR.
SmallVector<VPValue *, 16> VPLiveIns;
- /// Mapping from SCEVs to the VPValues representing their expansions.
- /// NOTE: This mapping is temporary and will be removed once all users have
- /// been modeled in VPlan directly.
- DenseMap<const SCEV *, VPValue *> SCEVToExpansion;
-
/// Blocks allocated and owned by the VPlan. They will be deleted once the
/// VPlan is destroyed.
SmallVector<VPBlockBase *> CreatedBlocks;
@@ -4424,15 +4419,6 @@ public:
LLVM_DUMP_METHOD void dump() const;
#endif
- VPValue *getSCEVExpansion(const SCEV *S) const {
- return SCEVToExpansion.lookup(S);
- }
-
- void addSCEVExpansion(const SCEV *S, VPValue *V) {
- assert(!SCEVToExpansion.contains(S) && "SCEV already expanded");
- SCEVToExpansion[S] = V;
- }
-
/// Clone the current VPlan, update all VPValues of the new VPlan and cloned
/// recipes to refer to the clones, and return it.
VPlan *duplicate();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c385c36..84817d7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -943,12 +943,40 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {
}
}
+/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
+/// Returns an optional pair, where the first element indicates whether it is
+/// an intrinsic ID.
+static std::optional<std::pair<bool, unsigned>>
+getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
+ return TypeSwitch<const VPSingleDefRecipe *,
+ std::optional<std::pair<bool, unsigned>>>(R)
+ .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
+ VPWidenSelectRecipe, VPWidenGEPRecipe, VPReplicateRecipe>(
+ [](auto *I) { return std::make_pair(false, I->getOpcode()); })
+ .Case<VPWidenIntrinsicRecipe>([](auto *I) {
+ return std::make_pair(true, I->getVectorIntrinsicID());
+ })
+ .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
+ // For recipes that do not directly map to LLVM IR instructions,
+ // assign opcodes after the last VPInstruction opcode (which is also
+ // after the last IR Instruction opcode), based on the VPDefID.
+ return std::make_pair(false,
+ VPInstruction::OpsEnd + 1 + I->getVPDefID());
+ })
+ .Default([](auto *) { return std::nullopt; });
+}
+
/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
-/// non-nullptr Value for a handled \p Opcode if corresponding \p Operands are
-/// foldable live-ins.
-static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,
- ArrayRef<VPValue *> Operands,
- const DataLayout &DL, VPTypeAnalysis &TypeInfo) {
+/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
+/// Operands are foldable live-ins.
+static VPValue *tryToFoldLiveIns(VPSingleDefRecipe &R,
+ ArrayRef<VPValue *> Operands,
+ const DataLayout &DL,
+ VPTypeAnalysis &TypeInfo) {
+ auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
+ if (!OpcodeOrIID)
+ return nullptr;
+
SmallVector<Value *, 4> Ops;
for (VPValue *Op : Operands) {
if (!Op->isLiveIn() || !Op->getLiveInIRValue())
@@ -956,43 +984,57 @@ static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,
Ops.push_back(Op->getLiveInIRValue());
}
- InstSimplifyFolder Folder(DL);
- if (Instruction::isBinaryOp(Opcode))
- return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode), Ops[0],
+ auto FoldToIRValue = [&]() -> Value * {
+ InstSimplifyFolder Folder(DL);
+ if (OpcodeOrIID->first) {
+ if (R.getNumOperands() != 2)
+ return nullptr;
+ unsigned ID = OpcodeOrIID->second;
+ return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
+ TypeInfo.inferScalarType(&R));
+ }
+ unsigned Opcode = OpcodeOrIID->second;
+ if (Instruction::isBinaryOp(Opcode))
+ return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
+ Ops[0], Ops[1]);
+ if (Instruction::isCast(Opcode))
+ return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
+ TypeInfo.inferScalarType(R.getVPSingleValue()));
+ switch (Opcode) {
+ case VPInstruction::LogicalAnd:
+ return Folder.FoldSelect(Ops[0], Ops[1],
+ ConstantInt::getNullValue(Ops[1]->getType()));
+ case VPInstruction::Not:
+ return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
+ Constant::getAllOnesValue(Ops[0]->getType()));
+ case Instruction::Select:
+ return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
Ops[1]);
- if (Instruction::isCast(Opcode))
- return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
- TypeInfo.inferScalarType(R.getVPSingleValue()));
- switch (Opcode) {
- case VPInstruction::LogicalAnd:
- return Folder.FoldSelect(Ops[0], Ops[1],
- ConstantInt::getNullValue(Ops[1]->getType()));
- case VPInstruction::Not:
- return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
- Constant::getAllOnesValue(Ops[0]->getType()));
- case Instruction::Select:
- return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
- case Instruction::ICmp:
- case Instruction::FCmp:
- return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
- Ops[1]);
- case Instruction::GetElementPtr: {
- auto &RFlags = cast<VPRecipeWithIRFlags>(R);
- auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
- return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0], drop_begin(Ops),
- RFlags.getGEPNoWrapFlags());
- }
- case VPInstruction::PtrAdd:
- case VPInstruction::WidePtrAdd:
- return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), Ops[0],
- Ops[1],
- cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
- // An extract of a live-in is an extract of a broadcast, so return the
- // broadcasted element.
- case Instruction::ExtractElement:
- assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
- return Ops[0];
- }
+ case Instruction::GetElementPtr: {
+ auto &RFlags = cast<VPRecipeWithIRFlags>(R);
+ auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
+ return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
+ drop_begin(Ops), RFlags.getGEPNoWrapFlags());
+ }
+ case VPInstruction::PtrAdd:
+ case VPInstruction::WidePtrAdd:
+ return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
+ Ops[0], Ops[1],
+ cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
+ // An extract of a live-in is an extract of a broadcast, so return the
+ // broadcasted element.
+ case Instruction::ExtractElement:
+ assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
+ return Ops[0];
+ }
+ return nullptr;
+ };
+
+ if (Value *V = FoldToIRValue())
+ return R.getParent()->getPlan()->getOrAddLiveIn(V);
return nullptr;
}
@@ -1006,19 +1048,10 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
// Simplification of live-in IR values for SingleDef recipes using
// InstSimplifyFolder.
- if (TypeSwitch<VPRecipeBase *, bool>(&R)
- .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
- VPReplicateRecipe, VPWidenSelectRecipe>([&](auto *I) {
- const DataLayout &DL =
- Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout();
- Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL,
- TypeInfo);
- if (V)
- I->replaceAllUsesWith(Plan->getOrAddLiveIn(V));
- return V;
- })
- .Default([](auto *) { return false; }))
- return;
+ const DataLayout &DL =
+ Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout();
+ if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
+ return Def->replaceAllUsesWith(V);
// Fold PredPHI LiveIn -> LiveIn.
if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) {
@@ -1996,29 +2029,6 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
return Def == getEmptyKey() || Def == getTombstoneKey();
}
- /// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
- /// Returns an optional pair, where the first element indicates whether it is
- /// an intrinsic ID.
- static std::optional<std::pair<bool, unsigned>>
- getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
- return TypeSwitch<const VPSingleDefRecipe *,
- std::optional<std::pair<bool, unsigned>>>(R)
- .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
- VPWidenSelectRecipe, VPWidenGEPRecipe, VPReplicateRecipe>(
- [](auto *I) { return std::make_pair(false, I->getOpcode()); })
- .Case<VPWidenIntrinsicRecipe>([](auto *I) {
- return std::make_pair(true, I->getVectorIntrinsicID());
- })
- .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
- // For recipes that do not directly map to LLVM IR instructions,
- // assign opcodes after the last VPInstruction opcode (which is also
- // after the last IR Instruction opcode), based on the VPDefID.
- return std::make_pair(false,
- VPInstruction::OpsEnd + 1 + I->getVPDefID());
- })
- .Default([](auto *) { return std::nullopt; });
- }
-
/// If recipe \p R will lower to a GEP with a non-i8 source element type,
/// return that source element type.
static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 06c3d75..fe66f13 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -32,8 +32,6 @@ bool vputils::onlyScalarValuesUsed(const VPValue *Def) {
}
VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) {
- if (auto *Expanded = Plan.getSCEVExpansion(Expr))
- return Expanded;
VPValue *Expanded = nullptr;
if (auto *E = dyn_cast<SCEVConstant>(Expr))
Expanded = Plan.getOrAddLiveIn(E->getValue());
@@ -50,7 +48,6 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) {
Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe());
}
}
- Plan.addSCEVExpansion(Expr, Expanded);
return Expanded;
}