diff options
-rw-r--r-- | llvm/include/llvm/CodeGen/ISDOpcodes.h | 17 | ||||
-rw-r--r-- | llvm/include/llvm/CodeGen/SelectionDAG.h | 5 | ||||
-rw-r--r-- | llvm/include/llvm/CodeGen/TargetLowering.h | 4 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 30 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 4 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 19 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 51 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 10 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 52 | ||||
-rw-r--r-- | llvm/lib/CodeGen/TargetLoweringBase.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 16 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll | 1 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll | 731 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll | 49 |
16 files changed, 963 insertions, 41 deletions
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 68ed812..665c4d6 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1459,6 +1459,23 @@ enum NodeType { VECREDUCE_UMAX, VECREDUCE_UMIN, + // PARTIAL_REDUCE_[U|S]MLA(Accumulator, Input1, Input2) + // The partial reduction nodes sign or zero extend Input1 and Input2 to the + // element type of Accumulator before multiplying their results. + // This result is concatenated to the Accumulator, and this is then reduced, + // using addition, to the result type. + // The output is only expected to either be given to another partial reduction + // operation or an equivalent vector reduce operation, so the order in which + // the elements are reduced is deliberately not specified. + // Input1 and Input2 must be the same type. Accumulator and the output must be + // the same type. + // The number of elements in Input1 and Input2 must be a positive integer + // multiple of the number of elements in the Accumulator / output type. + // Input1 and Input2 must have an element type which is the same as or smaller + // than the element type of the Accumulator and output. + PARTIAL_REDUCE_SMLA, + PARTIAL_REDUCE_UMLA, + // The `llvm.experimental.stackmap` intrinsic. // Operands: input chain, glue, <id>, <numShadowBytes>, [live0[, live1...]] // Outputs: output chain, glue diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 461c0c1..cf8e4a3 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1607,11 +1607,6 @@ public: /// the target's desired shift amount type. SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op); - /// Create the DAG equivalent of vector_partial_reduce where Op1 and Op2 are - /// its operands and ReducedTY is the intrinsic's return type. - SDValue getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1, - SDValue Op2); - /// Expands a node with multiple results to an FP or vector libcall. The /// libcall is expected to take all the operands of the \p Node followed by /// output pointers for each of the results. \p CallRetResNo can be optionally diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index bbecc7a..a4c3d04 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5564,6 +5564,10 @@ public: /// temporarily, advance store position, before re-loading the final vector. SDValue expandVECTOR_COMPRESS(SDNode *Node, SelectionDAG &DAG) const; + /// Expands PARTIAL_REDUCE_S/UMLA nodes to a series of simpler operations, + /// consisting of zext/sext, extract_subvector, mul and add operations. + SDValue expandPartialReduceMLA(SDNode *Node, SelectionDAG &DAG) const; + /// Legalize a SETCC or VP_SETCC with given LHS and RHS and condition code CC /// on the current target. A VP_SETCC will additionally be given a Mask /// and/or EVL not equal to SDValue(). diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index a0f2949..204b323d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -159,6 +159,11 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { Res = PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(N); break; + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: + Res = PromoteIntRes_PARTIAL_REDUCE_MLA(N); + break; + case ISD::SIGN_EXTEND: case ISD::VP_SIGN_EXTEND: case ISD::ZERO_EXTEND: @@ -2099,6 +2104,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::VECTOR_FIND_LAST_ACTIVE: Res = PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(N, OpNo); break; + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: + Res = PromoteIntOp_PARTIAL_REDUCE_MLA(N); + break; } // If the result is null, the sub-method took care of registering results etc. @@ -2881,6 +2890,18 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N, return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } +SDValue DAGTypeLegalizer::PromoteIntOp_PARTIAL_REDUCE_MLA(SDNode *N) { + SmallVector<SDValue, 1> NewOps(N->ops()); + if (N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA) { + NewOps[1] = SExtPromotedInteger(N->getOperand(1)); + NewOps[2] = SExtPromotedInteger(N->getOperand(2)); + } else { + NewOps[1] = ZExtPromotedInteger(N->getOperand(1)); + NewOps[2] = ZExtPromotedInteger(N->getOperand(2)); + } + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + //===----------------------------------------------------------------------===// // Integer Result Expansion //===----------------------------------------------------------------------===// @@ -6200,6 +6221,15 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N) { return DAG.getNode(ISD::VECTOR_FIND_LAST_ACTIVE, SDLoc(N), NVT, N->ops()); } +SDValue DAGTypeLegalizer::PromoteIntRes_PARTIAL_REDUCE_MLA(SDNode *N) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + SDValue ExtAcc = GetPromotedInteger(N->getOperand(0)); + return DAG.getNode(N->getOpcode(), DL, NVT, ExtAcc, N->getOperand(1), + N->getOperand(2)); +} + SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) { EVT OutVT = N->getValueType(0); EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index b58c160..69c687a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -379,6 +379,7 @@ private: SDValue PromoteIntRes_IS_FPCLASS(SDNode *N); SDValue PromoteIntRes_PATCHPOINT(SDNode *N); SDValue PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N); + SDValue PromoteIntRes_PARTIAL_REDUCE_MLA(SDNode *N); // Integer Operand Promotion. bool PromoteIntegerOperand(SDNode *N, unsigned OpNo); @@ -430,6 +431,7 @@ private: SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N, unsigned OpNo); + SDValue PromoteIntOp_PARTIAL_REDUCE_MLA(SDNode *N); void SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -969,6 +971,7 @@ private: void SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_FP_TO_XINT_SAT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo, SDValue &Hi); // Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>. bool SplitVectorOperand(SDNode *N, unsigned OpNo); @@ -1000,6 +1003,7 @@ private: SDValue SplitVecOp_FP_TO_XINT_SAT(SDNode *N); SDValue SplitVecOp_VP_CttzElements(SDNode *N); SDValue SplitVecOp_VECTOR_HISTOGRAM(SDNode *N); + SDValue SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N); //===--------------------------------------------------------------------===// // Vector Widening Support: LegalizeVectorTypes.cpp diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 7e8bae4..de4447f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -469,6 +469,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::VECTOR_COMPRESS: case ISD::SCMP: case ISD::UCMP: + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); break; case ISD::SMULFIX: @@ -1197,6 +1199,10 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) { case ISD::VECREDUCE_FMINIMUM: Results.push_back(TLI.expandVecReduce(Node, DAG)); return; + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: + Results.push_back(TLI.expandPartialReduceMLA(Node, DAG)); + return; case ISD::VECREDUCE_SEQ_FADD: case ISD::VECREDUCE_SEQ_FMUL: Results.push_back(TLI.expandVecReduceSeq(Node, DAG)); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 1d8bf54..9d42ec2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1395,6 +1395,10 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::EXPERIMENTAL_VP_REVERSE: SplitVecRes_VP_REVERSE(N, Lo, Hi); break; + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: + SplitVecRes_PARTIAL_REDUCE_MLA(N, Lo, Hi); + break; } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -3213,6 +3217,13 @@ void DAGTypeLegalizer::SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo, std::tie(Lo, Hi) = DAG.SplitVector(Load, DL); } +void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDLoc DL(N); + SDValue Expanded = TLI.expandPartialReduceMLA(N, DAG); + std::tie(Lo, Hi) = DAG.SplitVector(Expanded, DL); +} + void DAGTypeLegalizer::SplitVecRes_VECTOR_DEINTERLEAVE(SDNode *N) { unsigned Factor = N->getNumOperands(); @@ -3431,6 +3442,10 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: Res = SplitVecOp_VECTOR_HISTOGRAM(N); break; + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: + Res = SplitVecOp_PARTIAL_REDUCE_MLA(N); + break; } // If the result is null, the sub-method took care of registering results etc. @@ -4485,6 +4500,10 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECTOR_HISTOGRAM(SDNode *N) { MMO, IndexType); } +SDValue DAGTypeLegalizer::SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N) { + return TLI.expandPartialReduceMLA(N, DAG); +} + //===----------------------------------------------------------------------===// // Result Vector Widening //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 9d2f874..80c2de1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2474,35 +2474,6 @@ SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) { return getZExtOrTrunc(Op, SDLoc(Op), ShTy); } -SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1, - SDValue Op2) { - EVT FullTy = Op2.getValueType(); - - unsigned Stride = ReducedTy.getVectorMinNumElements(); - unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride; - - // Collect all of the subvectors - std::deque<SDValue> Subvectors = {Op1}; - for (unsigned I = 0; I < ScaleFactor; I++) { - auto SourceIndex = getVectorIdxConstant(I * Stride, DL); - Subvectors.push_back( - getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {Op2, SourceIndex})); - } - - // Flatten the subvector tree - while (Subvectors.size() > 1) { - Subvectors.push_back( - getNode(ISD::ADD, DL, ReducedTy, {Subvectors[0], Subvectors[1]})); - Subvectors.pop_front(); - Subvectors.pop_front(); - } - - assert(Subvectors.size() == 1 && - "There should only be one subvector after tree flattening"); - - return Subvectors[0]; -} - /// Given a store node \p StoreNode, return true if it is safe to fold that node /// into \p FPNode, which expands to a library call with output pointers. static bool canFoldStoreIntoLibCallOutputPointers(StoreSDNode *StoreNode, @@ -7883,6 +7854,28 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, break; } + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: { + [[maybe_unused]] EVT AccVT = N1.getValueType(); + [[maybe_unused]] EVT Input1VT = N2.getValueType(); + [[maybe_unused]] EVT Input2VT = N3.getValueType(); + assert(Input1VT.isVector() && Input1VT == Input2VT && + "Expected the second and third operands of the PARTIAL_REDUCE_MLA " + "node to have the same type!"); + assert(VT.isVector() && VT == AccVT && + "Expected the first operand of the PARTIAL_REDUCE_MLA node to have " + "the same type as its result!"); + assert(Input1VT.getVectorElementCount().hasKnownScalarFactor( + AccVT.getVectorElementCount()) && + "Expected the element count of the second and third operands of the " + "PARTIAL_REDUCE_MLA node to be a positive integer multiple of the " + "element count of the first operand and the result!"); + assert(N2.getScalarValueSizeInBits() <= N1.getScalarValueSizeInBits() && + "Expected the second and third operands of the PARTIAL_REDUCE_MLA " + "node to have an element type which is the same as or smaller than " + "the element type of the first operand and result!"); + break; + } } // Memoize node if it doesn't produce a glue result. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 78a6e24..1c58a7f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8115,15 +8115,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, return; } case Intrinsic::experimental_vector_partial_reduce_add: { - if (!TLI.shouldExpandPartialReductionIntrinsic(cast<IntrinsicInst>(&I))) { visitTargetIntrinsic(I, Intrinsic); return; } - - setValue(&I, DAG.getPartialReduceAdd(sdl, EVT::getEVT(I.getType()), - getValue(I.getOperand(0)), - getValue(I.getOperand(1)))); + SDValue Acc = getValue(I.getOperand(0)); + SDValue Input = getValue(I.getOperand(1)); + setValue(&I, + DAG.getNode(ISD::PARTIAL_REDUCE_UMLA, sdl, Acc.getValueType(), Acc, + Input, DAG.getConstant(1, sdl, Input.getValueType()))); return; } case Intrinsic::experimental_cttz_elts: { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 8de5371..8457bee 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -569,6 +569,11 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::VECTOR_FIND_LAST_ACTIVE: return "find_last_active"; + case ISD::PARTIAL_REDUCE_UMLA: + return "partial_reduce_umla"; + case ISD::PARTIAL_REDUCE_SMLA: + return "partial_reduce_smla"; + // Vector Predication #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...) \ case ISD::SDID: \ diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index adfb960..7771958 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -34,6 +34,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" #include <cctype> +#include <deque> using namespace llvm; /// NOTE: The TargetMachine owns TLOF. @@ -11890,6 +11891,57 @@ SDValue TargetLowering::expandVECTOR_COMPRESS(SDNode *Node, return DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo); } +SDValue TargetLowering::expandPartialReduceMLA(SDNode *N, + SelectionDAG &DAG) const { + SDLoc DL(N); + SDValue Acc = N->getOperand(0); + SDValue MulLHS = N->getOperand(1); + SDValue MulRHS = N->getOperand(2); + EVT AccVT = Acc.getValueType(); + EVT MulOpVT = MulLHS.getValueType(); + + EVT ExtMulOpVT = + EVT::getVectorVT(*DAG.getContext(), AccVT.getVectorElementType(), + MulOpVT.getVectorElementCount()); + unsigned ExtOpc = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA + ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND; + + if (ExtMulOpVT != MulOpVT) { + MulLHS = DAG.getNode(ExtOpc, DL, ExtMulOpVT, MulLHS); + MulRHS = DAG.getNode(ExtOpc, DL, ExtMulOpVT, MulRHS); + } + SDValue Input = MulLHS; + APInt ConstantOne; + if (!ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) || + !ConstantOne.isOne()) + Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS); + + unsigned Stride = AccVT.getVectorMinNumElements(); + unsigned ScaleFactor = MulOpVT.getVectorMinNumElements() / Stride; + + // Collect all of the subvectors + std::deque<SDValue> Subvectors = {Acc}; + for (unsigned I = 0; I < ScaleFactor; I++) { + auto SourceIndex = DAG.getVectorIdxConstant(I * Stride, DL); + Subvectors.push_back( + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, AccVT, {Input, SourceIndex})); + } + + // Flatten the subvector tree + while (Subvectors.size() > 1) { + Subvectors.push_back( + DAG.getNode(ISD::ADD, DL, AccVT, {Subvectors[0], Subvectors[1]})); + Subvectors.pop_front(); + Subvectors.pop_front(); + } + + assert(Subvectors.size() == 1 && + "There should only be one subvector after tree flattening"); + + return Subvectors[0]; +} + bool TargetLowering::LegalizeSetCCCondCode(SelectionDAG &DAG, EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC, SDValue Mask, diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 51cde7c..f5ea3c0 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -835,6 +835,10 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::GET_FPENV, VT, Expand); setOperationAction(ISD::SET_FPENV, VT, Expand); setOperationAction(ISD::RESET_FPENV, VT, Expand); + + // PartialReduceMLA operations default to expand. + setOperationAction({ISD::PARTIAL_REDUCE_UMLA, ISD::PARTIAL_REDUCE_SMLA}, VT, + Expand); } // Most targets ignore the @llvm.prefetch intrinsic. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d47a0bfa..50be082 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -154,6 +154,13 @@ cl::opt<bool> EnableSVEGISel( cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false)); +// FIXME : This is a temporary flag, and is used to help transition to +// performing lowering the proper way using the new PARTIAL_REDUCE_MLA ISD +// nodes. +static cl::opt<bool> EnablePartialReduceNodes( + "aarch64-enable-partial-reduce-nodes", cl::init(false), cl::ReallyHidden, + cl::desc("Use the new method of lowering partial reductions.")); + /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; @@ -2050,6 +2057,8 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic( const IntrinsicInst *I) const { if (I->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add) return true; + if (EnablePartialReduceNodes) + return true; EVT VT = EVT::getEVT(I->getType()); auto Op1 = I->getOperand(1); @@ -21978,8 +21987,11 @@ static SDValue performIntrinsicCombine(SDNode *N, return Dot; if (SDValue WideAdd = tryLowerPartialReductionToWideAdd(N, Subtarget, DAG)) return WideAdd; - return DAG.getPartialReduceAdd(SDLoc(N), N->getValueType(0), - N->getOperand(1), N->getOperand(2)); + SDLoc DL(N); + SDValue Input = N->getOperand(2); + return DAG.getNode(ISD::PARTIAL_REDUCE_UMLA, DL, N->getValueType(0), + N->getOperand(1), Input, + DAG.getConstant(1, DL, Input.getValueType())); } case Intrinsic::aarch64_neon_vcvtfxs2fp: case Intrinsic::aarch64_neon_vcvtfxu2fp: diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll index 9ece9ed..40daf8f 100644 --- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NOI8MM ; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT ; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-I8MM +; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { ; CHECK-DOT-LABEL: udot: diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll index 66f83c6..455231d 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll @@ -1,12 +1,36 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-I8MM ; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM +; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING define <vscale x 4 x i32> @udot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { ; CHECK-LABEL: udot: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: udot z0.s, z1.b, z2.b ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: udot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z1.b +; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: ptrue p0.s +; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h +; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s +; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s +; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32> %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32> @@ -20,6 +44,29 @@ define <vscale x 2 x i64> @udot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: udot z0.d, z1.h, z2.h ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: udot_wide: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64> %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64> @@ -33,6 +80,29 @@ define <vscale x 4 x i32> @sdot(<vscale x 4 x i32> %accc, <vscale x 16 x i8> %a, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sdot z0.s, z1.b, z2.b ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sdot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z1.b +; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: ptrue p0.s +; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z3.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h +; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s +; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s +; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32> %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32> @@ -46,6 +116,29 @@ define <vscale x 2 x i64> @sdot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sdot z0.d, z1.h, z2.h ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sdot_wide: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z2.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64> %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64> @@ -82,6 +175,29 @@ define <vscale x 4 x i32> @usdot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, ; CHECK-NOI8MM-NEXT: mla z1.s, p0/m, z7.s, z24.s ; CHECK-NOI8MM-NEXT: add z0.s, z1.s, z0.s ; CHECK-NOI8MM-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: usdot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z1.b +; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: ptrue p0.s +; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h +; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s +; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s +; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32> %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32> @@ -118,6 +234,29 @@ define <vscale x 4 x i32> @sudot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, ; CHECK-NOI8MM-NEXT: mla z1.s, p0/m, z7.s, z24.s ; CHECK-NOI8MM-NEXT: add z0.s, z1.s, z0.s ; CHECK-NOI8MM-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sudot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z1.b +; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: ptrue p0.s +; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z3.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h +; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s +; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s +; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32> %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32> @@ -136,6 +275,63 @@ define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8 ; CHECK-NEXT: add z0.d, z0.d, z2.d ; CHECK-NEXT: add z1.d, z1.d, z3.d ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: udot_8to64: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2 +; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16 +; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b +; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h +; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h +; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z26.d, z6.s +; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z6.s +; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z7.s +; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z5.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z30.d, z24.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z31.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z24.s +; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z8.d, z25.s +; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z25.s +; CHECK-NEWLOWERING-NEXT: uunpklo z9.d, z3.s +; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d +; CHECK-NEWLOWERING-NEXT: movprfx z2, z27 +; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d +; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d +; CHECK-NEWLOWERING-NEXT: movprfx z3, z4 +; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d +; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d +; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2 +; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64> %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64> @@ -155,6 +351,63 @@ define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8 ; CHECK-NEXT: add z0.d, z0.d, z2.d ; CHECK-NEXT: add z1.d, z1.d, z3.d ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sdot_8to64: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2 +; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16 +; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b +; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h +; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h +; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z26.d, z6.s +; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z6.s +; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z7.s +; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z5.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z30.d, z24.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z31.d, z2.s +; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z24.s +; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z8.d, z25.s +; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z25.s +; CHECK-NEWLOWERING-NEXT: sunpklo z9.d, z3.s +; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d +; CHECK-NEWLOWERING-NEXT: movprfx z2, z27 +; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d +; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d +; CHECK-NEWLOWERING-NEXT: movprfx z3, z4 +; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d +; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d +; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2 +; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64> %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64> @@ -231,6 +484,63 @@ define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i ; CHECK-NOI8MM-NEXT: addvl sp, sp, #2 ; CHECK-NOI8MM-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NOI8MM-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: usdot_8to64: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2 +; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16 +; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b +; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h +; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h +; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z26.d, z6.s +; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z6.s +; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z7.s +; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z5.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z30.d, z24.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z31.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z24.s +; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z8.d, z25.s +; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z25.s +; CHECK-NEWLOWERING-NEXT: sunpklo z9.d, z3.s +; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d +; CHECK-NEWLOWERING-NEXT: movprfx z2, z27 +; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d +; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d +; CHECK-NEWLOWERING-NEXT: movprfx z3, z4 +; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d +; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d +; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2 +; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64> %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64> @@ -307,6 +617,63 @@ define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i ; CHECK-NOI8MM-NEXT: addvl sp, sp, #2 ; CHECK-NOI8MM-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NOI8MM-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sudot_8to64: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2 +; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16 +; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b +; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h +; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h +; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z26.d, z6.s +; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z6.s +; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z7.s +; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z5.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z30.d, z24.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z31.d, z2.s +; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z24.s +; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z8.d, z25.s +; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z25.s +; CHECK-NEWLOWERING-NEXT: uunpklo z9.d, z3.s +; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d +; CHECK-NEWLOWERING-NEXT: movprfx z2, z27 +; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d +; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d +; CHECK-NEWLOWERING-NEXT: movprfx z3, z4 +; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d +; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d +; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2 +; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64> %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64> @@ -322,6 +689,20 @@ define <vscale x 4 x i32> @udot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 ; CHECK-NEXT: mov z2.b, #1 // =0x1 ; CHECK-NEXT: udot z0.s, z1.b, z2.b ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: udot_no_bin_op: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: uunpklo z2.h, z1.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: add z0.s, z0.s, z3.s +; CHECK-NEWLOWERING-NEXT: add z1.s, z2.s, z1.s +; CHECK-NEWLOWERING-NEXT: add z0.s, z4.s, z0.s +; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEWLOWERING-NEXT: ret %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32> %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext) ret <vscale x 4 x i32> %partial.reduce @@ -333,6 +714,20 @@ define <vscale x 4 x i32> @sdot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 ; CHECK-NEXT: mov z2.b, #1 // =0x1 ; CHECK-NEXT: sdot z0.s, z1.b, z2.b ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: sunpklo z2.h, z1.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b +; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpklo z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: add z0.s, z0.s, z3.s +; CHECK-NEWLOWERING-NEXT: add z1.s, z2.s, z1.s +; CHECK-NEWLOWERING-NEXT: add z0.s, z4.s, z0.s +; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEWLOWERING-NEXT: ret %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32> %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext) ret <vscale x 4 x i32> %partial.reduce @@ -344,6 +739,20 @@ define <vscale x 2 x i64> @udot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale ; CHECK-NEXT: mov z2.h, #1 // =0x1 ; CHECK-NEXT: udot z0.d, z1.h, z2.h ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: udot_no_bin_op_wide: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpklo z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z3.d +; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z4.d, z0.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64> %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide) @@ -356,6 +765,20 @@ define <vscale x 2 x i64> @sdot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale ; CHECK-NEXT: mov z2.h, #1 // =0x1 ; CHECK-NEXT: sdot z0.d, z1.h, z2.h ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op_wide: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: sunpklo z2.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpklo z3.d, z2.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpklo z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z3.d +; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z4.d, z0.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64> %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide) @@ -373,6 +796,32 @@ define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale ; CHECK-NEXT: add z0.d, z0.d, z2.d ; CHECK-NEXT: add z1.d, z1.d, z3.d ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: udot_no_bin_op_8to64: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z2.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z5.s +; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z5.s +; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z4.d +; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z6.d +; CHECK-NEWLOWERING-NEXT: add z4.d, z25.d, z24.d +; CHECK-NEWLOWERING-NEXT: add z2.d, z3.d, z2.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z5.d, z0.d +; CHECK-NEWLOWERING-NEXT: add z1.d, z7.d, z1.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z4.d, z0.d +; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEWLOWERING-NEXT: ret %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64> %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext) ret <vscale x 4 x i64> %partial.reduce @@ -389,6 +838,32 @@ define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale ; CHECK-NEXT: add z0.d, z0.d, z2.d ; CHECK-NEXT: add z1.d, z1.d, z3.d ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op_8to64: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z2.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z3.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpklo z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpklo z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z5.s +; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z2.s +; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z5.s +; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z4.d +; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z6.d +; CHECK-NEWLOWERING-NEXT: add z4.d, z25.d, z24.d +; CHECK-NEWLOWERING-NEXT: add z2.d, z3.d, z2.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z5.d, z0.d +; CHECK-NEWLOWERING-NEXT: add z1.d, z7.d, z1.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z4.d, z0.d +; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEWLOWERING-NEXT: ret %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64> %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext) ret <vscale x 4 x i64> %partial.reduce @@ -407,6 +882,19 @@ define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> % ; CHECK-NEXT: mla z0.s, p0/m, z3.s, z4.s ; CHECK-NEXT: mla z0.s, p0/m, z1.s, z2.s ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: not_udot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff +; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff +; CHECK-NEWLOWERING-NEXT: ptrue p0.s +; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z3.s, z4.s +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i32> %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i32> @@ -428,6 +916,19 @@ define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x ; CHECK-NEXT: mla z0.d, p0/m, z3.d, z4.d ; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: not_udot_wide: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: and z1.s, z1.s, #0xffff +; CHECK-NEWLOWERING-NEXT: and z2.s, z2.s, #0xffff +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z3.d, z4.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext <vscale x 4 x i16> %a to <vscale x 4 x i64> %b.wide = zext <vscale x 4 x i16> %b to <vscale x 4 x i64> @@ -459,6 +960,29 @@ define <vscale x 2 x i64> @not_usdot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> ; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d ; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: not_usdot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z2.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64> %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64> @@ -490,6 +1014,29 @@ define <vscale x 2 x i64> @not_sudot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> ; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d ; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: not_sudot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64> %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64> @@ -522,6 +1069,30 @@ define <vscale x 2 x i64> @udot_different_types(<vscale x 2 x i64> %acc, <vscale ; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d ; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: udot_different_types: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff +; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64> %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64> @@ -555,6 +1126,31 @@ define <vscale x 2 x i64> @sdot_different_types(<vscale x 2 x i64> %acc, <vscale ; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d ; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sdot_different_types: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: ptrue p0.h +; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: sxtb z2.h, p0/m, z2.h +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z2.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64> %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64> @@ -588,6 +1184,31 @@ define <vscale x 2 x i64> @usdot_different_types(<vscale x 2 x i64> %acc, <vscal ; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d ; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: usdot_different_types: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: ptrue p0.h +; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: sxtb z2.h, p0/m, z2.h +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z2.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64> %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64> @@ -620,6 +1241,30 @@ define <vscale x 2 x i64> @sudot_different_types(<vscale x 2 x i64> %acc, <vscal ; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d ; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sudot_different_types: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff +; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64> %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64> @@ -627,3 +1272,89 @@ entry: %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult) ret <vscale x 2 x i64> %partial.reduce } + +define <vscale x 2 x i16> @udot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b){ +; CHECK-LABEL: udot_nxv8i8_promote: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and z1.h, z1.h, #0xff +; CHECK-NEXT: and z2.h, z2.h, #0xff +; CHECK-NEXT: mul z1.h, z1.h, z2.h +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEXT: uunpklo z3.d, z2.s +; CHECK-NEXT: uunpklo z4.d, z1.s +; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: add z0.d, z0.d, z3.d +; CHECK-NEXT: add z2.d, z2.d, z4.d +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: udot_nxv8i8_promote: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff +; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff +; CHECK-NEWLOWERING-NEXT: mul z1.h, z1.h, z2.h +; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z3.d +; CHECK-NEWLOWERING-NEXT: add z2.d, z2.d, z4.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret +entry: + %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i16> + %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i16> + %mult = mul nuw nsw <vscale x 8 x i16> %a.wide, %b.wide + %partial.reduce = tail call <vscale x 2 x i16> @llvm.experimental.vector.partial.reduce.add.nxv2i16.nxv8i16(<vscale x 2 x i16> %acc, <vscale x 8 x i16> %mult) + ret <vscale x 2 x i16> %partial.reduce +} + +define <vscale x 2 x i16> @sdot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b){ +; CHECK-LABEL: sdot_nxv8i8_promote: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: sxtb z1.h, p0/m, z1.h +; CHECK-NEXT: sxtb z2.h, p0/m, z2.h +; CHECK-NEXT: mul z1.h, z1.h, z2.h +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEXT: uunpklo z3.d, z2.s +; CHECK-NEXT: uunpklo z4.d, z1.s +; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: add z0.d, z0.d, z3.d +; CHECK-NEXT: add z2.d, z2.d, z4.d +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sdot_nxv8i8_promote: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: ptrue p0.h +; CHECK-NEWLOWERING-NEXT: sxtb z1.h, p0/m, z1.h +; CHECK-NEWLOWERING-NEXT: sxtb z2.h, p0/m, z2.h +; CHECK-NEWLOWERING-NEXT: mul z1.h, z1.h, z2.h +; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z3.d +; CHECK-NEWLOWERING-NEXT: add z2.d, z2.d, z4.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret +entry: + %a.wide = sext <vscale x 8 x i8> %a to <vscale x 8 x i16> + %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i16> + %mult = mul nuw nsw <vscale x 8 x i16> %a.wide, %b.wide + %partial.reduce = tail call <vscale x 2 x i16> @llvm.experimental.vector.partial.reduce.add.nxv2i16.nxv8i16(<vscale x 2 x i16> %acc, <vscale x 8 x i16> %mult) + ret <vscale x 2 x i16> %partial.reduce +} diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll index b4b946c..11fb60e 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE2 ; RUN: llc -mtriple=aarch64 -mattr=+sve %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE +; RUN: llc -mtriple=aarch64 -mattr=+sve2 -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEWLOWERING define <vscale x 2 x i64> @signed_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vscale x 4 x i32> %input){ ; CHECK-SVE2-LABEL: signed_wide_add_nxv4i32: @@ -16,6 +17,14 @@ define <vscale x 2 x i64> @signed_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vsc ; CHECK-SVE-NEXT: add z0.d, z0.d, z2.d ; CHECK-SVE-NEXT: add z0.d, z1.d, z0.d ; CHECK-SVE-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: signed_wide_add_nxv4i32: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %input.wide = sext <vscale x 4 x i32> %input to <vscale x 4 x i64> %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide) @@ -36,6 +45,14 @@ define <vscale x 2 x i64> @unsigned_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <v ; CHECK-SVE-NEXT: add z0.d, z0.d, z2.d ; CHECK-SVE-NEXT: add z0.d, z1.d, z0.d ; CHECK-SVE-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: unsigned_wide_add_nxv4i32: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %input.wide = zext <vscale x 4 x i32> %input to <vscale x 4 x i64> %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide) @@ -56,6 +73,14 @@ define <vscale x 4 x i32> @signed_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vsc ; CHECK-SVE-NEXT: add z0.s, z0.s, z2.s ; CHECK-SVE-NEXT: add z0.s, z1.s, z0.s ; CHECK-SVE-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: signed_wide_add_nxv8i16: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: sunpklo z2.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEWLOWERING-NEXT: ret entry: %input.wide = sext <vscale x 8 x i16> %input to <vscale x 8 x i32> %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide) @@ -76,6 +101,14 @@ define <vscale x 4 x i32> @unsigned_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <v ; CHECK-SVE-NEXT: add z0.s, z0.s, z2.s ; CHECK-SVE-NEXT: add z0.s, z1.s, z0.s ; CHECK-SVE-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: unsigned_wide_add_nxv8i16: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEWLOWERING-NEXT: ret entry: %input.wide = zext <vscale x 8 x i16> %input to <vscale x 8 x i32> %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide) @@ -96,6 +129,14 @@ define <vscale x 8 x i16> @signed_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vsc ; CHECK-SVE-NEXT: add z0.h, z0.h, z2.h ; CHECK-SVE-NEXT: add z0.h, z1.h, z0.h ; CHECK-SVE-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: signed_wide_add_nxv16i8: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: sunpklo z2.h, z1.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b +; CHECK-NEWLOWERING-NEXT: add z0.h, z0.h, z2.h +; CHECK-NEWLOWERING-NEXT: add z0.h, z1.h, z0.h +; CHECK-NEWLOWERING-NEXT: ret entry: %input.wide = sext <vscale x 16 x i8> %input to <vscale x 16 x i16> %partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide) @@ -116,6 +157,14 @@ define <vscale x 8 x i16> @unsigned_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <v ; CHECK-SVE-NEXT: add z0.h, z0.h, z2.h ; CHECK-SVE-NEXT: add z0.h, z1.h, z0.h ; CHECK-SVE-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: unsigned_wide_add_nxv16i8: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: uunpklo z2.h, z1.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEWLOWERING-NEXT: add z0.h, z0.h, z2.h +; CHECK-NEWLOWERING-NEXT: add z0.h, z1.h, z0.h +; CHECK-NEWLOWERING-NEXT: ret entry: %input.wide = zext <vscale x 16 x i8> %input to <vscale x 16 x i16> %partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide) |