diff options
Diffstat (limited to 'llvm/lib')
34 files changed, 494 insertions, 236 deletions
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index e45d1f7..b3b62cf 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -407,9 +407,10 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA, continue; Value *Ptr = getLoadStorePointerOperand(&Inst); const Loop *L = LI.getLoopFor(Inst.getParent()); + const Loop *OutermostLoop = L ? L->getOutermostLoop() : nullptr; const SCEV *PtrSCEV = SE.getSCEVAtScope(Ptr, L); const SCEV *AccessFn = SE.removePointerBase(PtrSCEV); - SCEVMonotonicity Mon = Checker.checkMonotonicity(AccessFn, L); + SCEVMonotonicity Mon = Checker.checkMonotonicity(AccessFn, OutermostLoop); OS.indent(2) << "Inst: " << Inst << "\n"; OS.indent(4) << "Expr: " << *AccessFn << "\n"; Mon.print(OS, 4); @@ -945,6 +946,8 @@ SCEVMonotonicity SCEVMonotonicityChecker::invariantOrUnknown(const SCEV *Expr) { SCEVMonotonicity SCEVMonotonicityChecker::checkMonotonicity(const SCEV *Expr, const Loop *OutermostLoop) { + assert((!OutermostLoop || OutermostLoop->isOutermost()) && + "OutermostLoop must be outermost"); assert(Expr->getType()->isIntegerTy() && "Expr must be integer type"); this->OutermostLoop = OutermostLoop; return visit(Expr); diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp index 3110577..c3f7a0c 100644 --- a/llvm/lib/CAS/ObjectStore.cpp +++ b/llvm/lib/CAS/ObjectStore.cpp @@ -213,10 +213,13 @@ Expected<ObjectRef> ObjectStore::importObject(ObjectStore &Upstream, // Remove the current node and its IDs from the stack. PrimaryRefStack.truncate(PrimaryRefStack.size() - Cur.RefsCount); - CursorStack.pop_back(); + // Push new node into created objects. PrimaryRefStack.push_back(*NewNode); CreatedObjects.try_emplace(Cur.Ref, *NewNode); + + // Pop the cursor in the end after all uses. + CursorStack.pop_back(); continue; } diff --git a/llvm/lib/CAS/UnifiedOnDiskCache.cpp b/llvm/lib/CAS/UnifiedOnDiskCache.cpp index ae9d818..7b790bb 100644 --- a/llvm/lib/CAS/UnifiedOnDiskCache.cpp +++ b/llvm/lib/CAS/UnifiedOnDiskCache.cpp @@ -174,7 +174,7 @@ getAllDBDirs(StringRef Path, bool IncludeCorrupt = false) { return createFileError(Path, EC); llvm::sort(FoundDBDirs, [](const DBDir &LHS, const DBDir &RHS) -> bool { - return LHS.Order <= RHS.Order; + return LHS.Order < RHS.Order; }); SmallVector<std::string, 4> DBDirs; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 10daca5..f144f17 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2042,6 +2042,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: return visitPARTIAL_REDUCE_MLA(N); case ISD::VECTOR_COMPRESS: return visitVECTOR_COMPRESS(N); case ISD::LIFETIME_END: return visitLIFETIME_END(N); @@ -13006,6 +13007,9 @@ SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) { // // partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1)) // -> partial_reduce_*mla(acc, x, C) +// +// partial_reduce_fmla(acc, fmul(fpext(a), fpext(b)), splat(1.0)) +// -> partial_reduce_fmla(acc, a, b) SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { SDLoc DL(N); auto *Context = DAG.getContext(); @@ -13014,7 +13018,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { SDValue Op2 = N->getOperand(2); unsigned Opc = Op1->getOpcode(); - if (Opc != ISD::MUL && Opc != ISD::SHL) + if (Opc != ISD::MUL && Opc != ISD::FMUL && Opc != ISD::SHL) return SDValue(); SDValue LHS = Op1->getOperand(0); @@ -13033,13 +13037,16 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { Opc = ISD::MUL; } - APInt C; - if (Opc != ISD::MUL || !ISD::isConstantSplatVector(Op2.getNode(), C) || - !C.isOne()) + if (!(Opc == ISD::MUL && llvm::isOneOrOneSplat(Op2)) && + !(Opc == ISD::FMUL && llvm::isOneOrOneSplatFP(Op2))) return SDValue(); + auto IsIntOrFPExtOpcode = [](unsigned int Opcode) { + return (ISD::isExtOpcode(Opcode) || Opcode == ISD::FP_EXTEND); + }; + unsigned LHSOpcode = LHS->getOpcode(); - if (!ISD::isExtOpcode(LHSOpcode)) + if (!IsIntOrFPExtOpcode(LHSOpcode)) return SDValue(); SDValue LHSExtOp = LHS->getOperand(0); @@ -13047,6 +13054,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { // partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1)) // -> partial_reduce_*mla(acc, x, C) + APInt C; if (ISD::isConstantSplatVector(RHS.getNode(), C)) { // TODO: Make use of partial_reduce_sumla here APInt CTrunc = C.trunc(LHSExtOpVT.getScalarSizeInBits()); @@ -13071,7 +13079,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { } unsigned RHSOpcode = RHS->getOpcode(); - if (!ISD::isExtOpcode(RHSOpcode)) + if (!IsIntOrFPExtOpcode(RHSOpcode)) return SDValue(); SDValue RHSExtOp = RHS->getOperand(0); @@ -13088,6 +13096,8 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { else if (LHSOpcode == ISD::ZERO_EXTEND && RHSOpcode == ISD::SIGN_EXTEND) { NewOpc = ISD::PARTIAL_REDUCE_SUMLA; std::swap(LHSExtOp, RHSExtOp); + } else if (LHSOpcode == ISD::FP_EXTEND && RHSOpcode == ISD::FP_EXTEND) { + NewOpc = ISD::PARTIAL_REDUCE_FMLA; } else return SDValue(); // For a 2-stage extend the signedness of both of the extends must match @@ -13115,30 +13125,33 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { // -> partial.reduce.smla(acc, op, splat(trunc(1))) // partial.reduce.sumla(acc, sext(op), splat(1)) // -> partial.reduce.smla(acc, op, splat(trunc(1))) +// partial.reduce.fmla(acc, fpext(op), splat(1.0)) +// -> partial.reduce.fmla(acc, op, splat(1.0)) SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) { SDLoc DL(N); SDValue Acc = N->getOperand(0); SDValue Op1 = N->getOperand(1); SDValue Op2 = N->getOperand(2); - APInt ConstantOne; - if (!ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) || - !ConstantOne.isOne()) + if (!llvm::isOneOrOneSplat(Op2) && !llvm::isOneOrOneSplatFP(Op2)) return SDValue(); unsigned Op1Opcode = Op1.getOpcode(); - if (!ISD::isExtOpcode(Op1Opcode)) + if (!ISD::isExtOpcode(Op1Opcode) && Op1Opcode != ISD::FP_EXTEND) return SDValue(); - bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND; + bool Op1IsSigned = + Op1Opcode == ISD::SIGN_EXTEND || Op1Opcode == ISD::FP_EXTEND; bool NodeIsSigned = N->getOpcode() != ISD::PARTIAL_REDUCE_UMLA; EVT AccElemVT = Acc.getValueType().getVectorElementType(); if (Op1IsSigned != NodeIsSigned && Op1.getValueType().getVectorElementType() != AccElemVT) return SDValue(); - unsigned NewOpcode = - Op1IsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA; + unsigned NewOpcode = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA + ? ISD::PARTIAL_REDUCE_FMLA + : Op1IsSigned ? ISD::PARTIAL_REDUCE_SMLA + : ISD::PARTIAL_REDUCE_UMLA; SDValue UnextOp1 = Op1.getOperand(0); EVT UnextOp1VT = UnextOp1.getValueType(); @@ -13148,8 +13161,12 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) { TLI.getTypeToTransformTo(*Context, UnextOp1VT))) return SDValue(); + SDValue Constant = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA + ? DAG.getConstantFP(1, DL, UnextOp1VT) + : DAG.getConstant(1, DL, UnextOp1VT); + return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, UnextOp1, - DAG.getConstant(1, DL, UnextOp1VT)); + Constant); } SDValue DAGCombiner::visitVP_STRIDED_LOAD(SDNode *N) { diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 8e423c4..94751be5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -534,6 +534,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: Action = TLI.getPartialReduceMLAAction(Op.getOpcode(), Node->getValueType(0), Node->getOperand(1).getValueType()); @@ -1243,6 +1244,7 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) { case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: Results.push_back(TLI.expandPartialReduceMLA(Node, DAG)); return; case ISD::VECREDUCE_SEQ_FADD: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index bb4a8d9..dd5c011 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1474,6 +1474,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: SplitVecRes_PARTIAL_REDUCE_MLA(N, Lo, Hi); break; case ISD::GET_ACTIVE_LANE_MASK: @@ -3689,6 +3690,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: Res = SplitVecOp_PARTIAL_REDUCE_MLA(N); break; } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 0a06752..bbc1d73 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -8404,7 +8404,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, } case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SMLA: - case ISD::PARTIAL_REDUCE_SUMLA: { + case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: { [[maybe_unused]] EVT AccVT = N1.getValueType(); [[maybe_unused]] EVT Input1VT = N2.getValueType(); [[maybe_unused]] EVT Input2VT = N3.getValueType(); @@ -13064,6 +13065,11 @@ bool llvm::isOneOrOneSplat(SDValue N, bool AllowUndefs) { return C && C->isOne(); } +bool llvm::isOneOrOneSplatFP(SDValue N, bool AllowUndefs) { + ConstantFPSDNode *C = isConstOrConstSplatFP(N, AllowUndefs); + return C && C->isExactlyValue(1.0); +} + bool llvm::isAllOnesOrAllOnesSplat(SDValue N, bool AllowUndefs) { N = peekThroughBitcasts(N); unsigned BitWidth = N.getScalarValueSizeInBits(); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 2f598b2..88b0809 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8187,6 +8187,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, Input, DAG.getConstant(1, sdl, Input.getValueType()))); return; } + case Intrinsic::vector_partial_reduce_fadd: { + SDValue Acc = getValue(I.getOperand(0)); + SDValue Input = getValue(I.getOperand(1)); + setValue(&I, DAG.getNode( + ISD::PARTIAL_REDUCE_FMLA, sdl, Acc.getValueType(), Acc, + Input, DAG.getConstantFP(1.0, sdl, Input.getValueType()))); + return; + } case Intrinsic::experimental_cttz_elts: { auto DL = getCurSDLoc(); SDValue Op = getValue(I.getOperand(0)); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index d3e1628..ec5edd5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -590,6 +590,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { return "partial_reduce_smla"; case ISD::PARTIAL_REDUCE_SUMLA: return "partial_reduce_sumla"; + case ISD::PARTIAL_REDUCE_FMLA: + return "partial_reduce_fmla"; case ISD::LOOP_DEPENDENCE_WAR_MASK: return "loop_dep_war"; case ISD::LOOP_DEPENDENCE_RAW_MASK: diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 9bdf822..b51d664 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -12074,22 +12074,32 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N, EVT::getVectorVT(*DAG.getContext(), AccVT.getVectorElementType(), MulOpVT.getVectorElementCount()); - unsigned ExtOpcLHS = N->getOpcode() == ISD::PARTIAL_REDUCE_UMLA - ? ISD::ZERO_EXTEND - : ISD::SIGN_EXTEND; - unsigned ExtOpcRHS = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA - ? ISD::SIGN_EXTEND - : ISD::ZERO_EXTEND; + unsigned ExtOpcLHS, ExtOpcRHS; + switch (N->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode"); + case ISD::PARTIAL_REDUCE_UMLA: + ExtOpcLHS = ExtOpcRHS = ISD::ZERO_EXTEND; + break; + case ISD::PARTIAL_REDUCE_SMLA: + ExtOpcLHS = ExtOpcRHS = ISD::SIGN_EXTEND; + break; + case ISD::PARTIAL_REDUCE_FMLA: + ExtOpcLHS = ExtOpcRHS = ISD::FP_EXTEND; + break; + } if (ExtMulOpVT != MulOpVT) { MulLHS = DAG.getNode(ExtOpcLHS, DL, ExtMulOpVT, MulLHS); MulRHS = DAG.getNode(ExtOpcRHS, DL, ExtMulOpVT, MulRHS); } SDValue Input = MulLHS; - APInt ConstantOne; - if (!ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) || - !ConstantOne.isOne()) + if (N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA) { + if (!llvm::isOneOrOneSplatFP(MulRHS)) + Input = DAG.getNode(ISD::FMUL, DL, ExtMulOpVT, MulLHS, MulRHS); + } else if (!llvm::isOneOrOneSplat(MulRHS)) { Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS); + } unsigned Stride = AccVT.getVectorMinNumElements(); unsigned ScaleFactor = MulOpVT.getVectorMinNumElements() / Stride; @@ -12099,10 +12109,13 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N, for (unsigned I = 0; I < ScaleFactor; I++) Subvectors.push_back(DAG.getExtractSubvector(DL, AccVT, Input, I * Stride)); + unsigned FlatNode = + N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA ? ISD::FADD : ISD::ADD; + // Flatten the subvector tree while (Subvectors.size() > 1) { Subvectors.push_back( - DAG.getNode(ISD::ADD, DL, AccVT, {Subvectors[0], Subvectors[1]})); + DAG.getNode(FlatNode, DL, AccVT, {Subvectors[0], Subvectors[1]})); Subvectors.pop_front(); Subvectors.pop_front(); } diff --git a/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp b/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp index 34174f9..ca918f6 100644 --- a/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp +++ b/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp @@ -377,8 +377,10 @@ Error SyntheticTypeNameBuilder::addTypeName(UnitEntryPairTy InputUnitEntryPair, } break; } - // If name for the DIE is not determined yet add referenced types to the name. - if (!HasLinkageName && !HasShortName && !HasDeclFileName) { + // If name for the DIE is not determined yet or if the DIE is a typedef, add + // referenced types to the name. + if ((!HasLinkageName && !HasShortName && !HasDeclFileName) || + InputUnitEntryPair.DieEntry->getTag() == dwarf::DW_TAG_typedef) { if (InputUnitEntryPair.CU->find(InputUnitEntryPair.DieEntry, getODRAttributes())) if (Error Err = addReferencedODRDies(InputUnitEntryPair, AddParentNames, diff --git a/llvm/lib/Demangle/ItaniumDemangle.cpp b/llvm/lib/Demangle/ItaniumDemangle.cpp index 1009cc9..8e476cd 100644 --- a/llvm/lib/Demangle/ItaniumDemangle.cpp +++ b/llvm/lib/Demangle/ItaniumDemangle.cpp @@ -25,10 +25,6 @@ using namespace llvm; using namespace llvm::itanium_demangle; -constexpr const char *itanium_demangle::FloatData<float>::spec; -constexpr const char *itanium_demangle::FloatData<double>::spec; -constexpr const char *itanium_demangle::FloatData<long double>::spec; - // <discriminator> := _ <non-negative number> # when number < 10 // := __ <non-negative number> _ # when number >= 10 // extension := decimal-digit+ # at the end of string diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index f1e473a..59eb870 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -6583,6 +6583,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { } break; } + case Intrinsic::vector_partial_reduce_fadd: case Intrinsic::vector_partial_reduce_add: { VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType()); VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType()); diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp index f9fda23..3f0ecbe 100644 --- a/llvm/lib/Object/ELFObjectFile.cpp +++ b/llvm/lib/Object/ELFObjectFile.cpp @@ -311,6 +311,10 @@ static std::optional<std::string> hexagonAttrToFeatureString(unsigned Attr) { return "v73"; case 75: return "v75"; + case 79: + return "v79"; + case 81: + return "v81"; default: return {}; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 40e6400..c8a038f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1916,6 +1916,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal); setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal); } + + // Handle floating-point partial reduction + if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) { + setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, MVT::nxv4f32, + MVT::nxv8f16, Legal); + } } // Handle non-aliasing elements mask @@ -2283,6 +2289,11 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { MVT::getVectorVT(MVT::i8, NumElts * 8), Custom); } + if (Subtarget->hasSVE2p1() && VT.getVectorElementType() == MVT::f32) { + setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, VT, + MVT::getVectorVT(MVT::f16, NumElts * 2), Custom); + } + // Lower fixed length vector operations to scalable equivalents. setOperationAction(ISD::ABDS, VT, Default); setOperationAction(ISD::ABDU, VT, Default); @@ -7875,6 +7886,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: return LowerPARTIAL_REDUCE_MLA(Op, DAG); } } diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 58a53af..bb2f083 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -13292,18 +13292,24 @@ multiclass AtomicFPStore<bit R, bits<3> op0, string asm> { def H : BaseAtomicFPStore<FPR16, 0b01, R, op0, asm>; } -class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind> +class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind, list<dag> pattern> : BaseSIMDThreeSameVectorTied<1, 1, {size, 0}, 0b11101, - V128, asm, ".16b", []> { + V128, asm, ".16b", pattern> { let AsmString = !strconcat(asm, "{\t$Rd", kind, ", $Rn.16b, $Rm.16b", "|", kind, "\t$Rd, $Rn, $Rm}"); } -multiclass SIMDThreeSameVectorFP8MatrixMul<string asm>{ - def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h">{ +multiclass SIMDThreeSameVectorFP8MatrixMul<string asm, SDPatternOperator OpNode>{ + def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h", + [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { let Predicates = [HasNEON, HasF8F16MM]; } - def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s">{ + def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s", + [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { let Predicates = [HasNEON, HasF8F32MM]; } } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index e6954f7..76f076a 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -11417,7 +11417,7 @@ let Predicates = [HasF16F32MM] in defm FMMLA : SIMDThreeSameVectorFMLAWiden<"fmmla">; let Uses = [FPMR, FPCR] in - defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">; + defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla", int_aarch64_neon_fmmla>; //===----------------------------------------------------------------------===// // Contention Management Hints (FEAT_CMH) diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 3b268dc..e1f4386 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -375,6 +375,11 @@ def AArch64fclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm), node:$Zm) ]>; +def AArch64fdot : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm), + [(int_aarch64_sve_fdot_x2 node:$Zd, node:$Zn, node:$Zm), + (partial_reduce_fmla node:$Zd, node:$Zn, node:$Zm) + ]>; + def SDT_AArch64FCVT : SDTypeProfile<1, 3, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>, SDTCisSameAs<0,3> @@ -4251,7 +4256,7 @@ defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>; let Predicates = [HasSVE2p1_or_SME2] in { defm FCLAMP_ZZZ : sve_fp_clamp<"fclamp", AArch64fclamp>; -defm FDOT_ZZZ_S : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>; +defm FDOT_ZZZ_S : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, AArch64fdot>; defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>; defm BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp index 65e6ed9..c52eb4e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp @@ -57,10 +57,11 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, const UniformityInfo &UI, ValueMap<const Value *, bool> &Tracker) { llvm::Intrinsic::ID IID = II.getIntrinsicID(); - + /// We deliberately do not simplify readfirstlane with a uniform argument, so + /// that frontends can use it to force a copy to SGPR and thereby prevent the + /// backend from generating unwanted waterfall loops. switch (IID) { case Intrinsic::amdgcn_permlane64: - case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: { Value *Src = II.getArgOperand(0); if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker)) @@ -107,7 +108,7 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II, return Changed; } default: - llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic"); + return false; } return false; } @@ -121,16 +122,6 @@ static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) { auto *II = dyn_cast<IntrinsicInst>(&I); if (!II) continue; - - switch (II->getIntrinsicID()) { - case Intrinsic::amdgcn_permlane64: - case Intrinsic::amdgcn_readfirstlane: - case Intrinsic::amdgcn_readlane: - case Intrinsic::amdgcn_ballot: - break; - default: - continue; - } IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker); } return IsChanged; diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 84984a0..964309b 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -681,6 +681,10 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { return false; MI->setDesc(TII->get(NewMFMAOpc)); MI->untieRegOperand(0); + const MCInstrDesc &MCID = MI->getDesc(); + for (unsigned I = 0; I < MI->getNumDefs(); ++I) + if (MCID.getOperandConstraint(I, MCOI::EARLY_CLOBBER) != -1) + MI->getOperand(I).setIsEarlyClobber(true); } // TODO: Should we try to avoid adding this to the candidate list? diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 6ce18ea..9c74c65 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10163,7 +10163,7 @@ static bool followSubRegDef(MachineInstr &MI, } MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, - MachineRegisterInfo &MRI) { + const MachineRegisterInfo &MRI) { assert(MRI.isSSA()); if (!P.Reg.isVirtual()) return nullptr; @@ -10628,6 +10628,8 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, const SIRegisterInfo &RI) { MachineInstr *KillsSCC = nullptr; + if (SCCValid->getParent() != SCCRedefine->getParent()) + return false; for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()), SCCRedefine->getIterator())) { if (MI.modifiesRegister(AMDGPU::SCC, &RI)) @@ -10672,8 +10674,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (CmpValue != 0) return false; - MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); - if (!Def || Def->getParent() != CmpInstr.getParent()) + MachineInstr *Def = MRI->getVRegDef(SrcReg); + if (!Def) return false; // For S_OP that set SCC = DST!=0, do the transformation @@ -10692,6 +10694,32 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (!optimizeSCC(Def, &CmpInstr, RI)) return false; + // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit + // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a + // 64-bit foldableSelect then delete s_or_b32 in the sequence: + // sX = s_cselect_b64 (non-zero imm), 0 + // sLo = copy sX.sub0 + // sHi = copy sX.sub1 + // sY = s_or_b32 sLo, sHi + if (Def->getOpcode() == AMDGPU::S_OR_B32 && + MRI->use_nodbg_empty(Def->getOperand(0).getReg())) { + const MachineOperand &OrOpnd1 = Def->getOperand(1); + const MachineOperand &OrOpnd2 = Def->getOperand(2); + if (OrOpnd1.isReg() && OrOpnd2.isReg()) { + MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg()); + MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg()); + if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 && + Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() && + Def2->getOperand(1).isReg() && + Def1->getOperand(1).getSubReg() == AMDGPU::sub0 && + Def2->getOperand(1).getSubReg() == AMDGPU::sub1 && + Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) { + MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg()); + if (Select && foldableSelect(*Select)) + optimizeSCC(Select, Def, RI); + } + } + } return true; }; @@ -10721,8 +10749,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n - MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); - if (!Def || Def->getParent() != CmpInstr.getParent()) + MachineInstr *Def = MRI->getVRegDef(SrcReg); + if (!Def) return false; if (Def->getOpcode() != AMDGPU::S_AND_B32 && diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 0643b53..8d693b1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1687,7 +1687,7 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, /// skipping copy like instructions and subreg-manipulation pseudos. /// Following another subreg of a reg:subreg isn't supported. MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, - MachineRegisterInfo &MRI); + const MachineRegisterInfo &MRI); /// \brief Return false if EXEC is not changed between the def of \p VReg at \p /// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index bfac639..caff354 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1334,20 +1334,21 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const MCInstrDesc &Desc = TII->get(MI.getOpcode()); unsigned ConstantBusCount = 0; for (MachineOperand &Op : MI.explicit_uses()) { - if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) - continue; - - unsigned I = Op.getOperandNo(); + if (Op.isReg()) { + if (TRI->isVGPR(*MRI, Op.getReg())) + continue; - int16_t RegClass = TII->getOpRegClassID(Desc.operands()[I]); - if (RegClass == -1 || !TRI->isVSSuperClass(TRI->getRegClass(RegClass))) + if (ST.hasSDWAScalar() && ConstantBusCount == 0) { + ++ConstantBusCount; + continue; + } + } else if (!Op.isImm()) continue; - if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && - TRI->isSGPRReg(*MRI, Op.getReg())) { - ++ConstantBusCount; + unsigned I = Op.getOperandNo(); + const TargetRegisterClass *OpRC = TII->getRegClass(Desc, I, TRI); + if (!OpRC || !TRI->isVSSuperClass(OpRC)) continue; - } Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 20fc849..dd233e2 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -657,6 +657,17 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom); setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom); + if (Subtarget.isISA3_0() && isPPC64) { + setOperationAction(ISD::VP_STORE, MVT::v16i1, Custom); + setOperationAction(ISD::VP_STORE, MVT::v8i1, Custom); + setOperationAction(ISD::VP_STORE, MVT::v4i1, Custom); + setOperationAction(ISD::VP_STORE, MVT::v2i1, Custom); + setOperationAction(ISD::VP_LOAD, MVT::v16i1, Custom); + setOperationAction(ISD::VP_LOAD, MVT::v8i1, Custom); + setOperationAction(ISD::VP_LOAD, MVT::v4i1, Custom); + setOperationAction(ISD::VP_LOAD, MVT::v2i1, Custom); + } + // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom); @@ -11917,6 +11928,62 @@ SDValue PPCTargetLowering::LowerIS_FPCLASS(SDValue Op, return getDataClassTest(LHS, Category, Dl, DAG, Subtarget); } +// Adjust the length value for a load/store with length to account for the +// instructions requiring a left justified length, and for non-byte element +// types requiring scaling by element size. +static SDValue AdjustLength(SDValue Val, unsigned Bits, bool Left, + SelectionDAG &DAG) { + SDLoc dl(Val); + EVT VT = Val->getValueType(0); + unsigned LeftAdj = Left ? VT.getSizeInBits() - 8 : 0; + unsigned TypeAdj = llvm::countr_zero<uint32_t>(Bits / 8); + SDValue SHLAmt = DAG.getConstant(LeftAdj + TypeAdj, dl, VT); + return DAG.getNode(ISD::SHL, dl, VT, Val, SHLAmt); +} + +SDValue PPCTargetLowering::LowerVP_LOAD(SDValue Op, SelectionDAG &DAG) const { + auto VPLD = cast<VPLoadSDNode>(Op); + bool Future = Subtarget.isISAFuture(); + SDLoc dl(Op); + assert(ISD::isConstantSplatVectorAllOnes(Op->getOperand(3).getNode(), true) && + "Mask predication not supported"); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Len = DAG.getNode(ISD::ANY_EXTEND, dl, PtrVT, VPLD->getOperand(4)); + unsigned IID = Future ? Intrinsic::ppc_vsx_lxvrl : Intrinsic::ppc_vsx_lxvl; + unsigned EltBits = Op->getValueType(0).getScalarType().getSizeInBits(); + Len = AdjustLength(Len, EltBits, !Future, DAG); + SDValue Ops[] = {VPLD->getChain(), DAG.getConstant(IID, dl, MVT::i32), + VPLD->getOperand(1), Len}; + SDVTList Tys = DAG.getVTList(Op->getValueType(0), MVT::Other); + SDValue VPL = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, Tys, Ops, + VPLD->getMemoryVT(), VPLD->getMemOperand()); + return VPL; +} + +SDValue PPCTargetLowering::LowerVP_STORE(SDValue Op, SelectionDAG &DAG) const { + auto VPST = cast<VPStoreSDNode>(Op); + assert(ISD::isConstantSplatVectorAllOnes(Op->getOperand(4).getNode(), true) && + "Mask predication not supported"); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc dl(Op); + SDValue Len = DAG.getNode(ISD::ANY_EXTEND, dl, PtrVT, VPST->getOperand(5)); + unsigned EltBits = + Op->getOperand(1).getValueType().getScalarType().getSizeInBits(); + bool Future = Subtarget.isISAFuture(); + unsigned IID = Future ? Intrinsic::ppc_vsx_stxvrl : Intrinsic::ppc_vsx_stxvl; + Len = AdjustLength(Len, EltBits, !Future, DAG); + SDValue Ops[] = { + VPST->getChain(), DAG.getConstant(IID, dl, MVT::i32), + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, VPST->getOperand(1)), + VPST->getOperand(2), Len}; + SDVTList Tys = DAG.getVTList(MVT::Other); + SDValue VPS = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, Tys, Ops, + VPST->getMemoryVT(), VPST->getMemOperand()); + return VPS; +} + SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -12771,6 +12838,10 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { if (Op->getFlags().hasNoFPExcept()) return Op; return SDValue(); + case ISD::VP_LOAD: + return LowerVP_LOAD(Op, DAG); + case ISD::VP_STORE: + return LowerVP_STORE(Op, DAG); } } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 880aca7..d967018 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1345,6 +1345,9 @@ namespace llvm { SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVP_LOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVP_STORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDMFVectorLoad(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index b04e887..e74f1bd 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -24,6 +24,10 @@ using namespace llvm; #define DEBUG_TYPE "ppctti" +static cl::opt<bool> Pwr9EVL("ppc-pwr9-evl", + cl::desc("Allow vp.load and vp.store for pwr9"), + cl::init(false), cl::Hidden); + static cl::opt<bool> VecMaskCost("ppc-vec-mask-cost", cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden); @@ -1031,3 +1035,42 @@ bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, bool PPCTTIImpl::supportsTailCallFor(const CallBase *CB) const { return TLI->supportsTailCallFor(CB); } + +// Target hook used by CodeGen to decide whether to expand vector predication +// intrinsics into scalar operations or to use special ISD nodes to represent +// them. The Target will not see the intrinsics. +TargetTransformInfo::VPLegalization +PPCTTIImpl::getVPLegalizationStrategy(const VPIntrinsic &PI) const { + using VPLegalization = TargetTransformInfo::VPLegalization; + unsigned Directive = ST->getCPUDirective(); + VPLegalization DefaultLegalization = BaseT::getVPLegalizationStrategy(PI); + if (Directive != PPC::DIR_PWR10 && Directive != PPC::DIR_PWR_FUTURE && + (!Pwr9EVL || Directive != PPC::DIR_PWR9)) + return DefaultLegalization; + + if (!ST->isPPC64()) + return DefaultLegalization; + + unsigned IID = PI.getIntrinsicID(); + if (IID != Intrinsic::vp_load && IID != Intrinsic::vp_store) + return DefaultLegalization; + + bool IsLoad = IID == Intrinsic::vp_load; + Type *VecTy = IsLoad ? PI.getType() : PI.getOperand(0)->getType(); + EVT VT = TLI->getValueType(DL, VecTy, true); + if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && + VT != MVT::v16i8) + return DefaultLegalization; + + auto IsAllTrueMask = [](Value *MaskVal) { + if (Value *SplattedVal = getSplatValue(MaskVal)) + if (auto *ConstValue = dyn_cast<Constant>(SplattedVal)) + return ConstValue->isAllOnesValue(); + return false; + }; + unsigned MaskIx = IsLoad ? 1 : 2; + if (!IsAllTrueMask(PI.getOperand(MaskIx))) + return DefaultLegalization; + + return VPLegalization(VPLegalization::Legal, VPLegalization::Legal); +} diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 8d7f255..f80ebdb 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -150,6 +150,9 @@ public: ArrayRef<Type *> Types) const override; bool supportsTailCallFor(const CallBase *CB) const override; + TargetTransformInfo::VPLegalization + getVPLegalizationStrategy(const VPIntrinsic &PI) const override; + private: // The following constant is used for estimating costs on power9. static const InstructionCost::CostType P9PipelineFlushEstimate = 80; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 7bc0b5b..332433b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -2140,7 +2140,8 @@ InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, // Assume memory ops cost scale with the number of vector registers // possible accessed by the instruction. Note that BasicTTI already // handles the LT.first term for us. - if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize) + if (ST->hasVInstructions() && LT.second.isVector() && + CostKind != TTI::TCK_CodeSize) BaseCost *= TLI->getLMULCost(LT.second); return Cost + BaseCost; } diff --git a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp index 21a233b2..b7a93e7 100644 --- a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp +++ b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp @@ -216,6 +216,7 @@ static unsigned getInstSizeInBytes(const MachineInstr &MI, MI.isDebugOrPseudoInstr() || MI.isPosition() || MI.isKill() || MI.isImplicitDef() || MI.getOpcode() == TargetOpcode::MEMBARRIER || MI.getOpcode() == TargetOpcode::INIT_UNDEF || MI.isFakeUse() || + MI.getOpcode() == TargetOpcode::RELOC_NONE || // These have a size that may be zero: MI.isInlineAsm() || MI.getOpcode() == SystemZ::STACKMAP || MI.getOpcode() == SystemZ::PATCHPOINT || diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 090f649..05a854a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45022,11 +45022,16 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( case X86ISD::INSERTPS: case X86ISD::BLENDI: case X86ISD::PSHUFB: + case X86ISD::VZEXT_MOVL: case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::SHUFP: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPV: case X86ISD::VPERMILPI: + case X86ISD::VPERMI: case X86ISD::VPERMV: case X86ISD::VPERMV3: { SmallVector<int, 8> Mask; @@ -45052,6 +45057,16 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( } break; } + case X86ISD::VBROADCAST: { + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + if (SrcVT.isVector()) { + APInt DemandedSrc = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0); + return DAG.isGuaranteedNotToBeUndefOrPoison(Src, DemandedSrc, PoisonOnly, + Depth + 1); + } + return DAG.isGuaranteedNotToBeUndefOrPoison(Src, PoisonOnly, Depth + 1); + } } return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( Op, DemandedElts, DAG, PoisonOnly, Depth); @@ -45096,13 +45111,19 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( // SSE target shuffles. case X86ISD::INSERTPS: case X86ISD::PSHUFB: + case X86ISD::VZEXT_MOVL: case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::SHUFP: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPV: case X86ISD::VPERMILPI: + case X86ISD::VPERMI: case X86ISD::VPERMV: case X86ISD::VPERMV3: + case X86ISD::VBROADCAST: return false; // SSE comparisons handle all icmp/fcmp cases. // TODO: Add CMPM/MM with test coverage. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b7224a3..666033b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7488,11 +7488,12 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { }); } -VPWidenMemoryRecipe * -VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, - VFRange &Range) { - assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && +VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI, + VFRange &Range) { + assert((VPI->getOpcode() == Instruction::Load || + VPI->getOpcode() == Instruction::Store) && "Must be called with either a load or store"); + Instruction *I = VPI->getUnderlyingInstr(); auto WillWiden = [&](ElementCount VF) -> bool { LoopVectorizationCostModel::InstWidening Decision = @@ -7522,7 +7523,8 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, bool Consecutive = Reverse || Decision == LoopVectorizationCostModel::CM_Widen; - VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; + VPValue *Ptr = VPI->getOpcode() == Instruction::Load ? VPI->getOperand(0) + : VPI->getOperand(1); if (Consecutive) { auto *GEP = dyn_cast<GetElementPtrInst>( Ptr->getUnderlyingValue()->stripPointerCasts()); @@ -7536,77 +7538,78 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, CM.foldTailByMasking() || !GEP ? GEPNoWrapFlags::none() : GEP->getNoWrapFlags().withoutNoUnsignedWrap(); - VectorPtr = - new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I), - /*Stride*/ -1, Flags, I->getDebugLoc()); + VectorPtr = new VPVectorEndPointerRecipe( + Ptr, &Plan.getVF(), getLoadStoreType(I), + /*Stride*/ -1, Flags, VPI->getDebugLoc()); } else { VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), - I->getDebugLoc()); + VPI->getDebugLoc()); } Builder.insert(VectorPtr); Ptr = VectorPtr; } - if (LoadInst *Load = dyn_cast<LoadInst>(I)) + if (VPI->getOpcode() == Instruction::Load) { + auto *Load = cast<LoadInst>(I); return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, VPIRMetadata(*Load, LVer), I->getDebugLoc()); + } StoreInst *Store = cast<StoreInst>(I); - return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, - Reverse, VPIRMetadata(*Store, LVer), - I->getDebugLoc()); + return new VPWidenStoreRecipe(*Store, Ptr, VPI->getOperand(0), Mask, + Consecutive, Reverse, + VPIRMetadata(*Store, LVer), VPI->getDebugLoc()); } -/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also -/// insert a recipe to expand the step for the induction recipe. +/// Creates a VPWidenIntOrFpInductionRecipe for \p PhiR. If needed, it will +/// also insert a recipe to expand the step for the induction recipe. static VPWidenIntOrFpInductionRecipe * -createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, - VPValue *Start, const InductionDescriptor &IndDesc, - VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) { - assert(IndDesc.getStartValue() == - Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); +createWidenInductionRecipes(VPInstruction *PhiR, + const InductionDescriptor &IndDesc, VPlan &Plan, + ScalarEvolution &SE, Loop &OrigLoop) { assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && "step must be loop invariant"); + VPValue *Start = PhiR->getOperand(0); + assert(Plan.getLiveIn(IndDesc.getStartValue()) == Start && + "Start VPValue must match IndDesc's start value"); + VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep()); - if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { - return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), - IndDesc, TruncI, - TruncI->getDebugLoc()); - } - assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); + PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingInstr()); return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), - IndDesc, Phi->getDebugLoc()); + IndDesc, PhiR->getDebugLoc()); } -VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( - PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) { +VPHeaderPHIRecipe * +VPRecipeBuilder::tryToOptimizeInductionPHI(VPInstruction *VPI, VFRange &Range) { + auto *Phi = cast<PHINode>(VPI->getUnderlyingInstr()); // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) - return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan, - *PSE.getSE(), *OrigLoop); + return createWidenInductionRecipes(VPI, *II, Plan, *PSE.getSE(), *OrigLoop); // Check if this is pointer induction. If so, build the recipe for it. if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep()); return new VPWidenPointerInductionRecipe( - Phi, Operands[0], Step, &Plan.getVFxUF(), *II, + Phi, VPI->getOperand(0), Step, &Plan.getVFxUF(), *II, LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return CM.isScalarAfterVectorization(Phi, VF); }, Range), - Phi->getDebugLoc()); + VPI->getDebugLoc()); } return nullptr; } -VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( - TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) { +VPWidenIntOrFpInductionRecipe * +VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI, + VFRange &Range) { + auto *I = cast<TruncInst>(VPI->getUnderlyingInstr()); // Optimize the special case where the source is a constant integer // induction variable. Notice that we can only optimize the 'trunc' case // because (a) FP conversions lose precision, (b) sext/zext may wrap, and @@ -7621,21 +7624,24 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( }; }; - if (LoopVectorizationPlanner::getDecisionAndClampRange( - IsOptimizableIVTruncate(I), Range)) { + if (!LoopVectorizationPlanner::getDecisionAndClampRange( + IsOptimizableIVTruncate(I), Range)) + return nullptr; - auto *Phi = cast<PHINode>(I->getOperand(0)); - const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); - VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue()); - return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(), - *OrigLoop); - } - return nullptr; + auto *WidenIV = cast<VPWidenIntOrFpInductionRecipe>( + VPI->getOperand(0)->getDefiningRecipe()); + PHINode *Phi = WidenIV->getPHINode(); + VPValue *Start = WidenIV->getStartValue(); + const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor(); + VPValue *Step = + vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep()); + return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(), + IndDesc, I, VPI->getDebugLoc()); } -VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, - ArrayRef<VPValue *> Operands, +VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI, VFRange &Range) { + CallInst *CI = cast<CallInst>(VPI->getUnderlyingInstr()); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI, VF); @@ -7652,7 +7658,8 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, ID == Intrinsic::experimental_noalias_scope_decl)) return nullptr; - SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size())); + SmallVector<VPValue *, 4> Ops(VPI->op_begin(), + VPI->op_begin() + CI->arg_size()); // Is it beneficial to perform intrinsic call compared to lib call? bool ShouldUseVectorIntrinsic = @@ -7664,7 +7671,7 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, Range); if (ShouldUseVectorIntrinsic) return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), - CI->getDebugLoc()); + VPI->getDebugLoc()); Function *Variant = nullptr; std::optional<unsigned> MaskPos; @@ -7711,13 +7718,13 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, Mask = getBlockInMask(Builder.getInsertBlock()); else Mask = Plan.getOrAddLiveIn( - ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext()))); + ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext()))); Ops.insert(Ops.begin() + *MaskPos, Mask); } - Ops.push_back(Operands.back()); - return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc()); + Ops.push_back(VPI->getOperand(VPI->getNumOperands() - 1)); + return new VPWidenCallRecipe(CI, Variant, Ops, VPI->getDebugLoc()); } return nullptr; @@ -7737,9 +7744,9 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { Range); } -VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, - ArrayRef<VPValue *> Operands) { - switch (I->getOpcode()) { +VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) { + auto *I = VPI->getUnderlyingInstr(); + switch (VPI->getOpcode()) { default: return nullptr; case Instruction::SDiv: @@ -7749,10 +7756,11 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, // If not provably safe, use a select to form a safe divisor before widening the // div/rem operation itself. Otherwise fall through to general handling below. if (CM.isPredicatedInst(I)) { - SmallVector<VPValue *> Ops(Operands); + SmallVector<VPValue *> Ops(VPI->operands()); VPValue *Mask = getBlockInMask(Builder.getInsertBlock()); VPValue *One = Plan.getConstantInt(I->getType(), 1u); - auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc()); + auto *SafeRHS = + Builder.createSelect(Mask, Ops[1], One, VPI->getDebugLoc()); Ops[1] = SafeRHS; return new VPWidenRecipe(*I, Ops); } @@ -7777,8 +7785,8 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, case Instruction::Sub: case Instruction::Xor: case Instruction::Freeze: { - SmallVector<VPValue *> NewOps(Operands); - if (Instruction::isBinaryOp(I->getOpcode())) { + SmallVector<VPValue *> NewOps(VPI->operands()); + if (Instruction::isBinaryOp(VPI->getOpcode())) { // The legacy cost model uses SCEV to check if some of the operands are // constants. To match the legacy cost model's behavior, use SCEV to try // to replace operands with constants. @@ -7795,7 +7803,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, return Plan.getOrAddLiveIn(C->getValue()); }; // For Mul, the legacy cost model checks both operands. - if (I->getOpcode() == Instruction::Mul) + if (VPI->getOpcode() == Instruction::Mul) NewOps[0] = GetConstantViaSCEV(NewOps[0]); // For other binops, the legacy cost model only checks the second operand. NewOps[1] = GetConstantViaSCEV(NewOps[1]); @@ -7803,7 +7811,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, return new VPWidenRecipe(*I, NewOps); } case Instruction::ExtractValue: { - SmallVector<VPValue *> NewOps(Operands); + SmallVector<VPValue *> NewOps(VPI->operands()); auto *EVI = cast<ExtractValueInst>(I); assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index"); unsigned Idx = EVI->getIndices()[0]; @@ -7813,9 +7821,8 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, }; } -VPHistogramRecipe * -VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI, - ArrayRef<VPValue *> Operands) { +VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI, + VPInstruction *VPI) { // FIXME: Support other operations. unsigned Opcode = HI->Update->getOpcode(); assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) && @@ -7823,7 +7830,7 @@ VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI, SmallVector<VPValue *, 3> HGramOps; // Bucket address. - HGramOps.push_back(Operands[1]); + HGramOps.push_back(VPI->getOperand(1)); // Increment value. HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1))); @@ -7832,12 +7839,12 @@ VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI, if (Legal->isMaskRequired(HI->Store)) HGramOps.push_back(getBlockInMask(Builder.getInsertBlock())); - return new VPHistogramRecipe(Opcode, HGramOps, HI->Store->getDebugLoc()); + return new VPHistogramRecipe(Opcode, HGramOps, VPI->getDebugLoc()); } -VPReplicateRecipe * -VPRecipeBuilder::handleReplication(Instruction *I, ArrayRef<VPValue *> Operands, - VFRange &Range) { +VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI, + VFRange &Range) { + auto *I = VPI->getUnderlyingInstr(); bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); @@ -7893,8 +7900,8 @@ VPRecipeBuilder::handleReplication(Instruction *I, ArrayRef<VPValue *> Operands, assert((Range.Start.isScalar() || !IsUniform || !IsPredicated || (Range.Start.isScalable() && isa<IntrinsicInst>(I))) && "Should not predicate a uniform recipe"); - auto *Recipe = new VPReplicateRecipe(I, Operands, IsUniform, BlockInMask, - VPIRMetadata(*I, LVer)); + auto *Recipe = new VPReplicateRecipe(I, VPI->operands(), IsUniform, + BlockInMask, VPIRMetadata(*I, LVer)); return Recipe; } @@ -8075,8 +8082,6 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, // First, check for specific widening recipes that deal with inductions, Phi // nodes, calls and memory operations. VPRecipeBase *Recipe; - Instruction *Instr = R->getUnderlyingInstr(); - SmallVector<VPValue *, 4> Operands(R->operands()); if (auto *PhiR = dyn_cast<VPPhi>(R)) { VPBasicBlock *Parent = PhiR->getParent(); [[maybe_unused]] VPRegionBlock *LoopRegionOf = @@ -8084,15 +8089,15 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent && "Non-header phis should have been handled during predication"); auto *Phi = cast<PHINode>(R->getUnderlyingInstr()); - assert(Operands.size() == 2 && "Must have 2 operands for header phis"); - if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) + assert(R->getNumOperands() == 2 && "Must have 2 operands for header phis"); + if ((Recipe = tryToOptimizeInductionPHI(PhiR, Range))) return Recipe; VPHeaderPHIRecipe *PhiRecipe = nullptr; assert((Legal->isReductionVariable(Phi) || Legal->isFixedOrderRecurrence(Phi)) && "can only widen reductions and fixed-order recurrences here"); - VPValue *StartV = Operands[0]; + VPValue *StartV = R->getOperand(0); if (Legal->isReductionVariable(Phi)) { const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(Phi); assert(RdxDesc.getRecurrenceStartValue() == @@ -8112,13 +8117,15 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); } // Add backedge value. - PhiRecipe->addOperand(Operands[1]); + PhiRecipe->addOperand(R->getOperand(1)); return PhiRecipe; } assert(!R->isPhi() && "only VPPhi nodes expected at this point"); - if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate( - cast<TruncInst>(Instr), Operands, Range))) + auto *VPI = cast<VPInstruction>(R); + Instruction *Instr = R->getUnderlyingInstr(); + if (VPI->getOpcode() == Instruction::Trunc && + (Recipe = tryToOptimizeInductionTruncate(VPI, Range))) return Recipe; // All widen recipes below deal only with VF > 1. @@ -8126,46 +8133,46 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, [&](ElementCount VF) { return VF.isScalar(); }, Range)) return nullptr; - if (auto *CI = dyn_cast<CallInst>(Instr)) - return tryToWidenCall(CI, Operands, Range); + if (VPI->getOpcode() == Instruction::Call) + return tryToWidenCall(VPI, Range); - if (StoreInst *SI = dyn_cast<StoreInst>(Instr)) - if (auto HistInfo = Legal->getHistogramInfo(SI)) - return tryToWidenHistogram(*HistInfo, Operands); + if (VPI->getOpcode() == Instruction::Store) + if (auto HistInfo = Legal->getHistogramInfo(cast<StoreInst>(Instr))) + return tryToWidenHistogram(*HistInfo, VPI); - if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) - return tryToWidenMemory(Instr, Operands, Range); + if (VPI->getOpcode() == Instruction::Load || + VPI->getOpcode() == Instruction::Store) + return tryToWidenMemory(VPI, Range); if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr)) - return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()); + return tryToCreatePartialReduction(VPI, ScaleFactor.value()); if (!shouldWiden(Instr, Range)) return nullptr; - if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr)) - return new VPWidenGEPRecipe(GEP, Operands); + if (VPI->getOpcode() == Instruction::GetElementPtr) + return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr), R->operands()); - if (auto *SI = dyn_cast<SelectInst>(Instr)) { - return new VPWidenSelectRecipe(*SI, Operands); - } + if (VPI->getOpcode() == Instruction::Select) + return new VPWidenSelectRecipe(*cast<SelectInst>(Instr), R->operands()); - if (auto *CI = dyn_cast<CastInst>(Instr)) { - return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), - *CI); + if (Instruction::isCast(VPI->getOpcode())) { + auto *CI = cast<CastInst>(Instr); + return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0), + CI->getType(), *CI); } - return tryToWiden(Instr, Operands); + return tryToWiden(VPI); } VPRecipeBase * -VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, - ArrayRef<VPValue *> Operands, +VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction, unsigned ScaleFactor) { - assert(Operands.size() == 2 && + assert(Reduction->getNumOperands() == 2 && "Unexpected number of operands for partial reduction"); - VPValue *BinOp = Operands[0]; - VPValue *Accumulator = Operands[1]; + VPValue *BinOp = Reduction->getOperand(0); + VPValue *Accumulator = Reduction->getOperand(1); VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe(); if (isa<VPReductionPHIRecipe>(BinOpRecipe) || isa<VPPartialReductionRecipe>(BinOpRecipe)) @@ -8176,28 +8183,29 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, "all accumulators in chain must have same scale factor"); unsigned ReductionOpcode = Reduction->getOpcode(); + auto *ReductionI = Reduction->getUnderlyingInstr(); if (ReductionOpcode == Instruction::Sub) { - auto *const Zero = ConstantInt::get(Reduction->getType(), 0); + auto *const Zero = ConstantInt::get(ReductionI->getType(), 0); SmallVector<VPValue *, 2> Ops; Ops.push_back(Plan.getOrAddLiveIn(Zero)); Ops.push_back(BinOp); - BinOp = new VPWidenRecipe(*Reduction, Ops); + BinOp = new VPWidenRecipe(*ReductionI, Ops); Builder.insert(BinOp->getDefiningRecipe()); ReductionOpcode = Instruction::Add; } VPValue *Cond = nullptr; - if (CM.blockNeedsPredicationForAnyReason(Reduction->getParent())) { + if (CM.blockNeedsPredicationForAnyReason(ReductionI->getParent())) { assert((ReductionOpcode == Instruction::Add || ReductionOpcode == Instruction::Sub) && "Expected an ADD or SUB operation for predicated partial " "reductions (because the neutral element in the mask is zero)!"); Cond = getBlockInMask(Builder.getInsertBlock()); - VPValue *Zero = Plan.getConstantInt(Reduction->getType(), 0); + VPValue *Zero = Plan.getConstantInt(ReductionI->getType(), 0); BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc()); } return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond, - ScaleFactor, Reduction); + ScaleFactor, ReductionI); } void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, @@ -8382,7 +8390,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( VPRecipeBase *Recipe = RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range); if (!Recipe) - Recipe = RecipeBuilder.handleReplication(Instr, R.operands(), Range); + Recipe = RecipeBuilder.handleReplication(cast<VPInstruction>(SingleDef), + Range); RecipeBuilder.setRecipe(Instr, Recipe); if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) { diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 41878e3..a7000af 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -93,42 +93,37 @@ class VPRecipeBuilder { /// Range. The function should not be called for memory instructions or calls. bool shouldWiden(Instruction *I, VFRange &Range) const; - /// Check if the load or store instruction \p I should widened for \p + /// Check if the load or store instruction \p VPI should widened for \p /// Range.Start and potentially masked. Such instructions are handled by a /// recipe that takes an additional VPInstruction for the mask. - VPWidenMemoryRecipe *tryToWidenMemory(Instruction *I, - ArrayRef<VPValue *> Operands, - VFRange &Range); + VPWidenMemoryRecipe *tryToWidenMemory(VPInstruction *VPI, VFRange &Range); - /// Check if an induction recipe should be constructed for \p Phi. If so build + /// Check if an induction recipe should be constructed for \p VPI. If so build /// and return it. If not, return null. - VPHeaderPHIRecipe *tryToOptimizeInductionPHI(PHINode *Phi, - ArrayRef<VPValue *> Operands, + VPHeaderPHIRecipe *tryToOptimizeInductionPHI(VPInstruction *VPI, VFRange &Range); - /// Optimize the special case where the operand of \p I is a constant integer - /// induction variable. + /// Optimize the special case where the operand of \p VPI is a constant + /// integer induction variable. VPWidenIntOrFpInductionRecipe * - tryToOptimizeInductionTruncate(TruncInst *I, ArrayRef<VPValue *> Operands, - VFRange &Range); + tryToOptimizeInductionTruncate(VPInstruction *VPI, VFRange &Range); - /// Handle call instructions. If \p CI can be widened for \p Range.Start, + /// Handle call instructions. If \p VPI can be widened for \p Range.Start, /// return a new VPWidenCallRecipe or VPWidenIntrinsicRecipe. Range.End may be /// decreased to ensure same decision from \p Range.Start to \p Range.End. - VPSingleDefRecipe *tryToWidenCall(CallInst *CI, ArrayRef<VPValue *> Operands, - VFRange &Range); + VPSingleDefRecipe *tryToWidenCall(VPInstruction *VPI, VFRange &Range); - /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe - /// if it can. The function should only be called if the cost-model indicates - /// that widening should be performed. - VPWidenRecipe *tryToWiden(Instruction *I, ArrayRef<VPValue *> Operands); + /// Check if \p VPI has an opcode that can be widened and return a + /// VPWidenRecipe if it can. The function should only be called if the + /// cost-model indicates that widening should be performed. + VPWidenRecipe *tryToWiden(VPInstruction *VPI); /// Makes Histogram count operations safe for vectorization, by emitting a /// llvm.experimental.vector.histogram.add intrinsic in place of the /// Load + Add|Sub + Store operations that perform the histogram in the /// original scalar loop. VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI, - ArrayRef<VPValue *> Operands); + VPInstruction *VPI); /// Examines reduction operations to see if the target can use a cheaper /// operation with a wider per-iteration input VF and narrower PHI VF. @@ -171,8 +166,7 @@ public: /// Create and return a partial reduction recipe for a reduction instruction /// along with binary operation and reduction phi operands. - VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction, - ArrayRef<VPValue *> Operands, + VPRecipeBase *tryToCreatePartialReduction(VPInstruction *Reduction, unsigned ScaleFactor); /// Set the recipe created for given ingredient. @@ -197,12 +191,10 @@ public: return Ingredient2Recipe[I]; } - /// Build a VPReplicationRecipe for \p I using \p Operands. If it is - /// predicated, add the mask as last operand. Range.End may be decreased to - /// ensure same recipe behavior from \p Range.Start to \p Range.End. - VPReplicateRecipe *handleReplication(Instruction *I, - ArrayRef<VPValue *> Operands, - VFRange &Range); + /// Build a VPReplicationRecipe for \p VPI. If it is predicated, add the mask + /// as last operand. Range.End may be decreased to ensure same recipe behavior + /// from \p Range.Start to \p Range.End. + VPReplicateRecipe *handleReplication(VPInstruction *VPI, VFRange &Range); VPValue *getVPValueOrAddLiveIn(Value *V) { if (auto *I = dyn_cast<Instruction>(V)) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 48bd697..634df51 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1288,8 +1288,9 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { // Look through broadcast of single-scalar when used as select conditions; in // that case the scalar condition can be used directly. if (match(Def, - m_Select(m_Broadcast(m_VPValue(C)), m_VPValue(), m_VPValue())) && - vputils::isSingleScalar(C)) { + m_Select(m_Broadcast(m_VPValue(C)), m_VPValue(), m_VPValue()))) { + assert(vputils::isSingleScalar(C) && + "broadcast operand must be single-scalar"); Def->setOperand(0, C); return; } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 27a8bbd..ed3a0a0 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -696,11 +696,11 @@ bool VectorCombine::foldExtractExtract(Instruction &I) { /// shuffle. bool VectorCombine::foldInsExtFNeg(Instruction &I) { // Match an insert (op (extract)) pattern. - Value *DestVec; - uint64_t Index; + Value *DstVec; + uint64_t ExtIdx, InsIdx; Instruction *FNeg; - if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)), - m_ConstantInt(Index)))) + if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)), + m_ConstantInt(InsIdx)))) return false; // Note: This handles the canonical fneg instruction and "fsub -0.0, X". @@ -708,67 +708,74 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { Instruction *Extract; if (!match(FNeg, m_FNeg(m_CombineAnd( m_Instruction(Extract), - m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index)))))) + m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)))))) return false; - auto *VecTy = cast<FixedVectorType>(I.getType()); - auto *ScalarTy = VecTy->getScalarType(); + auto *DstVecTy = cast<FixedVectorType>(DstVec->getType()); + auto *DstVecScalarTy = DstVecTy->getScalarType(); auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType()); - if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType()) + if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType()) return false; - // Ignore bogus insert/extract index. - unsigned NumElts = VecTy->getNumElements(); - if (Index >= NumElts) + // Ignore if insert/extract index is out of bounds or destination vector has + // one element + unsigned NumDstElts = DstVecTy->getNumElements(); + unsigned NumSrcElts = SrcVecTy->getNumElements(); + if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1) return false; // We are inserting the negated element into the same lane that we extracted // from. This is equivalent to a select-shuffle that chooses all but the // negated element from the destination vector. - SmallVector<int> Mask(NumElts); + SmallVector<int> Mask(NumDstElts); std::iota(Mask.begin(), Mask.end(), 0); - Mask[Index] = Index + NumElts; + Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts; InstructionCost OldCost = - TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) + - TTI.getVectorInstrCost(I, VecTy, CostKind, Index); + TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) + + TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx); // If the extract has one use, it will be eliminated, so count it in the // original cost. If it has more than one use, ignore the cost because it will // be the same before/after. if (Extract->hasOneUse()) - OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index); + OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx); InstructionCost NewCost = - TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) + - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, VecTy, - Mask, CostKind); + TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) + + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, DstVecTy, + DstVecTy, Mask, CostKind); - bool NeedLenChg = SrcVecTy->getNumElements() != NumElts; + bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts; // If the lengths of the two vectors are not equal, // we need to add a length-change vector. Add this cost. SmallVector<int> SrcMask; if (NeedLenChg) { - SrcMask.assign(NumElts, PoisonMaskElem); - SrcMask[Index] = Index; + SrcMask.assign(NumDstElts, PoisonMaskElem); + SrcMask[ExtIdx % NumDstElts] = ExtIdx; NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - VecTy, SrcVecTy, SrcMask, CostKind); + DstVecTy, SrcVecTy, SrcMask, CostKind); } + LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I + << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost + << "\n"); if (NewCost > OldCost) return false; - Value *NewShuf; - // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index + Value *NewShuf, *LenChgShuf = nullptr; + // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); if (NeedLenChg) { - // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask - Value *LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask); - NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask); + // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask + LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask); + NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask); + Worklist.pushValue(LenChgShuf); } else { - // shuffle DestVec, (fneg SrcVec), Mask - NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); + // shuffle DstVec, (fneg SrcVec), Mask + NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask); } + Worklist.pushValue(VecFNeg); replaceValue(I, *NewShuf); return true; } |
