aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/DependenceAnalysis.cpp5
-rw-r--r--llvm/lib/CAS/ObjectStore.cpp5
-rw-r--r--llvm/lib/CAS/UnifiedOnDiskCache.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp45
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp8
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp8
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp33
-rw-r--r--llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp6
-rw-r--r--llvm/lib/Demangle/ItaniumDemangle.cpp4
-rw-r--r--llvm/lib/IR/Verifier.cpp1
-rw-r--r--llvm/lib/Object/ELFObjectFile.cpp4
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp12
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrFormats.td16
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp17
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp38
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp21
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp71
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.h3
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp43
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp3
-rw-r--r--llvm/lib/Target/SystemZ/SystemZLongBranch.cpp1
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp21
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp221
-rw-r--r--llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h46
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp5
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp67
34 files changed, 494 insertions, 236 deletions
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index e45d1f7..b3b62cf 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -407,9 +407,10 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
continue;
Value *Ptr = getLoadStorePointerOperand(&Inst);
const Loop *L = LI.getLoopFor(Inst.getParent());
+ const Loop *OutermostLoop = L ? L->getOutermostLoop() : nullptr;
const SCEV *PtrSCEV = SE.getSCEVAtScope(Ptr, L);
const SCEV *AccessFn = SE.removePointerBase(PtrSCEV);
- SCEVMonotonicity Mon = Checker.checkMonotonicity(AccessFn, L);
+ SCEVMonotonicity Mon = Checker.checkMonotonicity(AccessFn, OutermostLoop);
OS.indent(2) << "Inst: " << Inst << "\n";
OS.indent(4) << "Expr: " << *AccessFn << "\n";
Mon.print(OS, 4);
@@ -945,6 +946,8 @@ SCEVMonotonicity SCEVMonotonicityChecker::invariantOrUnknown(const SCEV *Expr) {
SCEVMonotonicity
SCEVMonotonicityChecker::checkMonotonicity(const SCEV *Expr,
const Loop *OutermostLoop) {
+ assert((!OutermostLoop || OutermostLoop->isOutermost()) &&
+ "OutermostLoop must be outermost");
assert(Expr->getType()->isIntegerTy() && "Expr must be integer type");
this->OutermostLoop = OutermostLoop;
return visit(Expr);
diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp
index 3110577..c3f7a0c 100644
--- a/llvm/lib/CAS/ObjectStore.cpp
+++ b/llvm/lib/CAS/ObjectStore.cpp
@@ -213,10 +213,13 @@ Expected<ObjectRef> ObjectStore::importObject(ObjectStore &Upstream,
// Remove the current node and its IDs from the stack.
PrimaryRefStack.truncate(PrimaryRefStack.size() - Cur.RefsCount);
- CursorStack.pop_back();
+ // Push new node into created objects.
PrimaryRefStack.push_back(*NewNode);
CreatedObjects.try_emplace(Cur.Ref, *NewNode);
+
+ // Pop the cursor in the end after all uses.
+ CursorStack.pop_back();
continue;
}
diff --git a/llvm/lib/CAS/UnifiedOnDiskCache.cpp b/llvm/lib/CAS/UnifiedOnDiskCache.cpp
index ae9d818..7b790bb 100644
--- a/llvm/lib/CAS/UnifiedOnDiskCache.cpp
+++ b/llvm/lib/CAS/UnifiedOnDiskCache.cpp
@@ -174,7 +174,7 @@ getAllDBDirs(StringRef Path, bool IncludeCorrupt = false) {
return createFileError(Path, EC);
llvm::sort(FoundDBDirs, [](const DBDir &LHS, const DBDir &RHS) -> bool {
- return LHS.Order <= RHS.Order;
+ return LHS.Order < RHS.Order;
});
SmallVector<std::string, 4> DBDirs;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 10daca5..f144f17 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2042,6 +2042,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
case ISD::PARTIAL_REDUCE_SMLA:
case ISD::PARTIAL_REDUCE_UMLA:
case ISD::PARTIAL_REDUCE_SUMLA:
+ case ISD::PARTIAL_REDUCE_FMLA:
return visitPARTIAL_REDUCE_MLA(N);
case ISD::VECTOR_COMPRESS: return visitVECTOR_COMPRESS(N);
case ISD::LIFETIME_END: return visitLIFETIME_END(N);
@@ -13006,6 +13007,9 @@ SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) {
//
// partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1))
// -> partial_reduce_*mla(acc, x, C)
+//
+// partial_reduce_fmla(acc, fmul(fpext(a), fpext(b)), splat(1.0))
+// -> partial_reduce_fmla(acc, a, b)
SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
SDLoc DL(N);
auto *Context = DAG.getContext();
@@ -13014,7 +13018,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
SDValue Op2 = N->getOperand(2);
unsigned Opc = Op1->getOpcode();
- if (Opc != ISD::MUL && Opc != ISD::SHL)
+ if (Opc != ISD::MUL && Opc != ISD::FMUL && Opc != ISD::SHL)
return SDValue();
SDValue LHS = Op1->getOperand(0);
@@ -13033,13 +13037,16 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
Opc = ISD::MUL;
}
- APInt C;
- if (Opc != ISD::MUL || !ISD::isConstantSplatVector(Op2.getNode(), C) ||
- !C.isOne())
+ if (!(Opc == ISD::MUL && llvm::isOneOrOneSplat(Op2)) &&
+ !(Opc == ISD::FMUL && llvm::isOneOrOneSplatFP(Op2)))
return SDValue();
+ auto IsIntOrFPExtOpcode = [](unsigned int Opcode) {
+ return (ISD::isExtOpcode(Opcode) || Opcode == ISD::FP_EXTEND);
+ };
+
unsigned LHSOpcode = LHS->getOpcode();
- if (!ISD::isExtOpcode(LHSOpcode))
+ if (!IsIntOrFPExtOpcode(LHSOpcode))
return SDValue();
SDValue LHSExtOp = LHS->getOperand(0);
@@ -13047,6 +13054,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
// partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1))
// -> partial_reduce_*mla(acc, x, C)
+ APInt C;
if (ISD::isConstantSplatVector(RHS.getNode(), C)) {
// TODO: Make use of partial_reduce_sumla here
APInt CTrunc = C.trunc(LHSExtOpVT.getScalarSizeInBits());
@@ -13071,7 +13079,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
}
unsigned RHSOpcode = RHS->getOpcode();
- if (!ISD::isExtOpcode(RHSOpcode))
+ if (!IsIntOrFPExtOpcode(RHSOpcode))
return SDValue();
SDValue RHSExtOp = RHS->getOperand(0);
@@ -13088,6 +13096,8 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
else if (LHSOpcode == ISD::ZERO_EXTEND && RHSOpcode == ISD::SIGN_EXTEND) {
NewOpc = ISD::PARTIAL_REDUCE_SUMLA;
std::swap(LHSExtOp, RHSExtOp);
+ } else if (LHSOpcode == ISD::FP_EXTEND && RHSOpcode == ISD::FP_EXTEND) {
+ NewOpc = ISD::PARTIAL_REDUCE_FMLA;
} else
return SDValue();
// For a 2-stage extend the signedness of both of the extends must match
@@ -13115,30 +13125,33 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
// -> partial.reduce.smla(acc, op, splat(trunc(1)))
// partial.reduce.sumla(acc, sext(op), splat(1))
// -> partial.reduce.smla(acc, op, splat(trunc(1)))
+// partial.reduce.fmla(acc, fpext(op), splat(1.0))
+// -> partial.reduce.fmla(acc, op, splat(1.0))
SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
SDLoc DL(N);
SDValue Acc = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
SDValue Op2 = N->getOperand(2);
- APInt ConstantOne;
- if (!ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) ||
- !ConstantOne.isOne())
+ if (!llvm::isOneOrOneSplat(Op2) && !llvm::isOneOrOneSplatFP(Op2))
return SDValue();
unsigned Op1Opcode = Op1.getOpcode();
- if (!ISD::isExtOpcode(Op1Opcode))
+ if (!ISD::isExtOpcode(Op1Opcode) && Op1Opcode != ISD::FP_EXTEND)
return SDValue();
- bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND;
+ bool Op1IsSigned =
+ Op1Opcode == ISD::SIGN_EXTEND || Op1Opcode == ISD::FP_EXTEND;
bool NodeIsSigned = N->getOpcode() != ISD::PARTIAL_REDUCE_UMLA;
EVT AccElemVT = Acc.getValueType().getVectorElementType();
if (Op1IsSigned != NodeIsSigned &&
Op1.getValueType().getVectorElementType() != AccElemVT)
return SDValue();
- unsigned NewOpcode =
- Op1IsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA;
+ unsigned NewOpcode = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA
+ ? ISD::PARTIAL_REDUCE_FMLA
+ : Op1IsSigned ? ISD::PARTIAL_REDUCE_SMLA
+ : ISD::PARTIAL_REDUCE_UMLA;
SDValue UnextOp1 = Op1.getOperand(0);
EVT UnextOp1VT = UnextOp1.getValueType();
@@ -13148,8 +13161,12 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
TLI.getTypeToTransformTo(*Context, UnextOp1VT)))
return SDValue();
+ SDValue Constant = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA
+ ? DAG.getConstantFP(1, DL, UnextOp1VT)
+ : DAG.getConstant(1, DL, UnextOp1VT);
+
return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, UnextOp1,
- DAG.getConstant(1, DL, UnextOp1VT));
+ Constant);
}
SDValue DAGCombiner::visitVP_STRIDED_LOAD(SDNode *N) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 8e423c4..94751be5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -534,6 +534,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::PARTIAL_REDUCE_UMLA:
case ISD::PARTIAL_REDUCE_SMLA:
case ISD::PARTIAL_REDUCE_SUMLA:
+ case ISD::PARTIAL_REDUCE_FMLA:
Action =
TLI.getPartialReduceMLAAction(Op.getOpcode(), Node->getValueType(0),
Node->getOperand(1).getValueType());
@@ -1243,6 +1244,7 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
case ISD::PARTIAL_REDUCE_UMLA:
case ISD::PARTIAL_REDUCE_SMLA:
case ISD::PARTIAL_REDUCE_SUMLA:
+ case ISD::PARTIAL_REDUCE_FMLA:
Results.push_back(TLI.expandPartialReduceMLA(Node, DAG));
return;
case ISD::VECREDUCE_SEQ_FADD:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index bb4a8d9..dd5c011 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1474,6 +1474,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
case ISD::PARTIAL_REDUCE_UMLA:
case ISD::PARTIAL_REDUCE_SMLA:
case ISD::PARTIAL_REDUCE_SUMLA:
+ case ISD::PARTIAL_REDUCE_FMLA:
SplitVecRes_PARTIAL_REDUCE_MLA(N, Lo, Hi);
break;
case ISD::GET_ACTIVE_LANE_MASK:
@@ -3689,6 +3690,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::PARTIAL_REDUCE_UMLA:
case ISD::PARTIAL_REDUCE_SMLA:
case ISD::PARTIAL_REDUCE_SUMLA:
+ case ISD::PARTIAL_REDUCE_FMLA:
Res = SplitVecOp_PARTIAL_REDUCE_MLA(N);
break;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0a06752..bbc1d73 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8404,7 +8404,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
}
case ISD::PARTIAL_REDUCE_UMLA:
case ISD::PARTIAL_REDUCE_SMLA:
- case ISD::PARTIAL_REDUCE_SUMLA: {
+ case ISD::PARTIAL_REDUCE_SUMLA:
+ case ISD::PARTIAL_REDUCE_FMLA: {
[[maybe_unused]] EVT AccVT = N1.getValueType();
[[maybe_unused]] EVT Input1VT = N2.getValueType();
[[maybe_unused]] EVT Input2VT = N3.getValueType();
@@ -13064,6 +13065,11 @@ bool llvm::isOneOrOneSplat(SDValue N, bool AllowUndefs) {
return C && C->isOne();
}
+bool llvm::isOneOrOneSplatFP(SDValue N, bool AllowUndefs) {
+ ConstantFPSDNode *C = isConstOrConstSplatFP(N, AllowUndefs);
+ return C && C->isExactlyValue(1.0);
+}
+
bool llvm::isAllOnesOrAllOnesSplat(SDValue N, bool AllowUndefs) {
N = peekThroughBitcasts(N);
unsigned BitWidth = N.getScalarValueSizeInBits();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 2f598b2..88b0809 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8187,6 +8187,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
Input, DAG.getConstant(1, sdl, Input.getValueType())));
return;
}
+ case Intrinsic::vector_partial_reduce_fadd: {
+ SDValue Acc = getValue(I.getOperand(0));
+ SDValue Input = getValue(I.getOperand(1));
+ setValue(&I, DAG.getNode(
+ ISD::PARTIAL_REDUCE_FMLA, sdl, Acc.getValueType(), Acc,
+ Input, DAG.getConstantFP(1.0, sdl, Input.getValueType())));
+ return;
+ }
case Intrinsic::experimental_cttz_elts: {
auto DL = getCurSDLoc();
SDValue Op = getValue(I.getOperand(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index d3e1628..ec5edd5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -590,6 +590,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
return "partial_reduce_smla";
case ISD::PARTIAL_REDUCE_SUMLA:
return "partial_reduce_sumla";
+ case ISD::PARTIAL_REDUCE_FMLA:
+ return "partial_reduce_fmla";
case ISD::LOOP_DEPENDENCE_WAR_MASK:
return "loop_dep_war";
case ISD::LOOP_DEPENDENCE_RAW_MASK:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 9bdf822..b51d664 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12074,22 +12074,32 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
EVT::getVectorVT(*DAG.getContext(), AccVT.getVectorElementType(),
MulOpVT.getVectorElementCount());
- unsigned ExtOpcLHS = N->getOpcode() == ISD::PARTIAL_REDUCE_UMLA
- ? ISD::ZERO_EXTEND
- : ISD::SIGN_EXTEND;
- unsigned ExtOpcRHS = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA
- ? ISD::SIGN_EXTEND
- : ISD::ZERO_EXTEND;
+ unsigned ExtOpcLHS, ExtOpcRHS;
+ switch (N->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode");
+ case ISD::PARTIAL_REDUCE_UMLA:
+ ExtOpcLHS = ExtOpcRHS = ISD::ZERO_EXTEND;
+ break;
+ case ISD::PARTIAL_REDUCE_SMLA:
+ ExtOpcLHS = ExtOpcRHS = ISD::SIGN_EXTEND;
+ break;
+ case ISD::PARTIAL_REDUCE_FMLA:
+ ExtOpcLHS = ExtOpcRHS = ISD::FP_EXTEND;
+ break;
+ }
if (ExtMulOpVT != MulOpVT) {
MulLHS = DAG.getNode(ExtOpcLHS, DL, ExtMulOpVT, MulLHS);
MulRHS = DAG.getNode(ExtOpcRHS, DL, ExtMulOpVT, MulRHS);
}
SDValue Input = MulLHS;
- APInt ConstantOne;
- if (!ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) ||
- !ConstantOne.isOne())
+ if (N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA) {
+ if (!llvm::isOneOrOneSplatFP(MulRHS))
+ Input = DAG.getNode(ISD::FMUL, DL, ExtMulOpVT, MulLHS, MulRHS);
+ } else if (!llvm::isOneOrOneSplat(MulRHS)) {
Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS);
+ }
unsigned Stride = AccVT.getVectorMinNumElements();
unsigned ScaleFactor = MulOpVT.getVectorMinNumElements() / Stride;
@@ -12099,10 +12109,13 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
for (unsigned I = 0; I < ScaleFactor; I++)
Subvectors.push_back(DAG.getExtractSubvector(DL, AccVT, Input, I * Stride));
+ unsigned FlatNode =
+ N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA ? ISD::FADD : ISD::ADD;
+
// Flatten the subvector tree
while (Subvectors.size() > 1) {
Subvectors.push_back(
- DAG.getNode(ISD::ADD, DL, AccVT, {Subvectors[0], Subvectors[1]}));
+ DAG.getNode(FlatNode, DL, AccVT, {Subvectors[0], Subvectors[1]}));
Subvectors.pop_front();
Subvectors.pop_front();
}
diff --git a/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp b/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp
index 34174f9..ca918f6 100644
--- a/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/SyntheticTypeNameBuilder.cpp
@@ -377,8 +377,10 @@ Error SyntheticTypeNameBuilder::addTypeName(UnitEntryPairTy InputUnitEntryPair,
} break;
}
- // If name for the DIE is not determined yet add referenced types to the name.
- if (!HasLinkageName && !HasShortName && !HasDeclFileName) {
+ // If name for the DIE is not determined yet or if the DIE is a typedef, add
+ // referenced types to the name.
+ if ((!HasLinkageName && !HasShortName && !HasDeclFileName) ||
+ InputUnitEntryPair.DieEntry->getTag() == dwarf::DW_TAG_typedef) {
if (InputUnitEntryPair.CU->find(InputUnitEntryPair.DieEntry,
getODRAttributes()))
if (Error Err = addReferencedODRDies(InputUnitEntryPair, AddParentNames,
diff --git a/llvm/lib/Demangle/ItaniumDemangle.cpp b/llvm/lib/Demangle/ItaniumDemangle.cpp
index 1009cc9..8e476cd 100644
--- a/llvm/lib/Demangle/ItaniumDemangle.cpp
+++ b/llvm/lib/Demangle/ItaniumDemangle.cpp
@@ -25,10 +25,6 @@
using namespace llvm;
using namespace llvm::itanium_demangle;
-constexpr const char *itanium_demangle::FloatData<float>::spec;
-constexpr const char *itanium_demangle::FloatData<double>::spec;
-constexpr const char *itanium_demangle::FloatData<long double>::spec;
-
// <discriminator> := _ <non-negative number> # when number < 10
// := __ <non-negative number> _ # when number >= 10
// extension := decimal-digit+ # at the end of string
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index f1e473a..59eb870 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6583,6 +6583,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
}
break;
}
+ case Intrinsic::vector_partial_reduce_fadd:
case Intrinsic::vector_partial_reduce_add: {
VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType());
VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType());
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index f9fda23..3f0ecbe 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -311,6 +311,10 @@ static std::optional<std::string> hexagonAttrToFeatureString(unsigned Attr) {
return "v73";
case 75:
return "v75";
+ case 79:
+ return "v79";
+ case 81:
+ return "v81";
default:
return {};
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 40e6400..c8a038f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1916,6 +1916,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);
setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);
}
+
+ // Handle floating-point partial reduction
+ if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
+ setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, MVT::nxv4f32,
+ MVT::nxv8f16, Legal);
+ }
}
// Handle non-aliasing elements mask
@@ -2283,6 +2289,11 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
}
+ if (Subtarget->hasSVE2p1() && VT.getVectorElementType() == MVT::f32) {
+ setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, VT,
+ MVT::getVectorVT(MVT::f16, NumElts * 2), Custom);
+ }
+
// Lower fixed length vector operations to scalable equivalents.
setOperationAction(ISD::ABDS, VT, Default);
setOperationAction(ISD::ABDU, VT, Default);
@@ -7875,6 +7886,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::PARTIAL_REDUCE_SMLA:
case ISD::PARTIAL_REDUCE_UMLA:
case ISD::PARTIAL_REDUCE_SUMLA:
+ case ISD::PARTIAL_REDUCE_FMLA:
return LowerPARTIAL_REDUCE_MLA(Op, DAG);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 58a53af..bb2f083 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -13292,18 +13292,24 @@ multiclass AtomicFPStore<bit R, bits<3> op0, string asm> {
def H : BaseAtomicFPStore<FPR16, 0b01, R, op0, asm>;
}
-class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind>
+class BaseSIMDThreeSameVectorFP8MatrixMul<string asm, bits<2> size, string kind, list<dag> pattern>
: BaseSIMDThreeSameVectorTied<1, 1, {size, 0}, 0b11101,
- V128, asm, ".16b", []> {
+ V128, asm, ".16b", pattern> {
let AsmString = !strconcat(asm, "{\t$Rd", kind, ", $Rn.16b, $Rm.16b",
"|", kind, "\t$Rd, $Rn, $Rm}");
}
-multiclass SIMDThreeSameVectorFP8MatrixMul<string asm>{
- def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h">{
+multiclass SIMDThreeSameVectorFP8MatrixMul<string asm, SDPatternOperator OpNode>{
+ def v8f16: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b00, ".8h",
+ [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd),
+ (v16i8 V128:$Rn),
+ (v16i8 V128:$Rm)))]> {
let Predicates = [HasNEON, HasF8F16MM];
}
- def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s">{
+ def v4f32: BaseSIMDThreeSameVectorFP8MatrixMul<asm, 0b10, ".4s",
+ [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
+ (v16i8 V128:$Rn),
+ (v16i8 V128:$Rm)))]> {
let Predicates = [HasNEON, HasF8F32MM];
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index e6954f7..76f076a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -11417,7 +11417,7 @@ let Predicates = [HasF16F32MM] in
defm FMMLA : SIMDThreeSameVectorFMLAWiden<"fmmla">;
let Uses = [FPMR, FPCR] in
- defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">;
+ defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla", int_aarch64_neon_fmmla>;
//===----------------------------------------------------------------------===//
// Contention Management Hints (FEAT_CMH)
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3b268dc..e1f4386 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -375,6 +375,11 @@ def AArch64fclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
node:$Zm)
]>;
+def AArch64fdot : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
+ [(int_aarch64_sve_fdot_x2 node:$Zd, node:$Zn, node:$Zm),
+ (partial_reduce_fmla node:$Zd, node:$Zn, node:$Zm)
+ ]>;
+
def SDT_AArch64FCVT : SDTypeProfile<1, 3, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>, SDTCisSameAs<0,3>
@@ -4251,7 +4256,7 @@ defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>;
let Predicates = [HasSVE2p1_or_SME2] in {
defm FCLAMP_ZZZ : sve_fp_clamp<"fclamp", AArch64fclamp>;
-defm FDOT_ZZZ_S : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>;
+defm FDOT_ZZZ_S : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, AArch64fdot>;
defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>;
defm BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
index 65e6ed9..c52eb4e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -57,10 +57,11 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
const UniformityInfo &UI,
ValueMap<const Value *, bool> &Tracker) {
llvm::Intrinsic::ID IID = II.getIntrinsicID();
-
+ /// We deliberately do not simplify readfirstlane with a uniform argument, so
+ /// that frontends can use it to force a copy to SGPR and thereby prevent the
+ /// backend from generating unwanted waterfall loops.
switch (IID) {
case Intrinsic::amdgcn_permlane64:
- case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
Value *Src = II.getArgOperand(0);
if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
@@ -107,7 +108,7 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
return Changed;
}
default:
- llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic");
+ return false;
}
return false;
}
@@ -121,16 +122,6 @@ static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) {
auto *II = dyn_cast<IntrinsicInst>(&I);
if (!II)
continue;
-
- switch (II->getIntrinsicID()) {
- case Intrinsic::amdgcn_permlane64:
- case Intrinsic::amdgcn_readfirstlane:
- case Intrinsic::amdgcn_readlane:
- case Intrinsic::amdgcn_ballot:
- break;
- default:
- continue;
- }
IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
}
return IsChanged;
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 84984a0..964309b 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -681,6 +681,10 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
return false;
MI->setDesc(TII->get(NewMFMAOpc));
MI->untieRegOperand(0);
+ const MCInstrDesc &MCID = MI->getDesc();
+ for (unsigned I = 0; I < MI->getNumDefs(); ++I)
+ if (MCID.getOperandConstraint(I, MCOI::EARLY_CLOBBER) != -1)
+ MI->getOperand(I).setIsEarlyClobber(true);
}
// TODO: Should we try to avoid adding this to the candidate list?
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea..9c74c65 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10163,7 +10163,7 @@ static bool followSubRegDef(MachineInstr &MI,
}
MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
- MachineRegisterInfo &MRI) {
+ const MachineRegisterInfo &MRI) {
assert(MRI.isSSA());
if (!P.Reg.isVirtual())
return nullptr;
@@ -10628,6 +10628,8 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
const SIRegisterInfo &RI) {
MachineInstr *KillsSCC = nullptr;
+ if (SCCValid->getParent() != SCCRedefine->getParent())
+ return false;
for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
SCCRedefine->getIterator())) {
if (MI.modifiesRegister(AMDGPU::SCC, &RI))
@@ -10672,8 +10674,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (CmpValue != 0)
return false;
- MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
- if (!Def || Def->getParent() != CmpInstr.getParent())
+ MachineInstr *Def = MRI->getVRegDef(SrcReg);
+ if (!Def)
return false;
// For S_OP that set SCC = DST!=0, do the transformation
@@ -10692,6 +10694,32 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (!optimizeSCC(Def, &CmpInstr, RI))
return false;
+ // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
+ // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
+ // 64-bit foldableSelect then delete s_or_b32 in the sequence:
+ // sX = s_cselect_b64 (non-zero imm), 0
+ // sLo = copy sX.sub0
+ // sHi = copy sX.sub1
+ // sY = s_or_b32 sLo, sHi
+ if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
+ MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
+ const MachineOperand &OrOpnd1 = Def->getOperand(1);
+ const MachineOperand &OrOpnd2 = Def->getOperand(2);
+ if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
+ MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
+ MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
+ if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
+ Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
+ Def2->getOperand(1).isReg() &&
+ Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
+ Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
+ Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
+ MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
+ if (Select && foldableSelect(*Select))
+ optimizeSCC(Select, Def, RI);
+ }
+ }
+ }
return true;
};
@@ -10721,8 +10749,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
// s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
// s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
- MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
- if (!Def || Def->getParent() != CmpInstr.getParent())
+ MachineInstr *Def = MRI->getVRegDef(SrcReg);
+ if (!Def)
return false;
if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 0643b53..8d693b1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1687,7 +1687,7 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI,
/// skipping copy like instructions and subreg-manipulation pseudos.
/// Following another subreg of a reg:subreg isn't supported.
MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
- MachineRegisterInfo &MRI);
+ const MachineRegisterInfo &MRI);
/// \brief Return false if EXEC is not changed between the def of \p VReg at \p
/// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index bfac639..caff354 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1334,20 +1334,21 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
const MCInstrDesc &Desc = TII->get(MI.getOpcode());
unsigned ConstantBusCount = 0;
for (MachineOperand &Op : MI.explicit_uses()) {
- if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
- continue;
-
- unsigned I = Op.getOperandNo();
+ if (Op.isReg()) {
+ if (TRI->isVGPR(*MRI, Op.getReg()))
+ continue;
- int16_t RegClass = TII->getOpRegClassID(Desc.operands()[I]);
- if (RegClass == -1 || !TRI->isVSSuperClass(TRI->getRegClass(RegClass)))
+ if (ST.hasSDWAScalar() && ConstantBusCount == 0) {
+ ++ConstantBusCount;
+ continue;
+ }
+ } else if (!Op.isImm())
continue;
- if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
- TRI->isSGPRReg(*MRI, Op.getReg())) {
- ++ConstantBusCount;
+ unsigned I = Op.getOperandNo();
+ const TargetRegisterClass *OpRC = TII->getRegClass(Desc, I, TRI);
+ if (!OpRC || !TRI->isVSSuperClass(OpRC))
continue;
- }
Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 20fc849..dd233e2 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -657,6 +657,17 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);
+ if (Subtarget.isISA3_0() && isPPC64) {
+ setOperationAction(ISD::VP_STORE, MVT::v16i1, Custom);
+ setOperationAction(ISD::VP_STORE, MVT::v8i1, Custom);
+ setOperationAction(ISD::VP_STORE, MVT::v4i1, Custom);
+ setOperationAction(ISD::VP_STORE, MVT::v2i1, Custom);
+ setOperationAction(ISD::VP_LOAD, MVT::v16i1, Custom);
+ setOperationAction(ISD::VP_LOAD, MVT::v8i1, Custom);
+ setOperationAction(ISD::VP_LOAD, MVT::v4i1, Custom);
+ setOperationAction(ISD::VP_LOAD, MVT::v2i1, Custom);
+ }
+
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom);
@@ -11917,6 +11928,62 @@ SDValue PPCTargetLowering::LowerIS_FPCLASS(SDValue Op,
return getDataClassTest(LHS, Category, Dl, DAG, Subtarget);
}
+// Adjust the length value for a load/store with length to account for the
+// instructions requiring a left justified length, and for non-byte element
+// types requiring scaling by element size.
+static SDValue AdjustLength(SDValue Val, unsigned Bits, bool Left,
+ SelectionDAG &DAG) {
+ SDLoc dl(Val);
+ EVT VT = Val->getValueType(0);
+ unsigned LeftAdj = Left ? VT.getSizeInBits() - 8 : 0;
+ unsigned TypeAdj = llvm::countr_zero<uint32_t>(Bits / 8);
+ SDValue SHLAmt = DAG.getConstant(LeftAdj + TypeAdj, dl, VT);
+ return DAG.getNode(ISD::SHL, dl, VT, Val, SHLAmt);
+}
+
+SDValue PPCTargetLowering::LowerVP_LOAD(SDValue Op, SelectionDAG &DAG) const {
+ auto VPLD = cast<VPLoadSDNode>(Op);
+ bool Future = Subtarget.isISAFuture();
+ SDLoc dl(Op);
+ assert(ISD::isConstantSplatVectorAllOnes(Op->getOperand(3).getNode(), true) &&
+ "Mask predication not supported");
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue Len = DAG.getNode(ISD::ANY_EXTEND, dl, PtrVT, VPLD->getOperand(4));
+ unsigned IID = Future ? Intrinsic::ppc_vsx_lxvrl : Intrinsic::ppc_vsx_lxvl;
+ unsigned EltBits = Op->getValueType(0).getScalarType().getSizeInBits();
+ Len = AdjustLength(Len, EltBits, !Future, DAG);
+ SDValue Ops[] = {VPLD->getChain(), DAG.getConstant(IID, dl, MVT::i32),
+ VPLD->getOperand(1), Len};
+ SDVTList Tys = DAG.getVTList(Op->getValueType(0), MVT::Other);
+ SDValue VPL =
+ DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, Tys, Ops,
+ VPLD->getMemoryVT(), VPLD->getMemOperand());
+ return VPL;
+}
+
+SDValue PPCTargetLowering::LowerVP_STORE(SDValue Op, SelectionDAG &DAG) const {
+ auto VPST = cast<VPStoreSDNode>(Op);
+ assert(ISD::isConstantSplatVectorAllOnes(Op->getOperand(4).getNode(), true) &&
+ "Mask predication not supported");
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDLoc dl(Op);
+ SDValue Len = DAG.getNode(ISD::ANY_EXTEND, dl, PtrVT, VPST->getOperand(5));
+ unsigned EltBits =
+ Op->getOperand(1).getValueType().getScalarType().getSizeInBits();
+ bool Future = Subtarget.isISAFuture();
+ unsigned IID = Future ? Intrinsic::ppc_vsx_stxvrl : Intrinsic::ppc_vsx_stxvl;
+ Len = AdjustLength(Len, EltBits, !Future, DAG);
+ SDValue Ops[] = {
+ VPST->getChain(), DAG.getConstant(IID, dl, MVT::i32),
+ DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, VPST->getOperand(1)),
+ VPST->getOperand(2), Len};
+ SDVTList Tys = DAG.getVTList(MVT::Other);
+ SDValue VPS =
+ DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, Tys, Ops,
+ VPST->getMemoryVT(), VPST->getMemOperand());
+ return VPS;
+}
+
SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -12771,6 +12838,10 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
if (Op->getFlags().hasNoFPExcept())
return Op;
return SDValue();
+ case ISD::VP_LOAD:
+ return LowerVP_LOAD(Op, DAG);
+ case ISD::VP_STORE:
+ return LowerVP_STORE(Op, DAG);
}
}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 880aca7..d967018 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1345,6 +1345,9 @@ namespace llvm {
SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVP_LOAD(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVP_STORE(SDValue Op, SelectionDAG &DAG) const;
+
SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDMFVectorLoad(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index b04e887..e74f1bd 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -24,6 +24,10 @@ using namespace llvm;
#define DEBUG_TYPE "ppctti"
+static cl::opt<bool> Pwr9EVL("ppc-pwr9-evl",
+ cl::desc("Allow vp.load and vp.store for pwr9"),
+ cl::init(false), cl::Hidden);
+
static cl::opt<bool> VecMaskCost("ppc-vec-mask-cost",
cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden);
@@ -1031,3 +1035,42 @@ bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
bool PPCTTIImpl::supportsTailCallFor(const CallBase *CB) const {
return TLI->supportsTailCallFor(CB);
}
+
+// Target hook used by CodeGen to decide whether to expand vector predication
+// intrinsics into scalar operations or to use special ISD nodes to represent
+// them. The Target will not see the intrinsics.
+TargetTransformInfo::VPLegalization
+PPCTTIImpl::getVPLegalizationStrategy(const VPIntrinsic &PI) const {
+ using VPLegalization = TargetTransformInfo::VPLegalization;
+ unsigned Directive = ST->getCPUDirective();
+ VPLegalization DefaultLegalization = BaseT::getVPLegalizationStrategy(PI);
+ if (Directive != PPC::DIR_PWR10 && Directive != PPC::DIR_PWR_FUTURE &&
+ (!Pwr9EVL || Directive != PPC::DIR_PWR9))
+ return DefaultLegalization;
+
+ if (!ST->isPPC64())
+ return DefaultLegalization;
+
+ unsigned IID = PI.getIntrinsicID();
+ if (IID != Intrinsic::vp_load && IID != Intrinsic::vp_store)
+ return DefaultLegalization;
+
+ bool IsLoad = IID == Intrinsic::vp_load;
+ Type *VecTy = IsLoad ? PI.getType() : PI.getOperand(0)->getType();
+ EVT VT = TLI->getValueType(DL, VecTy, true);
+ if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
+ VT != MVT::v16i8)
+ return DefaultLegalization;
+
+ auto IsAllTrueMask = [](Value *MaskVal) {
+ if (Value *SplattedVal = getSplatValue(MaskVal))
+ if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
+ return ConstValue->isAllOnesValue();
+ return false;
+ };
+ unsigned MaskIx = IsLoad ? 1 : 2;
+ if (!IsAllTrueMask(PI.getOperand(MaskIx)))
+ return DefaultLegalization;
+
+ return VPLegalization(VPLegalization::Legal, VPLegalization::Legal);
+}
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 8d7f255..f80ebdb 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -150,6 +150,9 @@ public:
ArrayRef<Type *> Types) const override;
bool supportsTailCallFor(const CallBase *CB) const override;
+ TargetTransformInfo::VPLegalization
+ getVPLegalizationStrategy(const VPIntrinsic &PI) const override;
+
private:
// The following constant is used for estimating costs on power9.
static const InstructionCost::CostType P9PipelineFlushEstimate = 80;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 7bc0b5b..332433b 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2140,7 +2140,8 @@ InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
// Assume memory ops cost scale with the number of vector registers
// possible accessed by the instruction. Note that BasicTTI already
// handles the LT.first term for us.
- if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
+ if (ST->hasVInstructions() && LT.second.isVector() &&
+ CostKind != TTI::TCK_CodeSize)
BaseCost *= TLI->getLMULCost(LT.second);
return Cost + BaseCost;
}
diff --git a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
index 21a233b2..b7a93e7 100644
--- a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -216,6 +216,7 @@ static unsigned getInstSizeInBytes(const MachineInstr &MI,
MI.isDebugOrPseudoInstr() || MI.isPosition() || MI.isKill() ||
MI.isImplicitDef() || MI.getOpcode() == TargetOpcode::MEMBARRIER ||
MI.getOpcode() == TargetOpcode::INIT_UNDEF || MI.isFakeUse() ||
+ MI.getOpcode() == TargetOpcode::RELOC_NONE ||
// These have a size that may be zero:
MI.isInlineAsm() || MI.getOpcode() == SystemZ::STACKMAP ||
MI.getOpcode() == SystemZ::PATCHPOINT ||
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 090f649..05a854a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45022,11 +45022,16 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
case X86ISD::INSERTPS:
case X86ISD::BLENDI:
case X86ISD::PSHUFB:
+ case X86ISD::VZEXT_MOVL:
case X86ISD::PSHUFD:
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ case X86ISD::SHUFP:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
case X86ISD::VPERMILPV:
case X86ISD::VPERMILPI:
+ case X86ISD::VPERMI:
case X86ISD::VPERMV:
case X86ISD::VPERMV3: {
SmallVector<int, 8> Mask;
@@ -45052,6 +45057,16 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
}
break;
}
+ case X86ISD::VBROADCAST: {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ if (SrcVT.isVector()) {
+ APInt DemandedSrc = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
+ return DAG.isGuaranteedNotToBeUndefOrPoison(Src, DemandedSrc, PoisonOnly,
+ Depth + 1);
+ }
+ return DAG.isGuaranteedNotToBeUndefOrPoison(Src, PoisonOnly, Depth + 1);
+ }
}
return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
Op, DemandedElts, DAG, PoisonOnly, Depth);
@@ -45096,13 +45111,19 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
// SSE target shuffles.
case X86ISD::INSERTPS:
case X86ISD::PSHUFB:
+ case X86ISD::VZEXT_MOVL:
case X86ISD::PSHUFD:
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ case X86ISD::SHUFP:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
case X86ISD::VPERMILPV:
case X86ISD::VPERMILPI:
+ case X86ISD::VPERMI:
case X86ISD::VPERMV:
case X86ISD::VPERMV3:
+ case X86ISD::VBROADCAST:
return false;
// SSE comparisons handle all icmp/fcmp cases.
// TODO: Add CMPM/MM with test coverage.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b7224a3..666033b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7488,11 +7488,12 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
});
}
-VPWidenMemoryRecipe *
-VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
- VFRange &Range) {
- assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
+ VFRange &Range) {
+ assert((VPI->getOpcode() == Instruction::Load ||
+ VPI->getOpcode() == Instruction::Store) &&
"Must be called with either a load or store");
+ Instruction *I = VPI->getUnderlyingInstr();
auto WillWiden = [&](ElementCount VF) -> bool {
LoopVectorizationCostModel::InstWidening Decision =
@@ -7522,7 +7523,8 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
bool Consecutive =
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
- VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
+ VPValue *Ptr = VPI->getOpcode() == Instruction::Load ? VPI->getOperand(0)
+ : VPI->getOperand(1);
if (Consecutive) {
auto *GEP = dyn_cast<GetElementPtrInst>(
Ptr->getUnderlyingValue()->stripPointerCasts());
@@ -7536,77 +7538,78 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
CM.foldTailByMasking() || !GEP
? GEPNoWrapFlags::none()
: GEP->getNoWrapFlags().withoutNoUnsignedWrap();
- VectorPtr =
- new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
- /*Stride*/ -1, Flags, I->getDebugLoc());
+ VectorPtr = new VPVectorEndPointerRecipe(
+ Ptr, &Plan.getVF(), getLoadStoreType(I),
+ /*Stride*/ -1, Flags, VPI->getDebugLoc());
} else {
VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
GEP ? GEP->getNoWrapFlags()
: GEPNoWrapFlags::none(),
- I->getDebugLoc());
+ VPI->getDebugLoc());
}
Builder.insert(VectorPtr);
Ptr = VectorPtr;
}
- if (LoadInst *Load = dyn_cast<LoadInst>(I))
+ if (VPI->getOpcode() == Instruction::Load) {
+ auto *Load = cast<LoadInst>(I);
return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
VPIRMetadata(*Load, LVer), I->getDebugLoc());
+ }
StoreInst *Store = cast<StoreInst>(I);
- return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
- Reverse, VPIRMetadata(*Store, LVer),
- I->getDebugLoc());
+ return new VPWidenStoreRecipe(*Store, Ptr, VPI->getOperand(0), Mask,
+ Consecutive, Reverse,
+ VPIRMetadata(*Store, LVer), VPI->getDebugLoc());
}
-/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
-/// insert a recipe to expand the step for the induction recipe.
+/// Creates a VPWidenIntOrFpInductionRecipe for \p PhiR. If needed, it will
+/// also insert a recipe to expand the step for the induction recipe.
static VPWidenIntOrFpInductionRecipe *
-createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
- VPValue *Start, const InductionDescriptor &IndDesc,
- VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
- assert(IndDesc.getStartValue() ==
- Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
+createWidenInductionRecipes(VPInstruction *PhiR,
+ const InductionDescriptor &IndDesc, VPlan &Plan,
+ ScalarEvolution &SE, Loop &OrigLoop) {
assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
"step must be loop invariant");
+ VPValue *Start = PhiR->getOperand(0);
+ assert(Plan.getLiveIn(IndDesc.getStartValue()) == Start &&
+ "Start VPValue must match IndDesc's start value");
+
VPValue *Step =
vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep());
- if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
- return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
- IndDesc, TruncI,
- TruncI->getDebugLoc());
- }
- assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
+ PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingInstr());
return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
- IndDesc, Phi->getDebugLoc());
+ IndDesc, PhiR->getDebugLoc());
}
-VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
- PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
+VPHeaderPHIRecipe *
+VPRecipeBuilder::tryToOptimizeInductionPHI(VPInstruction *VPI, VFRange &Range) {
+ auto *Phi = cast<PHINode>(VPI->getUnderlyingInstr());
// Check if this is an integer or fp induction. If so, build the recipe that
// produces its scalar and vector values.
if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
- return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
- *PSE.getSE(), *OrigLoop);
+ return createWidenInductionRecipes(VPI, *II, Plan, *PSE.getSE(), *OrigLoop);
// Check if this is pointer induction. If so, build the recipe for it.
if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep());
return new VPWidenPointerInductionRecipe(
- Phi, Operands[0], Step, &Plan.getVFxUF(), *II,
+ Phi, VPI->getOperand(0), Step, &Plan.getVFxUF(), *II,
LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) {
return CM.isScalarAfterVectorization(Phi, VF);
},
Range),
- Phi->getDebugLoc());
+ VPI->getDebugLoc());
}
return nullptr;
}
-VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
- TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
+VPWidenIntOrFpInductionRecipe *
+VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI,
+ VFRange &Range) {
+ auto *I = cast<TruncInst>(VPI->getUnderlyingInstr());
// Optimize the special case where the source is a constant integer
// induction variable. Notice that we can only optimize the 'trunc' case
// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
@@ -7621,21 +7624,24 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
};
};
- if (LoopVectorizationPlanner::getDecisionAndClampRange(
- IsOptimizableIVTruncate(I), Range)) {
+ if (!LoopVectorizationPlanner::getDecisionAndClampRange(
+ IsOptimizableIVTruncate(I), Range))
+ return nullptr;
- auto *Phi = cast<PHINode>(I->getOperand(0));
- const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
- VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
- return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
- *OrigLoop);
- }
- return nullptr;
+ auto *WidenIV = cast<VPWidenIntOrFpInductionRecipe>(
+ VPI->getOperand(0)->getDefiningRecipe());
+ PHINode *Phi = WidenIV->getPHINode();
+ VPValue *Start = WidenIV->getStartValue();
+ const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor();
+ VPValue *Step =
+ vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep());
+ return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
+ IndDesc, I, VPI->getDebugLoc());
}
-VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
- ArrayRef<VPValue *> Operands,
+VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
VFRange &Range) {
+ CallInst *CI = cast<CallInst>(VPI->getUnderlyingInstr());
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
[this, CI](ElementCount VF) {
return CM.isScalarWithPredication(CI, VF);
@@ -7652,7 +7658,8 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
ID == Intrinsic::experimental_noalias_scope_decl))
return nullptr;
- SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
+ SmallVector<VPValue *, 4> Ops(VPI->op_begin(),
+ VPI->op_begin() + CI->arg_size());
// Is it beneficial to perform intrinsic call compared to lib call?
bool ShouldUseVectorIntrinsic =
@@ -7664,7 +7671,7 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
Range);
if (ShouldUseVectorIntrinsic)
return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
- CI->getDebugLoc());
+ VPI->getDebugLoc());
Function *Variant = nullptr;
std::optional<unsigned> MaskPos;
@@ -7711,13 +7718,13 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
Mask = getBlockInMask(Builder.getInsertBlock());
else
Mask = Plan.getOrAddLiveIn(
- ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext())));
+ ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext())));
Ops.insert(Ops.begin() + *MaskPos, Mask);
}
- Ops.push_back(Operands.back());
- return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
+ Ops.push_back(VPI->getOperand(VPI->getNumOperands() - 1));
+ return new VPWidenCallRecipe(CI, Variant, Ops, VPI->getDebugLoc());
}
return nullptr;
@@ -7737,9 +7744,9 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
Range);
}
-VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
- ArrayRef<VPValue *> Operands) {
- switch (I->getOpcode()) {
+VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
+ auto *I = VPI->getUnderlyingInstr();
+ switch (VPI->getOpcode()) {
default:
return nullptr;
case Instruction::SDiv:
@@ -7749,10 +7756,11 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
// If not provably safe, use a select to form a safe divisor before widening the
// div/rem operation itself. Otherwise fall through to general handling below.
if (CM.isPredicatedInst(I)) {
- SmallVector<VPValue *> Ops(Operands);
+ SmallVector<VPValue *> Ops(VPI->operands());
VPValue *Mask = getBlockInMask(Builder.getInsertBlock());
VPValue *One = Plan.getConstantInt(I->getType(), 1u);
- auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
+ auto *SafeRHS =
+ Builder.createSelect(Mask, Ops[1], One, VPI->getDebugLoc());
Ops[1] = SafeRHS;
return new VPWidenRecipe(*I, Ops);
}
@@ -7777,8 +7785,8 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
case Instruction::Sub:
case Instruction::Xor:
case Instruction::Freeze: {
- SmallVector<VPValue *> NewOps(Operands);
- if (Instruction::isBinaryOp(I->getOpcode())) {
+ SmallVector<VPValue *> NewOps(VPI->operands());
+ if (Instruction::isBinaryOp(VPI->getOpcode())) {
// The legacy cost model uses SCEV to check if some of the operands are
// constants. To match the legacy cost model's behavior, use SCEV to try
// to replace operands with constants.
@@ -7795,7 +7803,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
return Plan.getOrAddLiveIn(C->getValue());
};
// For Mul, the legacy cost model checks both operands.
- if (I->getOpcode() == Instruction::Mul)
+ if (VPI->getOpcode() == Instruction::Mul)
NewOps[0] = GetConstantViaSCEV(NewOps[0]);
// For other binops, the legacy cost model only checks the second operand.
NewOps[1] = GetConstantViaSCEV(NewOps[1]);
@@ -7803,7 +7811,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
return new VPWidenRecipe(*I, NewOps);
}
case Instruction::ExtractValue: {
- SmallVector<VPValue *> NewOps(Operands);
+ SmallVector<VPValue *> NewOps(VPI->operands());
auto *EVI = cast<ExtractValueInst>(I);
assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
unsigned Idx = EVI->getIndices()[0];
@@ -7813,9 +7821,8 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
};
}
-VPHistogramRecipe *
-VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
- ArrayRef<VPValue *> Operands) {
+VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
+ VPInstruction *VPI) {
// FIXME: Support other operations.
unsigned Opcode = HI->Update->getOpcode();
assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
@@ -7823,7 +7830,7 @@ VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
SmallVector<VPValue *, 3> HGramOps;
// Bucket address.
- HGramOps.push_back(Operands[1]);
+ HGramOps.push_back(VPI->getOperand(1));
// Increment value.
HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
@@ -7832,12 +7839,12 @@ VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
if (Legal->isMaskRequired(HI->Store))
HGramOps.push_back(getBlockInMask(Builder.getInsertBlock()));
- return new VPHistogramRecipe(Opcode, HGramOps, HI->Store->getDebugLoc());
+ return new VPHistogramRecipe(Opcode, HGramOps, VPI->getDebugLoc());
}
-VPReplicateRecipe *
-VPRecipeBuilder::handleReplication(Instruction *I, ArrayRef<VPValue *> Operands,
- VFRange &Range) {
+VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI,
+ VFRange &Range) {
+ auto *I = VPI->getUnderlyingInstr();
bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
Range);
@@ -7893,8 +7900,8 @@ VPRecipeBuilder::handleReplication(Instruction *I, ArrayRef<VPValue *> Operands,
assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
(Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
"Should not predicate a uniform recipe");
- auto *Recipe = new VPReplicateRecipe(I, Operands, IsUniform, BlockInMask,
- VPIRMetadata(*I, LVer));
+ auto *Recipe = new VPReplicateRecipe(I, VPI->operands(), IsUniform,
+ BlockInMask, VPIRMetadata(*I, LVer));
return Recipe;
}
@@ -8075,8 +8082,6 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
// First, check for specific widening recipes that deal with inductions, Phi
// nodes, calls and memory operations.
VPRecipeBase *Recipe;
- Instruction *Instr = R->getUnderlyingInstr();
- SmallVector<VPValue *, 4> Operands(R->operands());
if (auto *PhiR = dyn_cast<VPPhi>(R)) {
VPBasicBlock *Parent = PhiR->getParent();
[[maybe_unused]] VPRegionBlock *LoopRegionOf =
@@ -8084,15 +8089,15 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent &&
"Non-header phis should have been handled during predication");
auto *Phi = cast<PHINode>(R->getUnderlyingInstr());
- assert(Operands.size() == 2 && "Must have 2 operands for header phis");
- if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
+ assert(R->getNumOperands() == 2 && "Must have 2 operands for header phis");
+ if ((Recipe = tryToOptimizeInductionPHI(PhiR, Range)))
return Recipe;
VPHeaderPHIRecipe *PhiRecipe = nullptr;
assert((Legal->isReductionVariable(Phi) ||
Legal->isFixedOrderRecurrence(Phi)) &&
"can only widen reductions and fixed-order recurrences here");
- VPValue *StartV = Operands[0];
+ VPValue *StartV = R->getOperand(0);
if (Legal->isReductionVariable(Phi)) {
const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(Phi);
assert(RdxDesc.getRecurrenceStartValue() ==
@@ -8112,13 +8117,15 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
}
// Add backedge value.
- PhiRecipe->addOperand(Operands[1]);
+ PhiRecipe->addOperand(R->getOperand(1));
return PhiRecipe;
}
assert(!R->isPhi() && "only VPPhi nodes expected at this point");
- if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
- cast<TruncInst>(Instr), Operands, Range)))
+ auto *VPI = cast<VPInstruction>(R);
+ Instruction *Instr = R->getUnderlyingInstr();
+ if (VPI->getOpcode() == Instruction::Trunc &&
+ (Recipe = tryToOptimizeInductionTruncate(VPI, Range)))
return Recipe;
// All widen recipes below deal only with VF > 1.
@@ -8126,46 +8133,46 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
[&](ElementCount VF) { return VF.isScalar(); }, Range))
return nullptr;
- if (auto *CI = dyn_cast<CallInst>(Instr))
- return tryToWidenCall(CI, Operands, Range);
+ if (VPI->getOpcode() == Instruction::Call)
+ return tryToWidenCall(VPI, Range);
- if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
- if (auto HistInfo = Legal->getHistogramInfo(SI))
- return tryToWidenHistogram(*HistInfo, Operands);
+ if (VPI->getOpcode() == Instruction::Store)
+ if (auto HistInfo = Legal->getHistogramInfo(cast<StoreInst>(Instr)))
+ return tryToWidenHistogram(*HistInfo, VPI);
- if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
- return tryToWidenMemory(Instr, Operands, Range);
+ if (VPI->getOpcode() == Instruction::Load ||
+ VPI->getOpcode() == Instruction::Store)
+ return tryToWidenMemory(VPI, Range);
if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
- return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
+ return tryToCreatePartialReduction(VPI, ScaleFactor.value());
if (!shouldWiden(Instr, Range))
return nullptr;
- if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))
- return new VPWidenGEPRecipe(GEP, Operands);
+ if (VPI->getOpcode() == Instruction::GetElementPtr)
+ return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr), R->operands());
- if (auto *SI = dyn_cast<SelectInst>(Instr)) {
- return new VPWidenSelectRecipe(*SI, Operands);
- }
+ if (VPI->getOpcode() == Instruction::Select)
+ return new VPWidenSelectRecipe(*cast<SelectInst>(Instr), R->operands());
- if (auto *CI = dyn_cast<CastInst>(Instr)) {
- return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
- *CI);
+ if (Instruction::isCast(VPI->getOpcode())) {
+ auto *CI = cast<CastInst>(Instr);
+ return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0),
+ CI->getType(), *CI);
}
- return tryToWiden(Instr, Operands);
+ return tryToWiden(VPI);
}
VPRecipeBase *
-VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
- ArrayRef<VPValue *> Operands,
+VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction,
unsigned ScaleFactor) {
- assert(Operands.size() == 2 &&
+ assert(Reduction->getNumOperands() == 2 &&
"Unexpected number of operands for partial reduction");
- VPValue *BinOp = Operands[0];
- VPValue *Accumulator = Operands[1];
+ VPValue *BinOp = Reduction->getOperand(0);
+ VPValue *Accumulator = Reduction->getOperand(1);
VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
isa<VPPartialReductionRecipe>(BinOpRecipe))
@@ -8176,28 +8183,29 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
"all accumulators in chain must have same scale factor");
unsigned ReductionOpcode = Reduction->getOpcode();
+ auto *ReductionI = Reduction->getUnderlyingInstr();
if (ReductionOpcode == Instruction::Sub) {
- auto *const Zero = ConstantInt::get(Reduction->getType(), 0);
+ auto *const Zero = ConstantInt::get(ReductionI->getType(), 0);
SmallVector<VPValue *, 2> Ops;
Ops.push_back(Plan.getOrAddLiveIn(Zero));
Ops.push_back(BinOp);
- BinOp = new VPWidenRecipe(*Reduction, Ops);
+ BinOp = new VPWidenRecipe(*ReductionI, Ops);
Builder.insert(BinOp->getDefiningRecipe());
ReductionOpcode = Instruction::Add;
}
VPValue *Cond = nullptr;
- if (CM.blockNeedsPredicationForAnyReason(Reduction->getParent())) {
+ if (CM.blockNeedsPredicationForAnyReason(ReductionI->getParent())) {
assert((ReductionOpcode == Instruction::Add ||
ReductionOpcode == Instruction::Sub) &&
"Expected an ADD or SUB operation for predicated partial "
"reductions (because the neutral element in the mask is zero)!");
Cond = getBlockInMask(Builder.getInsertBlock());
- VPValue *Zero = Plan.getConstantInt(Reduction->getType(), 0);
+ VPValue *Zero = Plan.getConstantInt(ReductionI->getType(), 0);
BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
}
return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
- ScaleFactor, Reduction);
+ ScaleFactor, ReductionI);
}
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -8382,7 +8390,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
VPRecipeBase *Recipe =
RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range);
if (!Recipe)
- Recipe = RecipeBuilder.handleReplication(Instr, R.operands(), Range);
+ Recipe = RecipeBuilder.handleReplication(cast<VPInstruction>(SingleDef),
+ Range);
RecipeBuilder.setRecipe(Instr, Recipe);
if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 41878e3..a7000af 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -93,42 +93,37 @@ class VPRecipeBuilder {
/// Range. The function should not be called for memory instructions or calls.
bool shouldWiden(Instruction *I, VFRange &Range) const;
- /// Check if the load or store instruction \p I should widened for \p
+ /// Check if the load or store instruction \p VPI should widened for \p
/// Range.Start and potentially masked. Such instructions are handled by a
/// recipe that takes an additional VPInstruction for the mask.
- VPWidenMemoryRecipe *tryToWidenMemory(Instruction *I,
- ArrayRef<VPValue *> Operands,
- VFRange &Range);
+ VPWidenMemoryRecipe *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
- /// Check if an induction recipe should be constructed for \p Phi. If so build
+ /// Check if an induction recipe should be constructed for \p VPI. If so build
/// and return it. If not, return null.
- VPHeaderPHIRecipe *tryToOptimizeInductionPHI(PHINode *Phi,
- ArrayRef<VPValue *> Operands,
+ VPHeaderPHIRecipe *tryToOptimizeInductionPHI(VPInstruction *VPI,
VFRange &Range);
- /// Optimize the special case where the operand of \p I is a constant integer
- /// induction variable.
+ /// Optimize the special case where the operand of \p VPI is a constant
+ /// integer induction variable.
VPWidenIntOrFpInductionRecipe *
- tryToOptimizeInductionTruncate(TruncInst *I, ArrayRef<VPValue *> Operands,
- VFRange &Range);
+ tryToOptimizeInductionTruncate(VPInstruction *VPI, VFRange &Range);
- /// Handle call instructions. If \p CI can be widened for \p Range.Start,
+ /// Handle call instructions. If \p VPI can be widened for \p Range.Start,
/// return a new VPWidenCallRecipe or VPWidenIntrinsicRecipe. Range.End may be
/// decreased to ensure same decision from \p Range.Start to \p Range.End.
- VPSingleDefRecipe *tryToWidenCall(CallInst *CI, ArrayRef<VPValue *> Operands,
- VFRange &Range);
+ VPSingleDefRecipe *tryToWidenCall(VPInstruction *VPI, VFRange &Range);
- /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe
- /// if it can. The function should only be called if the cost-model indicates
- /// that widening should be performed.
- VPWidenRecipe *tryToWiden(Instruction *I, ArrayRef<VPValue *> Operands);
+ /// Check if \p VPI has an opcode that can be widened and return a
+ /// VPWidenRecipe if it can. The function should only be called if the
+ /// cost-model indicates that widening should be performed.
+ VPWidenRecipe *tryToWiden(VPInstruction *VPI);
/// Makes Histogram count operations safe for vectorization, by emitting a
/// llvm.experimental.vector.histogram.add intrinsic in place of the
/// Load + Add|Sub + Store operations that perform the histogram in the
/// original scalar loop.
VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
- ArrayRef<VPValue *> Operands);
+ VPInstruction *VPI);
/// Examines reduction operations to see if the target can use a cheaper
/// operation with a wider per-iteration input VF and narrower PHI VF.
@@ -171,8 +166,7 @@ public:
/// Create and return a partial reduction recipe for a reduction instruction
/// along with binary operation and reduction phi operands.
- VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
- ArrayRef<VPValue *> Operands,
+ VPRecipeBase *tryToCreatePartialReduction(VPInstruction *Reduction,
unsigned ScaleFactor);
/// Set the recipe created for given ingredient.
@@ -197,12 +191,10 @@ public:
return Ingredient2Recipe[I];
}
- /// Build a VPReplicationRecipe for \p I using \p Operands. If it is
- /// predicated, add the mask as last operand. Range.End may be decreased to
- /// ensure same recipe behavior from \p Range.Start to \p Range.End.
- VPReplicateRecipe *handleReplication(Instruction *I,
- ArrayRef<VPValue *> Operands,
- VFRange &Range);
+ /// Build a VPReplicationRecipe for \p VPI. If it is predicated, add the mask
+ /// as last operand. Range.End may be decreased to ensure same recipe behavior
+ /// from \p Range.Start to \p Range.End.
+ VPReplicateRecipe *handleReplication(VPInstruction *VPI, VFRange &Range);
VPValue *getVPValueOrAddLiveIn(Value *V) {
if (auto *I = dyn_cast<Instruction>(V)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 48bd697..634df51 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1288,8 +1288,9 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
// Look through broadcast of single-scalar when used as select conditions; in
// that case the scalar condition can be used directly.
if (match(Def,
- m_Select(m_Broadcast(m_VPValue(C)), m_VPValue(), m_VPValue())) &&
- vputils::isSingleScalar(C)) {
+ m_Select(m_Broadcast(m_VPValue(C)), m_VPValue(), m_VPValue()))) {
+ assert(vputils::isSingleScalar(C) &&
+ "broadcast operand must be single-scalar");
Def->setOperand(0, C);
return;
}
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 27a8bbd..ed3a0a0 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -696,11 +696,11 @@ bool VectorCombine::foldExtractExtract(Instruction &I) {
/// shuffle.
bool VectorCombine::foldInsExtFNeg(Instruction &I) {
// Match an insert (op (extract)) pattern.
- Value *DestVec;
- uint64_t Index;
+ Value *DstVec;
+ uint64_t ExtIdx, InsIdx;
Instruction *FNeg;
- if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)),
- m_ConstantInt(Index))))
+ if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
+ m_ConstantInt(InsIdx))))
return false;
// Note: This handles the canonical fneg instruction and "fsub -0.0, X".
@@ -708,67 +708,74 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
Instruction *Extract;
if (!match(FNeg, m_FNeg(m_CombineAnd(
m_Instruction(Extract),
- m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index))))))
+ m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
return false;
- auto *VecTy = cast<FixedVectorType>(I.getType());
- auto *ScalarTy = VecTy->getScalarType();
+ auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
+ auto *DstVecScalarTy = DstVecTy->getScalarType();
auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
- if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType())
+ if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
return false;
- // Ignore bogus insert/extract index.
- unsigned NumElts = VecTy->getNumElements();
- if (Index >= NumElts)
+ // Ignore if insert/extract index is out of bounds or destination vector has
+ // one element
+ unsigned NumDstElts = DstVecTy->getNumElements();
+ unsigned NumSrcElts = SrcVecTy->getNumElements();
+ if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
return false;
// We are inserting the negated element into the same lane that we extracted
// from. This is equivalent to a select-shuffle that chooses all but the
// negated element from the destination vector.
- SmallVector<int> Mask(NumElts);
+ SmallVector<int> Mask(NumDstElts);
std::iota(Mask.begin(), Mask.end(), 0);
- Mask[Index] = Index + NumElts;
+ Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
InstructionCost OldCost =
- TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) +
- TTI.getVectorInstrCost(I, VecTy, CostKind, Index);
+ TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
+ TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
// If the extract has one use, it will be eliminated, so count it in the
// original cost. If it has more than one use, ignore the cost because it will
// be the same before/after.
if (Extract->hasOneUse())
- OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index);
+ OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
InstructionCost NewCost =
- TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) +
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, VecTy,
- Mask, CostKind);
+ TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
+ TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, DstVecTy,
+ DstVecTy, Mask, CostKind);
- bool NeedLenChg = SrcVecTy->getNumElements() != NumElts;
+ bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
// If the lengths of the two vectors are not equal,
// we need to add a length-change vector. Add this cost.
SmallVector<int> SrcMask;
if (NeedLenChg) {
- SrcMask.assign(NumElts, PoisonMaskElem);
- SrcMask[Index] = Index;
+ SrcMask.assign(NumDstElts, PoisonMaskElem);
+ SrcMask[ExtIdx % NumDstElts] = ExtIdx;
NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
- VecTy, SrcVecTy, SrcMask, CostKind);
+ DstVecTy, SrcVecTy, SrcMask, CostKind);
}
+ LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
+ << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
+ << "\n");
if (NewCost > OldCost)
return false;
- Value *NewShuf;
- // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index
+ Value *NewShuf, *LenChgShuf = nullptr;
+ // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
if (NeedLenChg) {
- // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
- Value *LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
- NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask);
+ // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
+ LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
+ NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
+ Worklist.pushValue(LenChgShuf);
} else {
- // shuffle DestVec, (fneg SrcVec), Mask
- NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
+ // shuffle DstVec, (fneg SrcVec), Mask
+ NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
}
+ Worklist.pushValue(VecFNeg);
replaceValue(I, *NewShuf);
return true;
}