diff options
-rw-r--r-- | llvm/include/llvm/Analysis/TargetTransformInfo.h | 22 | ||||
-rw-r--r-- | llvm/include/llvm/Analysis/TargetTransformInfoImpl.h | 7 | ||||
-rw-r--r-- | llvm/lib/Analysis/TargetTransformInfo.cpp | 9 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 9 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.h | 5 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 45 | ||||
-rw-r--r-- | llvm/test/Transforms/SLPVectorizer/X86/supernode.ll | 22 | ||||
-rw-r--r-- | llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll | 20 |
8 files changed, 109 insertions, 30 deletions
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 735be36..048912b 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1243,6 +1243,18 @@ public: ArrayRef<const Value *> Args = ArrayRef<const Value *>(), const Instruction *CxtI = nullptr) const; + /// Returns the cost estimation for alternating opcode pattern that can be + /// lowered to a single instruction on the target. In X86 this is for the + /// addsub instruction which corrsponds to a Shuffle + Fadd + FSub pattern in + /// IR. This function expects two opcodes: \p Opcode1 and \p Opcode2 being + /// selected by \p OpcodeMask. The mask contains one bit per lane and is a `0` + /// when \p Opcode0 is selected and `1` when Opcode1 is selected. + /// \p VecTy is the vector type of the instruction to be generated. + InstructionCost getAltInstrCost( + VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, + const SmallBitVector &OpcodeMask, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; + /// \return The cost of a shuffle instruction of kind Kind and of type Tp. /// The exact mask may be passed as Mask, or else the array will be empty. /// The index and subtype parameters are used by the subvector insertion and @@ -1944,6 +1956,10 @@ public: unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, OperandValueInfo Opd1Info, OperandValueInfo Opd2Info, ArrayRef<const Value *> Args, const Instruction *CxtI = nullptr) = 0; + virtual InstructionCost getAltInstrCost( + VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, + const SmallBitVector &OpcodeMask, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const = 0; virtual InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask, @@ -2555,6 +2571,12 @@ public: return Impl.getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info, Args, CxtI); } + InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, + unsigned Opcode1, + const SmallBitVector &OpcodeMask, + TTI::TargetCostKind CostKind) const override { + return Impl.getAltInstrCost(VecTy, Opcode0, Opcode1, OpcodeMask, CostKind); + } InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 1d8f523..7ad3ce5 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -554,6 +554,13 @@ public: return 1; } + InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, + unsigned Opcode1, + const SmallBitVector &OpcodeMask, + TTI::TargetCostKind CostKind) const { + return InstructionCost::getInvalid(); + } + InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty, ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 3f76dfd..67246af 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -862,6 +862,15 @@ InstructionCost TargetTransformInfo::getArithmeticInstrCost( return Cost; } +InstructionCost TargetTransformInfo::getAltInstrCost( + VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, + const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const { + InstructionCost Cost = + TTIImpl->getAltInstrCost(VecTy, Opcode0, Opcode1, OpcodeMask, CostKind); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + InstructionCost TargetTransformInfo::getShuffleCost( ShuffleKind Kind, VectorType *Ty, ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 8a04987..e09dc7f 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1459,6 +1459,15 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( Args, CxtI); } +InstructionCost +X86TTIImpl::getAltInstrCost(VectorType *VecTy, unsigned Opcode0, + unsigned Opcode1, const SmallBitVector &OpcodeMask, + TTI::TargetCostKind CostKind) const { + if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) + return TTI::TCC_Basic; + return InstructionCost::getInvalid(); +} + InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 0fa0d24..07a3fff4 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -140,6 +140,11 @@ public: TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None}, ArrayRef<const Value *> Args = ArrayRef<const Value *>(), const Instruction *CxtI = nullptr); + InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0, + unsigned Opcode1, + const SmallBitVector &OpcodeMask, + TTI::TargetCostKind CostKind) const; + InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 32913b3..944d12f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8384,6 +8384,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, (void)E; return TTI->getInstructionCost(VI, CostKind); }; + // FIXME: Workaround for syntax error reported by MSVC buildbots. + TargetTransformInfo &TTIRef = *TTI; // Need to clear CommonCost since the final shuffle cost is included into // vector cost. auto GetVectorCost = [&](InstructionCost) { @@ -8398,14 +8400,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, // No need to add new vector costs here since we're going to reuse // same main/alternate vector ops, just do different shuffling. } else if (Instruction::isBinaryOp(E->getOpcode())) { - VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); + VecCost = + TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); VecCost += - TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); + TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) { auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); - VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, - CI0->getPredicate(), CostKind, VL0); - VecCost += TTI->getCmpSelInstrCost( + VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, + CI0->getPredicate(), CostKind, VL0); + VecCost += TTIRef.getCmpSelInstrCost( E->getOpcode(), VecTy, MaskTy, cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind, E->getAltOp()); @@ -8414,10 +8417,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType(); auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size()); auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size()); - VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty, - TTI::CastContextHint::None, CostKind); - VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, - TTI::CastContextHint::None, CostKind); + VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, Src0Ty, + TTI::CastContextHint::None, CostKind); + VecCost += + TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, + TTI::CastContextHint::None, CostKind); } SmallVector<int> Mask; E->buildAltOpShuffleMask( @@ -8426,8 +8430,27 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, return I->getOpcode() == E->getAltOpcode(); }, Mask); - VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, - FinalVecTy, Mask); + VecCost += TTIRef.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, + FinalVecTy, Mask); + // Patterns like [fadd,fsub] can be combined into a single instruction + // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we + // need to take into account their order when looking for the most used + // order. + unsigned Opcode0 = E->getOpcode(); + unsigned Opcode1 = E->getAltOpcode(); + // The opcode mask selects between the two opcodes. + SmallBitVector OpcodeMask(E->Scalars.size(), false); + for (unsigned Lane : seq<unsigned>(0, E->Scalars.size())) + if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1) + OpcodeMask.set(Lane); + // If this pattern is supported by the target then we consider the + // order. + if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) { + InstructionCost AltVecCost = TTIRef.getAltInstrCost( + VecTy, Opcode0, Opcode1, OpcodeMask, CostKind); + return AltVecCost < VecCost ? AltVecCost : VecCost; + } + // TODO: Check the reverse order too. return VecCost; }; return GetCostDiff(GetScalarCost, GetVectorCost); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll index d4c7128..87063fc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll @@ -103,21 +103,23 @@ define void @test_supernode_addsub_alt(ptr %Aarray, ptr %Barray, ptr %Carray, pt ; ENABLED-LABEL: @test_supernode_addsub_alt( ; ENABLED-NEXT: entry: ; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1 -; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[BARRAY:%.*]], i64 1 ; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1 -; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, ptr [[SARRAY:%.*]], i64 1 ; ENABLED-NEXT: [[A0:%.*]] = load double, ptr [[AARRAY]], align 8 ; ENABLED-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 -; ENABLED-NEXT: [[B0:%.*]] = load double, ptr [[BARRAY]], align 8 -; ENABLED-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 ; ENABLED-NEXT: [[C0:%.*]] = load double, ptr [[CARRAY]], align 8 ; ENABLED-NEXT: [[C1:%.*]] = load double, ptr [[IDXC1]], align 8 -; ENABLED-NEXT: [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]] -; ENABLED-NEXT: [[ADDB1C1:%.*]] = fadd fast double [[B1]], [[C1]] -; ENABLED-NEXT: [[SUB0:%.*]] = fsub fast double [[SUBA0B0]], [[C0]] -; ENABLED-NEXT: [[ADD1:%.*]] = fadd fast double [[ADDB1C1]], [[A1]] -; ENABLED-NEXT: store double [[SUB0]], ptr [[SARRAY]], align 8 -; ENABLED-NEXT: store double [[ADD1]], ptr [[IDXS1]], align 8 +; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 +; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1 +; ENABLED-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP0]] +; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP0]] +; ENABLED-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <2 x i32> <i32 0, i32 3> +; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 +; ENABLED-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A1]], i32 1 +; ENABLED-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] +; ENABLED-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]] +; ENABLED-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3> +; ENABLED-NEXT: store <2 x double> [[TMP10]], ptr [[SARRAY:%.*]], align 8 ; ENABLED-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll index aa3c2be..17f9f37 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll @@ -12,22 +12,24 @@ define void @foo() { ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[CONV]], i32 1 ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP10:%.*]], [[BB3:%.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP14:%.*]], [[BB3:%.*]] ] ; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr undef, align 8 ; CHECK-NEXT: br i1 undef, label [[BB3]], label [[BB4:%.*]] ; CHECK: bb4: ; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double> ; CHECK-NEXT: [[CONV2:%.*]] = uitofp i16 undef to double -; CHECK-NEXT: [[ADD1:%.*]] = fadd double [[TMP3]], [[CONV2]] -; CHECK-NEXT: [[SUB1:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> <double poison, double poison, double undef, double undef>, double [[SUB1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[ADD1]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt <4 x double> [[TMP6]], [[TMP4]] -; CHECK-NEXT: [[TMP8:%.*]] = fptrunc <4 x double> [[TMP6]] to <4 x float> -; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP2]], <4 x float> [[TMP8]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[CONV2]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP11:%.*]] = fcmp ogt <4 x double> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = fptrunc <4 x double> [[TMP10]] to <4 x float> +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP2]], <4 x float> [[TMP12]] ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP10]] = phi <4 x float> [ [[TMP9]], [[BB4]] ], [ [[TMP2]], [[BB2]] ] +; CHECK-NEXT: [[TMP14]] = phi <4 x float> [ [[TMP13]], [[BB4]] ], [ [[TMP2]], [[BB2]] ] ; CHECK-NEXT: br label [[BB2]] ; entry: |