diff options
author | Anna Welker <anna.welker@arm.com> | 2020-03-11 10:13:11 +0000 |
---|---|---|
committer | Anna Welker <anna.welker@arm.com> | 2020-03-11 10:23:41 +0000 |
commit | a6d3bec83fca0568e1fb02b9297b43435b9579d6 (patch) | |
tree | 3e8984b5f7385372b60afae000ea523dc057564a | |
parent | 8a12553223180246eeafaa0fa7bfa11e834d34b6 (diff) | |
download | llvm-a6d3bec83fca0568e1fb02b9297b43435b9579d6.zip llvm-a6d3bec83fca0568e1fb02b9297b43435b9579d6.tar.gz llvm-a6d3bec83fca0568e1fb02b9297b43435b9579d6.tar.bz2 |
[TTI][ARM][MVE] Refine gather/scatter cost model
Refines the gather/scatter cost model, but also changes the TTI
function getIntrinsicInstrCost to accept an additional parameter
which is needed for the gather/scatter cost evaluation.
This did require trivial changes in some non-ARM backends to
adopt the new parameter.
Extending gathers and truncating scatters are now priced cheaper.
Differential Revision: https://reviews.llvm.org/D75525
18 files changed, 363 insertions, 110 deletions
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 2968a5f..1387b90 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -966,8 +966,11 @@ public: /// \p VariableMask - true when the memory access is predicated with a mask /// that is not a compile-time constant /// \p Alignment - alignment of single element + /// \p I - the optional original context instruction, if one exists, e.g. the + /// load/store to transform or the call to the gather/scatter intrinsic int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, - bool VariableMask, unsigned Alignment) const; + bool VariableMask, unsigned Alignment, + const Instruction *I = nullptr) const; /// \return The cost of the interleaved memory operation. /// \p Opcode is the memory operation code @@ -1006,16 +1009,22 @@ public: /// \returns The cost of Intrinsic instructions. Analyses the real arguments. /// Three cases are handled: 1. scalar instruction 2. vector instruction /// 3. scalar instruction which is to be vectorized with VF. + /// I is the optional original context instruction holding the call to the + /// intrinsic int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Value *> Args, FastMathFlags FMF, - unsigned VF = 1) const; + unsigned VF = 1, + const Instruction *I = nullptr) const; /// \returns The cost of Intrinsic instructions. Types analysis only. /// If ScalarizationCostPassed is UINT_MAX, the cost of scalarizing the /// arguments and the return value will be computed based on types. - int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Type *> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed = UINT_MAX) const; + /// I is the optional original context instruction holding the call to the + /// intrinsic + int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys, + FastMathFlags FMF, + unsigned ScalarizationCostPassed = UINT_MAX, + const Instruction *I = nullptr) const; /// \returns The cost of Call instructions. int getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) const; @@ -1340,9 +1349,9 @@ public: virtual int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) = 0; - virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, - Value *Ptr, bool VariableMask, - unsigned Alignment) = 0; + virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, + bool VariableMask, unsigned Alignment, + const Instruction *I = nullptr) = 0; virtual int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, @@ -1355,10 +1364,12 @@ public: virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm, bool IsUnsigned) = 0; virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Type *> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed) = 0; + ArrayRef<Type *> Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed, + const Instruction *I) = 0; virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) = 0; + ArrayRef<Value *> Args, FastMathFlags FMF, + unsigned VF, const Instruction *I) = 0; virtual int getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) = 0; virtual unsigned getNumberOfParts(Type *Tp) = 0; @@ -1759,11 +1770,11 @@ public: unsigned AddressSpace) override { return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } - int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, - Value *Ptr, bool VariableMask, - unsigned Alignment) override { + int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, + bool VariableMask, unsigned Alignment, + const Instruction *I = nullptr) override { return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, - Alignment); + Alignment, I); } int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, @@ -1781,15 +1792,18 @@ public: bool IsPairwiseForm, bool IsUnsigned) override { return Impl.getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm, IsUnsigned); } - int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys, - FastMathFlags FMF, unsigned ScalarizationCostPassed) override { - return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF, - ScalarizationCostPassed); - } - int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) override { - return Impl.getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); - } + int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef<Type *> Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed, + const Instruction *I) override { + return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF, + ScalarizationCostPassed, I); + } + int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef<Value *> Args, FastMathFlags FMF, + unsigned VF, const Instruction *I) override { + return Impl.getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I); + } int getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) override { return Impl.getCallInstrCost(F, RetTy, Tys); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 5c51d30..6006573 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -481,8 +481,8 @@ public: } unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, - bool VariableMask, - unsigned Alignment) { + bool VariableMask, unsigned Alignment, + const Instruction *I = nullptr) { return 1; } @@ -497,11 +497,13 @@ public: unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed) { + unsigned ScalarizationCostPassed, + const Instruction *I) { return 1; } unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) { + ArrayRef<Value *> Args, FastMathFlags FMF, + unsigned VF, const Instruction *I) { return 1; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index d3f8896..f3dd322 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1072,7 +1072,8 @@ public: /// Get intrinsic cost based on arguments. unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef<Value *> Args, FastMathFlags FMF, - unsigned VF = 1) { + unsigned VF = 1, + const Instruction *I = nullptr) { unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1); assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type"); auto *ConcreteTTI = static_cast<T *>(this); @@ -1109,16 +1110,17 @@ public: Value *Mask = Args[3]; bool VarMask = !isa<Constant>(Mask); unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue(); - return ConcreteTTI->getGatherScatterOpCost( - Instruction::Store, Args[0]->getType(), Args[1], VarMask, Alignment); + return ConcreteTTI->getGatherScatterOpCost(Instruction::Store, + Args[0]->getType(), Args[1], + VarMask, Alignment, I); } case Intrinsic::masked_gather: { assert(VF == 1 && "Can't vectorize types here."); Value *Mask = Args[2]; bool VarMask = !isa<Constant>(Mask); unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue(); - return ConcreteTTI->getGatherScatterOpCost(Instruction::Load, RetTy, - Args[0], VarMask, Alignment); + return ConcreteTTI->getGatherScatterOpCost( + Instruction::Load, RetTy, Args[0], VarMask, Alignment, I); } case Intrinsic::experimental_vector_reduce_add: case Intrinsic::experimental_vector_reduce_mul: @@ -1180,7 +1182,8 @@ public: /// based on types. unsigned getIntrinsicInstrCost( Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed = std::numeric_limits<unsigned>::max()) { + unsigned ScalarizationCostPassed = std::numeric_limits<unsigned>::max(), + const Instruction *I = nullptr) { auto *ConcreteTTI = static_cast<T *>(this); SmallVector<unsigned, 2> ISDs; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 865f897..a1b30fc 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -674,9 +674,10 @@ int TargetTransformInfo::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, - unsigned Alignment) const { + unsigned Alignment, + const Instruction *I) const { int Cost = TTIImpl->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, - Alignment); + Alignment, I); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } @@ -694,17 +695,21 @@ int TargetTransformInfo::getInterleavedMemoryOpCost( } int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Type *> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed) const { + ArrayRef<Type *> Tys, + FastMathFlags FMF, + unsigned ScalarizationCostPassed, + const Instruction *I) const { int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys, FMF, - ScalarizationCostPassed); + ScalarizationCostPassed, I); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) const { - int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); + ArrayRef<Value *> Args, + FastMathFlags FMF, unsigned VF, + const Instruction *I) const { + int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } @@ -1339,8 +1344,8 @@ int TargetTransformInfo::getInstructionThroughput(const Instruction *I) const { if (auto *FPMO = dyn_cast<FPMathOperator>(II)) FMF = FPMO->getFastMathFlags(); - return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), - Args, FMF); + return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args, + FMF, 1, II); } return -1; default: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index b7635f0..caa3a4aa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -478,14 +478,14 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, template <typename T> int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<T *> Args, - FastMathFlags FMF, unsigned VF) { + ArrayRef<T *> Args, FastMathFlags FMF, + unsigned VF, const Instruction *I) { if (ID != Intrinsic::fma) - return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); + return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I); EVT OrigTy = TLI->getValueType(DL, RetTy); if (!OrigTy.isSimple()) { - return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); + return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I); } // Legalize the type. @@ -507,16 +507,17 @@ int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, } int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Value*> Args, FastMathFlags FMF, - unsigned VF) { - return getIntrinsicInstrCost<Value>(ID, RetTy, Args, FMF, VF); + ArrayRef<Value *> Args, FastMathFlags FMF, + unsigned VF, const Instruction *I) { + return getIntrinsicInstrCost<Value>(ID, RetTy, Args, FMF, VF, I); } int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed) { + unsigned ScalarizationCostPassed, + const Instruction *I) { return getIntrinsicInstrCost<Type>(ID, RetTy, Tys, FMF, - ScalarizationCostPassed); + ScalarizationCostPassed, I); } unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) { @@ -889,7 +890,7 @@ unsigned GCNTTIImpl::getUserCost(const User *U, if (auto *FPMO = dyn_cast<FPMathOperator>(II)) FMF = FPMO->getFastMathFlags(); return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args, - FMF); + FMF, 1, II); } else { return BaseT::getUserCost(U, Operands); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index dee2e7d..d2a25bf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -219,15 +219,16 @@ public: Type *Ty, bool IsPairwise); template <typename T> - int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef<T *> Args, FastMathFlags FMF, - unsigned VF); + int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef<T *> Args, + FastMathFlags FMF, unsigned VF, + const Instruction *I = nullptr); int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed = UINT_MAX); + unsigned ScalarizationCostPassed = UINT_MAX, + const Instruction *I = nullptr); int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef<Value *> Args, FastMathFlags FMF, - unsigned VF = 1); + unsigned VF = 1, const Instruction *I = nullptr); int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm, bool IsUnsigned); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index ffac332..3a88575 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -863,16 +863,17 @@ int ARMTTIImpl::getInterleavedMemoryOpCost( unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, - unsigned Alignment) { + unsigned Alignment, + const Instruction *I) { + using namespace PatternMatch; if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters) return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, - Alignment); + Alignment, I); assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!"); VectorType *VTy = cast<VectorType>(DataTy); // TODO: Splitting, once we do that. - // TODO: trunc/sext/zext the result/input unsigned NumElems = VTy->getNumElements(); unsigned EltSize = VTy->getScalarSizeInBits(); @@ -889,19 +890,54 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, unsigned ScalarCost = NumElems * LT.first + BaseT::getScalarizationOverhead(DataTy, {}); - // TODO: Cost extended gathers or trunc stores correctly. - if (EltSize * NumElems != 128 || NumElems < 4) - return ScalarCost; if (Alignment < EltSize / 8) return ScalarCost; + unsigned ExtSize = EltSize; + // Check whether there's a single user that asks for an extended type + if (I != nullptr) { + // Dependent of the caller of this function, a gather instruction will + // either have opcode Instruction::Load or be a call to the masked_gather + // intrinsic + if ((I->getOpcode() == Instruction::Load || + match(I, m_Intrinsic<Intrinsic::masked_gather>())) && + I->hasOneUse()) { + const User *Us = *I->users().begin(); + if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) { + // only allow valid type combinations + unsigned TypeSize = + cast<Instruction>(Us)->getType()->getScalarSizeInBits(); + if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) || + (TypeSize == 16 && EltSize == 8)) && + TypeSize * NumElems == 128) { + ExtSize = TypeSize; + } + } + } + // Check whether the input data needs to be truncated + TruncInst *T; + if ((I->getOpcode() == Instruction::Store || + match(I, m_Intrinsic<Intrinsic::masked_scatter>())) && + (T = dyn_cast<TruncInst>(I->getOperand(0)))) { + // Only allow valid type combinations + unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits(); + if (((EltSize == 16 && TypeSize == 32) || + (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) && + TypeSize * NumElems == 128) + ExtSize = TypeSize; + } + } + + if (ExtSize * NumElems != 128 || NumElems < 4) + return ScalarCost; + // Any (aligned) i32 gather will not need to be scalarised. - if (EltSize == 32) + if (ExtSize == 32) return VectorCost; // For smaller types, we need to ensure that the gep's inputs are correctly - // extended from a small enough value. Other size (including i64) are + // extended from a small enough value. Other sizes (including i64) are // scalarized for now. - if (EltSize != 8 && EltSize != 16) + if (ExtSize != 8 && ExtSize != 16) return ScalarCost; if (auto BC = dyn_cast<BitCastInst>(Ptr)) @@ -911,12 +947,13 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, return ScalarCost; unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType()); // Scale needs to be correct (which is only relevant for i16s). - if (Scale != 1 && Scale * 8 != EltSize) + if (Scale != 1 && Scale * 8 != ExtSize) return ScalarCost; // And we need to zext (not sext) the indexes from a small enough type. - if (auto ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) - if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= EltSize) + if (auto ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) { + if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize) return VectorCost; + } return ScalarCost; } return ScalarCost; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index cab1514..f21b348 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -222,7 +222,8 @@ public: bool UseMaskForGaps = false); unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, - bool VariableMask, unsigned Alignment); + bool VariableMask, unsigned Alignment, + const Instruction *I = nullptr); bool isLoweredToCall(const Function *F); bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index 7ec8070..7f6058a 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -131,19 +131,23 @@ unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy, } unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF) { - return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); + ArrayRef<Value *> Args, + FastMathFlags FMF, unsigned VF, + const Instruction *I) { + return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I); } unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Type*> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed) { + ArrayRef<Type *> Tys, + FastMathFlags FMF, + unsigned ScalarizationCostPassed, + const Instruction *I) { if (ID == Intrinsic::bswap) { std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, RetTy); return LT.first + 2; } return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF, - ScalarizationCostPassed); + ScalarizationCostPassed, I); } unsigned HexagonTTIImpl::getAddressComputationCost(Type *Tp, @@ -209,9 +213,11 @@ unsigned HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, } unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, - Value *Ptr, bool VariableMask, unsigned Alignment) { + Value *Ptr, bool VariableMask, + unsigned Alignment, + const Instruction *I) { return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, - Alignment); + Alignment, I); } unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index ace0d79..409e1df 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -106,10 +106,12 @@ public: unsigned VF); unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type*> Tys); unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF); + ArrayRef<Value *> Args, FastMathFlags FMF, + unsigned VF, const Instruction *I); unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Type*> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed = UINT_MAX); + ArrayRef<Type *> Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed = UINT_MAX, + const Instruction *I = nullptr); unsigned getAddressComputationCost(Type *Tp, ScalarEvolution *SE, const SCEV *S); unsigned getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, @@ -120,7 +122,8 @@ public: unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, - bool VariableMask, unsigned Alignment); + bool VariableMask, unsigned Alignment, + const Instruction *I); unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond = false, diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 2022728..1bb78ed 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -936,17 +936,21 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, } unsigned PPCTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF) { - return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); + ArrayRef<Value *> Args, + FastMathFlags FMF, unsigned VF, + const Instruction *I) { + return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I); } unsigned PPCTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Type*> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed) { + ArrayRef<Type *> Tys, + FastMathFlags FMF, + unsigned ScalarizationCostPassed, + const Instruction *I) { if (ID == Intrinsic::bswap && ST->hasP9Vector()) return TLI->getTypeLegalizationCost(DL, RetTy).first; return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF, - ScalarizationCostPassed); + ScalarizationCostPassed, I); } bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index b89e585..5aea524 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -111,10 +111,12 @@ public: bool UseMaskForCond = false, bool UseMaskForGaps = false); unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF); + ArrayRef<Value *> Args, FastMathFlags FMF, + unsigned VF, const Instruction *I = nullptr); unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Type*> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed = UINT_MAX); + ArrayRef<Type *> Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed = UINT_MAX, + const Instruction *I = nullptr); /// @} }; diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index df5286b..d088682 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -1124,20 +1124,22 @@ static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy) { int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Value *> Args, - FastMathFlags FMF, unsigned VF) { + FastMathFlags FMF, unsigned VF, + const Instruction *I) { int Cost = getVectorIntrinsicInstrCost(ID, RetTy); if (Cost != -1) return Cost; - return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); + return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I); } int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed) { + unsigned ScalarizationCostPassed, + const Instruction *I) { int Cost = getVectorIntrinsicInstrCost(ID, RetTy); if (Cost != -1) return Cost; - return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, - FMF, ScalarizationCostPassed); + return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF, + ScalarizationCostPassed, I); } diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index bc4d066..5905057 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -101,10 +101,11 @@ public: int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Value *> Args, FastMathFlags FMF, - unsigned VF = 1); - int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef<Type *> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed = UINT_MAX); + unsigned VF = 1, const Instruction *I = nullptr); + int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys, + FastMathFlags FMF, + unsigned ScalarizationCostPassed = UINT_MAX, + const Instruction *I = nullptr); /// @} }; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 207e1d9..4ac610a 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1887,7 +1887,8 @@ unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed) { + unsigned ScalarizationCostPassed, + const Instruction *I) { // Costs should match the codegen from: // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll @@ -2309,12 +2310,13 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, return LT.first * Entry->Cost; } - return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed); + return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, + ScalarizationCostPassed, I); } int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef<Value *> Args, FastMathFlags FMF, - unsigned VF) { + unsigned VF, const Instruction *I) { static const CostTblEntry AVX512CostTbl[] = { { ISD::ROTL, MVT::v8i64, 1 }, { ISD::ROTL, MVT::v4i64, 1 }, @@ -2404,7 +2406,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, return LT.first * Entry->Cost; } - return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF); + return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF, I); } int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { @@ -3354,7 +3356,8 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, /// Calculate the cost of Gather / Scatter operation int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, bool VariableMask, - unsigned Alignment) { + unsigned Alignment, + const Instruction *I = nullptr) { assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); unsigned VF = SrcVTy->getVectorNumElements(); PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index b9c2dbd..4b48064 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -138,7 +138,8 @@ public: int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace); int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, - bool VariableMask, unsigned Alignment); + bool VariableMask, unsigned Alignment, + const Instruction *I); int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr); @@ -146,10 +147,11 @@ public: int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed = UINT_MAX); + unsigned ScalarizationCostPassed = UINT_MAX, + const Instruction *I = nullptr); int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef<Value *> Args, FastMathFlags FMF, - unsigned VF = 1); + unsigned VF = 1, const Instruction *I = nullptr); int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 49e38c3..c6bf118 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3301,7 +3301,7 @@ unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, FMF = FPMO->getFastMathFlags(); SmallVector<Value *, 4> Operands(CI->arg_operands()); - return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF); + return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF, CI); } static Type *smallestIntegerVectorType(Type *T1, Type *T2) { @@ -5889,7 +5889,7 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, return TTI.getAddressComputationCost(VectorTy) + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), - Alignment ? Alignment->value() : 0); + Alignment ? Alignment->value() : 0, I); } unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, diff --git a/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll b/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll index 0f28d43..53480f5 100644 --- a/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll +++ b/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll @@ -134,21 +134,26 @@ define void @gep_v4i32(i32* %base, i16* %base16, i8* %base8, <4 x i32> %ind32, < ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i32, i32* %base, <4 x i32> %ind32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep1, i32 4, <4 x i1> %mask, <4 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res1, <4 x i32*> %gep1, i32 4, <4 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i32, i32* %base, <4 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep2, i32 4, <4 x i1> %mask, <4 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res2, <4 x i32*> %gep2, i32 4, <4 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i32, i32* %base, <4 x i32> %indsext ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep3, i32 4, <4 x i1> %mask, <4 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res3, <4 x i32*> %gep3, i32 4, <4 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepu = getelementptr i32, i32* %base, <4 x i32> %ind32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %resu = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gepu, i32 1, <4 x i1> %mask, <4 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %resu, <4 x i32*> %gepu, i32 1, <4 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <4 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <4 x i8*> %gepos to <4 x i32*> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %resos = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %geposb, i32 4, <4 x i1> %mask, <4 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %resos, <4 x i32*> %geposb, i32 4, <4 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, i16* %base16, <4 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <4 x i16*> %gepbs to <4 x i32*> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %resbs = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gepbsb, i32 4, <4 x i1> %mask, <4 x i32> undef) @@ -193,21 +198,26 @@ define void @gep_v4f32(float* %base, i16* %base16, i8* %base8, <4 x i32> %ind32, ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep1 = getelementptr float, float* %base, <4 x i32> %ind32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res1 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep1, i32 4, <4 x i1> %mask, <4 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res1, <4 x float*> %gep1, i32 4, <4 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep2 = getelementptr float, float* %base, <4 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res2 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep2, i32 4, <4 x i1> %mask, <4 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res2, <4 x float*> %gep2, i32 4, <4 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep3 = getelementptr float, float* %base, <4 x i32> %indsext ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res3 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep3, i32 4, <4 x i1> %mask, <4 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res3, <4 x float*> %gep3, i32 4, <4 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gepu = getelementptr float, float* %base, <4 x i32> %ind32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %resu = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gepu, i32 1, <4 x i1> %mask, <4 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resu, <4 x float*> %gepu, i32 1, <4 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <4 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <4 x i8*> %gepos to <4 x float*> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %resos = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %geposb, i32 4, <4 x i1> %mask, <4 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resos, <4 x float*> %geposb, i32 4, <4 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, i16* %base16, <4 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <4 x i16*> %gepbs to <4 x float*> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %resbs = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gepbsb, i32 4, <4 x i1> %mask, <4 x float> undef) @@ -252,14 +262,28 @@ define void @gep_v4i16(i16* %base, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i16, i16* %base, <4 x i32> %ind32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %res1 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep1, i32 2, <4 x i1> %mask, <4 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res1, <4 x i16*> %gep1, i32 2, <4 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i16, i16* %base, <4 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %res2 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep2, i32 2, <4 x i1> %mask, <4 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res2, <4 x i16*> %gep2, i32 2, <4 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i16, i16* %base, <4 x i32> %indsext ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %res3 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep3, i32 2, <4 x i1> %mask, <4 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res3, <4 x i16*> %gep3, i32 2, <4 x i1> %mask) + +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep5 = getelementptr i16, i16* %base, <4 x i16> %ind16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res5 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res5zext = zext <4 x i16> %res5 to <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %res5trunc = trunc <4 x i32> %res5zext to <4 x i16> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res5trunc, <4 x i16*> %gep5, i32 4, <4 x i1> %mask) + +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res6 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res6sext = sext <4 x i16> %res6 to <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %res6trunc = trunc <4 x i32> %res6sext to <4 x i16> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res6trunc, <4 x i16*> %gep5, i32 4, <4 x i1> %mask) +; ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %gep1 = getelementptr i16, i16* %base, <4 x i32> %ind32 @@ -275,43 +299,109 @@ define void @gep_v4i16(i16* %base, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %gep3 = getelementptr i16, i16* %base, <4 x i32> %indsext %res3 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep3, i32 2, <4 x i1> %mask, <4 x i16> undef) call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res3, <4 x i16*> %gep3, i32 2, <4 x i1> %mask) + + ; result zext + %gep5 = getelementptr i16, i16* %base, <4 x i16> %ind16 + %res5 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef) + %res5zext = zext <4 x i16> %res5 to <4 x i32> + %res5trunc = trunc <4 x i32> %res5zext to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res5trunc, <4 x i16*> %gep5, i32 4, <4 x i1> %mask) + + ; result sext + %res6 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef) + %res6sext = sext <4 x i16> %res6 to <4 x i32> + %res6trunc = trunc <4 x i32> %res6sext to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res6trunc, <4 x i16*> %gep5, i32 4, <4 x i1> %mask) + ret void } -define void @gep_v8i16(i16* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, <8 x i16> %ind16, <8 x i1> %mask) { +define void @gep_v4i8(i8* %base, <4 x i8> %ind8, <4 x i1> %mask) { +; CHECK-LABEL: 'gep_v4i8' + +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep5 = getelementptr i8, i8* %base, <4 x i8> %ind8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res5 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res5zext = zext <4 x i8> %res5 to <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %res5trunc = trunc <4 x i32> %res5zext to <4 x i8> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %res5trunc, <4 x i8*> %gep5, i32 4, <4 x i1> %mask) + +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res6 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res6sext = sext <4 x i8> %res6 to <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %res6trunc = trunc <4 x i32> %res6sext to <4 x i8> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %res6trunc, <4 x i8*> %gep5, i32 4, <4 x i1> %mask) +; +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + + ; result zext + %gep5 = getelementptr i8, i8* %base, <4 x i8> %ind8 + %res5 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef) + %res5zext = zext <4 x i8> %res5 to <4 x i32> + %res5trunc = trunc <4 x i32> %res5zext to <4 x i8> + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %res5trunc, <4 x i8*> %gep5, i32 4, <4 x i1> %mask) + + ; result sext + %res6 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef) + %res6sext = sext <4 x i8> %res6 to <4 x i32> + %res6trunc = trunc <4 x i32> %res6sext to <4 x i8> + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %res6trunc, <4 x i8*> %gep5, i32 4, <4 x i1> %mask) + + ret void +} + +define void @gep_v8i16(i16* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, <8 x i16> %ind16, <8 x i8> %ind8, <8 x i1> %mask) { ; CHECK-LABEL: 'gep_v8i16' ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i16, i16* %base, <8 x i32> %ind32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %res1 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep1, i32 2, <8 x i1> %mask, <8 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res1, <8 x i16*> %gep1, i32 2, <8 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indzext = zext <8 x i16> %ind16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i16, i16* %base, <8 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep2, i32 2, <8 x i1> %mask, <8 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res2, <8 x i16*> %gep2, i32 2, <8 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indsext = sext <8 x i16> %ind16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i16, i16* %base, <8 x i32> %indsext ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep3, i32 2, <8 x i1> %mask, <8 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res3, <8 x i16*> %gep3, i32 2, <8 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %resu = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep2, i32 1, <8 x i1> %mask, <8 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resu, <8 x i16*> %gep2, i32 1, <8 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <8 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <8 x i8*> %gepos to <8 x i16*> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %resos = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %geposb, i32 2, <8 x i1> %mask, <8 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resos, <8 x i16*> %geposb, i32 2, <8 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i32, i32* %base32, <8 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <8 x i32*> %gepbs to <8 x i16*> ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gepbsb, i32 2, <8 x i1> %mask, <8 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resbs, <8 x i16*> %gepbsb, i32 2, <8 x i1> %mask) + +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indzext4 = zext <8 x i16> %ind16 to <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep4 = getelementptr i16, i16* %base, <8 x i32> %indzext4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indtrunc = trunc <8 x i32> %ind32 to <8 x i16> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %indtrunc, <8 x i16*> %gep4, i32 2, <8 x i1> %mask) + +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep4, i32 2, <8 x i1> %mask, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %ressext = sext <8 x i16> %res to <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %restrunc = trunc <8 x i32> %ressext to <8 x i16> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %restrunc, <8 x i16*> %gep4, i32 4, <8 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + ; no offset ext %gep1 = getelementptr i16, i16* %base, <8 x i32> %ind32 %res1 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep1, i32 2, <8 x i1> %mask, <8 x i16> undef) call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res1, <8 x i16*> %gep1, i32 2, <8 x i1> %mask) + ; offset zext %indzext = zext <8 x i16> %ind16 to <8 x i32> %gep2 = getelementptr i16, i16* %base, <8 x i32> %indzext %res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep2, i32 2, <8 x i1> %mask, <8 x i16> undef) call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res2, <8 x i16*> %gep2, i32 2, <8 x i1> %mask) + ; offset sext %indsext = sext <8 x i16> %ind16 to <8 x i32> %gep3 = getelementptr i16, i16* %base, <8 x i32> %indsext %res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep3, i32 2, <8 x i1> %mask, <8 x i16> undef) @@ -332,6 +422,19 @@ define void @gep_v8i16(i16* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, < %gepbsb = bitcast <8 x i32*> %gepbs to <8 x i16*> %resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gepbsb, i32 2, <8 x i1> %mask, <8 x i16> undef) call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resbs, <8 x i16*> %gepbsb, i32 2, <8 x i1> %mask) + + ; trunc scatter + %indzext4 = zext <8 x i16> %ind16 to <8 x i32> + %gep4 = getelementptr i16, i16* %base, <8 x i32> %indzext4 + %indtrunc = trunc <8 x i32> %ind32 to <8 x i16> + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %indtrunc, <8 x i16*> %gep4, i32 2, <8 x i1> %mask) + + ; ext result to <8 x i32> + %res = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep4, i32 2, <8 x i1> %mask, <8 x i16> undef) + %ressext = sext <8 x i16> %res to <8 x i32> + %restrunc = trunc <8 x i32> %ressext to <8 x i16> + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %restrunc, <8 x i16*> %gep4, i32 4, <8 x i1> %mask) + ret void } @@ -340,35 +443,44 @@ define void @gep_v8f16(half* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep1 = getelementptr half, half* %base, <8 x i32> %ind32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %res1 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep1, i32 2, <8 x i1> %mask, <8 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res1, <8 x half*> %gep1, i32 2, <8 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indzext = zext <8 x i16> %ind16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep2 = getelementptr half, half* %base, <8 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep2, i32 2, <8 x i1> %mask, <8 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res2, <8 x half*> %gep2, i32 2, <8 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indsext = sext <8 x i16> %ind16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep3 = getelementptr half, half* %base, <8 x i32> %indsext ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep3, i32 2, <8 x i1> %mask, <8 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res3, <8 x half*> %gep3, i32 2, <8 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %resu = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep2, i32 1, <8 x i1> %mask, <8 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resu, <8 x half*> %gep2, i32 1, <8 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <8 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <8 x i8*> %gepos to <8 x half*> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %resos = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %geposb, i32 2, <8 x i1> %mask, <8 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resos, <8 x half*> %geposb, i32 2, <8 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i32, i32* %base32, <8 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <8 x i32*> %gepbs to <8 x half*> ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gepbsb, i32 2, <8 x i1> %mask, <8 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resbs, <8 x half*> %gepbsb, i32 2, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + + ; no offset ext %gep1 = getelementptr half, half* %base, <8 x i32> %ind32 %res1 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep1, i32 2, <8 x i1> %mask, <8 x half> undef) call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res1, <8 x half*> %gep1, i32 2, <8 x i1> %mask) + ; offset zext %indzext = zext <8 x i16> %ind16 to <8 x i32> %gep2 = getelementptr half, half* %base, <8 x i32> %indzext %res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep2, i32 2, <8 x i1> %mask, <8 x half> undef) call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res2, <8 x half*> %gep2, i32 2, <8 x i1> %mask) + ; offset sext %indsext = sext <8 x i16> %ind16 to <8 x i32> %gep3 = getelementptr half, half* %base, <8 x i32> %indsext %res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep3, i32 2, <8 x i1> %mask, <8 x half> undef) @@ -389,6 +501,42 @@ define void @gep_v8f16(half* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, %gepbsb = bitcast <8 x i32*> %gepbs to <8 x half*> %resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gepbsb, i32 2, <8 x i1> %mask, <8 x half> undef) call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resbs, <8 x half*> %gepbsb, i32 2, <8 x i1> %mask) + + ret void +} + +define void @gep_v8i8(i8* %base, <8 x i8> %ind8, <8 x i1> %mask) { +; CHECK-LABEL: 'gep_v8i8' + +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %indzext = zext <8 x i8> %ind8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep5 = getelementptr i8, i8* %base, <8 x i32> %indzext +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res5 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res5zext = zext <8 x i8> %res5 to <8 x i16> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %res5trunc = trunc <8 x i16> %res5zext to <8 x i8> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %res5trunc, <8 x i8*> %gep5, i32 4, <8 x i1> %mask) + +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res6 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res6sext = sext <8 x i8> %res6 to <8 x i16> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %res6trunc = trunc <8 x i16> %res6sext to <8 x i8> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %res6trunc, <8 x i8*> %gep5, i32 4, <8 x i1> %mask) + +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + + ; result zext + %indzext = zext <8 x i8> %ind8 to <8 x i32> + %gep5 = getelementptr i8, i8* %base, <8 x i32> %indzext + %res5 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef) + %res5zext = zext <8 x i8> %res5 to <8 x i16> + %res5trunc = trunc <8 x i16> %res5zext to <8 x i8> + call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %res5trunc, <8 x i8*> %gep5, i32 4, <8 x i1> %mask) + + ; result sext + %res6 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef) + %res6sext = sext <8 x i8> %res6 to <8 x i16> + %res6trunc = trunc <8 x i16> %res6sext to <8 x i8> + call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %res6trunc, <8 x i8*> %gep5, i32 4, <8 x i1> %mask) + ret void } @@ -397,29 +545,40 @@ define void @gep_v16i8(i8* %base, i16* %base16, <16 x i8> %ind8, <16 x i32> %ind ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i8, i8* %base, <16 x i32> %ind32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %res1 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep1, i32 1, <16 x i1> %mask, <16 x i8> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res1, <16 x i8*> %gep1, i32 2, <16 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %indzext = zext <16 x i8> %ind8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i8, i8* %base, <16 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep2, i32 2, <16 x i1> %mask, <16 x i8> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res2, <16 x i8*> %gep2, i32 2, <16 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %indsext = sext <16 x i8> %ind8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i8, i8* %base, <16 x i32> %indsext ; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep3, i32 2, <16 x i1> %mask, <16 x i8> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res3, <16 x i8*> %gep3, i32 2, <16 x i1> %mask) + ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, i16* %base16, <16 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <16 x i16*> %gepbs to <16 x i8*> ; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gepbsb, i32 2, <16 x i1> %mask, <16 x i8> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %resbs, <16 x i8*> %gepbsb, i32 2, <16 x i1> %mask) + +; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %indzext4 = zext <16 x i8> %ind8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep4 = getelementptr i8, i8* %base, <16 x i32> %indzext +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indtrunc = trunc <16 x i32> %ind32 to <16 x i8> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %indtrunc, <16 x i8*> %gep4, i32 2, <16 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + ; no offset ext %gep1 = getelementptr i8, i8* %base, <16 x i32> %ind32 %res1 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep1, i32 1, <16 x i1> %mask, <16 x i8> undef) call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res1, <16 x i8*> %gep1, i32 2, <16 x i1> %mask) + ; offset zext %indzext = zext <16 x i8> %ind8 to <16 x i32> %gep2 = getelementptr i8, i8* %base, <16 x i32> %indzext %res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep2, i32 2, <16 x i1> %mask, <16 x i8> undef) call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res2, <16 x i8*> %gep2, i32 2, <16 x i1> %mask) + ; offset sext %indsext = sext <16 x i8> %ind8 to <16 x i32> %gep3 = getelementptr i8, i8* %base, <16 x i32> %indsext %res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep3, i32 2, <16 x i1> %mask, <16 x i8> undef) @@ -430,6 +589,13 @@ define void @gep_v16i8(i8* %base, i16* %base16, <16 x i8> %ind8, <16 x i32> %ind %gepbsb = bitcast <16 x i16*> %gepbs to <16 x i8*> %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gepbsb, i32 2, <16 x i1> %mask, <16 x i8> undef) call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %resbs, <16 x i8*> %gepbsb, i32 2, <16 x i1> %mask) + + ; trunc scatter + %indzext4 = zext <16 x i8> %ind8 to <16 x i32> + %gep4 = getelementptr i8, i8* %base, <16 x i32> %indzext + %indtrunc = trunc <16 x i32> %ind32 to <16 x i8> + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %indtrunc, <16 x i8*> %gep4, i32 2, <16 x i1> %mask) + ret void } |