diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-07-22 13:58:44 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-07-22 13:58:44 +0000 |
commit | ea0d4f9962fbc1741a730ec74b655940ea15424b (patch) | |
tree | 47e47fd7351dc3615c9c33add8a694638cec173d /llvm/lib/IR/AutoUpgrade.cpp | |
parent | 22c9e931470fea2e25bef1f52128e54ec96da403 (diff) | |
download | llvm-ea0d4f9962fbc1741a730ec74b655940ea15424b.zip llvm-ea0d4f9962fbc1741a730ec74b655940ea15424b.tar.gz llvm-ea0d4f9962fbc1741a730ec74b655940ea15424b.tar.bz2 |
[X86][AVX] Added support for lowering to VBROADCASTF128/VBROADCASTI128 (reapplied)
As reported on PR26235, we don't currently make use of the VBROADCASTF128/VBROADCASTI128 instructions (or the AVX512 equivalents) to load+splat a 128-bit vector to both lanes of a 256-bit vector.
This patch enables lowering from subvector insertion/concatenation patterns and auto-upgrades the llvm.x86.avx.vbroadcastf128.pd.256 / llvm.x86.avx.vbroadcastf128.ps.256 intrinsics to match.
We could possibly investigate using VBROADCASTF128/VBROADCASTI128 to load repeated constants as well (similar to how we already do for scalar broadcasts).
Reapplied with fix for PR28657 - removed intrinsic definitions (clang companion patch to be be submitted shortly).
Differential Revision: https://reviews.llvm.org/D22460
llvm-svn: 276416
Diffstat (limited to 'llvm/lib/IR/AutoUpgrade.cpp')
-rw-r--r-- | llvm/lib/IR/AutoUpgrade.cpp | 21 |
1 files changed, 14 insertions, 7 deletions
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 2e4a2f8..a8145b6 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -296,6 +296,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name.startswith("avx.blend.p") || Name == "avx2.pblendw" || Name.startswith("avx2.pblendd.") || + Name.startswith("avx.vbroadcastf128") || Name == "avx2.vbroadcasti128" || Name == "xop.vpcmov" || (Name.startswith("xop.vpcom") && F->arg_size() == 2))) { @@ -886,7 +887,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C)); Rep = Builder.CreateCall(CRC32, {Trunc0, CI->getArgOperand(1)}); Rep = Builder.CreateZExt(Rep, CI->getType(), ""); - } else if (IsX86 && Name.startswith("avx.vbroadcast")) { + } else if (IsX86 && Name.startswith("avx.vbroadcast.s")) { // Replace broadcasts with a series of insertelements. Type *VecTy = CI->getType(); Type *EltTy = VecTy->getVectorElementType(); @@ -918,15 +919,21 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { bool DoSext = (StringRef::npos != Name.find("pmovsx")); Rep = DoSext ? Builder.CreateSExt(SV, DstTy) : Builder.CreateZExt(SV, DstTy); - } else if (IsX86 && Name == "avx2.vbroadcasti128") { - // Replace vbroadcasts with a vector shuffle. - Type *VT = VectorType::get(Type::getInt64Ty(C), 2); + } else if (IsX86 && (Name.startswith("avx.vbroadcastf128") || + Name == "avx2.vbroadcasti128")) { + // Replace vbroadcastf128/vbroadcasti128 with a vector load+shuffle. + Type *EltTy = CI->getType()->getVectorElementType(); + unsigned NumSrcElts = 128 / EltTy->getPrimitiveSizeInBits(); + Type *VT = VectorType::get(EltTy, NumSrcElts); Value *Op = Builder.CreatePointerCast(CI->getArgOperand(0), PointerType::getUnqual(VT)); Value *Load = Builder.CreateLoad(VT, Op); - uint32_t Idxs[4] = { 0, 1, 0, 1 }; - Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()), - Idxs); + if (NumSrcElts == 2) + Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()), + { 0, 1, 0, 1 }); + else + Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()), + { 0, 1, 2, 3, 0, 1, 2, 3 }); } else if (IsX86 && (Name.startswith("avx2.pbroadcast") || Name.startswith("avx2.vbroadcast") || Name.startswith("avx512.pbroadcast") || |