diff options
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 13 | ||||
| -rw-r--r-- | llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll | 29 |
2 files changed, 20 insertions, 22 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index df761f9b..da03a69 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1566,6 +1566,8 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) { m_Mask(OuterMask)))) return false; + auto *ShufI0 = dyn_cast<Instruction>(I.getOperand(0)); + auto *ShufI1 = dyn_cast<Instruction>(I.getOperand(1)); auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType()); auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(V0->getType()); auto *ShuffleImmTy = dyn_cast<FixedVectorType>(I.getOperand(0)->getType()); @@ -1607,14 +1609,15 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) { InstructionCost OldCost = TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy, - InnerMask0, CostKind) + + InnerMask0, CostKind, 0, nullptr, {V0, U0}, ShufI0) + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy, - InnerMask1, CostKind) + + InnerMask1, CostKind, 0, nullptr, {V1, U1}, ShufI1) + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy, - OuterMask, CostKind, 0, nullptr, std::nullopt, &I); + OuterMask, CostKind, 0, nullptr, {ShufI0, ShufI1}, &I); - InstructionCost NewCost = TTI.getShuffleCost( - TargetTransformInfo::SK_PermuteTwoSrc, ShuffleSrcTy, NewMask, CostKind); + InstructionCost NewCost = + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleSrcTy, + NewMask, CostKind, 0, nullptr, {V0, V1}); LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll index 21d9d1c..57df36a 100644 --- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s ; fold to identity @@ -44,22 +44,17 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) { ret <8 x i32> %concat } +; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles) + define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) { -; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64( -; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] { -; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32 -; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32 -; SSE-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer -; SSE-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer -; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3> -; SSE-NEXT: ret <4 x double> [[BLEND]] -; -; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64( -; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] { -; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32 -; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32 -; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0> -; AVX-NEXT: ret <4 x double> [[BLEND]] +; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64( +; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32 +; CHECK-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32 +; CHECK-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3> +; CHECK-NEXT: ret <4 x double> [[BLEND]] ; %ld0 = load <4 x double>, ptr %p0, align 32 %ld1 = load <4 x double>, ptr %p1, align 32 |
