diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2020-08-13 12:42:43 +0100 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2020-08-13 12:42:59 +0100 |
commit | a31d20e67e2bbdbf5afd72b846f681023ff3bc4c (patch) | |
tree | 34982da95a52195be82ff23a23b959360b1d5a2d /llvm | |
parent | e63cc8105adfd452aebd079d2c0b2e915bcbc6d5 (diff) | |
download | llvm-a31d20e67e2bbdbf5afd72b846f681023ff3bc4c.zip llvm-a31d20e67e2bbdbf5afd72b846f681023ff3bc4c.tar.gz llvm-a31d20e67e2bbdbf5afd72b846f681023ff3bc4c.tar.bz2 |
[X86][SSE] IsElementEquivalent - add HOP(X,X) support
For HADD/HSUB/PACKS ops with repeated operands the lower/upper half element of each lane are known to be equivalent
Diffstat (limited to 'llvm')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 46 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/haddsub-3.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/haddsub-shuf.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/haddsub-undef.ll | 15 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/phaddsub.ll | 4 |
5 files changed, 50 insertions, 27 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4480c6e..da5bb92 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -10763,13 +10763,39 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode()) return false; - if (Op.getOpcode() == ISD::BUILD_VECTOR) { + switch (Op.getOpcode()) { + case ISD::BUILD_VECTOR: // If the values are build vectors, we can look through them to find // equivalent inputs that make the shuffles equivalent. // TODO: Handle MaskSize != Op.getNumOperands()? if (MaskSize == (int)Op.getNumOperands() && MaskSize == (int)ExpectedOp.getNumOperands()) return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx); + break; + case X86ISD::HADD: + case X86ISD::HSUB: + case X86ISD::FHADD: + case X86ISD::FHSUB: + case X86ISD::PACKSS: + case X86ISD::PACKUS: + // HOP(X,X) can refer to the elt from the lower/upper half of a lane. + // TODO: Handle MaskSize != NumElts? + // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases. + if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) { + MVT VT = Op.getSimpleValueType(); + int NumElts = VT.getVectorNumElements(); + if (MaskSize == NumElts) { + int NumLanes = VT.getSizeInBits() / 128; + int NumEltsPerLane = NumElts / NumLanes; + int NumHalfEltsPerLane = NumEltsPerLane / 2; + bool SameLane = + (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane); + bool SameElt = + (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane); + return SameLane && SameElt; + } + } + break; } return false; @@ -34012,17 +34038,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { - if (isTargetShuffleEquivalent(Mask, {0, 0})) { + if (isTargetShuffleEquivalent(Mask, {0, 0}, V1)) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v2f64; return true; } - if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { + if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}, V1)) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) { + if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3}, V1)) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v4f32; return true; @@ -34031,17 +34057,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, if (MaskVT.is256BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"); - if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { + if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}, V1)) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v4f64; return true; } - if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { + if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v8f32; return true; } - if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) { + if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v8f32; return true; @@ -34051,19 +34077,19 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, if (MaskVT.is512BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"); - if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { + if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v8f64; return true; } if (isTargetShuffleEquivalent( - Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) { + Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v16f32; return true; } if (isTargetShuffleEquivalent( - Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) { + Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v16f32; return true; diff --git a/llvm/test/CodeGen/X86/haddsub-3.ll b/llvm/test/CodeGen/X86/haddsub-3.ll index e0ea725..98898c7 100644 --- a/llvm/test/CodeGen/X86/haddsub-3.ll +++ b/llvm/test/CodeGen/X86/haddsub-3.ll @@ -29,10 +29,8 @@ define float @pr26491(<4 x float> %a0) { ; SSSE3-FAST-LABEL: pr26491: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 -; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSSE3-FAST-NEXT: addss %xmm0, %xmm1 -; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSSE3-FAST-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSSE3-FAST-NEXT: addss %xmm1, %xmm0 ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: pr26491: @@ -46,7 +44,7 @@ define float @pr26491(<4 x float> %a0) { ; AVX1-FAST-LABEL: pr26491: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX1-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll index 5c757d3..4f7528b 100644 --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -855,13 +855,13 @@ define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) { ; SSSE3-LABEL: broadcast_haddps_v4f32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: haddps %xmm0, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] ; SSSE3-NEXT: retq ; ; AVX1-LABEL: broadcast_haddps_v4f32: ; AVX1: # %bb.0: ; AVX1-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: broadcast_haddps_v4f32: diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll index ce5d182..5a9da36 100644 --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -587,7 +587,7 @@ define <4 x float> @add_ps_017(<4 x float> %x) { ; SSE-FAST-LABEL: add_ps_017: ; SSE-FAST: # %bb.0: ; SSE-FAST-NEXT: haddps %xmm0, %xmm0 -; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: add_ps_017: @@ -600,7 +600,7 @@ define <4 x float> @add_ps_017(<4 x float> %x) { ; AVX-FAST-LABEL: add_ps_017: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX-FAST-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2> %add = fadd <4 x float> %l, %x @@ -612,13 +612,13 @@ define <4 x float> @add_ps_018(<4 x float> %x) { ; SSE-LABEL: add_ps_018: ; SSE: # %bb.0: ; SSE-NEXT: haddps %xmm0, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] ; SSE-NEXT: retq ; ; AVX-LABEL: add_ps_018: ; AVX: # %bb.0: ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,2,2] +; AVX-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] ; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef> @@ -929,9 +929,8 @@ define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind { ; ; SSE-FAST-LABEL: PR45747_2: ; SSE-FAST: # %bb.0: -; SSE-FAST-NEXT: movaps %xmm1, %xmm0 -; SSE-FAST-NEXT: haddps %xmm1, %xmm0 -; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-FAST-NEXT: haddps %xmm1, %xmm1 +; SSE-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: PR45747_2: @@ -944,7 +943,7 @@ define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind { ; AVX-FAST-LABEL: PR45747_2: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm0 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX-FAST-NEXT: retq %t0 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef> %t1 = fadd <4 x float> %t0, %b diff --git a/llvm/test/CodeGen/X86/phaddsub.ll b/llvm/test/CodeGen/X86/phaddsub.ll index ebf0951..163631c 100644 --- a/llvm/test/CodeGen/X86/phaddsub.ll +++ b/llvm/test/CodeGen/X86/phaddsub.ll @@ -451,13 +451,13 @@ define <4 x i32> @phaddd_single_source6(<4 x i32> %x) { ; SSSE3-LABEL: phaddd_single_source6: ; SSSE3: # %bb.0: ; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddd_single_source6: ; AVX: # %bb.0: ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef> |