aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2020-07-04 15:27:55 +0100
committerSimon Pilgrim <llvm-dev@redking.me.uk>2020-07-04 15:28:15 +0100
commit56a8a5c9fe24d6d8809313633a4a5786d4fd29c6 (patch)
treef8fc79185a069969a68dc7fc0c49a74d85ac573e
parente56e96a264268a1df018f8b0a8c4caa18397a75d (diff)
downloadllvm-56a8a5c9fe24d6d8809313633a4a5786d4fd29c6.zip
llvm-56a8a5c9fe24d6d8809313633a4a5786d4fd29c6.tar.gz
llvm-56a8a5c9fe24d6d8809313633a4a5786d4fd29c6.tar.bz2
[DAG] matchBinOpReduction - match subvector reduction patterns beyond a matched shufflevector reduction
Currently matchBinOpReduction only handles shufflevector reduction patterns, but in many cases these only occur in the final stages of a reduction, once we're down to legal vector widths. Before this its likely that we are performing reductions using subvector extractions to repeatedly split the source vector in half and perform the binop on the halves. Assuming we've found a non-partial reduction, this patch continues looking for subvector reductions as far as it can beyond the last shufflevector. Fixes PR37890
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp22
-rw-r--r--llvm/test/CodeGen/X86/horizontal-reduce-add.ll59
-rw-r--r--llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll31
3 files changed, 56 insertions, 56 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 202cacd..732aea8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -9381,6 +9381,28 @@ SelectionDAG::matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
PrevOp = Op;
}
+ // Handle subvector reductions, which tend to appear after the shuffle
+ // reduction stages.
+ while (Op.getOpcode() == CandidateBinOp) {
+ unsigned NumElts = Op.getValueType().getVectorNumElements();
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ if (Op0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ Op0.getOperand(0) != Op1.getOperand(0))
+ break;
+ SDValue Src = Op0.getOperand(0);
+ unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+ if (NumSrcElts != (2 * NumElts))
+ break;
+ if (!(Op0.getConstantOperandAPInt(1) == 0 &&
+ Op1.getConstantOperandAPInt(1) == NumElts) &&
+ !(Op1.getConstantOperandAPInt(1) == 0 &&
+ Op0.getConstantOperandAPInt(1) == NumElts))
+ break;
+ Op = Src;
+ }
+
BinOp = (ISD::NodeType)CandidateBinOp;
return Op;
}
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-add.ll b/llvm/test/CodeGen/X86/horizontal-reduce-add.ll
index b73dc92..64d8de9 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-add.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-add.ll
@@ -29,10 +29,9 @@ define i32 @PR37890_v4i32(<4 x i32> %a) {
;
; SSSE3-FAST-LABEL: PR37890_v4i32:
; SSSE3-FAST: # %bb.0:
-; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSSE3-FAST-NEXT: paddd %xmm0, %xmm1
-; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm1
-; SSSE3-FAST-NEXT: movd %xmm1, %eax
+; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: movd %xmm0, %eax
; SSSE3-FAST-NEXT: retq
;
; AVX1-SLOW-LABEL: PR37890_v4i32:
@@ -46,8 +45,7 @@ define i32 @PR37890_v4i32(<4 x i32> %a) {
;
; AVX1-FAST-LABEL: PR37890_v4i32:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: retq
@@ -98,10 +96,8 @@ define i16 @PR37890_v8i16(<8 x i16> %a) {
;
; SSSE3-FAST-LABEL: PR37890_v8i16:
; SSSE3-FAST: # %bb.0:
-; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSSE3-FAST-NEXT: paddw %xmm0, %xmm1
-; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSSE3-FAST-NEXT: paddw %xmm1, %xmm0
+; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
; SSSE3-FAST-NEXT: movd %xmm0, %eax
; SSSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax
@@ -121,10 +117,8 @@ define i16 @PR37890_v8i16(<8 x i16> %a) {
;
; AVX1-FAST-LABEL: PR37890_v8i16:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
@@ -177,10 +171,9 @@ define i32 @PR37890_v8i32(<8 x i32> %a) {
; SSSE3-FAST-LABEL: PR37890_v8i32:
; SSSE3-FAST: # %bb.0:
; SSSE3-FAST-NEXT: paddd %xmm1, %xmm0
-; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSSE3-FAST-NEXT: paddd %xmm0, %xmm1
-; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm1
-; SSSE3-FAST-NEXT: movd %xmm1, %eax
+; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: movd %xmm0, %eax
; SSSE3-FAST-NEXT: retq
;
; AVX1-SLOW-LABEL: PR37890_v8i32:
@@ -198,9 +191,8 @@ define i32 @PR37890_v8i32(<8 x i32> %a) {
; AVX1-FAST-LABEL: PR37890_v8i32:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: vzeroupper
@@ -261,10 +253,8 @@ define i16 @PR37890_v16i16(<16 x i16> %a) {
; SSSE3-FAST-LABEL: PR37890_v16i16:
; SSSE3-FAST: # %bb.0:
; SSSE3-FAST-NEXT: paddw %xmm1, %xmm0
-; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSSE3-FAST-NEXT: paddw %xmm0, %xmm1
-; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSSE3-FAST-NEXT: paddw %xmm1, %xmm0
+; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
; SSSE3-FAST-NEXT: movd %xmm0, %eax
; SSSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax
@@ -288,11 +278,9 @@ define i16 @PR37890_v16i16(<16 x i16> %a) {
; AVX1-FAST-LABEL: PR37890_v16i16:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
@@ -381,13 +369,12 @@ define i32 @PR37890_v16i32(<16 x i32> %a) {
;
; AVX1-FAST-LABEL: PR37890_v16i32:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
; AVX1-FAST-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll
index 13cf83b..ff635b9 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll
@@ -31,11 +31,8 @@ define float @PR37890_v4f32(<4 x float> %a) {
;
; SSSE3-FAST-LABEL: PR37890_v4f32:
; SSSE3-FAST: # %bb.0:
-; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1
-; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSSE3-FAST-NEXT: addps %xmm0, %xmm1
-; SSSE3-FAST-NEXT: haddps %xmm1, %xmm1
-; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0
+; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
+; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
; SSSE3-FAST-NEXT: retq
;
; AVX1-SLOW-LABEL: PR37890_v4f32:
@@ -48,8 +45,7 @@ define float @PR37890_v4f32(<4 x float> %a) {
;
; AVX1-FAST-LABEL: PR37890_v4f32:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: retq
;
@@ -106,7 +102,7 @@ define double @PR37890_v4f64(<4 x double> %a) {
; AVX1-FAST-LABEL: PR37890_v4f64:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0
; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vzeroupper
; AVX1-FAST-NEXT: retq
@@ -154,11 +150,8 @@ define float @PR37890_v8f32(<8 x float> %a) {
; SSSE3-FAST-LABEL: PR37890_v8f32:
; SSSE3-FAST: # %bb.0:
; SSSE3-FAST-NEXT: addps %xmm1, %xmm0
-; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1
-; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSSE3-FAST-NEXT: addps %xmm0, %xmm1
-; SSSE3-FAST-NEXT: haddps %xmm1, %xmm1
-; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0
+; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
+; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
; SSSE3-FAST-NEXT: retq
;
; AVX1-SLOW-LABEL: PR37890_v8f32:
@@ -175,9 +168,8 @@ define float @PR37890_v8f32(<8 x float> %a) {
; AVX1-FAST-LABEL: PR37890_v8f32:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0
+; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vzeroupper
; AVX1-FAST-NEXT: retq
@@ -248,7 +240,7 @@ define double @PR37890_v8f64(<8 x double> %a) {
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0
; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vzeroupper
; AVX1-FAST-NEXT: retq
@@ -327,9 +319,8 @@ define float @PR37890_v16f32(<16 x float> %a) {
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0
+; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vzeroupper
; AVX1-FAST-NEXT: retq