diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2020-07-04 15:27:55 +0100 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2020-07-04 15:28:15 +0100 |
commit | 56a8a5c9fe24d6d8809313633a4a5786d4fd29c6 (patch) | |
tree | f8fc79185a069969a68dc7fc0c49a74d85ac573e | |
parent | e56e96a264268a1df018f8b0a8c4caa18397a75d (diff) | |
download | llvm-56a8a5c9fe24d6d8809313633a4a5786d4fd29c6.zip llvm-56a8a5c9fe24d6d8809313633a4a5786d4fd29c6.tar.gz llvm-56a8a5c9fe24d6d8809313633a4a5786d4fd29c6.tar.bz2 |
[DAG] matchBinOpReduction - match subvector reduction patterns beyond a matched shufflevector reduction
Currently matchBinOpReduction only handles shufflevector reduction patterns, but in many cases these only occur in the final stages of a reduction, once we're down to legal vector widths.
Before this its likely that we are performing reductions using subvector extractions to repeatedly split the source vector in half and perform the binop on the halves.
Assuming we've found a non-partial reduction, this patch continues looking for subvector reductions as far as it can beyond the last shufflevector.
Fixes PR37890
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 22 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/horizontal-reduce-add.ll | 59 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll | 31 |
3 files changed, 56 insertions, 56 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 202cacd..732aea8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -9381,6 +9381,28 @@ SelectionDAG::matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, PrevOp = Op; } + // Handle subvector reductions, which tend to appear after the shuffle + // reduction stages. + while (Op.getOpcode() == CandidateBinOp) { + unsigned NumElts = Op.getValueType().getVectorNumElements(); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + if (Op0.getOpcode() != ISD::EXTRACT_SUBVECTOR || + Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR || + Op0.getOperand(0) != Op1.getOperand(0)) + break; + SDValue Src = Op0.getOperand(0); + unsigned NumSrcElts = Src.getValueType().getVectorNumElements(); + if (NumSrcElts != (2 * NumElts)) + break; + if (!(Op0.getConstantOperandAPInt(1) == 0 && + Op1.getConstantOperandAPInt(1) == NumElts) && + !(Op1.getConstantOperandAPInt(1) == 0 && + Op0.getConstantOperandAPInt(1) == NumElts)) + break; + Op = Src; + } + BinOp = (ISD::NodeType)CandidateBinOp; return Op; } diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-add.ll b/llvm/test/CodeGen/X86/horizontal-reduce-add.ll index b73dc92..64d8de9 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-add.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-add.ll @@ -29,10 +29,9 @@ define i32 @PR37890_v4i32(<4 x i32> %a) { ; ; SSSE3-FAST-LABEL: PR37890_v4i32: ; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-FAST-NEXT: paddd %xmm0, %xmm1 -; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm1 -; SSSE3-FAST-NEXT: movd %xmm1, %eax +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: movd %xmm0, %eax ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: PR37890_v4i32: @@ -46,8 +45,7 @@ define i32 @PR37890_v4i32(<4 x i32> %a) { ; ; AVX1-FAST-LABEL: PR37890_v4i32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: retq @@ -98,10 +96,8 @@ define i16 @PR37890_v8i16(<8 x i16> %a) { ; ; SSSE3-FAST-LABEL: PR37890_v8i16: ; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-FAST-NEXT: paddw %xmm0, %xmm1 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSSE3-FAST-NEXT: paddw %xmm1, %xmm0 +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 ; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 ; SSSE3-FAST-NEXT: movd %xmm0, %eax ; SSSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -121,10 +117,8 @@ define i16 @PR37890_v8i16(<8 x i16> %a) { ; ; AVX1-FAST-LABEL: PR37890_v8i16: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -177,10 +171,9 @@ define i32 @PR37890_v8i32(<8 x i32> %a) { ; SSSE3-FAST-LABEL: PR37890_v8i32: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: paddd %xmm1, %xmm0 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-FAST-NEXT: paddd %xmm0, %xmm1 -; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm1 -; SSSE3-FAST-NEXT: movd %xmm1, %eax +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: movd %xmm0, %eax ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: PR37890_v8i32: @@ -198,9 +191,8 @@ define i32 @PR37890_v8i32(<8 x i32> %a) { ; AVX1-FAST-LABEL: PR37890_v8i32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: vzeroupper @@ -261,10 +253,8 @@ define i16 @PR37890_v16i16(<16 x i16> %a) { ; SSSE3-FAST-LABEL: PR37890_v16i16: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: paddw %xmm1, %xmm0 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-FAST-NEXT: paddw %xmm0, %xmm1 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSSE3-FAST-NEXT: paddw %xmm1, %xmm0 +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 ; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 ; SSSE3-FAST-NEXT: movd %xmm0, %eax ; SSSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -288,11 +278,9 @@ define i16 @PR37890_v16i16(<16 x i16> %a) { ; AVX1-FAST-LABEL: PR37890_v16i16: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -381,13 +369,12 @@ define i32 @PR37890_v16i32(<16 x i32> %a) { ; ; AVX1-FAST-LABEL: PR37890_v16i32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll index 13cf83b..ff635b9 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll @@ -31,11 +31,8 @@ define float @PR37890_v4f32(<4 x float> %a) { ; ; SSSE3-FAST-LABEL: PR37890_v4f32: ; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSSE3-FAST-NEXT: addps %xmm0, %xmm1 -; SSSE3-FAST-NEXT: haddps %xmm1, %xmm1 -; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: PR37890_v4f32: @@ -48,8 +45,7 @@ define float @PR37890_v4f32(<4 x float> %a) { ; ; AVX1-FAST-LABEL: PR37890_v4f32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: retq ; @@ -106,7 +102,7 @@ define double @PR37890_v4f64(<4 x double> %a) { ; AVX1-FAST-LABEL: PR37890_v4f64: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -154,11 +150,8 @@ define float @PR37890_v8f32(<8 x float> %a) { ; SSSE3-FAST-LABEL: PR37890_v8f32: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 -; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSSE3-FAST-NEXT: addps %xmm0, %xmm1 -; SSSE3-FAST-NEXT: haddps %xmm1, %xmm1 -; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: PR37890_v8f32: @@ -175,9 +168,8 @@ define float @PR37890_v8f32(<8 x float> %a) { ; AVX1-FAST-LABEL: PR37890_v8f32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -248,7 +240,7 @@ define double @PR37890_v8f64(<8 x double> %a) { ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -327,9 +319,8 @@ define float @PR37890_v16f32(<16 x float> %a) { ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq |