8 files changed, 395 insertions, 161 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index cd7074e..b514755 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,18 @@
+2015-06-23  Uros Bizjak  <ubizjak@gmail.com>
+
+	PR target/66560
+	* config/i386/predicates.md (addsub_vm_operator): New predicate.
+	(addsub_vs_operator): Ditto.
+	(addsub_vs_parallel): Ditto.
+	* config/i386/sse.md (ssedoublemode): Add V4SF and V2DF modes.
+	(avx_addsubv4df3, avx_addsubv8sf3, sse3_addsubv2df3, sse3_addsubv4sf3):
+	Put minus RTX before plus and adjust vec_merge selector.
+	(*avx_addsubv4df3_1, *avx_addsubv4df3_1s, *sse3_addsubv2df3_1)
+	(*sse_addsubv2df3_1s, *avx_addsubv8sf3_1, *avx_addsubv8sf3_1s)
+	(*sse3_addsubv4sf3_1, *sse_addsubv4sf3_1s): Remove insn patterns.
+	(addsub vec_merge splitters): New combiner splitters.
+	(addsub vec_select/vec_concat splitters): Ditto.
+
 2015-06-23  Bin Cheng  <bin.cheng@arm.com>
 
 	PR tree-optimization/66449
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index b7bb84f..4e45246 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1426,8 +1426,105 @@
   (and (match_code "unspec_volatile")
        (match_test "XINT (op, 1) == UNSPECV_VZEROUPPER")))
 
-;; Return true if OP is a parallel for a vbroadcast permute.
+;; Return true if OP is an addsub vec_merge operation
+(define_predicate "addsub_vm_operator"
+  (match_code "vec_merge")
+{
+  rtx op0, op1;
+  int swapped;
+  HOST_WIDE_INT mask;
+  int nunits, elt;
+
+  op0 = XEXP (op, 0);
+  op1 = XEXP (op, 1);
+
+  /* Sanity check.  */
+  if (GET_CODE (op0) == MINUS && GET_CODE (op1) == PLUS)
+    swapped = 0;
+  else if (GET_CODE (op0) == PLUS && GET_CODE (op1) == MINUS)
+    swapped = 1;
+  else
+    gcc_unreachable ();
+
+  mask = INTVAL (XEXP (op, 2));
+  nunits = GET_MODE_NUNITS (mode);
+
+  for (elt = 0; elt < nunits; elt++)
+    {
+      /* bit clear: take from op0, set: take from op1  */
+      int bit = !(mask & (HOST_WIDE_INT_1U << elt));
+
+      if (bit != ((elt & 1) ^ swapped))
+	return false;
+    }
+
+  return true;
+})
+
+;; Return true if OP is an addsub vec_select/vec_concat operation
+(define_predicate "addsub_vs_operator"
+  (and (match_code "vec_select")
+       (match_code "vec_concat" "0"))
+{
+  rtx op0, op1;
+  bool swapped;
+  int nunits, elt;
+
+  op0 = XEXP (XEXP (op, 0), 0);
+  op1 = XEXP (XEXP (op, 0), 1);
+
+  /* Sanity check.  */
+  if (GET_CODE (op0) == MINUS && GET_CODE (op1) == PLUS)
+    swapped = false;
+  else if (GET_CODE (op0) == PLUS && GET_CODE (op1) == MINUS)
+    swapped = true;
+  else
+    gcc_unreachable ();
+
+  nunits = GET_MODE_NUNITS (mode);
+  if (XVECLEN (XEXP (op, 1), 0) != nunits)
+    return false;
+
+  /* We already checked that permutation is suitable for addsub,
+     so only look at the first element of the parallel.  */
+  elt = INTVAL (XVECEXP (XEXP (op, 1), 0, 0));
 
+  return elt == (swapped ? nunits : 0);
+})
+
+;; Return true if OP is a parallel for an addsub vec_select.
+(define_predicate "addsub_vs_parallel"
+  (and (match_code "parallel")
+       (match_code "const_int" "a"))
+{
+  int nelt = XVECLEN (op, 0);
+  int elt, i;
+  
+  if (nelt < 2)
+    return false;
+
+  /* Check that the permutation is suitable for addsub.
+     For example, { 0 9 2 11 4 13 6 15 } or { 8 1 10 3 12 5 14 7 }.  */
+  elt = INTVAL (XVECEXP (op, 0, 0));
+  if (elt == 0)
+    {
+      for (i = 1; i < nelt; ++i)
+	if (INTVAL (XVECEXP (op, 0, i)) != (i + (i & 1) * nelt))
+	  return false;
+    }
+  else if (elt == nelt)
+    {
+      for (i = 1; i < nelt; ++i)
+	if (INTVAL (XVECEXP (op, 0, i)) != (elt + i - (i & 1) * nelt))
+	  return false;
+    }
+  else
+    return false;
+
+  return true;
+})
+
+;; Return true if OP is a parallel for a vbroadcast permute.
 (define_predicate "avx_vbroadcast_operand"
   (and (match_code "parallel")
        (match_code "const_int" "a"))
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d1277ca..9c95816 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -487,10 +487,12 @@
    (V4SI "v4di")   (V8SI "v8di")   (V16SI "v16di")])
 
 (define_mode_attr ssedoublemode
-  [(V16SF "V32SF") (V16SI "V32SI") (V8DI "V16DI") (V8DF "V16DF")
-   (V8SF "V16SF") (V8SI "V16SI") (V4DI "V8DI") (V4DF "V8DF")
-   (V16HI "V16SI") (V8HI "V8SI") (V4HI "V4SI") (V4SI "V4DI")
-   (V32HI "V32SI") (V32QI "V32HI") (V16QI "V16HI") (V64QI "V64HI")])
+  [(V4SF "V8SF") (V8SF "V16SF") (V16SF "V32SF")
+   (V2DF "V4DF") (V4DF "V8DF") (V8DF "V16DF")
+   (V16QI "V16HI") (V32QI "V32HI") (V64QI "V64HI")
+   (V4HI "V4SI") (V8HI "V8SI") (V16HI "V16SI") (V32HI "V32SI")
+   (V4SI "V4DI") (V8SI "V16SI") (V16SI "V32SI")
+   (V4DI "V8DI") (V8DI "V16DI")])
 
 (define_mode_attr ssebytemode
   [(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI")])
@@ -2021,43 +2023,11 @@
 (define_insn "avx_addsubv4df3"
   [(set (match_operand:V4DF 0 "register_operand" "=x")
 	(vec_merge:V4DF
-	  (plus:V4DF
+	  (minus:V4DF
 	    (match_operand:V4DF 1 "register_operand" "x")
 	    (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
-	  (minus:V4DF (match_dup 1) (match_dup 2))
-	  (const_int 10)))]
-  "TARGET_AVX"
-  "vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "sseadd")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "V4DF")])
-
-(define_insn "*avx_addsubv4df3_1"
-  [(set (match_operand:V4DF 0 "register_operand" "=x")
-  	(vec_select:V4DF
-	  (vec_concat:V8DF
-	    (minus:V4DF
-	      (match_operand:V4DF 1 "register_operand" "x")
-	      (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
-	    (plus:V4DF (match_dup 1) (match_dup 2)))
-	  (parallel [(const_int 0) (const_int 5)
-		     (const_int 2) (const_int 7)])))]
-  "TARGET_AVX"
-  "vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "sseadd")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "V4DF")])
-
-(define_insn "*avx_addsubv4df3_1s"
-  [(set (match_operand:V4DF 0 "register_operand" "=x")
-  	(vec_select:V4DF
-	  (vec_concat:V8DF
-	    (minus:V4DF
-	      (match_operand:V4DF 1 "register_operand" "x")
-	      (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
-	    (plus:V4DF (match_dup 2) (match_dup 1)))
-	  (parallel [(const_int 0) (const_int 5)
-		     (const_int 2) (const_int 7)])))]
+	  (plus:V4DF (match_dup 1) (match_dup 2))
+	  (const_int 5)))]
   "TARGET_AVX"
   "vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sseadd")
@@ -2067,49 +2037,11 @@
 (define_insn "sse3_addsubv2df3"
   [(set (match_operand:V2DF 0 "register_operand" "=x,x")
 	(vec_merge:V2DF
-	  (plus:V2DF
+	  (minus:V2DF
 	    (match_operand:V2DF 1 "register_operand" "0,x")
 	    (match_operand:V2DF 2 "nonimmediate_operand" "xm,xm"))
-	  (minus:V2DF (match_dup 1) (match_dup 2))
-	  (const_int 2)))]
-  "TARGET_SSE3"
-  "@
-   addsubpd\t{%2, %0|%0, %2}
-   vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "noavx,avx")
-   (set_attr "type" "sseadd")
-   (set_attr "atom_unit" "complex")
-   (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "V2DF")])
-
-(define_insn "*sse3_addsubv2df3_1"
-  [(set (match_operand:V2DF 0 "register_operand" "=x,x")
-	(vec_select:V2DF
-	  (vec_concat:V4DF
-	    (minus:V2DF
-	      (match_operand:V2DF 1 "register_operand" "0,x")
-	      (match_operand:V2DF 2 "nonimmediate_operand" "xm,xm"))
-	    (plus:V2DF (match_dup 1) (match_dup 2)))
-	  (parallel [(const_int 0) (const_int 3)])))]
-  "TARGET_SSE3"
-  "@
-   addsubpd\t{%2, %0|%0, %2}
-   vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "noavx,avx")
-   (set_attr "type" "sseadd")
-   (set_attr "atom_unit" "complex")
-   (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "V2DF")])
-
-(define_insn "*sse3_addsubv2df3_1s"
-  [(set (match_operand:V2DF 0 "register_operand" "=x,x")
-	(vec_select:V2DF
-	  (vec_concat:V4DF
-	    (minus:V2DF
-	      (match_operand:V2DF 1 "register_operand" "0,x")
-	      (match_operand:V2DF 2 "nonimmediate_operand" "xm,xm"))
-	    (plus:V2DF (match_dup 2) (match_dup 1)))
-	  (parallel [(const_int 0) (const_int 3)])))]
+	  (plus:V2DF (match_dup 1) (match_dup 2))
+	  (const_int 1)))]
   "TARGET_SSE3"
   "@
    addsubpd\t{%2, %0|%0, %2}
@@ -2123,47 +2055,11 @@
 (define_insn "avx_addsubv8sf3"
   [(set (match_operand:V8SF 0 "register_operand" "=x")
 	(vec_merge:V8SF
-	  (plus:V8SF
+	  (minus:V8SF
 	    (match_operand:V8SF 1 "register_operand" "x")
 	    (match_operand:V8SF 2 "nonimmediate_operand" "xm"))
-	  (minus:V8SF (match_dup 1) (match_dup 2))
-	  (const_int 170)))]
-  "TARGET_AVX"
-  "vaddsubps\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "sseadd")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "V8SF")])
-
-(define_insn "*avx_addsubv8sf3_1"
-  [(set (match_operand:V8SF 0 "register_operand" "=x")
-	(vec_select:V8SF
-	  (vec_concat:V16SF
-	    (minus:V8SF
-	      (match_operand:V8SF 1 "register_operand" "x")
-	      (match_operand:V8SF 2 "nonimmediate_operand" "xm"))
-	    (plus:V8SF (match_dup 1) (match_dup 2)))
-	  (parallel [(const_int 0) (const_int 9)
-		     (const_int 2) (const_int 11)
-		     (const_int 4) (const_int 13)
-		     (const_int 6) (const_int 15)])))]
-  "TARGET_AVX"
-  "vaddsubps\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "sseadd")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "V8SF")])
-
-(define_insn "*avx_addsubv8sf3_1s"
-  [(set (match_operand:V8SF 0 "register_operand" "=x")
-	(vec_select:V8SF
-	  (vec_concat:V16SF
-	    (minus:V8SF
-	      (match_operand:V8SF 1 "register_operand" "x")
-	      (match_operand:V8SF 2 "nonimmediate_operand" "xm"))
-	    (plus:V8SF (match_dup 2) (match_dup 1)))
-	  (parallel [(const_int 0) (const_int 9)
-		     (const_int 2) (const_int 11)
-		     (const_int 4) (const_int 13)
-		     (const_int 6) (const_int 15)])))]
+	  (plus:V8SF (match_dup 1) (match_dup 2))
+	  (const_int 85)))]
   "TARGET_AVX"
   "vaddsubps\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sseadd")
@@ -2173,11 +2069,11 @@
 (define_insn "sse3_addsubv4sf3"
   [(set (match_operand:V4SF 0 "register_operand" "=x,x")
 	(vec_merge:V4SF
-	  (plus:V4SF
+	  (minus:V4SF
 	    (match_operand:V4SF 1 "register_operand" "0,x")
 	    (match_operand:V4SF 2 "nonimmediate_operand" "xm,xm"))
-	  (minus:V4SF (match_dup 1) (match_dup 2))
-	  (const_int 10)))]
+	  (plus:V4SF (match_dup 1) (match_dup 2))
+	  (const_int 5)))]
   "TARGET_SSE3"
   "@
    addsubps\t{%2, %0|%0, %2}
@@ -2188,45 +2084,123 @@
    (set_attr "prefix_rep" "1,*")
    (set_attr "mode" "V4SF")])
 
-(define_insn "*sse3_addsubv4sf3_1"
-  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
-	(vec_select:V4SF
-	  (vec_concat:V8SF
-	    (minus:V4SF
-	      (match_operand:V4SF 1 "register_operand" "0,x")
-	      (match_operand:V4SF 2 "nonimmediate_operand" "xm,xm"))
-	    (plus:V4SF (match_dup 1) (match_dup 2)))
-	  (parallel [(const_int 0) (const_int 5)
-		     (const_int 2) (const_int 7)])))]
-  "TARGET_SSE3"
-  "@
-   addsubps\t{%2, %0|%0, %2}
-   vaddsubps\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "noavx,avx")
-   (set_attr "type" "sseadd")
-   (set_attr "prefix" "orig,vex")
-   (set_attr "prefix_rep" "1,*")
-   (set_attr "mode" "V4SF")])
+(define_split
+  [(set (match_operand:VF_128_256 0 "register_operand")
+	(match_operator:VF_128_256 6 "addsub_vm_operator"
+	  [(minus:VF_128_256
+	     (match_operand:VF_128_256 1 "register_operand")
+	     (match_operand:VF_128_256 2 "nonimmediate_operand"))
+	   (plus:VF_128_256
+	     (match_operand:VF_128_256 3 "nonimmediate_operand")
+	     (match_operand:VF_128_256 4 "nonimmediate_operand"))
+	   (match_operand 5 "const_int_operand")]))]
+  "TARGET_SSE3
+   && can_create_pseudo_p ()
+   && ((rtx_equal_p (operands[1], operands[3])
+	&& rtx_equal_p (operands[2], operands[4]))
+       || (rtx_equal_p (operands[1], operands[4])
+	   && rtx_equal_p (operands[2], operands[3])))"
+  [(set (match_dup 0)
+	(vec_merge:VF_128_256
+	  (minus:VF_128_256 (match_dup 1) (match_dup 2))
+	  (plus:VF_128_256 (match_dup 1) (match_dup 2))
+	  (match_dup 5)))])
 
-(define_insn "*sse3_addsubv4sf3_1s"
-  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
-	(vec_select:V4SF
-	  (vec_concat:V8SF
-	    (minus:V4SF
-	      (match_operand:V4SF 1 "register_operand" "0,x")
-	      (match_operand:V4SF 2 "nonimmediate_operand" "xm,xm"))
-	    (plus:V4SF (match_dup 2) (match_dup 1)))
-	  (parallel [(const_int 0) (const_int 5)
-		     (const_int 2) (const_int 7)])))]
-  "TARGET_SSE3"
-  "@
-   addsubps\t{%2, %0|%0, %2}
-   vaddsubps\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "noavx,avx")
-   (set_attr "type" "sseadd")
-   (set_attr "prefix" "orig,vex")
-   (set_attr "prefix_rep" "1,*")
-   (set_attr "mode" "V4SF")])
+(define_split
+  [(set (match_operand:VF_128_256 0 "register_operand")
+	(match_operator:VF_128_256 6 "addsub_vm_operator"
+	  [(plus:VF_128_256
+	     (match_operand:VF_128_256 1 "nonimmediate_operand")
+	     (match_operand:VF_128_256 2 "nonimmediate_operand"))
+	   (minus:VF_128_256
+	     (match_operand:VF_128_256 3 "register_operand")
+	     (match_operand:VF_128_256 4 "nonimmediate_operand"))
+	   (match_operand 5 "const_int_operand")]))]
+  "TARGET_SSE3
+   && can_create_pseudo_p ()
+   && ((rtx_equal_p (operands[1], operands[3])
+	&& rtx_equal_p (operands[2], operands[4]))
+       || (rtx_equal_p (operands[1], operands[4])
+	   && rtx_equal_p (operands[2], operands[3])))"
+  [(set (match_dup 0)
+	(vec_merge:VF_128_256
+	  (minus:VF_128_256 (match_dup 3) (match_dup 4))
+	  (plus:VF_128_256 (match_dup 3) (match_dup 4))
+	  (match_dup 5)))]
+{
+  /* Negate mask bits to compensate for swapped PLUS and MINUS RTXes.  */
+  operands[5]
+    = GEN_INT (~INTVAL (operands[5])
+	       & ((HOST_WIDE_INT_1U << GET_MODE_NUNITS (<MODE>mode)) - 1));
+})
+
+(define_split
+  [(set (match_operand:VF_128_256 0 "register_operand")
+	(match_operator:VF_128_256 7 "addsub_vs_operator"
+	  [(vec_concat:<ssedoublemode>
+	     (minus:VF_128_256
+	       (match_operand:VF_128_256 1 "register_operand")
+	       (match_operand:VF_128_256 2 "nonimmediate_operand"))
+	     (plus:VF_128_256
+	       (match_operand:VF_128_256 3 "nonimmediate_operand")
+	       (match_operand:VF_128_256 4 "nonimmediate_operand")))
+	   (match_parallel 5 "addsub_vs_parallel"
+	     [(match_operand 6 "const_int_operand")])]))]
+  "TARGET_SSE3
+   && can_create_pseudo_p ()
+   && ((rtx_equal_p (operands[1], operands[3])
+	&& rtx_equal_p (operands[2], operands[4]))
+       || (rtx_equal_p (operands[1], operands[4])
+	   && rtx_equal_p (operands[2], operands[3])))"
+  [(set (match_dup 0)
+	(vec_merge:VF_128_256
+	  (minus:VF_128_256 (match_dup 1) (match_dup 2))
+	  (plus:VF_128_256 (match_dup 1) (match_dup 2))
+	  (match_dup 5)))]
+{
+  int i, nelt = XVECLEN (operands[5], 0);
+  HOST_WIDE_INT ival = 0;
+
+  for (i = 0; i < nelt; i++)
+    if (INTVAL (XVECEXP (operands[5], 0, i)) < GET_MODE_NUNITS (<MODE>mode))
+      ival |= HOST_WIDE_INT_1 << i;
+
+  operands[5] = GEN_INT (ival);
+})
+
+(define_split
+  [(set (match_operand:VF_128_256 0 "register_operand")
+	(match_operator:VF_128_256 7 "addsub_vs_operator"
+	  [(vec_concat:<ssedoublemode>
+	     (plus:VF_128_256
+	       (match_operand:VF_128_256 1 "nonimmediate_operand")
+	       (match_operand:VF_128_256 2 "nonimmediate_operand"))
+	     (minus:VF_128_256
+	       (match_operand:VF_128_256 3 "register_operand")
+	       (match_operand:VF_128_256 4 "nonimmediate_operand")))
+	   (match_parallel 5 "addsub_vs_parallel"
+	     [(match_operand 6 "const_int_operand")])]))]
+  "TARGET_SSE3
+   && can_create_pseudo_p ()
+   && ((rtx_equal_p (operands[1], operands[3])
+	&& rtx_equal_p (operands[2], operands[4]))
+       || (rtx_equal_p (operands[1], operands[4])
+	   && rtx_equal_p (operands[2], operands[3])))"
+  [(set (match_dup 0)
+	(vec_merge:VF_128_256
+	  (minus:VF_128_256 (match_dup 3) (match_dup 4))
+	  (plus:VF_128_256 (match_dup 3) (match_dup 4))
+	  (match_dup 5)))]
+{
+  int i, nelt = XVECLEN (operands[5], 0);
+  HOST_WIDE_INT ival = 0;
+
+  for (i = 0; i < nelt; i++)
+    if (INTVAL (XVECEXP (operands[5], 0, i)) >= GET_MODE_NUNITS (<MODE>mode))
+      ival |= HOST_WIDE_INT_1 << i;
+
+  operands[5] = GEN_INT (ival);
+})
 
 (define_insn "avx_h<plusminus_insn>v4df3"
   [(set (match_operand:V4DF 0 "register_operand" "=x")
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index f17ae0d..8e2ab43 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,11 @@
+2015-06-23  Uros Bizjak  <ubizjak@gmail.com>
+
+	PR target/66560
+	* gcc.target/i386/pr66560-1.c: New test.
+	* gcc.target/i386/pr66560-2.c: Ditto.
+	* gcc.target/i386/pr66560-3.c: Ditto.
+	* gcc.target/i386/pr66560-4.c: Ditto.
+
 2015-06-23  Thomas Schwinge  <thomas@codesourcery.com>
 
 	* gcc.target/nvptx/nvptx.exp: New file.
diff --git a/gcc/testsuite/gcc.target/i386/pr66560-1.c b/gcc/testsuite/gcc.target/i386/pr66560-1.c
new file mode 100644
index 0000000..b535ca7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr66560-1.c
@@ -0,0 +1,35 @@
+/* PR target/66560 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4" } */
+
+typedef float v4sf __attribute__((vector_size(16)));
+typedef int v4si __attribute__((vector_size(16)));
+v4sf foo1 (v4sf x, v4sf y)
+{
+  v4sf tem0 = x - y;
+  v4sf tem1 = x + y;
+  return __builtin_shuffle (tem0, tem1, (v4si) { 0, 5, 2, 7 });
+}
+
+v4sf foo2 (v4sf x, v4sf y)
+{
+  v4sf tem0 = x - y;
+  v4sf tem1 = y + x;
+  return __builtin_shuffle (tem0, tem1, (v4si) { 0, 5, 2, 7 });
+}
+
+v4sf foo3 (v4sf x, v4sf y)
+{
+  v4sf tem0 = x + y;
+  v4sf tem1 = x - y;
+  return __builtin_shuffle (tem0, tem1, (v4si) { 4, 1, 6, 3 });
+}
+
+v4sf foo4 (v4sf x, v4sf y)
+{
+  v4sf tem0 = y + x;
+  v4sf tem1 = x - y;
+  return __builtin_shuffle (tem0, tem1, (v4si) { 4, 1, 6, 3 });
+}
+
+/* { dg-final { scan-assembler-times "addsubps" 4 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr66560-2.c b/gcc/testsuite/gcc.target/i386/pr66560-2.c
new file mode 100644
index 0000000..c308f3d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr66560-2.c
@@ -0,0 +1,35 @@
+/* PR target/66560 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4" } */
+
+typedef double v2df __attribute__((vector_size(16)));
+typedef long long v2di __attribute__((vector_size(16)));
+v2df foo1 (v2df x, v2df y)
+{
+  v2df tem0 = x - y;
+  v2df tem1 = x + y;
+  return __builtin_shuffle (tem0, tem1, (v2di) { 0, 3 });
+}
+
+v2df foo2 (v2df x, v2df y)
+{
+  v2df tem0 = x - y;
+  v2df tem1 = y + x;
+  return __builtin_shuffle (tem0, tem1, (v2di) { 0, 3 });
+}
+
+v2df foo3 (v2df x, v2df y)
+{
+  v2df tem0 = x + y;
+  v2df tem1 = x - y;
+  return __builtin_shuffle (tem0, tem1, (v2di) { 2, 1 });
+}
+
+v2df foo4 (v2df x, v2df y)
+{
+  v2df tem0 = y + x;
+  v2df tem1 = x - y;
+  return __builtin_shuffle (tem0, tem1, (v2di) { 2, 1 });
+}
+
+/* { dg-final { scan-assembler-times "addsubpd" 4 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr66560-3.c b/gcc/testsuite/gcc.target/i386/pr66560-3.c
new file mode 100644
index 0000000..22f19d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr66560-3.c
@@ -0,0 +1,35 @@
+/* PR target/66560 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+
+typedef float v8sf __attribute__((vector_size(32)));
+typedef int v8si __attribute__((vector_size(32)));
+v8sf foo1 (v8sf x, v8sf y)
+{
+  v8sf tem0 = x - y;
+  v8sf tem1 = x + y;
+  return __builtin_shuffle (tem0, tem1, (v8si) { 0, 9, 2, 11, 4, 13, 6, 15 });
+}
+
+v8sf foo2 (v8sf x, v8sf y)
+{
+  v8sf tem0 = x - y;
+  v8sf tem1 = y + x;
+  return __builtin_shuffle (tem0, tem1, (v8si) { 0, 9, 2, 11, 4, 13, 6, 15 });
+}
+
+v8sf foo3 (v8sf x, v8sf y)
+{
+  v8sf tem0 = x + y;
+  v8sf tem1 = x - y;
+  return __builtin_shuffle (tem0, tem1, (v8si) { 8, 1, 10, 3, 12, 5, 14, 7 });
+}
+
+v8sf foo4 (v8sf x, v8sf y)
+{
+  v8sf tem0 = y + x;
+  v8sf tem1 = x - y;
+  return __builtin_shuffle (tem0, tem1, (v8si) { 8, 1, 10, 3, 12, 5, 14, 7 });
+}
+
+/* { dg-final { scan-assembler-times "vaddsubps" 4 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr66560-4.c b/gcc/testsuite/gcc.target/i386/pr66560-4.c
new file mode 100644
index 0000000..a8a6e90
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr66560-4.c
@@ -0,0 +1,35 @@
+/* PR target/66560 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+
+typedef double v4df __attribute__((vector_size(32)));
+typedef long long v4di __attribute__((vector_size(32)));
+v4df foo1 (v4df x, v4df y)
+{
+  v4df tem0 = x - y;
+  v4df tem1 = x + y;
+  return __builtin_shuffle (tem0, tem1, (v4di) { 0, 5, 2, 7 });
+}
+
+v4df foo2 (v4df x, v4df y)
+{
+  v4df tem0 = x - y;
+  v4df tem1 = y + x;
+  return __builtin_shuffle (tem0, tem1, (v4di) { 0, 5, 2, 7 });
+}
+
+v4df foo3 (v4df x, v4df y)
+{
+  v4df tem0 = x + y;
+  v4df tem1 = x - y;
+  return __builtin_shuffle (tem0, tem1, (v4di) { 4, 1, 6, 3 });
+}
+
+v4df foo4 (v4df x, v4df y)
+{
+  v4df tem0 = y + x;
+  v4df tem1 = x - y;
+  return __builtin_shuffle (tem0, tem1, (v4di) { 4, 1, 6, 3 });
+}
+
+/* { dg-final { scan-assembler-times "vaddsubpd" 4 } } */