From 8571ff0ae0922bee292161c7fd61dd127d26a4ed Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Mon, 23 Aug 2021 14:15:14 +0200
Subject: Adjust inner loop cost scaling

This makes use of the estimated number of iterations of the inner loop
to limit --param vect-inner-loop-cost-factor scaling.  It also reduces
the maximum value of vect-inner-loop-cost-factor to 10000 making it
less likely to cause overflow of costs.

2021-08-23  Richard Biener  <rguenther@suse.de>

	* doc/invoke.texi (vect-inner-loop-cost-factor): Adjust.
	* params.opt (--param vect-inner-loop-cost-factor): Adjust
	maximum value.
	* tree-vect-loop.c (vect_analyze_loop_form): Initialize
	inner_loop_cost_factor to the minimum of the estimated number
	of iterations of the inner loop and vect-inner-loop-cost-factor.
---
 gcc/doc/invoke.texi  | 7 ++++---
 gcc/params.opt       | 4 ++--
 gcc/tree-vect-loop.c | 7 +++++++
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'gcc')

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index c057cc1..a9d56fe 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14386,9 +14386,10 @@ The parameter only has an effect on targets that support partial
 vector loads and stores.
 
 @item vect-inner-loop-cost-factor
-The factor which the loop vectorizer applies to the cost of statements
-in an inner loop relative to the loop being vectorized.  The default
-value is 50.
+The maximum factor which the loop vectorizer applies to the cost of statements
+in an inner loop relative to the loop being vectorized.  The factor applied
+is the maximum of the estimated number of iterations of the inner loop and
+this parameter.  The default value of this parameter is 50.
 
 @item avoid-fma-max-bits
 Maximum number of bits for which we avoid creating FMAs.
diff --git a/gcc/params.opt b/gcc/params.opt
index f926488..f414dc1 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1114,7 +1114,7 @@ Common Joined UInteger Var(param_vect_partial_vector_usage) Init(2) IntegerRange
 Controls how loop vectorizer uses partial vectors.  0 means never, 1 means only for loops whose need to iterate can be removed, 2 means for all loops.  The default value is 2.
 
 -param=vect-inner-loop-cost-factor=
-Common Joined UInteger Var(param_vect_inner_loop_cost_factor) Init(50) IntegerRange(1, 999999) Param Optimization
-The factor which the loop vectorizer applies to the cost of statements in an inner loop relative to the loop being vectorized.
+Common Joined UInteger Var(param_vect_inner_loop_cost_factor) Init(50) IntegerRange(1, 10000) Param Optimization
+The maximum factor which the loop vectorizer applies to the cost of statements in an inner loop relative to the loop being vectorized.
 
 ; This comment is to ensure we retain the blank line above.
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index c521b43a..0c8d992 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1519,6 +1519,13 @@ vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
       stmt_vec_info inner_loop_cond_info
 	= loop_vinfo->lookup_stmt (inner_loop_cond);
       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
+      /* If we have an estimate on the number of iterations of the inner
+	 loop use that to limit the scale for costing, otherwise use
+	 --param vect-inner-loop-cost-factor literally.  */
+      widest_int nit;
+      if (estimated_stmt_executions (loop->inner, &nit))
+	LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
+	  = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
     }
 
   gcc_assert (!loop->aux);
-- 
cgit v1.1


From 6ddb30f941a44bd528904558673ab35394565f08 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Fri, 20 Aug 2021 15:30:40 +0800
Subject: Optimize (a & b) | (c & ~b) to vpternlog instruction.

Also optimize below 3 forms to vpternlog, op1, op2, op3 are
register_operand or unary_p as (not reg)

A: (any_logic (any_logic op1 op2) op3)
B: (any_logic (any_logic op1 op2) (any_logic op3 op4)) op3/op4 should
be equal to op1/op2
C: (any_logic (any_logic (any_logic:op1 op2) op3) op4) op3/op4 should
be equal to op1/op2

gcc/ChangeLog:

	PR target/101989
	* config/i386/i386.c (ix86_rtx_costs): Define cost for
	UNSPEC_VTERNLOG.
	* config/i386/i386.h (STRIP_UNARY): New macro.
	* config/i386/predicates.md (reg_or_notreg_operand): New
	predicate.
	* config/i386/sse.md (*<avx512>_vternlog<mode>_all): New define_insn.
	(*<avx512>_vternlog<mode>_1): New pre_reload
	define_insn_and_split.
	(*<avx512>_vternlog<mode>_2): Ditto.
	(*<avx512>_vternlog<mode>_3): Ditto.
	(any_logic1,any_logic2): New code iterator.
	(logic_op): New code attribute.
	(ternlogsuffix): Extend to VNxDF and VNxSF.

gcc/testsuite/ChangeLog:

	PR target/101989
	* gcc.target/i386/pr101989-1.c: New test.
	* gcc.target/i386/pr101989-2.c: New test.
	* gcc.target/i386/avx512bw-shiftqihi-constant-1.c: Adjust testcase.
---
 gcc/config/i386/i386.c                             |   5 +
 gcc/config/i386/i386.h                             |   2 +
 gcc/config/i386/predicates.md                      |   7 +
 gcc/config/i386/sse.md                             | 234 +++++++++++++++++++++
 .../i386/avx512bw-shiftqihi-constant-1.c           |   4 +-
 gcc/testsuite/gcc.target/i386/pr101989-1.c         |  51 +++++
 gcc/testsuite/gcc.target/i386/pr101989-2.c         | 102 +++++++++
 7 files changed, 403 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-2.c

(limited to 'gcc')

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 5bff131..ebec866 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -20542,6 +20542,11 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
     case UNSPEC:
       if (XINT (x, 1) == UNSPEC_TP)
 	*total = 0;
+      else if (XINT(x, 1) == UNSPEC_VTERNLOG)
+	{
+	  *total = cost->sse_op;
+	  return true;
+	}
       return false;
 
     case VEC_SELECT:
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 11ac8d0..6511422 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1716,6 +1716,8 @@ typedef struct ix86_args {
 
 #define LEGITIMATE_PIC_OPERAND_P(X) legitimate_pic_operand_p (X)
 
+#define STRIP_UNARY(X) (UNARY_P (X) ? XEXP (X, 0) : X)
+
 #define SYMBOLIC_CONST(X)	\
   (GET_CODE (X) == SYMBOL_REF						\
    || GET_CODE (X) == LABEL_REF						\
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 9321f33..df5acb4 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1044,6 +1044,13 @@
 	    (ior (match_test "op == const1_rtx")
 		 (match_test "op == constm1_rtx")))))
 
+;; True for registers, or (not: registers).  Used to optimize 3-operand
+;; bitwise operation.
+(define_predicate "reg_or_notreg_operand"
+  (ior (match_operand 0 "register_operand")
+       (and (match_code "not")
+	    (match_test "register_operand (XEXP (op, 0), mode)"))))
+
 ;; True if OP is acceptable as operand of DImode shift expander.
 (define_predicate "shiftdi_operand"
   (if_then_else (match_test "TARGET_64BIT")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 95f9582..25ca9a5 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -933,7 +933,9 @@
 ;; Mapping of vector modes to VPTERNLOG suffix
 (define_mode_attr ternlogsuffix
   [(V8DI "q") (V4DI "q") (V2DI "q")
+   (V8DF "q") (V4DF "q") (V2DF "q")
    (V16SI "d") (V8SI "d") (V4SI "d")
+   (V16SF "d") (V8SF "d") (V4SF "d")
    (V32HI "d") (V16HI "d") (V8HI "d")
    (V64QI "d") (V32QI "d") (V16QI "d")])
 
@@ -10041,6 +10043,238 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*<avx512>_vternlog<mode>_all"
+  [(set (match_operand:V 0 "register_operand" "=v")
+	(unspec:V
+	  [(match_operand:V 1 "register_operand" "0")
+	   (match_operand:V 2 "register_operand" "v")
+	   (match_operand:V 3 "nonimmediate_operand" "vm")
+	   (match_operand:SI 4 "const_0_to_255_operand")]
+	  UNSPEC_VTERNLOG))]
+  "TARGET_AVX512F"
+  "vpternlog<ternlogsuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+;; There must be lots of other combinations like
+;;
+;; (any_logic:V
+;;   (any_logic:V op1 op2)
+;;   (any_logic:V op1 op3))
+;;
+;; (any_logic:V
+;;   (any_logic:V
+;;     (any_logic:V op1, op2)
+;;     op3)
+;;   op1)
+;;
+;; and so on.
+
+(define_code_iterator any_logic1 [and ior xor])
+(define_code_iterator any_logic2 [and ior xor])
+(define_code_attr logic_op [(and "&") (ior "|") (xor "^")])
+
+(define_insn_and_split "*<avx512>_vpternlog<mode>_1"
+  [(set (match_operand:V 0 "register_operand")
+	(any_logic:V
+	  (any_logic1:V
+	    (match_operand:V 1 "reg_or_notreg_operand")
+	    (match_operand:V 2 "reg_or_notreg_operand"))
+	  (any_logic2:V
+	    (match_operand:V 3 "reg_or_notreg_operand")
+	    (match_operand:V 4 "reg_or_notreg_operand"))))]
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
+   && ix86_pre_reload_split ()
+   && (rtx_equal_p (STRIP_UNARY (operands[1]),
+		    STRIP_UNARY (operands[4]))
+       || rtx_equal_p (STRIP_UNARY (operands[2]),
+		       STRIP_UNARY (operands[4]))
+       || rtx_equal_p (STRIP_UNARY (operands[1]),
+		       STRIP_UNARY (operands[3]))
+       || rtx_equal_p (STRIP_UNARY (operands[2]),
+		       STRIP_UNARY (operands[3])))"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:V
+	  [(match_dup 6)
+	   (match_dup 2)
+	   (match_dup 1)
+	   (match_dup 5)]
+	  UNSPEC_VTERNLOG))]
+{
+  /* VPTERNLOGD reg6, reg2, reg1, imm8.  */
+  int reg6 = 0xF0;
+  int reg2 = 0xCC;
+  int reg1 = 0xAA;
+  int reg3 = 0;
+  int reg4 = 0;
+  int reg_mask, tmp1, tmp2;
+  if (rtx_equal_p (STRIP_UNARY (operands[1]),
+		   STRIP_UNARY (operands[4])))
+    {
+      reg4 = reg1;
+      reg3 = reg6;
+      operands[6] = operands[3];
+    }
+  else if (rtx_equal_p (STRIP_UNARY (operands[2]),
+		       STRIP_UNARY (operands[4])))
+    {
+      reg4 = reg2;
+      reg3 = reg6;
+      operands[6] = operands[3];
+    }
+  else if (rtx_equal_p (STRIP_UNARY (operands[1]),
+			STRIP_UNARY (operands[3])))
+    {
+      reg4 = reg6;
+      reg3 = reg1;
+      operands[6] = operands[4];
+    }
+  else
+    {
+      reg4 = reg6;
+      reg3 = reg2;
+      operands[6] = operands[4];
+    }
+
+  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
+  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
+  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
+  reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4;
+
+  tmp1 = reg1 <any_logic1:logic_op> reg2;
+  tmp2 = reg3 <any_logic2:logic_op> reg4;
+  reg_mask = tmp1  <any_logic:logic_op> tmp2;
+  reg_mask &= 0xFF;
+
+  operands[1] = STRIP_UNARY (operands[1]);
+  operands[2] = STRIP_UNARY (operands[2]);
+  operands[6] = STRIP_UNARY (operands[6]);
+  operands[5] = GEN_INT (reg_mask);
+})
+
+(define_insn_and_split "*<avx512>_vpternlog<mode>_2"
+  [(set (match_operand:V 0 "register_operand")
+	(any_logic:V
+	  (any_logic1:V
+	    (any_logic2:V
+	      (match_operand:V 1 "reg_or_notreg_operand")
+	      (match_operand:V 2 "reg_or_notreg_operand"))
+	    (match_operand:V 3 "reg_or_notreg_operand"))
+	  (match_operand:V 4 "reg_or_notreg_operand")))]
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
+   && ix86_pre_reload_split ()
+   && (rtx_equal_p (STRIP_UNARY (operands[1]),
+		    STRIP_UNARY (operands[4]))
+       || rtx_equal_p (STRIP_UNARY (operands[2]),
+		       STRIP_UNARY (operands[4]))
+       || rtx_equal_p (STRIP_UNARY (operands[1]),
+		       STRIP_UNARY (operands[3]))
+       || rtx_equal_p (STRIP_UNARY (operands[2]),
+		       STRIP_UNARY (operands[3])))"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:V
+	  [(match_dup 6)
+	   (match_dup 2)
+	   (match_dup 1)
+	   (match_dup 5)]
+	  UNSPEC_VTERNLOG))]
+{
+  /* VPTERNLOGD reg6, reg2, reg1, imm8.  */
+  int reg6 = 0xF0;
+  int reg2 = 0xCC;
+  int reg1 = 0xAA;
+  int reg3 = 0;
+  int reg4 = 0;
+  int reg_mask, tmp1, tmp2;
+  if (rtx_equal_p (STRIP_UNARY (operands[1]),
+		   STRIP_UNARY (operands[4])))
+    {
+      reg4 = reg1;
+      reg3 = reg6;
+      operands[6] = operands[3];
+    }
+  else if (rtx_equal_p (STRIP_UNARY (operands[2]),
+		       STRIP_UNARY (operands[4])))
+    {
+      reg4 = reg2;
+      reg3 = reg6;
+      operands[6] = operands[3];
+    }
+  else if (rtx_equal_p (STRIP_UNARY (operands[1]),
+			STRIP_UNARY (operands[3])))
+    {
+      reg4 = reg6;
+      reg3 = reg1;
+      operands[6] = operands[4];
+    }
+  else
+    {
+      reg4 = reg6;
+      reg3 = reg2;
+      operands[6] = operands[4];
+    }
+
+  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
+  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
+  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
+  reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4;
+
+  tmp1 = reg1 <any_logic2:logic_op> reg2;
+  tmp2 = tmp1 <any_logic1:logic_op> reg3;
+  reg_mask = tmp2 <any_logic:logic_op> reg4;
+  reg_mask &= 0xFF;
+
+  operands[1] = STRIP_UNARY (operands[1]);
+  operands[2] = STRIP_UNARY (operands[2]);
+  operands[6] = STRIP_UNARY (operands[6]);
+  operands[5] = GEN_INT (reg_mask);
+})
+
+(define_insn_and_split "*<avx512>_vpternlog<mode>_3"
+  [(set (match_operand:V 0 "register_operand")
+	(any_logic:V
+	  (any_logic1:V
+	    (match_operand:V 1 "reg_or_notreg_operand")
+	    (match_operand:V 2 "reg_or_notreg_operand"))
+	  (match_operand:V 3 "reg_or_notreg_operand")))]
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:V
+	  [(match_dup 3)
+	   (match_dup 2)
+	   (match_dup 1)
+	   (match_dup 4)]
+	  UNSPEC_VTERNLOG))]
+{
+  /* VPTERNLOGD reg3, reg2, reg1, imm8.  */
+  int reg3 = 0xF0;
+  int reg2 = 0xCC;
+  int reg1 = 0xAA;
+  int reg_mask, tmp1;
+
+  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
+  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
+  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
+
+  tmp1 = reg1 <any_logic1:logic_op> reg2;
+  reg_mask = tmp1 <any_logic:logic_op> reg3;
+  reg_mask &= 0xFF;
+
+  operands[1] = STRIP_UNARY (operands[1]);
+  operands[2] = STRIP_UNARY (operands[2]);
+  operands[3] = STRIP_UNARY (operands[3]);
+  operands[4] = GEN_INT (reg_mask);
+})
+
+
 (define_insn "<avx512>_vternlog<mode>_mask"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
 	(vec_merge:VI48_AVX512VL
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
index 78bf5d3..fbc3de0 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
@@ -1,7 +1,8 @@
 /* PR target/95524 */
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512bw" } */
-/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 3 } }  */
+/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 2 } }  */
+/* { dg-final { scan-assembler-times "vpternlogd\[^\n\]*%zmm" 1 } }  */
 typedef char v64qi  __attribute__ ((vector_size (64)));
 typedef unsigned char v64uqi  __attribute__ ((vector_size (64)));
 
@@ -11,7 +12,6 @@ foo_ashiftrt_512 (v64qi a)
   return a >> 2;
 }
 /* { dg-final { scan-assembler-times "vpsraw\[^\n\]*%zmm" 1 } } */
-/* { dg-final { scan-assembler-times "vpxor\[^\n\]*%zmm" 1 } } */
 /* { dg-final { scan-assembler-times "vpsubb\[^\n\]*%zmm" 1 } } */
 
 __attribute__((noipa)) v64qi
diff --git a/gcc/testsuite/gcc.target/i386/pr101989-1.c b/gcc/testsuite/gcc.target/i386/pr101989-1.c
new file mode 100644
index 0000000..594093e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101989-1.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-times "vpternlog" 6 } } */
+/* { dg-final { scan-assembler-not "vpxor" } } */
+/* { dg-final { scan-assembler-not "vpor" } } */
+/* { dg-final { scan-assembler-not "vpand" } } */
+
+#include<immintrin.h>
+__m256d
+__attribute__((noipa, target("avx512vl")))
+copysign2_pd(__m256d from, __m256d to) {
+  __m256i a = _mm256_castpd_si256(from);
+  __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63));
+  /* (avx_signbit & from) | (~avx_signbit & to)  */
+  return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to));
+}
+
+__m256i
+__attribute__((noipa, target("avx512vl")))
+foo (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & ~src1) | (src3 & src1);
+}
+
+__m256i
+__attribute__ ((noipa, target("avx512vl")))
+foo1 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & src1) | (src3 & ~src1);
+}
+
+__m256i
+__attribute__ ((noipa, target("avx512vl")))
+foo2 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & src1) | (~src3 & src1);
+}
+
+__m256i
+__attribute__ ((noipa, target("avx512vl")))
+foo3 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (~src2 & src1) | (src3 & src1);
+}
+
+__m256i
+__attribute__ ((noipa, target("avx512vl")))
+foo4 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return src3 & src2 ^ src1;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101989-2.c b/gcc/testsuite/gcc.target/i386/pr101989-2.c
new file mode 100644
index 0000000..9d9759a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101989-2.c
@@ -0,0 +1,102 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -mno-avx512f" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+
+#include "avx512f-helper.h"
+
+#include "pr101989-1.c"
+__m256d
+avx2_copysign2_pd (__m256d from, __m256d to) {
+  __m256i a = _mm256_castpd_si256(from);
+  __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63));
+  /* (avx_signbit & from) | (~avx_signbit & to)  */
+  return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to));
+}
+
+__m256i
+avx2_foo (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & ~src1) | (src3 & src1);
+}
+
+__m256i
+avx2_foo1 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & src1) | (src3 & ~src1);
+}
+
+__m256i
+avx2_foo2 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & src1) | (~src3 & src1);
+}
+
+__m256i
+avx2_foo3 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (~src2 & src1) | (src3 & src1);
+}
+
+__m256i
+avx2_foo4 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return src3 & src2 ^ src1;
+}
+
+
+void
+test_256 (void)
+{
+  union256i_q q1, q2, q3, res2, exp2;
+  union256d d1, d2, res1, exp1;
+  int i, sign = 1;
+
+  for (i = 0; i < 4; i++)
+    {
+      d1.a[i] = 12.34 * (i + 2000) * sign;
+      d2.a[i] = 56.78 * (i - 30) * sign;
+      q1.a[i] = 12 * (i + 2000) * sign;
+      q2.a[i] = 56 * (i - 30) * sign;
+      q3.a[i] = 90 * (i + 40) * sign;
+      res1.a[i] = DEFAULT_VALUE;
+      exp1.a[i] = DEFAULT_VALUE;
+      res2.a[i] = exp2.a[i] = -1;
+      sign = -sign;
+    }
+
+  exp1.x = avx2_copysign2_pd (d1.x, d2.x);
+  res1.x = copysign2_pd (d1.x, d2.x);
+  if (UNION_CHECK (256, d) (res1, exp1.a))
+    abort ();
+
+  exp2.x = avx2_foo1 (q1.x, q2.x, q3.x);
+  res2.x = foo1 (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+
+  exp2.x = avx2_foo2 (q1.x, q2.x, q3.x);
+  res2.x = foo2 (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+
+  exp2.x = avx2_foo3 (q1.x, q2.x, q3.x);
+  res2.x = foo3 (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+
+  exp2.x = avx2_foo4 (q1.x, q2.x, q3.x);
+  res2.x = foo4 (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+
+  exp2.x = avx2_foo (q1.x, q2.x, q3.x);
+  res2.x = foo (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+}
+
+static void
+test_128 ()
+{}
-- 
cgit v1.1


From 8da9b4f73c2c878b48f45fa2ed47d8a9edd31262 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Tue, 24 Aug 2021 18:09:33 +0800
Subject: Enable avx512 embedde broadcast for vpternlog.

gcc/ChangeLog:

	PR target/101989
	* config/i386/sse.md (<avx512>_vternlog<mode><sd_maskz_name>):
	Enable avx512 embedded broadcast.
	(*<avx512>_vternlog<mode>_all): Ditto.
	(<avx512>_vternlog<mode>_mask): Ditto.

gcc/testsuite/ChangeLog:

	PR target/101989
	* gcc.target/i386/pr101989-broadcast-1.c: New test.
---
 gcc/config/i386/sse.md                             |  6 ++---
 .../gcc.target/i386/pr101989-broadcast-1.c         | 31 ++++++++++++++++++++++
 2 files changed, 34 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101989-broadcast-1.c

(limited to 'gcc')

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 25ca9a5..03fc2df 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -10034,7 +10034,7 @@
 	(unspec:VI48_AVX512VL
 	  [(match_operand:VI48_AVX512VL 1 "register_operand" "0")
 	   (match_operand:VI48_AVX512VL 2 "register_operand" "v")
-	   (match_operand:VI48_AVX512VL 3 "nonimmediate_operand" "vm")
+	   (match_operand:VI48_AVX512VL 3 "bcst_vector_operand" "vmBr")
 	   (match_operand:SI 4 "const_0_to_255_operand")]
 	  UNSPEC_VTERNLOG))]
   "TARGET_AVX512F"
@@ -10048,7 +10048,7 @@
 	(unspec:V
 	  [(match_operand:V 1 "register_operand" "0")
 	   (match_operand:V 2 "register_operand" "v")
-	   (match_operand:V 3 "nonimmediate_operand" "vm")
+	   (match_operand:V 3 "bcst_vector_operand" "vmBr")
 	   (match_operand:SI 4 "const_0_to_255_operand")]
 	  UNSPEC_VTERNLOG))]
   "TARGET_AVX512F"
@@ -10281,7 +10281,7 @@
 	  (unspec:VI48_AVX512VL
 	    [(match_operand:VI48_AVX512VL 1 "register_operand" "0")
 	     (match_operand:VI48_AVX512VL 2 "register_operand" "v")
-	     (match_operand:VI48_AVX512VL 3 "nonimmediate_operand" "vm")
+	     (match_operand:VI48_AVX512VL 3 "bcst_vector_operand" "vmBr")
 	     (match_operand:SI 4 "const_0_to_255_operand")]
 	    UNSPEC_VTERNLOG)
 	  (match_dup 1)
diff --git a/gcc/testsuite/gcc.target/i386/pr101989-broadcast-1.c b/gcc/testsuite/gcc.target/i386/pr101989-broadcast-1.c
new file mode 100644
index 0000000..d03d192
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101989-broadcast-1.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512vl" } */
+/* { dg-final { scan-assembler-times "vpternlog" 4 } } */
+/* { dg-final { scan-assembler-times "\\\{1to4\\\}" 4 } } */
+#include<immintrin.h>
+extern long long C;
+__m256d
+copysign2_pd(__m256d from, __m256d to) {
+  __m256i a = _mm256_castpd_si256(from);
+  __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63));
+  /* (avx_signbit & from) | (~avx_signbit & to)  */
+  return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to));
+}
+
+__m256i
+mask_pternlog (__m256i A, __m256i B, __mmask8 U)
+{
+  return _mm256_mask_ternarylogic_epi64 (A, U, B, _mm256_set1_epi64x (C) ,202);
+}
+
+__m256i
+maskz_pternlog (__m256i A, __m256i B, __mmask8 U)
+{
+  return _mm256_maskz_ternarylogic_epi64 (U, A, B, _mm256_set1_epi64x (C) ,202);
+}
+
+__m256i
+none_pternlog (__m256i A, __m256i B)
+{
+  return _mm256_ternarylogic_epi64 (A, B, _mm256_set1_epi64x (C) ,202);
+}
-- 
cgit v1.1


From 4702d3cf044924970a9a00142542da1edacfd76c Mon Sep 17 00:00:00 2001
From: Richard Earnshaw <rearnsha@arm.com>
Date: Fri, 11 Jun 2021 17:18:12 +0100
Subject: arm: Fix general issues with patterns for VLLDM and VLSTM

Both lazy_store_multiple_insn and lazy_load_multiple_insn contain
invalid RTL (eg they contain a post_inc statement outside of a mem).
What's more, the instructions concerned do not modify their input
address register.  We probably got away with this because they are
generated so late in the compilation that no subsequent pass needed to
understand them.  Nevertheless, this could cause problems someday, so
fixed to use a simple legal unspec.

gcc:
	* config/arm/vfp.md (lazy_store_multiple_insn): Rewrite as valid RTL.
	(lazy_load_multiple_insn): Likewise.
---
 gcc/config/arm/vfp.md | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'gcc')

diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md
index 93e96369..9961f93 100644
--- a/gcc/config/arm/vfp.md
+++ b/gcc/config/arm/vfp.md
@@ -1703,12 +1703,15 @@
    (set_attr "type" "mov_reg")]
 )
 
+;; Both this and the next instruction are treated by GCC in the same
+;; way as a blockage pattern.  That's perhaps stronger than it needs
+;; to be, but we do not want accesses to the VFP register bank to be
+;; moved across either instruction.
+
 (define_insn "lazy_store_multiple_insn"
-  [(set (match_operand:SI 0 "s_register_operand" "+&rk")
-	(post_dec:SI (match_dup 0)))
-   (unspec_volatile [(const_int 0)
-		     (mem:SI (post_dec:SI (match_dup 0)))]
-		    VUNSPEC_VLSTM)]
+  [(unspec_volatile
+    [(mem:BLK (match_operand:SI 0 "s_register_operand" "rk"))]
+    VUNSPEC_VLSTM)]
   "use_cmse && reload_completed"
   "vlstm%?\\t%0"
   [(set_attr "predicable" "yes")
@@ -1716,11 +1719,9 @@
 )
 
 (define_insn "lazy_load_multiple_insn"
-  [(set (match_operand:SI 0 "s_register_operand" "+&rk")
-	(post_inc:SI (match_dup 0)))
-   (unspec_volatile:SI [(const_int 0)
-			(mem:SI (match_dup 0))]
-		       VUNSPEC_VLLDM)]
+  [(unspec_volatile
+    [(mem:BLK (match_operand:SI 0 "s_register_operand" "rk"))]
+    VUNSPEC_VLLDM)]
   "use_cmse && reload_completed"
   "vlldm%?\\t%0"
   [(set_attr "predicable" "yes")
-- 
cgit v1.1


From 79fb2700bdbab4212346d907be6063c5a32d3836 Mon Sep 17 00:00:00 2001
From: Richard Earnshaw <rearnsha@arm.com>
Date: Fri, 18 Jun 2021 17:13:04 +0100
Subject: arm: testsuite: improve detection of CMSE hardware.

The test for CMSE support being available in hardware currently
relies on the compiler not optimizing away a secure gateway operation.
But even that is suspect, because the SG instruction is just a NOP
on armv8-m implementations that do not support the security extension.

Replace the existing test with a new one that reads and checks
the appropriate hardware feature register (memory mapped).  This has
to be run from secure mode, but that shouldn't matter, because if we
can't do that we can't really test the CMSE extensions anyway.  We
retain the SG instruction to ensure the test can't pass accidentally
if run on pre-armv8-m devices.

gcc/testsuite:
	* lib/target-supports.exp (check_effective_target_arm_cmse_hw):
	Check the CMSE feature register, rather than relying on the
	SG operation causing an execution fault.
---
 gcc/testsuite/lib/target-supports.exp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'gcc')

diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 66ce48d..06f5b1e 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4878,15 +4878,16 @@ proc check_effective_target_arm_cmse_ok {} {
 
 proc check_effective_target_arm_cmse_hw { } {
     return [check_runtime arm_cmse_hw_available {
-	int __attribute__ ((cmse_nonsecure_entry)) ns_func(void)
-	{
-	    return 0;
-	}
 	int main (void)
 	{
-	    return ns_func();
-	}
-    } "-mcmse -Wl,--section-start,.gnu.sgstubs=0x00400000"]
+	    unsigned id_pfr1;
+	    asm ("ldr\t%0, =0xe000ed44\n" \
+		 "ldr\t%0, [%0]\n" \
+		 "sg" : "=l" (id_pfr1));
+	    /* Exit with code 0 iff security extension is available.  */
+	    return !(id_pfr1 & 0xf0);
+	}
+    } "-mcmse"]
 }
 # Return 1 if the target supports executing MVE instructions, 0
 # otherwise.
-- 
cgit v1.1


From 3929bca9ca95de9d35e82ae8828b188029e3eb70 Mon Sep 17 00:00:00 2001
From: Richard Earnshaw <rearnsha@arm.com>
Date: Fri, 11 Jun 2021 16:02:05 +0100
Subject: arm: Add command-line option for enabling CVE-2021-35465 mitigation
 [PR102035]

Add a new option, -mfix-cmse-cve-2021-35465 and document it.  Enable it
automatically for cortex-m33, cortex-m35p and cortex-m55.

gcc:
	PR target/102035
	* config/arm/arm.opt (mfix-cmse-cve-2021-35465): New option.
	* doc/invoke.texi (Arm Options): Document it.
	* config/arm/arm-cpus.in (quirk_vlldm): New feature bit.
	(ALL_QUIRKS): Add quirk_vlldm.
	(cortex-m33): Add quirk_vlldm.
	(cortex-m35p, cortex-m55): Likewise.
	* config/arm/arm.c (arm_option_override): Enable fix_vlldm if
	targetting an affected CPU and not explicitly controlled on
	the command line.
---
 gcc/config/arm/arm-cpus.in | 9 +++++++--
 gcc/config/arm/arm.c       | 9 +++++++++
 gcc/config/arm/arm.opt     | 4 ++++
 gcc/doc/invoke.texi        | 9 +++++++++
 4 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'gcc')

diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in
index 249995a..bcc9ebe 100644
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -186,6 +186,9 @@ define feature quirk_armv6kz
 # Cortex-M3 LDRD quirk.
 define feature quirk_cm3_ldrd
 
+# v8-m/v8.1-m VLLDM errata.
+define feature quirk_vlldm
+
 # Don't use .cpu assembly directive
 define feature quirk_no_asmcpu
 
@@ -322,7 +325,7 @@ define implied vfp_base MVE MVE_FP ALL_FP
 # architectures.
 # xscale isn't really a 'quirk', but it isn't an architecture either and we
 # need to ignore it for matching purposes.
-define fgroup ALL_QUIRKS   quirk_no_volatile_ce quirk_armv6kz quirk_cm3_ldrd xscale quirk_no_asmcpu
+define fgroup ALL_QUIRKS   quirk_no_volatile_ce quirk_armv6kz quirk_cm3_ldrd quirk_vlldm xscale quirk_no_asmcpu
 
 define fgroup IGNORE_FOR_MULTILIB cdecp0 cdecp1 cdecp2 cdecp3 cdecp4 cdecp5 cdecp6 cdecp7
 
@@ -1571,6 +1574,7 @@ begin cpu cortex-m33
  architecture armv8-m.main+dsp+fp
  option nofp remove ALL_FP
  option nodsp remove armv7em
+ isa quirk_vlldm
  costs v7m
 end cpu cortex-m33
 
@@ -1580,6 +1584,7 @@ begin cpu cortex-m35p
  architecture armv8-m.main+dsp+fp
  option nofp remove ALL_FP
  option nodsp remove armv7em
+ isa quirk_vlldm
  costs v7m
 end cpu cortex-m35p
 
@@ -1591,7 +1596,7 @@ begin cpu cortex-m55
  option nomve remove mve mve_float
  option nofp remove ALL_FP mve_float
  option nodsp remove MVE mve_float
- isa quirk_no_asmcpu
+ isa quirk_no_asmcpu quirk_vlldm
  costs v7m
  vendor 41
 end cpu cortex-m55
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 11dafc7..5c92941 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -3616,6 +3616,15 @@ arm_option_override (void)
 	fix_cm3_ldrd = 0;
     }
 
+  /* Enable fix_vlldm by default if required.  */
+  if (fix_vlldm == 2)
+    {
+      if (bitmap_bit_p (arm_active_target.isa, isa_bit_quirk_vlldm))
+	fix_vlldm = 1;
+      else
+	fix_vlldm = 0;
+    }
+
   /* Hot/Cold partitioning is not currently supported, since we can't
      handle literal pool placement in that case.  */
   if (flag_reorder_blocks_and_partition)
diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt
index 7417b55..a7677ee 100644
--- a/gcc/config/arm/arm.opt
+++ b/gcc/config/arm/arm.opt
@@ -268,6 +268,10 @@ Target Var(fix_cm3_ldrd) Init(2)
 Avoid overlapping destination and address registers on LDRD instructions
 that may trigger Cortex-M3 errata.
 
+mfix-cmse-cve-2021-35465
+Target Var(fix_vlldm) Init(2)
+Mitigate issues with VLLDM on some M-profile devices (CVE-2021-35465).
+
 munaligned-access
 Target Var(unaligned_access) Init(2) Save
 Enable unaligned word and halfword accesses to packed data.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index a9d56fe..b8f5d9e 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -808,6 +808,7 @@ Objective-C and Objective-C++ Dialects}.
 -mverbose-cost-dump @gol
 -mpure-code @gol
 -mcmse @gol
+-mfix-cmse-cve-2021-35465 @gol
 -mfdpic}
 
 @emph{AVR Options}
@@ -20743,6 +20744,14 @@ Generate secure code as per the "ARMv8-M Security Extensions: Requirements on
 Development Tools Engineering Specification", which can be found on
 @url{https://developer.arm.com/documentation/ecm0359818/latest/}.
 
+@item -mfix-cmse-cve-2021-35465
+@opindex mfix-cmse-cve-2021-35465
+Mitigate against a potential security issue with the @code{VLLDM} instruction
+in some M-profile devices when using CMSE (CVE-2021-365465).  This option is
+enabled by default when the option @option{-mcpu=} is used with
+@code{cortex-m33}, @code{cortex-m35p} or @code{cortex-m55}.  The option
+@option{-mno-fix-cmse-cve-2021-35465} can be used to disable the mitigation.
+
 @item -mfdpic
 @itemx -mno-fdpic
 @opindex mfdpic
-- 
cgit v1.1


From 30461cf8dba3d3adb15a125e4da48800eb2b9b8f Mon Sep 17 00:00:00 2001
From: Richard Earnshaw <rearnsha@arm.com>
Date: Fri, 18 Jun 2021 17:18:37 +0100
Subject: arm: fix vlldm erratum for Armv8.1-m [PR102035]

For Armv8.1-m we generate code that emits VLLDM directly and do not
rely on support code in the library, so emit the mitigation directly
as well, when required.  In this case, we can use the compiler options
to determine when to apply the fix and when it is safe to omit it.

gcc:
	PR target/102035
	* config/arm/arm.md (attribute arch): Add fix_vlldm.
	(arch_enabled): Use it.
	* config/arm/vfp.md (lazy_store_multiple_insn): Add alternative to
	use when erratum mitigation is needed.
---
 gcc/config/arm/arm.md | 11 +++++++++--
 gcc/config/arm/vfp.md | 10 +++++++---
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'gcc')

diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 0646048..5d3f21b 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -132,9 +132,12 @@
 ; TARGET_32BIT, "t1" or "t2" to specify a specific Thumb mode.  "v6"
 ; for ARM or Thumb-2 with arm_arch6, and nov6 for ARM without
 ; arm_arch6.  "v6t2" for Thumb-2 with arm_arch6 and "v8mb" for ARMv8-M
-; Baseline.  This attribute is used to compute attribute "enabled",
+; Baseline.  "fix_vlldm" is for fixing the v8-m/v8.1-m VLLDM erratum.
+; This attribute is used to compute attribute "enabled",
 ; use type "any" to enable an alternative in all cases.
-(define_attr "arch" "any,a,t,32,t1,t2,v6,nov6,v6t2,v8mb,iwmmxt,iwmmxt2,armv6_or_vfpv3,neon,mve"
+(define_attr "arch" "any, a, t, 32, t1, t2, v6,nov6, v6t2, \
+		     v8mb, fix_vlldm, iwmmxt, iwmmxt2, armv6_or_vfpv3, \
+		     neon, mve"
   (const_string "any"))
 
 (define_attr "arch_enabled" "no,yes"
@@ -177,6 +180,10 @@
 	      (match_test "TARGET_THUMB1 && arm_arch8"))
 	 (const_string "yes")
 
+	 (and (eq_attr "arch" "fix_vlldm")
+	      (match_test "fix_vlldm"))
+	 (const_string "yes")
+
 	 (and (eq_attr "arch" "iwmmxt2")
 	      (match_test "TARGET_REALLY_IWMMXT2"))
 	 (const_string "yes")
diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md
index 9961f93..f0030a8 100644
--- a/gcc/config/arm/vfp.md
+++ b/gcc/config/arm/vfp.md
@@ -1720,11 +1720,15 @@
 
 (define_insn "lazy_load_multiple_insn"
   [(unspec_volatile
-    [(mem:BLK (match_operand:SI 0 "s_register_operand" "rk"))]
+    [(mem:BLK (match_operand:SI 0 "s_register_operand" "rk,rk"))]
     VUNSPEC_VLLDM)]
   "use_cmse && reload_completed"
-  "vlldm%?\\t%0"
-  [(set_attr "predicable" "yes")
+  "@
+   vscclrm\\t{vpr}\;vlldm\\t%0
+   vlldm\\t%0"
+  [(set_attr "arch" "fix_vlldm,*")
+   (set_attr "predicable" "no")
+   (set_attr "length" "8,4")
    (set_attr "type" "load_4")]
 )
 
-- 
cgit v1.1


From 809330ab8450261e05919b472783bf15e4b000f7 Mon Sep 17 00:00:00 2001
From: Richard Earnshaw <rearnsha@arm.com>
Date: Tue, 6 Jul 2021 15:10:18 +0100
Subject: arm: Add tests for VLLDM mitigation [PR102035]

New tests for the erratum mitigation.

gcc/testsuite:
	PR target/102035
	* gcc.target/arm/cmse/mainline/8_1m/soft/cmse-13a.c: New test.
	* gcc.target/arm/cmse/mainline/8_1m/soft/cmse-7a.c: Likewise.
	* gcc.target/arm/cmse/mainline/8_1m/soft/cmse-8a.c: Likewise.
	* gcc.target/arm/cmse/mainline/8_1m/softfp-sp/cmse-7a.c: Likewise.
	* gcc.target/arm/cmse/mainline/8_1m/softfp-sp/cmse-8a.c: Likewise.
	* gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-13a.c: Likewise.
	* gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-7a.c: Likewise.
	* gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-8a.c: Likewise.
---
 .../arm/cmse/mainline/8_1m/soft/cmse-13a.c         | 31 ++++++++++++++++++++++
 .../arm/cmse/mainline/8_1m/soft/cmse-7a.c          | 28 +++++++++++++++++++
 .../arm/cmse/mainline/8_1m/soft/cmse-8a.c          | 30 +++++++++++++++++++++
 .../arm/cmse/mainline/8_1m/softfp-sp/cmse-7a.c     | 27 +++++++++++++++++++
 .../arm/cmse/mainline/8_1m/softfp-sp/cmse-8a.c     | 29 ++++++++++++++++++++
 .../arm/cmse/mainline/8_1m/softfp/cmse-13a.c       | 30 +++++++++++++++++++++
 .../arm/cmse/mainline/8_1m/softfp/cmse-7a.c        | 27 +++++++++++++++++++
 .../arm/cmse/mainline/8_1m/softfp/cmse-8a.c        | 29 ++++++++++++++++++++
 8 files changed, 231 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-13a.c
 create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-7a.c
 create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-8a.c
 create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp-sp/cmse-7a.c
 create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp-sp/cmse-8a.c
 create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-13a.c
 create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-7a.c
 create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-8a.c

(limited to 'gcc')

diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-13a.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-13a.c
new file mode 100644
index 0000000..553cc78
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-13a.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse -mfloat-abi=soft -mfix-cmse-cve-2021-35465" }  */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=*" } { "-mfloat-abi=soft" } } */
+
+#include "../../../cmse-13.x"
+
+/* Checks for saving and clearing prior to function call.  */
+/* Shift on the same register as blxns.  */
+/* { dg-final { scan-assembler "lsrs\t(r\[1,4-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "lsls\t(r\[1,4-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler "push\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler "vlstm\tsp" } } */
+/* Check the right registers are cleared and none appears twice.  */
+/* { dg-final { scan-assembler "clrm\t\{(r1, )?(r4, )?(r5, )?(r6, )?(r7, )?(r8, )?(r9, )?(r10, )?(fp, )?(ip, )?APSR\}" } } */
+/* Check that the right number of registers is cleared and thus only one
+   register is missing.  */
+/* { dg-final { scan-assembler "clrm\t\{((r\[1,4-9\]|r10|fp|ip), ){9}APSR\}" } } */
+/* Check that no cleared register is used for blxns.  */
+/* { dg-final { scan-assembler-not "clrm\t\{\[^\}\]\+(r\[1,4-9\]|r10|fp|ip),\[^\}\]\+\}.*blxns\t\\1" } } */
+/* Check for v8.1-m variant of erratum work-around.  */
+/* { dg-final { scan-assembler "vscclrm\t\{vpr\}" } } */
+/* { dg-final { scan-assembler "vlldm\tsp" } } */
+/* { dg-final { scan-assembler "pop\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler-not "vmov" } } */
+/* { dg-final { scan-assembler-not "vmsr" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "blxns" } } */
diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-7a.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-7a.c
new file mode 100644
index 0000000..ce02fde
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-7a.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse -mfloat-abi=soft -mfix-cmse-cve-2021-35465" }  */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=*" } { "-mfloat-abi=soft" } } */
+
+#include "../../../cmse-7.x"
+
+/* Checks for saving and clearing prior to function call.  */
+/* Shift on the same register as blxns.  */
+/* { dg-final { scan-assembler "lsrs\t(r\[0-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "lsls\t(r\[0-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "push\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler "vlstm\tsp" } } */
+/* Check the right registers are cleared and none appears twice.  */
+/* { dg-final { scan-assembler "clrm\t\{(r0, )?(r1, )?(r2, )?(r3, )?(r4, )?(r5, )?(r6, )?(r7, )?(r8, )?(r9, )?(r10, )?(fp, )?(ip, )?APSR\}" } } */
+/* Check that the right number of registers is cleared and thus only one
+   register is missing.  */
+/* { dg-final { scan-assembler "clrm\t\{((r\[0-9\]|r10|fp|ip), ){12}APSR\}" } } */
+/* Check that no cleared register is used for blxns.  */
+/* { dg-final { scan-assembler-not "clrm\t\{\[^\}\]\+(r\[0-9\]|r10|fp|ip),\[^\}\]\+\}.*blxns\t\\1" } } */
+/* Check for v8.1-m variant of erratum work-around.  */
+/* { dg-final { scan-assembler "vscclrm\t\{vpr\}" } } */
+/* { dg-final { scan-assembler "vlldm\tsp" } } */
+/* { dg-final { scan-assembler "pop\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler-not "vmov" } } */
+/* { dg-final { scan-assembler-not "vmsr" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "blxns" } } */
diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-8a.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-8a.c
new file mode 100644
index 0000000..75e1611
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-8a.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse -mfloat-abi=soft -mfix-cmse-cve-2021-35465" }  */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=*" } { "-mfloat-abi=soft" } } */
+
+#include "../../../cmse-8.x"
+
+/* Checks for saving and clearing prior to function call.  */
+/* Shift on the same register as blxns.  */
+/* { dg-final { scan-assembler "lsrs\t(r\[2-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "lsls\t(r\[2-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "push\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler "vlstm\tsp" } } */
+/* Check the right registers are cleared and none appears twice.  */
+/* { dg-final { scan-assembler "clrm\t\{(r2, )?(r3, )?(r4, )?(r5, )?(r6, )?(r7, )?(r8, )?(r9, )?(r10, )?(fp, )?(ip, )?APSR\}" } } */
+/* Check that the right number of registers is cleared and thus only one
+   register is missing.  */
+/* { dg-final { scan-assembler "clrm\t\{((r\[2-9\]|r10|fp|ip), ){10}APSR\}" } } */
+/* Check that no cleared register is used for blxns.  */
+/* { dg-final { scan-assembler-not "clrm\t\{\[^\}\]\+(r\[2-9\]|r10|fp|ip),\[^\}\]\+\}.*blxns\t\\1" } } */
+/* Check for v8.1-m variant of erratum work-around.  */
+/* { dg-final { scan-assembler "vscclrm\t\{vpr\}" } } */
+/* { dg-final { scan-assembler "vlldm\tsp" } } */
+/* { dg-final { scan-assembler "pop\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler-not "vmov" } } */
+/* { dg-final { scan-assembler-not "vmsr" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "blxns" } } */
diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp-sp/cmse-7a.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp-sp/cmse-7a.c
new file mode 100644
index 0000000..dad7266
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp-sp/cmse-7a.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-sp-d16 -mfix-cmse-cve-2021-35465" }  */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=*" } { "-mfloat-abi=softfp" } } */
+/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
+
+#include "../../../cmse-7.x"
+
+/* Checks for saving and clearing prior to function call.  */
+/* Shift on the same register as blxns.  */
+/* { dg-final { scan-assembler "lsrs\t(r\[0-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "lsls\t(r\[0-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "push\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler "vlstm\tsp" } } */
+/* Check the right registers are cleared and none appears twice.  */
+/* { dg-final { scan-assembler "clrm\t\{(r0, )?(r1, )?(r2, )?(r3, )?(r4, )?(r5, )?(r6, )?(r7, )?(r8, )?(r9, )?(r10, )?(fp, )?(ip, )?APSR\}" } } */
+/* Check that the right number of registers is cleared and thus only one
+   register is missing.  */
+/* { dg-final { scan-assembler "clrm\t\{((r\[0-9\]|r10|fp|ip), ){12}APSR\}" } } */
+/* Check that no cleared register is used for blxns.  */
+/* { dg-final { scan-assembler-not "clrm\t\{\[^\}\]\+(r\[0-9\]|r10|fp|ip),\[^\}\]\+\}.*blxns\t\\1" } } */
+/* Check for v8.1-m variant of erratum work-around.  */
+/* { dg-final { scan-assembler "vscclrm\t\{vpr\}" } } */
+/* { dg-final { scan-assembler "vlldm\tsp" } } */
+/* { dg-final { scan-assembler "pop\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "blxns" } } */
diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp-sp/cmse-8a.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp-sp/cmse-8a.c
new file mode 100644
index 0000000..faa0448
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp-sp/cmse-8a.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-sp-d16 -mfix-cmse-cve-2021-35465" }  */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=*" } { "-mfloat-abi=softfp" } } */
+/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
+
+#include "../../../cmse-8.x"
+
+/* Checks for saving and clearing prior to function call.  */
+/* Shift on the same register as blxns.  */
+/* { dg-final { scan-assembler "lsrs\t(r\[2-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "lsls\t(r\[2-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "push\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler "vlstm\tsp" } } */
+/* Check the right registers are cleared and none appears twice.  */
+/* { dg-final { scan-assembler "clrm\t\{(r2, )?(r3, )?(r4, )?(r5, )?(r6, )?(r7, )?(r8, )?(r9, )?(r10, )?(fp, )?(ip, )?APSR\}" } } */
+/* Check that the right number of registers is cleared and thus only one
+   register is missing.  */
+/* { dg-final { scan-assembler "clrm\t\{((r\[2-9\]|r10|fp|ip), ){10}APSR\}" } } */
+/* Check that no cleared register is used for blxns.  */
+/* { dg-final { scan-assembler-not "clrm\t\{\[^\}\]\+(r\[2-9\]|r10|fp|ip),\[^\}\]\+\}.*blxns\t\\1" } } */
+/* Check for v8.1-m variant of erratum work-around.  */
+/* { dg-final { scan-assembler "vscclrm\t\{vpr\}" } } */
+/* { dg-final { scan-assembler "vlldm\tsp" } } */
+/* { dg-final { scan-assembler "pop\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "blxns" } } */
diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-13a.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-13a.c
new file mode 100644
index 0000000..bceba44
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-13a.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16 -mfix-cmse-cve-2021-35465" }  */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=*" } { "-mfloat-abi=softfp" } } */
+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
+
+#include "../../../cmse-13.x"
+
+/* Checks for saving and clearing prior to function call.  */
+/* Shift on the same register as blxns.  */
+/* { dg-final { scan-assembler "lsrs\t(r\[1,4-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "lsls\t(r\[1,4-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler "push\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler "vlstm\tsp" } } */
+/* Check the right registers are cleared and none appears twice.  */
+/* { dg-final { scan-assembler "clrm\t\{(r1, )?(r4, )?(r5, )?(r6, )?(r7, )?(r8, )?(r9, )?(r10, )?(fp, )?(ip, )?APSR\}" } } */
+/* Check that the right number of registers is cleared and thus only one
+   register is missing.  */
+/* { dg-final { scan-assembler "clrm\t\{((r\[1,4-9\]|r10|fp|ip), ){9}APSR\}" } } */
+/* Check that no cleared register is used for blxns.  */
+/* { dg-final { scan-assembler-not "clrm\t\{\[^\}\]\+(r\[1,4-9\]|r10|fp|ip),\[^\}\]\+\}.*blxns\t\\1" } } */
+/* Check for v8.1-m variant of erratum work-around.  */
+/* { dg-final { scan-assembler "vscclrm\t\{vpr\}" } } */
+/* { dg-final { scan-assembler "vlldm\tsp" } } */
+/* { dg-final { scan-assembler "pop\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "blxns" } } */
diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-7a.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-7a.c
new file mode 100644
index 0000000..c74ebbd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-7a.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16 -mfix-cmse-cve-2021-35465" }  */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=*" } { "-mfloat-abi=softfp" } } */
+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
+
+#include "../../../cmse-7.x"
+
+/* Checks for saving and clearing prior to function call.  */
+/* Shift on the same register as blxns.  */
+/* { dg-final { scan-assembler "lsrs\t(r\[0-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "lsls\t(r\[0-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "push\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler "vlstm\tsp" } } */
+/* Check the right registers are cleared and none appears twice.  */
+/* { dg-final { scan-assembler "clrm\t\{(r0, )?(r1, )?(r2, )?(r3, )?(r4, )?(r5, )?(r6, )?(r7, )?(r8, )?(r9, )?(r10, )?(fp, )?(ip, )?APSR\}" } } */
+/* Check that the right number of registers is cleared and thus only one
+   register is missing.  */
+/* { dg-final { scan-assembler "clrm\t\{((r\[0-9\]|r10|fp|ip), ){12}APSR\}" } } */
+/* Check that no cleared register is used for blxns.  */
+/* { dg-final { scan-assembler-not "clrm\t\{\[^\}\]\+(r\[0-9\]|r10|fp|ip),\[^\}\]\+\}.*blxns\t\\1" } } */
+/* Check for v8.1-m variant of erratum work-around.  */
+/* { dg-final { scan-assembler "vscclrm\t\{vpr\}" } } */
+/* { dg-final { scan-assembler "vlldm\tsp" } } */
+/* { dg-final { scan-assembler "pop\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "blxns" } } */
diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-8a.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-8a.c
new file mode 100644
index 0000000..ffb67a7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-8a.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16 -mfix-cmse-cve-2021-35465" }  */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=*" } { "-mfloat-abi=softfp" } } */
+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
+
+#include "../../../cmse-8.x"
+
+/* Checks for saving and clearing prior to function call.  */
+/* Shift on the same register as blxns.  */
+/* { dg-final { scan-assembler "lsrs\t(r\[2-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "lsls\t(r\[2-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "push\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler "vlstm\tsp" } } */
+/* Check the right registers are cleared and none appears twice.  */
+/* { dg-final { scan-assembler "clrm\t\{(r2, )?(r3, )?(r4, )?(r5, )?(r6, )?(r7, )?(r8, )?(r9, )?(r10, )?(fp, )?(ip, )?APSR\}" } } */
+/* Check that the right number of registers is cleared and thus only one
+   register is missing.  */
+/* { dg-final { scan-assembler "clrm\t\{((r\[2-9\]|r10|fp|ip), ){10}APSR\}" } } */
+/* Check that no cleared register is used for blxns.  */
+/* { dg-final { scan-assembler-not "clrm\t\{\[^\}\]\+(r\[2-9\]|r10|fp|ip),\[^\}\]\+\}.*blxns\t\\1" } } */
+/* Check for v8.1-m variant of erratum work-around.  */
+/* { dg-final { scan-assembler "vscclrm\t\{vpr\}" } } */
+/* { dg-final { scan-assembler "vlldm\tsp" } } */
+/* { dg-final { scan-assembler "pop\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "blxns" } } */
-- 
cgit v1.1


From 9216ee6d1195d48388f825cf1b072e570129cbbe Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Tue, 24 Aug 2021 12:25:25 +0200
Subject: tree-optimization/100089 - avoid leaving scalar if-converted code
 around

This avoids leaving scalar if-converted code around for the case
of BB vectorizing an if-converted loop body when using the very-cheap
cost model.  In this case we scan not vectorized scalar stmts in
the basic-block vectorized for COND_EXPRs and force the vectorization
to be marked as not profitable.

The patch also makes sure to always consider all BB vectorization
subgraphs together for costing purposes when vectorizing an
if-converted loop body.

2021-08-24  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/100089
	* tree-vectorizer.h (vect_slp_bb): Rename to ...
	(vect_slp_if_converted_bb): ... this and get the original
	loop as new argument.
	* tree-vectorizer.c (try_vectorize_loop_1): Revert previous fix,
	pass original loop to vect_slp_if_converted_bb.
	* tree-vect-slp.c (vect_bb_vectorization_profitable_p):
	If orig_loop was passed scan the not vectorized stmts
	for COND_EXPRs and force not profitable if found.
	(vect_slp_region): Pass down all SLP instances to costing
	if orig_loop was specified.
	(vect_slp_bbs): Pass through orig_loop.
	(vect_slp_bb): Rename to ...
	(vect_slp_if_converted_bb): ... this and get the original
	loop as new argument.
	(vect_slp_function): Adjust.
---
 gcc/tree-vect-slp.c   | 70 ++++++++++++++++++++++++++++++++++++++++-----------
 gcc/tree-vectorizer.c | 20 ++++++++-------
 gcc/tree-vectorizer.h |  2 +-
 3 files changed, 68 insertions(+), 24 deletions(-)

(limited to 'gcc')

diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index d2f6a16..edc11c6 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -5233,7 +5233,8 @@ li_cost_vec_cmp (const void *a_, const void *b_)
 
 static bool
 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
-				    vec<slp_instance> slp_instances)
+				    vec<slp_instance> slp_instances,
+				    loop_p orig_loop)
 {
   slp_instance instance;
   int i;
@@ -5270,6 +5271,30 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
       vector_costs.safe_splice (instance->cost_vec);
       instance->cost_vec.release ();
     }
+  /* When we're vectorizing an if-converted loop body with the
+     very-cheap cost model make sure we vectorized all if-converted
+     code.  */
+  bool force_not_profitable = false;
+  if (orig_loop && flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP)
+    {
+      gcc_assert (bb_vinfo->bbs.length () == 1);
+      for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
+	   !gsi_end_p (gsi); gsi_next (&gsi))
+	{
+	  /* The costing above left us with DCEable vectorized scalar
+	     stmts having the visited flag set.  */
+	  if (gimple_visited_p (gsi_stmt (gsi)))
+	    continue;
+
+	  if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
+	    if (gimple_assign_rhs_code (ass) == COND_EXPR)
+	      {
+		force_not_profitable = true;
+		break;
+	      }
+	}
+    }
+
   /* Unset visited flag.  */
   stmt_info_for_cost *cost;
   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
@@ -5394,9 +5419,14 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
       return false;
     }
 
+  if (dump_enabled_p () && force_not_profitable)
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "not profitable because of unprofitable if-converted "
+		     "scalar code\n");
+
   scalar_costs.release ();
   vector_costs.release ();
-  return true;
+  return !force_not_profitable;
 }
 
 /* qsort comparator for lane defs.  */
@@ -5810,7 +5840,8 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
 
 static bool
 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
-		 vec<int> *dataref_groups, unsigned int n_stmts)
+		 vec<int> *dataref_groups, unsigned int n_stmts,
+		 loop_p orig_loop)
 {
   bb_vec_info bb_vinfo;
   auto_vector_modes vector_modes;
@@ -5859,7 +5890,9 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
 	      vect_location = instance->location ();
 	      if (!unlimited_cost_model (NULL)
 		  && !vect_bb_vectorization_profitable_p
-			(bb_vinfo, instance->subgraph_entries))
+			(bb_vinfo,
+			 orig_loop ? BB_VINFO_SLP_INSTANCES (bb_vinfo)
+			 : instance->subgraph_entries, orig_loop))
 		{
 		  if (dump_enabled_p ())
 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -5877,7 +5910,9 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
 				 "using SLP\n");
 	      vectorized = true;
 
-	      vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
+	      vect_schedule_slp (bb_vinfo,
+				 orig_loop ? BB_VINFO_SLP_INSTANCES (bb_vinfo)
+				 : instance->subgraph_entries);
 
 	      unsigned HOST_WIDE_INT bytes;
 	      if (dump_enabled_p ())
@@ -5892,6 +5927,11 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
 				     "basic block part vectorized using "
 				     "variable length vectors\n");
 		}
+
+	      /* When we're called from loop vectorization we're considering
+		 all subgraphs at once.  */
+	      if (orig_loop)
+		break;
 	    }
 	}
       else
@@ -5959,7 +5999,7 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
    true if anything in the basic-block was vectorized.  */
 
 static bool
-vect_slp_bbs (const vec<basic_block> &bbs)
+vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
 {
   vec<data_reference_p> datarefs = vNULL;
   auto_vec<int> dataref_groups;
@@ -5989,18 +6029,20 @@ vect_slp_bbs (const vec<basic_block> &bbs)
       ++current_group;
     }
 
-  return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
+  return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
 }
 
-/* Main entry for the BB vectorizer.  Analyze and transform BB, returns
-   true if anything in the basic-block was vectorized.  */
+/* Special entry for the BB vectorizer.  Analyze and transform a single
+   if-converted BB with ORIG_LOOPs body being the not if-converted
+   representation.  Returns true if anything in the basic-block was
+   vectorized.  */
 
 bool
-vect_slp_bb (basic_block bb)
+vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
 {
   auto_vec<basic_block> bbs;
   bbs.safe_push (bb);
-  return vect_slp_bbs (bbs);
+  return vect_slp_bbs (bbs, orig_loop);
 }
 
 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
@@ -6051,7 +6093,7 @@ vect_slp_function (function *fun)
 
       if (split && !bbs.is_empty ())
 	{
-	  r |= vect_slp_bbs (bbs);
+	  r |= vect_slp_bbs (bbs, NULL);
 	  bbs.truncate (0);
 	  bbs.quick_push (bb);
 	}
@@ -6069,13 +6111,13 @@ vect_slp_function (function *fun)
 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			       "splitting region at control altering "
 			       "definition %G", last);
-	    r |= vect_slp_bbs (bbs);
+	    r |= vect_slp_bbs (bbs, NULL);
 	    bbs.truncate (0);
 	  }
     }
 
   if (!bbs.is_empty ())
-    r |= vect_slp_bbs (bbs);
+    r |= vect_slp_bbs (bbs, NULL);
 
   free (rpo);
 
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 813f468..3aa3e2a 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -1033,10 +1033,7 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 	 only non-if-converted parts took part in BB vectorization.  */
       if (flag_tree_slp_vectorize != 0
 	  && loop_vectorized_call
-	  && ! loop->inner
-	  /* This would purely be a workaround and should be removed
-	     once PR100089 is fixed.  */
-	  && flag_vect_cost_model != VECT_COST_MODEL_VERY_CHEAP)
+	  && ! loop->inner)
 	{
 	  basic_block bb = loop->header;
 	  bool require_loop_vectorize = false;
@@ -1062,12 +1059,17 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 	      gimple_set_uid (stmt, -1);
 	      gimple_set_visited (stmt, false);
 	    }
-	  if (!require_loop_vectorize && vect_slp_bb (bb))
+	  if (!require_loop_vectorize)
 	    {
-	      fold_loop_internal_call (loop_vectorized_call,
-				       boolean_true_node);
-	      loop_vectorized_call = NULL;
-	      ret |= TODO_cleanup_cfg | TODO_update_ssa_only_virtuals;
+	      tree arg = gimple_call_arg (loop_vectorized_call, 1);
+	      class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
+	      if (vect_slp_if_converted_bb (bb, scalar_loop))
+		{
+		  fold_loop_internal_call (loop_vectorized_call,
+					   boolean_true_node);
+		  loop_vectorized_call = NULL;
+		  ret |= TODO_cleanup_cfg | TODO_update_ssa_only_virtuals;
+		}
 	    }
 	}
       /* If outer loop vectorization fails for LOOP_VECTORIZED guarded
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 9c2c29d..72e018e 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2087,7 +2087,7 @@ extern void vect_gather_slp_loads (vec_info *);
 extern void vect_get_slp_defs (slp_tree, vec<tree> *);
 extern void vect_get_slp_defs (vec_info *, slp_tree, vec<vec<tree> > *,
 			       unsigned n = -1U);
-extern bool vect_slp_bb (basic_block);
+extern bool vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop);
 extern bool vect_slp_function (function *);
 extern stmt_vec_info vect_find_last_scalar_stmt_in_slp (slp_tree);
 extern stmt_vec_info vect_find_first_scalar_stmt_in_slp (slp_tree);
-- 
cgit v1.1


From 6e5401e87d02919b0594e04f828892deef956407 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 23 Aug 2021 14:47:03 -0700
Subject: x86: Broadcast from integer to a pseudo vector register

Broadcast from integer to a pseudo vector register instead of a hard
vector register to allow LRA to remove redundant move instruction after
broadcast.

gcc/

	PR target/102021
	* config/i386/i386-expand.c (ix86_expand_vector_move): Broadcast
	from integer to a pseudo vector register.

gcc/testsuite/

	PR target/102021
	* gcc.target/i386/pr100865-10b.c: Expect vzeroupper.
	* gcc.target/i386/pr100865-4b.c: Likewise.
	* gcc.target/i386/pr100865-6b.c: Expect vmovdqu and vzeroupper.
	* gcc.target/i386/pr100865-7b.c: Likewise.
	* gcc.target/i386/pr102021.c: New test.
---
 gcc/config/i386/i386-expand.c                | 13 ++-----------
 gcc/testsuite/gcc.target/i386/pr100865-10b.c |  1 -
 gcc/testsuite/gcc.target/i386/pr100865-4b.c  |  3 +--
 gcc/testsuite/gcc.target/i386/pr100865-6b.c  |  6 ++----
 gcc/testsuite/gcc.target/i386/pr100865-7b.c  |  6 ++----
 gcc/testsuite/gcc.target/i386/pr102021.c     | 15 +++++++++++++++
 6 files changed, 22 insertions(+), 22 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102021.c

(limited to 'gcc')

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 9bf13db..2500dbf 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -579,19 +579,10 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
 	{
 	  /* Broadcast to XMM/YMM/ZMM register from an integer
 	     constant or scalar mem.  */
-	  /* Hard registers are used for 2 purposes:
-	     1. Prevent stack realignment when the original code
-	     doesn't use vector registers, which is the same for
-	     memcpy and memset.
-	     2. Prevent combine to convert constant broadcast to
-	     load from constant pool.  */
-	  op1 = ix86_gen_scratch_sse_rtx (mode);
+	  op1 = gen_reg_rtx (mode);
 	  if (FLOAT_MODE_P (mode)
 	      || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
-	    {
-	      first = force_const_mem (GET_MODE_INNER (mode), first);
-	      op1 = gen_reg_rtx (mode);
-	    }
+	    first = force_const_mem (GET_MODE_INNER (mode), first);
 	  bool ok = ix86_expand_vector_init_duplicate (false, mode,
 						       op1, first);
 	  gcc_assert (ok);
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
index 77ace86..e5616d8 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-10b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
@@ -5,4 +5,3 @@
 
 /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 8 } } */
-/* { dg-final { scan-assembler-not "vzeroupper" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
index 80e9fdb..6d9cb91 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
@@ -5,7 +5,6 @@
 
 /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 2 } } */
-/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
 /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-6b.c b/gcc/testsuite/gcc.target/i386/pr100865-6b.c
index 35f2e96..9588249 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-6b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-6b.c
@@ -4,9 +4,7 @@
 #include "pr100865-6a.c"
 
 /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "vmovdqu32\[\\t \]%ymm\[0-9\]+, " 8 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
 /* { dg-final { scan-assembler-not "vpbroadcastd\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-7b.c b/gcc/testsuite/gcc.target/i386/pr100865-7b.c
index ad267c4..3b20c68 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-7b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-7b.c
@@ -5,8 +5,6 @@
 
 /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %ymm\[0-9\]+" 1 { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "vmovdqu64\[\\t \]%ymm\[0-9\]+, " 16 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102021.c b/gcc/testsuite/gcc.target/i386/pr102021.c
new file mode 100644
index 0000000..6db3f57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102021.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=skylake-avx512" } */
+
+#include<immintrin.h>
+
+__m256i
+foo ()
+{
+  return _mm256_set1_epi16 (12);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %ymm\[0-9\]+" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
+/* { dg-final { scan-assembler-not "vzeroupper" } } */
-- 
cgit v1.1


From 675a3e40567e1d0dd6d7e7be3efab74b22731415 Mon Sep 17 00:00:00 2001
From: Andrew MacLeod <amacleod@redhat.com>
Date: Wed, 18 Aug 2021 16:36:19 -0400
Subject: Add transitive operations to the relation oracle.

When registering relations in the oracle, search for other relations which
imply new transitive relations.

	gcc/
	* value-relation.cc (rr_transitive_table): New.
	(relation_transitive): New.
	(value_relation::swap): Remove.
	(value_relation::apply_transitive): New.
	(relation_oracle::relation_oracle): Allocate a new tmp bitmap.
	(relation_oracle::register_relation): Call register_transitives.
	(relation_oracle::register_transitives): New.
	* value-relation.h (relation_oracle): Add new temporary bitmap and
	methods.

	gcc/testsuite/
	* gcc.dg/predict-1.c: Disable evrp.
	* gcc.dg/tree-ssa/evrp-trans.c: New.
---
 gcc/testsuite/gcc.dg/predict-1.c           |   2 +-
 gcc/testsuite/gcc.dg/tree-ssa/evrp-trans.c | 144 +++++++++++++++
 gcc/value-relation.cc                      | 287 ++++++++++++++++++++++++++---
 gcc/value-relation.h                       |   9 +-
 4 files changed, 410 insertions(+), 32 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/evrp-trans.c

(limited to 'gcc')

diff --git a/gcc/testsuite/gcc.dg/predict-1.c b/gcc/testsuite/gcc.dg/predict-1.c
index 9e5605a..d2e753e 100644
--- a/gcc/testsuite/gcc.dg/predict-1.c
+++ b/gcc/testsuite/gcc.dg/predict-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -fdump-tree-profile_estimate" } */
+/* { dg-options "-O2 -fdump-tree-profile_estimate --disable-tree-evrp" } */
 
 extern int global;
 
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/evrp-trans.c b/gcc/testsuite/gcc.dg/tree-ssa/evrp-trans.c
new file mode 100644
index 0000000..8ee8e3c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/evrp-trans.c
@@ -0,0 +1,144 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-evrp" } */
+
+/* Simple tests to make sure transitives are working. */
+void keep();
+void kill();
+
+void
+f1 (int x, int y, int z)
+{
+  if (x > y)
+    if (y > z)
+      {
+	if (x > z)
+	  keep ();
+	else
+	  kill ();
+      }
+}
+
+void
+f2 (int w, int x, int y, int z)
+{  
+  // Test one equivalence.
+  if (w == z)
+    if (x > y)
+      if (y > z)
+	{
+	  if (x > w)
+	    keep ();
+	  else
+	    kill ();
+	}
+}
+
+void
+f3 (int a, int w, int x, int y, int z)
+{  
+  // Test two equivlaences.
+  if (a == x)
+    if (w == z)
+      if (x > y)
+	if (y > z)
+	  {
+	    if (a > w)
+	      keep ();
+	    else
+	      kill ();
+	  }
+}
+
+void
+f4 (int x, int y, int z)
+{
+  // test X > Y >= Z
+  if (x > y)
+    if (y >= z)
+      {
+        if (x > z)
+          keep ();
+        else
+          kill ();
+      }
+}
+void
+f5 (int x, int y, int z)
+{
+  // test X >= Y > Z
+  if (x >= y)
+    if (y > z)
+      {
+        if (x > z)
+          keep ();
+        else
+          kill ();
+      }
+}
+
+void
+f6 (int x, int y, int z)
+{
+  // test X >= Y >= Z
+  if (x >= y)
+    if (y >= z)
+      {
+        if (x > z)
+          keep ();
+        else if (x == z)
+	  keep ();
+         else
+          kill ();
+      }
+}
+
+void
+f7 (int x, int y, int z)
+{
+  // test Y <= X , Z <= Y
+  if (y <= x)
+    if (z <= y)
+      {
+        if (x > z)
+          keep ();
+        else if (x == z)
+	  keep ();
+	else
+          kill ();
+      }
+}
+
+void
+f8 (int x, int y, int z)
+{
+  // test X >= Y, Z <= Y
+  if (x >= y)
+    if (z <= y)
+      {
+        if (x > z)
+          keep ();
+        else if (x == z)
+	  keep ();
+	else
+          kill ();
+      }
+}
+
+void
+f9 (int x, int y, int z)
+{
+  // test Y <= X   Y >= Z
+  if (y <= x)
+    if (y >= z)
+      {
+        if (x > z)
+          keep ();
+        else if (x == z)
+	  keep ();
+        else
+          kill ();
+      }
+}
+
+/* { dg-final { scan-tree-dump-not "kill" "evrp" } }  */
+/* { dg-final { scan-tree-dump-times "keep" 13 "evrp"} } */
diff --git a/gcc/value-relation.cc b/gcc/value-relation.cc
index bcfe388..8edd98b 100644
--- a/gcc/value-relation.cc
+++ b/gcc/value-relation.cc
@@ -112,7 +112,7 @@ relation_kind rr_intersect_table[VREL_COUNT][VREL_COUNT] = {
   { NE_EXPR, LT_EXPR, LT_EXPR, GT_EXPR, GT_EXPR, VREL_EMPTY, VREL_EMPTY, NE_EXPR } };
 
 
-// Intersect relation R! with relation R2 and return the resulting relation.
+// Intersect relation R1 with relation R2 and return the resulting relation.
 
 relation_kind
 relation_intersect (relation_kind r1, relation_kind r2)
@@ -155,6 +155,39 @@ relation_union (relation_kind r1, relation_kind r2)
 }
 
 
+// This table is used to determine transitivity between 2 relations.
+// (A relation0 B) and (B relation1 C) implies  (A result C)
+
+relation_kind rr_transitive_table[VREL_COUNT][VREL_COUNT] = {
+//   NONE, LT_EXPR, LE_EXPR, GT_EXPR, GE_EXPR, EMPTY, EQ_EXPR, NE_EXPR
+// VREL_NONE
+  { VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE },
+// LT_EXPR
+  { VREL_NONE, LT_EXPR, LT_EXPR, VREL_NONE, VREL_NONE, VREL_NONE, LT_EXPR, VREL_NONE },
+// LE_EXPR
+  { VREL_NONE, LT_EXPR, LE_EXPR, VREL_NONE, VREL_NONE, VREL_NONE, LE_EXPR, VREL_NONE },
+// GT_EXPR
+  { VREL_NONE, VREL_NONE, VREL_NONE, GT_EXPR, GT_EXPR, VREL_NONE, GT_EXPR, VREL_NONE },
+// GE_EXPR
+  { VREL_NONE, VREL_NONE, VREL_NONE, GT_EXPR, GE_EXPR, VREL_NONE, GE_EXPR, VREL_NONE },
+// VREL_EMPTY
+  { VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE },
+// EQ_EXPR
+  { VREL_NONE, LT_EXPR, LE_EXPR, GT_EXPR, GE_EXPR, VREL_NONE, EQ_EXPR, VREL_NONE },
+// NE_EXPR
+  { VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE } };
+
+// Apply transitive operation between relation R1 and relation R2, and
+// return the resulting relation, if any.
+
+relation_kind
+relation_transitive (relation_kind r1, relation_kind r2)
+{
+  vrel_range_assert (r1);
+  vrel_range_assert (r2);
+  return rr_transitive_table[r1 - VREL_FIRST][r2 - VREL_FIRST];
+}
+
 // -------------------------------------------------------------------------
 
 // This class represents an equivalency set, and contains a link to the next
@@ -472,7 +505,7 @@ public:
   bool union_ (value_relation &p);
   bool intersect (value_relation &p);
   void negate ();
-  void swap ();
+  bool apply_transitive (const value_relation &rel);
 
   void dump (FILE *f) const;
 private:
@@ -517,14 +550,6 @@ value_relation::negate ()
   related = relation_negate (related);
 }
 
-// Modify the relation as if the operands were being swapped.
-
-void
-value_relation::swap ()
-{
-  related = relation_swap (related);
-}
-
 // Perform an intersection between 2 relations. *this &&= p.
 
 bool
@@ -561,6 +586,73 @@ value_relation::union_ (value_relation &p)
   return old != related;
 }
 
+// Identify and apply any transitive relations between REL
+// and THIS.  Return true if there was a transformation.
+
+bool
+value_relation::apply_transitive (const value_relation &rel)
+{
+  relation_kind k = VREL_NONE;
+
+  // Idenity any common operand, and notrmalize the relations to
+  // the form : A < B  B < C produces A < C
+  if (rel.op1 () == name2)
+    {
+      // A < B   B < C
+      if (rel.op2 () == name1)
+	return false;
+      k = relation_transitive (kind (), rel.kind ());
+      if (k != VREL_NONE)
+	{
+	  related = k;
+	  name2 = rel.op2 ();
+	  return true;
+	}
+    }
+  else if (rel.op1 () == name1)
+    {
+      // B > A   B < C
+      if (rel.op2 () == name2)
+	return false;
+      k = relation_transitive (relation_swap (kind ()), rel.kind ());
+      if (k != VREL_NONE)
+	{
+	  related = k;
+	  name1 = name2;
+	  name2 = rel.op2 ();
+	  return true;
+	}
+    }
+  else if (rel.op2 () == name2)
+    {
+       // A < B   C > B
+       if (rel.op1 () == name1)
+	 return false;
+      k = relation_transitive (kind (), relation_swap (rel.kind ()));
+      if (k != VREL_NONE)
+	{
+	  related = k;
+	  name2 = rel.op1 ();
+	  return true;
+	}
+    }
+  else if (rel.op2 () == name1)
+    {
+      // B > A  C > B
+      if (rel.op1 () == name2)
+	return false;
+      k = relation_transitive (relation_swap (kind ()),
+			       relation_swap (rel.kind ()));
+      if (k != VREL_NONE)
+	{
+	  related = k;
+	  name1 = name2;
+	  name2 = rel.op1 ();
+	  return true;
+	}
+    }
+  return false;
+}
 
 // Dump the relation to file F.
 
@@ -597,6 +689,7 @@ relation_oracle::relation_oracle ()
   m_relations.safe_grow_cleared (last_basic_block_for_fn (cfun) + 1);
   m_relation_set = BITMAP_ALLOC (&m_bitmaps);
   m_tmp = BITMAP_ALLOC (&m_bitmaps);
+  m_tmp2 = BITMAP_ALLOC (&m_bitmaps);
 }
 
 // Destruct a relation oracle.
@@ -669,10 +762,12 @@ relation_oracle::register_relation (edge e, relation_kind k, tree op1,
 // Register relation K between OP! and OP2 in block BB.
 // This creates the record and searches for existing records in the dominator
 // tree to merge with.
+// TRANSITIVE_P is true if this is being registered as a transitive operation,
+// and should not try to register further transitives.
 
 void
 relation_oracle::register_relation (basic_block bb, relation_kind k, tree op1,
-				    tree op2)
+				    tree op2, bool transitive_p)
 {
   gcc_checking_assert (k != VREL_NONE);
 
@@ -710,26 +805,160 @@ relation_oracle::register_relation (basic_block bb, relation_kind k, tree op1,
 	  ptr->dump (dump_file);
 	  fprintf (dump_file, "\n");
 	}
-      return;
+    }
+  else
+    {
+      // Check for an existing relation further up the DOM chain.
+      // By including dominating relations, The first one found in any search
+      // will be the aggregate of all the previous ones.
+      curr = find_relation_dom (bb, v1, v2);
+      if (curr != VREL_NONE)
+	k = relation_intersect (curr, k);
+
+      bitmap_set_bit (bm, v1);
+      bitmap_set_bit (bm, v2);
+      bitmap_set_bit (m_relation_set, v1);
+      bitmap_set_bit (m_relation_set, v2);
+
+      ptr = (relation_chain *) obstack_alloc (&m_chain_obstack,
+					      sizeof (relation_chain));
+      ptr->set_relation (k, op1, op2);
+      ptr->m_next = m_relations[bbi].m_head;
+      m_relations[bbi].m_head = ptr;;
     }
 
-  // Check for an existing relation further up the DOM chain.
-  // By including dominating relations, The first one found in any search
-  // will be the aggregate of all the previous ones.
-  curr = find_relation_dom (bb, v1, v2);
-  if (curr != VREL_NONE)
-    k = relation_intersect (curr, k);
-
-  bitmap_set_bit (bm, v1);
-  bitmap_set_bit (bm, v2);
-  bitmap_set_bit (m_relation_set, v1);
-  bitmap_set_bit (m_relation_set, v2);
-
-  ptr = (relation_chain *) obstack_alloc (&m_chain_obstack,
-					  sizeof (relation_chain));
-  ptr->set_relation (k, op1, op2);
-  ptr->m_next = m_relations[bbi].m_head;
-  m_relations[bbi].m_head = ptr;;
+  if (!transitive_p)
+    register_transitives (bb, *ptr);
+}
+
+// Starting at ROOT_BB search the DOM tree  looking for relations which
+// may produce transitive relations to RELATION.  EQUIV1 and EQUIV2 are
+// bitmaps for op1/op2 and any of their equivalences that should also be
+// considered.
+
+void
+relation_oracle::register_transitives (basic_block root_bb,
+				       const value_relation &relation,
+				       const_bitmap equiv1,
+				       const_bitmap equiv2)
+{
+  basic_block bb;
+  for (bb = root_bb; bb; bb = get_immediate_dominator (CDI_DOMINATORS, bb))
+    {
+      int bbi = bb->index;
+      if (bbi >= (int)m_relations.length())
+	continue;
+      const_bitmap bm = m_relations[bbi].m_names;
+      if (!bm)
+	continue;
+      if (!bitmap_intersect_p (bm, equiv1) && !bitmap_intersect_p (bm, equiv2))
+	continue;
+      // At least one of the 2 ops has a relation in this block.
+      relation_chain *ptr;
+      for (ptr = m_relations[bbi].m_head; ptr ; ptr = ptr->m_next)
+	{
+	  // In the presence of an equivalence, 2 operands may do not
+	  // naturally match. ie  with equivalence a_2 == b_3
+	  // given c_1 < a_2 && b_3 < d_4
+	  // convert the second relation (b_3 < d_4) to match any
+	  // equivalences to found in the first relation.
+	  // ie convert b_3 < d_4 to a_2 < d_4, which then exposes the
+	  // transitive operation:  c_1 < a_2 && a_2 < d_4 -> c_1 < d_4
+
+	  tree r1, r2;
+	  tree p1 = ptr->op1 ();
+	  tree p2 = ptr->op2 ();
+	  // Find which equivalence is in the first operand.
+	  if (bitmap_bit_p (equiv1, SSA_NAME_VERSION (p1)))
+	    r1 = p1;
+	  else if (bitmap_bit_p (equiv1, SSA_NAME_VERSION (p2)))
+	    r1 = p2;
+	  else
+	    r1 = NULL_TREE;
+
+	  // Find which equivalence is in the second operand.
+	  if (bitmap_bit_p (equiv2, SSA_NAME_VERSION (p1)))
+	    r2 = p1;
+	  else if (bitmap_bit_p (equiv2, SSA_NAME_VERSION (p2)))
+	    r2 = p2;
+	  else
+	    r2 = NULL_TREE;
+
+	  // Ignore if both NULL (not relevant relation) or the same,
+	  if (r1 == r2)
+	    continue;
+
+	  // Any operand not an equivalence, just take the real operand.
+	  if (!r1)
+	    r1 = relation.op1 ();
+	  if (!r2)
+	    r2 = relation.op2 ();
+
+	  value_relation nr (relation.kind (), r1, r2);
+	  if (nr.apply_transitive (*ptr))
+	    {
+	      register_relation (root_bb, nr.kind (), nr.op1 (), nr.op2 (),
+				 true);
+	      if (dump_file && (dump_flags & TDF_DETAILS))
+		{
+		  fprintf (dump_file, "   Registering transitive relation ");
+		  nr.dump (dump_file);
+		  fputc ('\n', dump_file);
+		}
+	    }
+
+	}
+    }
+}
+
+// Find adn register any transitive relations implied by RELATION occuring
+// in block BB.
+
+void
+relation_oracle::register_transitives (basic_block bb,
+				       const value_relation &relation)
+{
+  // Only apply transitives to certain kinds of operations.
+  switch (relation.kind ())
+    {
+      case LE_EXPR:
+      case LT_EXPR:
+      case GT_EXPR:
+      case GE_EXPR:
+	break;
+      default:
+	return;
+    }
+
+  // Set up the bitmaps for op1 and op2, and if there are no equivalencies,
+  // set just op1 or op2 in their own bitmap.
+  const_bitmap equiv1 = equiv_set (relation.op1 (), bb);
+  const_bitmap equiv2 = equiv_set (relation.op2 (), bb);
+  if (equiv1)
+    {
+      if (equiv2)
+	register_transitives (bb, relation, equiv1, equiv2);
+      else
+	{
+	  bitmap_clear (m_tmp);
+	  bitmap_set_bit (m_tmp, SSA_NAME_VERSION (relation.op2 ()));
+	  register_transitives (bb, relation, equiv1, m_tmp);
+	}
+    }
+  else if (equiv2)
+    {
+      bitmap_clear (m_tmp);
+      bitmap_set_bit (m_tmp, SSA_NAME_VERSION (relation.op1 ()));
+      register_transitives (bb, relation, m_tmp, equiv2);
+    }
+  else
+    {
+      bitmap_clear (m_tmp);
+      bitmap_clear (m_tmp2);
+      bitmap_set_bit (m_tmp, SSA_NAME_VERSION (relation.op1 ()));
+      bitmap_set_bit (m_tmp2, SSA_NAME_VERSION (relation.op2 ()));
+      register_transitives (bb, relation, m_tmp, m_tmp2);
+    }
 }
 
 // Find the relation between any ssa_name in B1 and any name in B2 in block BB.
diff --git a/gcc/value-relation.h b/gcc/value-relation.h
index 1148854..e0e2f82 100644
--- a/gcc/value-relation.h
+++ b/gcc/value-relation.h
@@ -143,7 +143,7 @@ public:
   void dump (FILE *f, basic_block bb) const;
   void dump (FILE *f) const;
 private:
-  bitmap m_tmp;
+  bitmap m_tmp, m_tmp2;
   bitmap m_relation_set;  // Index by ssa-name. True if a relation exists
   vec <relation_chain_head> m_relations;  // Index by BB, list of relations.
   relation_kind find_relation_block (unsigned bb, const_bitmap b1,
@@ -153,7 +153,12 @@ private:
   relation_kind find_relation_block (int bb, unsigned v1, unsigned v2,
 				     relation_chain **obj = NULL);
   relation_kind find_relation_dom (basic_block bb, unsigned v1, unsigned v2);
-  void register_relation (basic_block bb, relation_kind k, tree op1, tree op2);
+  void register_relation (basic_block bb, relation_kind k, tree op1, tree op2,
+			  bool transitive_p = false);
+  void register_transitives (basic_block, const class value_relation &);
+  void register_transitives (basic_block, const value_relation &, const_bitmap,
+			     const_bitmap);
+
 };
 
 #endif  /* GCC_VALUE_RELATION_H */
-- 
cgit v1.1


From fce8a52d0aef5f0ef393f68d31669058e0ddfd71 Mon Sep 17 00:00:00 2001
From: Bill Schmidt <wschmidt@linux.ibm.com>
Date: Fri, 2 Apr 2021 16:48:36 -0500
Subject: rs6000: Add power7 and power7-64 builtins

2021-04-02  Bill Schmidt  <wschmidt@linux.ibm.com>

gcc/
	* config/rs6000/rs6000-builtin-new.def: Add power7 and power7-64
	stanzas.
---
 gcc/config/rs6000/rs6000-builtin-new.def | 39 ++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'gcc')

diff --git a/gcc/config/rs6000/rs6000-builtin-new.def b/gcc/config/rs6000/rs6000-builtin-new.def
index 61f5b94..a310bf4 100644
--- a/gcc/config/rs6000/rs6000-builtin-new.def
+++ b/gcc/config/rs6000/rs6000-builtin-new.def
@@ -1961,3 +1961,42 @@
 
   const vsll __builtin_vsx_xxspltd_2di (vsll, const int<1>);
     XXSPLTD_V2DI vsx_xxspltd_v2di {}
+
+
+; Power7 builtins (ISA 2.06).
+[power7]
+  const unsigned int __builtin_addg6s (unsigned int, unsigned int);
+    ADDG6S addg6s {}
+
+  const signed long __builtin_bpermd (signed long, signed long);
+    BPERMD bpermd_di {}
+
+  const unsigned int __builtin_cbcdtd (unsigned int);
+    CBCDTD cbcdtd {}
+
+  const unsigned int __builtin_cdtbcd (unsigned int);
+    CDTBCD cdtbcd {}
+
+  const signed int __builtin_divwe (signed int, signed int);
+    DIVWE dive_si {}
+
+  const unsigned int __builtin_divweu (unsigned int, unsigned int);
+    DIVWEU diveu_si {}
+
+  const vsq __builtin_pack_vector_int128 (unsigned long long, unsigned long long);
+    PACK_V1TI packv1ti {}
+
+  void __builtin_ppc_speculation_barrier ();
+    SPECBARR speculation_barrier {}
+
+  const unsigned long __builtin_unpack_vector_int128 (vsq, const int<1>);
+    UNPACK_V1TI unpackv1ti {}
+
+
+; Power7 builtins requiring 64-bit GPRs (even with 32-bit addressing).
+[power7-64]
+  const signed long long __builtin_divde (signed long long, signed long long);
+    DIVDE dive_di {}
+
+  const unsigned long long __builtin_divdeu (unsigned long long, unsigned long long);
+    DIVDEU diveu_di {}
-- 
cgit v1.1


From 8ce18a29ef717f5920ebf5dc1d9e84570a1827d4 Mon Sep 17 00:00:00 2001
From: Bill Schmidt <wschmidt@linux.ibm.com>
Date: Thu, 1 Apr 2021 13:57:44 -0500
Subject: rs6000: Add power8-vector builtins

2021-04-01  Bill Schmidt  <wschmidt@linux.ibm.com>

gcc/
	* config/rs6000/rs6000-builtin-new.def: Add power8-vector stanza.
---
 gcc/config/rs6000/rs6000-builtin-new.def | 438 +++++++++++++++++++++++++++++++
 1 file changed, 438 insertions(+)

(limited to 'gcc')

diff --git a/gcc/config/rs6000/rs6000-builtin-new.def b/gcc/config/rs6000/rs6000-builtin-new.def
index a310bf4..2a2c913 100644
--- a/gcc/config/rs6000/rs6000-builtin-new.def
+++ b/gcc/config/rs6000/rs6000-builtin-new.def
@@ -2000,3 +2000,441 @@
 
   const unsigned long long __builtin_divdeu (unsigned long long, unsigned long long);
     DIVDEU diveu_di {}
+
+
+; Power8 vector built-ins.
+[power8-vector]
+  const vsll __builtin_altivec_abs_v2di (vsll);
+    ABS_V2DI absv2di2 {}
+
+  const vsc __builtin_altivec_bcddiv10_v16qi (vsc);
+    BCDDIV10_V16QI bcddiv10_v16qi {}
+
+  const vsc __builtin_altivec_bcdmul10_v16qi (vsc);
+    BCDMUL10_V16QI bcdmul10_v16qi {}
+
+  const vsc __builtin_altivec_eqv_v16qi (vsc, vsc);
+    EQV_V16QI eqvv16qi3 {}
+
+  const vuc __builtin_altivec_eqv_v16qi_uns (vuc, vuc);
+    EQV_V16QI_UNS eqvv16qi3 {}
+
+  const vsq __builtin_altivec_eqv_v1ti (vsq, vsq);
+    EQV_V1TI eqvv1ti3 {}
+
+  const vuq __builtin_altivec_eqv_v1ti_uns (vuq, vuq);
+    EQV_V1TI_UNS eqvv1ti3 {}
+
+  const vd __builtin_altivec_eqv_v2df (vd, vd);
+    EQV_V2DF eqvv2df3 {}
+
+  const vsll __builtin_altivec_eqv_v2di (vsll, vsll);
+    EQV_V2DI eqvv2di3 {}
+
+  const vull __builtin_altivec_eqv_v2di_uns (vull, vull);
+    EQV_V2DI_UNS eqvv2di3 {}
+
+  const vf __builtin_altivec_eqv_v4sf (vf, vf);
+    EQV_V4SF eqvv4sf3 {}
+
+  const vsi __builtin_altivec_eqv_v4si (vsi, vsi);
+    EQV_V4SI eqvv4si3 {}
+
+  const vui __builtin_altivec_eqv_v4si_uns (vui, vui);
+    EQV_V4SI_UNS eqvv4si3 {}
+
+  const vss __builtin_altivec_eqv_v8hi (vss, vss);
+    EQV_V8HI eqvv8hi3 {}
+
+  const vus __builtin_altivec_eqv_v8hi_uns (vus, vus);
+    EQV_V8HI_UNS eqvv8hi3 {}
+
+  const vsc __builtin_altivec_nand_v16qi (vsc, vsc);
+    NAND_V16QI nandv16qi3 {}
+
+  const vuc __builtin_altivec_nand_v16qi_uns (vuc, vuc);
+    NAND_V16QI_UNS nandv16qi3 {}
+
+  const vsq __builtin_altivec_nand_v1ti (vsq, vsq);
+    NAND_V1TI nandv1ti3 {}
+
+  const vuq __builtin_altivec_nand_v1ti_uns (vuq, vuq);
+    NAND_V1TI_UNS nandv1ti3 {}
+
+  const vd __builtin_altivec_nand_v2df (vd, vd);
+    NAND_V2DF nandv2df3 {}
+
+  const vsll __builtin_altivec_nand_v2di (vsll, vsll);
+    NAND_V2DI nandv2di3 {}
+
+  const vull __builtin_altivec_nand_v2di_uns (vull, vull);
+    NAND_V2DI_UNS nandv2di3 {}
+
+  const vf __builtin_altivec_nand_v4sf (vf, vf);
+    NAND_V4SF nandv4sf3 {}
+
+  const vsi __builtin_altivec_nand_v4si (vsi, vsi);
+    NAND_V4SI nandv4si3 {}
+
+  const vui __builtin_altivec_nand_v4si_uns (vui, vui);
+    NAND_V4SI_UNS nandv4si3 {}
+
+  const vss __builtin_altivec_nand_v8hi (vss, vss);
+    NAND_V8HI nandv8hi3 {}
+
+  const vus __builtin_altivec_nand_v8hi_uns (vus, vus);
+    NAND_V8HI_UNS nandv8hi3 {}
+
+  const vsc __builtin_altivec_neg_v16qi (vsc);
+    NEG_V16QI negv16qi2 {}
+
+  const vd __builtin_altivec_neg_v2df (vd);
+    NEG_V2DF negv2df2 {}
+
+  const vsll __builtin_altivec_neg_v2di (vsll);
+    NEG_V2DI negv2di2 {}
+
+  const vf __builtin_altivec_neg_v4sf (vf);
+    NEG_V4SF negv4sf2 {}
+
+  const vsi __builtin_altivec_neg_v4si (vsi);
+    NEG_V4SI negv4si2 {}
+
+  const vss __builtin_altivec_neg_v8hi (vss);
+    NEG_V8HI negv8hi2 {}
+
+  const vsc __builtin_altivec_orc_v16qi (vsc, vsc);
+    ORC_V16QI orcv16qi3 {}
+
+  const vuc __builtin_altivec_orc_v16qi_uns (vuc, vuc);
+    ORC_V16QI_UNS orcv16qi3 {}
+
+  const vsq __builtin_altivec_orc_v1ti (vsq, vsq);
+    ORC_V1TI orcv1ti3 {}
+
+  const vuq __builtin_altivec_orc_v1ti_uns (vuq, vuq);
+    ORC_V1TI_UNS orcv1ti3 {}
+
+  const vd __builtin_altivec_orc_v2df (vd, vd);
+    ORC_V2DF orcv2df3 {}
+
+  const vsll __builtin_altivec_orc_v2di (vsll, vsll);
+    ORC_V2DI orcv2di3 {}
+
+  const vull __builtin_altivec_orc_v2di_uns (vull, vull);
+    ORC_V2DI_UNS orcv2di3 {}
+
+  const vf __builtin_altivec_orc_v4sf (vf, vf);
+    ORC_V4SF orcv4sf3 {}
+
+  const vsi __builtin_altivec_orc_v4si (vsi, vsi);
+    ORC_V4SI orcv4si3 {}
+
+  const vui __builtin_altivec_orc_v4si_uns (vui, vui);
+    ORC_V4SI_UNS orcv4si3 {}
+
+  const vss __builtin_altivec_orc_v8hi (vss, vss);
+    ORC_V8HI orcv8hi3 {}
+
+  const vus __builtin_altivec_orc_v8hi_uns (vus, vus);
+    ORC_V8HI_UNS orcv8hi3 {}
+
+  const vsc __builtin_altivec_vclzb (vsc);
+    VCLZB clzv16qi2 {}
+
+  const vsll __builtin_altivec_vclzd (vsll);
+    VCLZD clzv2di2 {}
+
+  const vss __builtin_altivec_vclzh (vss);
+    VCLZH clzv8hi2 {}
+
+  const vsi __builtin_altivec_vclzw (vsi);
+    VCLZW clzv4si2 {}
+
+  const vuc __builtin_altivec_vgbbd (vuc);
+    VGBBD p8v_vgbbd {}
+
+  const vsq __builtin_altivec_vaddcuq (vsq, vsq);
+    VADDCUQ altivec_vaddcuq {}
+
+  const vsq __builtin_altivec_vaddecuq (vsq, vsq, vsq);
+    VADDECUQ altivec_vaddecuq {}
+
+  const vsq __builtin_altivec_vaddeuqm (vsq, vsq, vsq);
+    VADDEUQM altivec_vaddeuqm {}
+
+  const vsll __builtin_altivec_vaddudm (vsll, vsll);
+    VADDUDM addv2di3 {}
+
+  const vsq __builtin_altivec_vadduqm (vsq, vsq);
+    VADDUQM altivec_vadduqm {}
+
+  const vsll __builtin_altivec_vbpermq (vsc, vsc);
+    VBPERMQ altivec_vbpermq {}
+
+  const vsc __builtin_altivec_vbpermq2 (vsc, vsc);
+    VBPERMQ2 altivec_vbpermq2 {}
+
+  const vsll __builtin_altivec_vmaxsd (vsll, vsll);
+    VMAXSD smaxv2di3 {}
+
+  const vull __builtin_altivec_vmaxud (vull, vull);
+    VMAXUD umaxv2di3 {}
+
+  const vsll __builtin_altivec_vminsd (vsll, vsll);
+    VMINSD sminv2di3 {}
+
+  const vull __builtin_altivec_vminud (vull, vull);
+    VMINUD uminv2di3 {}
+
+  const vd __builtin_altivec_vmrgew_v2df (vd, vd);
+    VMRGEW_V2DF p8_vmrgew_v2df {}
+
+  const vsll __builtin_altivec_vmrgew_v2di (vsll, vsll);
+    VMRGEW_V2DI p8_vmrgew_v2di {}
+
+  const vf __builtin_altivec_vmrgew_v4sf (vf, vf);
+    VMRGEW_V4SF p8_vmrgew_v4sf {}
+
+  const vsi __builtin_altivec_vmrgew_v4si (vsi, vsi);
+    VMRGEW_V4SI p8_vmrgew_v4si {}
+
+  const vd __builtin_altivec_vmrgow_v2df (vd, vd);
+    VMRGOW_V2DF p8_vmrgow_v2df {}
+
+  const vsll __builtin_altivec_vmrgow_v2di (vsll, vsll);
+    VMRGOW_V2DI p8_vmrgow_v2di {}
+
+  const vf __builtin_altivec_vmrgow_v4sf (vf, vf);
+    VMRGOW_V4SF p8_vmrgow_v4sf {}
+
+  const vsi __builtin_altivec_vmrgow_v4si (vsi, vsi);
+    VMRGOW_V4SI p8_vmrgow_v4si {}
+
+  const vsc __builtin_altivec_vpermxor (vsc, vsc, vsc);
+    VPERMXOR altivec_vpermxor {}
+
+  const vsi __builtin_altivec_vpksdss (vsll, vsll);
+    VPKSDSS altivec_vpksdss {}
+
+  const vsi __builtin_altivec_vpksdus (vsll, vsll);
+    VPKSDUS altivec_vpksdus {}
+
+  const vsi __builtin_altivec_vpkudum (vsll, vsll);
+    VPKUDUM altivec_vpkudum {}
+
+  const vsi __builtin_altivec_vpkudus (vsll, vsll);
+    VPKUDUS altivec_vpkudus {}
+
+  const vsc __builtin_altivec_vpmsumb (vsc, vsc);
+    VPMSUMB_A crypto_vpmsumb {}
+
+  const vsll __builtin_altivec_vpmsumd (vsll, vsll);
+    VPMSUMD_A crypto_vpmsumd {}
+
+  const vss __builtin_altivec_vpmsumh (vss, vss);
+    VPMSUMH_A crypto_vpmsumh {}
+
+  const vsi __builtin_altivec_vpmsumw (vsi, vsi);
+    VPMSUMW_A crypto_vpmsumw {}
+
+  const vsc __builtin_altivec_vpopcntb (vsc);
+    VPOPCNTB popcountv16qi2 {}
+
+  const vsll __builtin_altivec_vpopcntd (vsll);
+    VPOPCNTD popcountv2di2 {}
+
+  const vss __builtin_altivec_vpopcnth (vss);
+    VPOPCNTH popcountv8hi2 {}
+
+  const vsc __builtin_altivec_vpopcntub (vsc);
+    VPOPCNTUB popcountv16qi2 {}
+
+  const vsll __builtin_altivec_vpopcntud (vsll);
+    VPOPCNTUD popcountv2di2 {}
+
+  const vss __builtin_altivec_vpopcntuh (vss);
+    VPOPCNTUH popcountv8hi2 {}
+
+  const vsi __builtin_altivec_vpopcntuw (vsi);
+    VPOPCNTUW popcountv4si2 {}
+
+  const vsi __builtin_altivec_vpopcntw (vsi);
+    VPOPCNTW popcountv4si2 {}
+
+  const vsll __builtin_altivec_vrld (vsll, vsll);
+    VRLD vrotlv2di3 {}
+
+  const vsll __builtin_altivec_vsld (vsll, vsll);
+    VSLD vashlv2di3 {}
+
+  const vsll __builtin_altivec_vsrad (vsll, vsll);
+    VSRAD vashrv2di3 {}
+
+  const vsll __builtin_altivec_vsrd (vsll, vull);
+    VSRD vlshrv2di3 {}
+
+  const vsq __builtin_altivec_vsubcuq (vsq, vsq);
+    VSUBCUQ altivec_vsubcuq {}
+
+  const vsq __builtin_altivec_vsubecuq (vsq, vsq, vsq);
+    VSUBECUQ altivec_vsubecuq {}
+
+  const vsq __builtin_altivec_vsubeuqm (vsq, vsq, vsq);
+    VSUBEUQM altivec_vsubeuqm {}
+
+  const vsll __builtin_altivec_vsubudm (vsll, vsll);
+    VSUBUDM subv2di3 {}
+
+  const vsq __builtin_altivec_vsubuqm (vsq, vsq);
+    VSUBUQM altivec_vsubuqm {}
+
+  const vsll __builtin_altivec_vupkhsw (vsi);
+    VUPKHSW altivec_vupkhsw {}
+
+  const vsll __builtin_altivec_vupklsw (vsi);
+    VUPKLSW altivec_vupklsw {}
+
+  const vsq __builtin_bcdadd_v1ti (vsq, vsq, const int<1>);
+    BCDADD_V1TI bcdadd_v1ti {}
+
+  const vsc __builtin_bcdadd_v16qi (vsc, vsc, const int<1>);
+    BCDADD_V16QI bcdadd_v16qi {}
+
+  const signed int __builtin_bcdadd_eq_v1ti (vsq, vsq, const int<1>);
+    BCDADD_EQ_V1TI bcdadd_eq_v1ti {}
+
+  const signed int __builtin_bcdadd_eq_v16qi (vsc, vsc, const int<1>);
+    BCDADD_EQ_V16QI bcdadd_eq_v16qi {}
+
+  const signed int __builtin_bcdadd_gt_v1ti (vsq, vsq, const int<1>);
+    BCDADD_GT_V1TI bcdadd_gt_v1ti {}
+
+  const signed int __builtin_bcdadd_gt_v16qi (vsc, vsc, const int<1>);
+    BCDADD_GT_V16QI bcdadd_gt_v16qi {}
+
+  const signed int __builtin_bcdadd_lt_v1ti (vsq, vsq, const int<1>);
+    BCDADD_LT_V1TI bcdadd_lt_v1ti {}
+
+  const signed int __builtin_bcdadd_lt_v16qi (vsc, vsc, const int<1>);
+    BCDADD_LT_V16QI bcdadd_lt_v16qi {}
+
+  const signed int __builtin_bcdadd_ov_v1ti (vsq, vsq, const int<1>);
+    BCDADD_OV_V1TI bcdadd_unordered_v1ti {}
+
+  const signed int __builtin_bcdadd_ov_v16qi (vsc, vsc, const int<1>);
+    BCDADD_OV_V16QI bcdadd_unordered_v16qi {}
+
+  const signed int __builtin_bcdinvalid_v1ti (vsq);
+    BCDINVALID_V1TI bcdinvalid_v1ti {}
+
+  const signed int __builtin_bcdinvalid_v16qi (vsc);
+    BCDINVALID_V16QI bcdinvalid_v16qi {}
+
+  const vsq __builtin_bcdsub_v1ti (vsq, vsq, const int<1>);
+    BCDSUB_V1TI bcdsub_v1ti {}
+
+  const vsc __builtin_bcdsub_v16qi (vsc, vsc, const int<1>);
+    BCDSUB_V16QI bcdsub_v16qi {}
+
+  const signed int __builtin_bcdsub_eq_v1ti (vsq, vsq, const int<1>);
+    BCDSUB_EQ_V1TI bcdsub_eq_v1ti {}
+
+  const signed int __builtin_bcdsub_eq_v16qi (vsc, vsc, const int<1>);
+    BCDSUB_EQ_V16QI bcdsub_eq_v16qi {}
+
+  const signed int __builtin_bcdsub_ge_v1ti (vsq, vsq, const int<1>);
+    BCDSUB_GE_V1TI bcdsub_ge_v1ti {}
+
+  const signed int __builtin_bcdsub_ge_v16qi (vsc, vsc, const int<1>);
+    BCDSUB_GE_V16QI bcdsub_ge_v16qi {}
+
+  const signed int __builtin_bcdsub_gt_v1ti (vsq, vsq, const int<1>);
+    BCDSUB_GT_V1TI bcdsub_gt_v1ti {}
+
+  const signed int __builtin_bcdsub_gt_v16qi (vsc, vsc, const int<1>);
+    BCDSUB_GT_V16QI bcdsub_gt_v16qi {}
+
+  const signed int __builtin_bcdsub_le_v1ti (vsq, vsq, const int<1>);
+    BCDSUB_LE_V1TI bcdsub_le_v1ti {}
+
+  const signed int __builtin_bcdsub_le_v16qi (vsc, vsc, const int<1>);
+    BCDSUB_LE_V16QI bcdsub_le_v16qi {}
+
+  const signed int __builtin_bcdsub_lt_v1ti (vsq, vsq, const int<1>);
+    BCDSUB_LT_V1TI bcdsub_lt_v1ti {}
+
+  const signed int __builtin_bcdsub_lt_v16qi (vsc, vsc, const int<1>);
+    BCDSUB_LT_V16QI bcdsub_lt_v16qi {}
+
+  const signed int __builtin_bcdsub_ov_v1ti (vsq, vsq, const int<1>);
+    BCDSUB_OV_V1TI bcdsub_unordered_v1ti {}
+
+  const signed int __builtin_bcdsub_ov_v16qi (vsc, vsc, const int<1>);
+    BCDSUB_OV_V16QI bcdsub_unordered_v16qi {}
+
+  const vuc __builtin_crypto_vpermxor_v16qi (vuc, vuc, vuc);
+    VPERMXOR_V16QI crypto_vpermxor_v16qi {}
+
+  const vull __builtin_crypto_vpermxor_v2di (vull, vull, vull);
+    VPERMXOR_V2DI crypto_vpermxor_v2di {}
+
+  const vui __builtin_crypto_vpermxor_v4si (vui, vui, vui);
+    VPERMXOR_V4SI crypto_vpermxor_v4si {}
+
+  const vus __builtin_crypto_vpermxor_v8hi (vus, vus, vus);
+    VPERMXOR_V8HI crypto_vpermxor_v8hi {}
+
+  const vuc __builtin_crypto_vpmsumb (vuc, vuc);
+    VPMSUMB crypto_vpmsumb {}
+
+  const vull __builtin_crypto_vpmsumd (vull, vull);
+    VPMSUMD crypto_vpmsumd {}
+
+  const vus __builtin_crypto_vpmsumh (vus, vus);
+    VPMSUMH crypto_vpmsumh {}
+
+  const vui __builtin_crypto_vpmsumw (vui, vui);
+    VPMSUMW crypto_vpmsumw {}
+
+  const vf __builtin_vsx_float2_v2df (vd, vd);
+    FLOAT2_V2DF float2_v2df {}
+
+  const vf __builtin_vsx_float2_v2di (vsll, vsll);
+    FLOAT2_V2DI float2_v2di {}
+
+  const vsc __builtin_vsx_revb_v16qi (vsc);
+    REVB_V16QI revb_v16qi {}
+
+  const vsq __builtin_vsx_revb_v1ti (vsq);
+    REVB_V1TI revb_v1ti {}
+
+  const vd __builtin_vsx_revb_v2df (vd);
+    REVB_V2DF revb_v2df {}
+
+  const vsll __builtin_vsx_revb_v2di (vsll);
+    REVB_V2DI revb_v2di {}
+
+  const vf __builtin_vsx_revb_v4sf (vf);
+    REVB_V4SF revb_v4sf {}
+
+  const vsi __builtin_vsx_revb_v4si (vsi);
+    REVB_V4SI revb_v4si {}
+
+  const vss __builtin_vsx_revb_v8hi (vss);
+    REVB_V8HI revb_v8hi {}
+
+  const vf __builtin_vsx_uns_float2_v2di (vsll, vsll);
+    UNS_FLOAT2_V2DI uns_float2_v2di {}
+
+  const vsi __builtin_vsx_vsigned2_v2df (vd, vd);
+    VEC_VSIGNED2_V2DF vsigned2_v2df {}
+
+  const vsi __builtin_vsx_vunsigned2_v2df (vd, vd);
+    VEC_VUNSIGNED2_V2DF vunsigned2_v2df {}
+
+  const vf __builtin_vsx_xscvdpspn (double);
+    XSCVDPSPN vsx_xscvdpspn {}
+
+  const double __builtin_vsx_xscvspdpn (vf);
+    XSCVSPDPN vsx_xscvspdpn {}
-- 
cgit v1.1