Merge branch 'master' into devel/sphinx

author: Martin Liska <mliska@suse.cz> 2021-08-24 16:42:47 +0200
committer: Martin Liska <mliska@suse.cz> 2021-08-24 16:42:47 +0200
commit: 7572f9cd10edd3bc1889a8f513dbf77b7f4e470d (patch)
tree: 2f77059468d7b182b29483f6fa674fe5d6550652 /gcc
parent: eb2de151c582a38efc53ce57416f7bd7a3a9c0eb (diff)
parent: 8ce18a29ef717f5920ebf5dc1d9e84570a1827d4 (diff)
download: gcc-7572f9cd10edd3bc1889a8f513dbf77b7f4e470d.zip
gcc-7572f9cd10edd3bc1889a8f513dbf77b7f4e470d.tar.gz
gcc-7572f9cd10edd3bc1889a8f513dbf77b7f4e470d.tar.bz2
39 files changed, 1720 insertions, 110 deletions
diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in
index 249995a..bcc9ebe 100644
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -186,6 +186,9 @@ define feature quirk_armv6kz
 # Cortex-M3 LDRD quirk.
 define feature quirk_cm3_ldrd
 
+# v8-m/v8.1-m VLLDM errata.
+define feature quirk_vlldm
+
 # Don't use .cpu assembly directive
 define feature quirk_no_asmcpu
 
@@ -322,7 +325,7 @@ define implied vfp_base MVE MVE_FP ALL_FP
 # architectures.
 # xscale isn't really a 'quirk', but it isn't an architecture either and we
 # need to ignore it for matching purposes.
-define fgroup ALL_QUIRKS   quirk_no_volatile_ce quirk_armv6kz quirk_cm3_ldrd xscale quirk_no_asmcpu
+define fgroup ALL_QUIRKS   quirk_no_volatile_ce quirk_armv6kz quirk_cm3_ldrd quirk_vlldm xscale quirk_no_asmcpu
 
 define fgroup IGNORE_FOR_MULTILIB cdecp0 cdecp1 cdecp2 cdecp3 cdecp4 cdecp5 cdecp6 cdecp7
 
@@ -1571,6 +1574,7 @@ begin cpu cortex-m33
  architecture armv8-m.main+dsp+fp
  option nofp remove ALL_FP
  option nodsp remove armv7em
+ isa quirk_vlldm
  costs v7m
 end cpu cortex-m33
 
@@ -1580,6 +1584,7 @@ begin cpu cortex-m35p
  architecture armv8-m.main+dsp+fp
  option nofp remove ALL_FP
  option nodsp remove armv7em
+ isa quirk_vlldm
  costs v7m
 end cpu cortex-m35p
 
@@ -1591,7 +1596,7 @@ begin cpu cortex-m55
  option nomve remove mve mve_float
  option nofp remove ALL_FP mve_float
  option nodsp remove MVE mve_float
- isa quirk_no_asmcpu
+ isa quirk_no_asmcpu quirk_vlldm
  costs v7m
  vendor 41
 end cpu cortex-m55
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 11dafc7..5c92941 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -3616,6 +3616,15 @@ arm_option_override (void)
 	fix_cm3_ldrd = 0;
     }
 
+  /* Enable fix_vlldm by default if required.  */
+  if (fix_vlldm == 2)
+    {
+      if (bitmap_bit_p (arm_active_target.isa, isa_bit_quirk_vlldm))
+	fix_vlldm = 1;
+      else
+	fix_vlldm = 0;
+    }
+
   /* Hot/Cold partitioning is not currently supported, since we can't
      handle literal pool placement in that case.  */
   if (flag_reorder_blocks_and_partition)
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 0646048..5d3f21b 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -132,9 +132,12 @@
 ; TARGET_32BIT, "t1" or "t2" to specify a specific Thumb mode.  "v6"
 ; for ARM or Thumb-2 with arm_arch6, and nov6 for ARM without
 ; arm_arch6.  "v6t2" for Thumb-2 with arm_arch6 and "v8mb" for ARMv8-M
-; Baseline.  This attribute is used to compute attribute "enabled",
+; Baseline.  "fix_vlldm" is for fixing the v8-m/v8.1-m VLLDM erratum.
+; This attribute is used to compute attribute "enabled",
 ; use type "any" to enable an alternative in all cases.
-(define_attr "arch" "any,a,t,32,t1,t2,v6,nov6,v6t2,v8mb,iwmmxt,iwmmxt2,armv6_or_vfpv3,neon,mve"
+(define_attr "arch" "any, a, t, 32, t1, t2, v6,nov6, v6t2, \
+		     v8mb, fix_vlldm, iwmmxt, iwmmxt2, armv6_or_vfpv3, \
+		     neon, mve"
   (const_string "any"))
 
 (define_attr "arch_enabled" "no,yes"
@@ -177,6 +180,10 @@
 	      (match_test "TARGET_THUMB1 && arm_arch8"))
 	 (const_string "yes")
 
+	 (and (eq_attr "arch" "fix_vlldm")
+	      (match_test "fix_vlldm"))
+	 (const_string "yes")
+
 	 (and (eq_attr "arch" "iwmmxt2")
 	      (match_test "TARGET_REALLY_IWMMXT2"))
 	 (const_string "yes")
diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt
index 7417b55..a7677ee 100644
--- a/gcc/config/arm/arm.opt
+++ b/gcc/config/arm/arm.opt
@@ -268,6 +268,10 @@ Target Var(fix_cm3_ldrd) Init(2)
 Avoid overlapping destination and address registers on LDRD instructions
 that may trigger Cortex-M3 errata.
 
+mfix-cmse-cve-2021-35465
+Target Var(fix_vlldm) Init(2)
+Mitigate issues with VLLDM on some M-profile devices (CVE-2021-35465).
+
 munaligned-access
 Target Var(unaligned_access) Init(2) Save
 Enable unaligned word and halfword accesses to packed data.
diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md
index 93e96369..f0030a8 100644
--- a/gcc/config/arm/vfp.md
+++ b/gcc/config/arm/vfp.md
@@ -1703,12 +1703,15 @@
    (set_attr "type" "mov_reg")]
 )
 
+;; Both this and the next instruction are treated by GCC in the same
+;; way as a blockage pattern.  That's perhaps stronger than it needs
+;; to be, but we do not want accesses to the VFP register bank to be
+;; moved across either instruction.
+
 (define_insn "lazy_store_multiple_insn"
-  [(set (match_operand:SI 0 "s_register_operand" "+&rk")
-	(post_dec:SI (match_dup 0)))
-   (unspec_volatile [(const_int 0)
-		     (mem:SI (post_dec:SI (match_dup 0)))]
-		    VUNSPEC_VLSTM)]
+  [(unspec_volatile
+    [(mem:BLK (match_operand:SI 0 "s_register_operand" "rk"))]
+    VUNSPEC_VLSTM)]
   "use_cmse && reload_completed"
   "vlstm%?\\t%0"
   [(set_attr "predicable" "yes")
@@ -1716,14 +1719,16 @@
 )
 
 (define_insn "lazy_load_multiple_insn"
-  [(set (match_operand:SI 0 "s_register_operand" "+&rk")
-	(post_inc:SI (match_dup 0)))
-   (unspec_volatile:SI [(const_int 0)
-			(mem:SI (match_dup 0))]
-		       VUNSPEC_VLLDM)]
+  [(unspec_volatile
+    [(mem:BLK (match_operand:SI 0 "s_register_operand" "rk,rk"))]
+    VUNSPEC_VLLDM)]
   "use_cmse && reload_completed"
-  "vlldm%?\\t%0"
-  [(set_attr "predicable" "yes")
+  "@
+   vscclrm\\t{vpr}\;vlldm\\t%0
+   vlldm\\t%0"
+  [(set_attr "arch" "fix_vlldm,*")
+   (set_attr "predicable" "no")
+   (set_attr "length" "8,4")
    (set_attr "type" "load_4")]
 )
 
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 9bf13db..2500dbf 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -579,19 +579,10 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
 	{
 	  /* Broadcast to XMM/YMM/ZMM register from an integer
 	     constant or scalar mem.  */
-	  /* Hard registers are used for 2 purposes:
-	     1. Prevent stack realignment when the original code
-	     doesn't use vector registers, which is the same for
-	     memcpy and memset.
-	     2. Prevent combine to convert constant broadcast to
-	     load from constant pool.  */
-	  op1 = ix86_gen_scratch_sse_rtx (mode);
+	  op1 = gen_reg_rtx (mode);
 	  if (FLOAT_MODE_P (mode)
 	      || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
-	    {
-	      first = force_const_mem (GET_MODE_INNER (mode), first);
-	      op1 = gen_reg_rtx (mode);
-	    }
+	    first = force_const_mem (GET_MODE_INNER (mode), first);
 	  bool ok = ix86_expand_vector_init_duplicate (false, mode,
 						       op1, first);
 	  gcc_assert (ok);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 5bff131..ebec866 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -20542,6 +20542,11 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
     case UNSPEC:
       if (XINT (x, 1) == UNSPEC_TP)
 	*total = 0;
+      else if (XINT(x, 1) == UNSPEC_VTERNLOG)
+	{
+	  *total = cost->sse_op;
+	  return true;
+	}
       return false;
 
     case VEC_SELECT:
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 11ac8d0..6511422 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1716,6 +1716,8 @@ typedef struct ix86_args {
 
 #define LEGITIMATE_PIC_OPERAND_P(X) legitimate_pic_operand_p (X)
 
+#define STRIP_UNARY(X) (UNARY_P (X) ? XEXP (X, 0) : X)
+
 #define SYMBOLIC_CONST(X)	\
   (GET_CODE (X) == SYMBOL_REF						\
    || GET_CODE (X) == LABEL_REF						\
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 9321f33..df5acb4 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1044,6 +1044,13 @@
 	    (ior (match_test "op == const1_rtx")
 		 (match_test "op == constm1_rtx")))))
 
+;; True for registers, or (not: registers).  Used to optimize 3-operand
+;; bitwise operation.
+(define_predicate "reg_or_notreg_operand"
+  (ior (match_operand 0 "register_operand")
+       (and (match_code "not")
+	    (match_test "register_operand (XEXP (op, 0), mode)"))))
+
 ;; True if OP is acceptable as operand of DImode shift expander.
 (define_predicate "shiftdi_operand"
   (if_then_else (match_test "TARGET_64BIT")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 95f9582..03fc2df 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -933,7 +933,9 @@
 ;; Mapping of vector modes to VPTERNLOG suffix
 (define_mode_attr ternlogsuffix
   [(V8DI "q") (V4DI "q") (V2DI "q")
+   (V8DF "q") (V4DF "q") (V2DF "q")
    (V16SI "d") (V8SI "d") (V4SI "d")
+   (V16SF "d") (V8SF "d") (V4SF "d")
    (V32HI "d") (V16HI "d") (V8HI "d")
    (V64QI "d") (V32QI "d") (V16QI "d")])
 
@@ -10032,7 +10034,7 @@
 	(unspec:VI48_AVX512VL
 	  [(match_operand:VI48_AVX512VL 1 "register_operand" "0")
 	   (match_operand:VI48_AVX512VL 2 "register_operand" "v")
-	   (match_operand:VI48_AVX512VL 3 "nonimmediate_operand" "vm")
+	   (match_operand:VI48_AVX512VL 3 "bcst_vector_operand" "vmBr")
 	   (match_operand:SI 4 "const_0_to_255_operand")]
 	  UNSPEC_VTERNLOG))]
   "TARGET_AVX512F"
@@ -10041,13 +10043,245 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*<avx512>_vternlog<mode>_all"
+  [(set (match_operand:V 0 "register_operand" "=v")
+	(unspec:V
+	  [(match_operand:V 1 "register_operand" "0")
+	   (match_operand:V 2 "register_operand" "v")
+	   (match_operand:V 3 "bcst_vector_operand" "vmBr")
+	   (match_operand:SI 4 "const_0_to_255_operand")]
+	  UNSPEC_VTERNLOG))]
+  "TARGET_AVX512F"
+  "vpternlog<ternlogsuffix>\t{%4, %3, %2, %0|%0, %2, %3, %4}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+;; There must be lots of other combinations like
+;;
+;; (any_logic:V
+;;   (any_logic:V op1 op2)
+;;   (any_logic:V op1 op3))
+;;
+;; (any_logic:V
+;;   (any_logic:V
+;;     (any_logic:V op1, op2)
+;;     op3)
+;;   op1)
+;;
+;; and so on.
+
+(define_code_iterator any_logic1 [and ior xor])
+(define_code_iterator any_logic2 [and ior xor])
+(define_code_attr logic_op [(and "&") (ior "|") (xor "^")])
+
+(define_insn_and_split "*<avx512>_vpternlog<mode>_1"
+  [(set (match_operand:V 0 "register_operand")
+	(any_logic:V
+	  (any_logic1:V
+	    (match_operand:V 1 "reg_or_notreg_operand")
+	    (match_operand:V 2 "reg_or_notreg_operand"))
+	  (any_logic2:V
+	    (match_operand:V 3 "reg_or_notreg_operand")
+	    (match_operand:V 4 "reg_or_notreg_operand"))))]
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
+   && ix86_pre_reload_split ()
+   && (rtx_equal_p (STRIP_UNARY (operands[1]),
+		    STRIP_UNARY (operands[4]))
+       || rtx_equal_p (STRIP_UNARY (operands[2]),
+		       STRIP_UNARY (operands[4]))
+       || rtx_equal_p (STRIP_UNARY (operands[1]),
+		       STRIP_UNARY (operands[3]))
+       || rtx_equal_p (STRIP_UNARY (operands[2]),
+		       STRIP_UNARY (operands[3])))"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:V
+	  [(match_dup 6)
+	   (match_dup 2)
+	   (match_dup 1)
+	   (match_dup 5)]
+	  UNSPEC_VTERNLOG))]
+{
+  /* VPTERNLOGD reg6, reg2, reg1, imm8.  */
+  int reg6 = 0xF0;
+  int reg2 = 0xCC;
+  int reg1 = 0xAA;
+  int reg3 = 0;
+  int reg4 = 0;
+  int reg_mask, tmp1, tmp2;
+  if (rtx_equal_p (STRIP_UNARY (operands[1]),
+		   STRIP_UNARY (operands[4])))
+    {
+      reg4 = reg1;
+      reg3 = reg6;
+      operands[6] = operands[3];
+    }
+  else if (rtx_equal_p (STRIP_UNARY (operands[2]),
+		       STRIP_UNARY (operands[4])))
+    {
+      reg4 = reg2;
+      reg3 = reg6;
+      operands[6] = operands[3];
+    }
+  else if (rtx_equal_p (STRIP_UNARY (operands[1]),
+			STRIP_UNARY (operands[3])))
+    {
+      reg4 = reg6;
+      reg3 = reg1;
+      operands[6] = operands[4];
+    }
+  else
+    {
+      reg4 = reg6;
+      reg3 = reg2;
+      operands[6] = operands[4];
+    }
+
+  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
+  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
+  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
+  reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4;
+
+  tmp1 = reg1 <any_logic1:logic_op> reg2;
+  tmp2 = reg3 <any_logic2:logic_op> reg4;
+  reg_mask = tmp1  <any_logic:logic_op> tmp2;
+  reg_mask &= 0xFF;
+
+  operands[1] = STRIP_UNARY (operands[1]);
+  operands[2] = STRIP_UNARY (operands[2]);
+  operands[6] = STRIP_UNARY (operands[6]);
+  operands[5] = GEN_INT (reg_mask);
+})
+
+(define_insn_and_split "*<avx512>_vpternlog<mode>_2"
+  [(set (match_operand:V 0 "register_operand")
+	(any_logic:V
+	  (any_logic1:V
+	    (any_logic2:V
+	      (match_operand:V 1 "reg_or_notreg_operand")
+	      (match_operand:V 2 "reg_or_notreg_operand"))
+	    (match_operand:V 3 "reg_or_notreg_operand"))
+	  (match_operand:V 4 "reg_or_notreg_operand")))]
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
+   && ix86_pre_reload_split ()
+   && (rtx_equal_p (STRIP_UNARY (operands[1]),
+		    STRIP_UNARY (operands[4]))
+       || rtx_equal_p (STRIP_UNARY (operands[2]),
+		       STRIP_UNARY (operands[4]))
+       || rtx_equal_p (STRIP_UNARY (operands[1]),
+		       STRIP_UNARY (operands[3]))
+       || rtx_equal_p (STRIP_UNARY (operands[2]),
+		       STRIP_UNARY (operands[3])))"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:V
+	  [(match_dup 6)
+	   (match_dup 2)
+	   (match_dup 1)
+	   (match_dup 5)]
+	  UNSPEC_VTERNLOG))]
+{
+  /* VPTERNLOGD reg6, reg2, reg1, imm8.  */
+  int reg6 = 0xF0;
+  int reg2 = 0xCC;
+  int reg1 = 0xAA;
+  int reg3 = 0;
+  int reg4 = 0;
+  int reg_mask, tmp1, tmp2;
+  if (rtx_equal_p (STRIP_UNARY (operands[1]),
+		   STRIP_UNARY (operands[4])))
+    {
+      reg4 = reg1;
+      reg3 = reg6;
+      operands[6] = operands[3];
+    }
+  else if (rtx_equal_p (STRIP_UNARY (operands[2]),
+		       STRIP_UNARY (operands[4])))
+    {
+      reg4 = reg2;
+      reg3 = reg6;
+      operands[6] = operands[3];
+    }
+  else if (rtx_equal_p (STRIP_UNARY (operands[1]),
+			STRIP_UNARY (operands[3])))
+    {
+      reg4 = reg6;
+      reg3 = reg1;
+      operands[6] = operands[4];
+    }
+  else
+    {
+      reg4 = reg6;
+      reg3 = reg2;
+      operands[6] = operands[4];
+    }
+
+  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
+  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
+  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
+  reg4 = UNARY_P (operands[4]) ? ~reg4 : reg4;
+
+  tmp1 = reg1 <any_logic2:logic_op> reg2;
+  tmp2 = tmp1 <any_logic1:logic_op> reg3;
+  reg_mask = tmp2 <any_logic:logic_op> reg4;
+  reg_mask &= 0xFF;
+
+  operands[1] = STRIP_UNARY (operands[1]);
+  operands[2] = STRIP_UNARY (operands[2]);
+  operands[6] = STRIP_UNARY (operands[6]);
+  operands[5] = GEN_INT (reg_mask);
+})
+
+(define_insn_and_split "*<avx512>_vpternlog<mode>_3"
+  [(set (match_operand:V 0 "register_operand")
+	(any_logic:V
+	  (any_logic1:V
+	    (match_operand:V 1 "reg_or_notreg_operand")
+	    (match_operand:V 2 "reg_or_notreg_operand"))
+	  (match_operand:V 3 "reg_or_notreg_operand")))]
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL)
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:V
+	  [(match_dup 3)
+	   (match_dup 2)
+	   (match_dup 1)
+	   (match_dup 4)]
+	  UNSPEC_VTERNLOG))]
+{
+  /* VPTERNLOGD reg3, reg2, reg1, imm8.  */
+  int reg3 = 0xF0;
+  int reg2 = 0xCC;
+  int reg1 = 0xAA;
+  int reg_mask, tmp1;
+
+  reg1 = UNARY_P (operands[1]) ? ~reg1 : reg1;
+  reg2 = UNARY_P (operands[2]) ? ~reg2 : reg2;
+  reg3 = UNARY_P (operands[3]) ? ~reg3 : reg3;
+
+  tmp1 = reg1 <any_logic1:logic_op> reg2;
+  reg_mask = tmp1 <any_logic:logic_op> reg3;
+  reg_mask &= 0xFF;
+
+  operands[1] = STRIP_UNARY (operands[1]);
+  operands[2] = STRIP_UNARY (operands[2]);
+  operands[3] = STRIP_UNARY (operands[3]);
+  operands[4] = GEN_INT (reg_mask);
+})
+
+
 (define_insn "<avx512>_vternlog<mode>_mask"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
 	(vec_merge:VI48_AVX512VL
 	  (unspec:VI48_AVX512VL
 	    [(match_operand:VI48_AVX512VL 1 "register_operand" "0")
 	     (match_operand:VI48_AVX512VL 2 "register_operand" "v")
-	     (match_operand:VI48_AVX512VL 3 "nonimmediate_operand" "vm")
+	     (match_operand:VI48_AVX512VL 3 "bcst_vector_operand" "vmBr")
 	     (match_operand:SI 4 "const_0_to_255_operand")]
 	    UNSPEC_VTERNLOG)
 	  (match_dup 1)
diff --git a/gcc/config/rs6000/rs6000-builtin-new.def b/gcc/config/rs6000/rs6000-builtin-new.def
index 61f5b94..2a2c913 100644
--- a/gcc/config/rs6000/rs6000-builtin-new.def
+++ b/gcc/config/rs6000/rs6000-builtin-new.def
@@ -1961,3 +1961,480 @@
 
   const vsll __builtin_vsx_xxspltd_2di (vsll, const int<1>);
     XXSPLTD_V2DI vsx_xxspltd_v2di {}
+
+
+; Power7 builtins (ISA 2.06).
+[power7]
+  const unsigned int __builtin_addg6s (unsigned int, unsigned int);
+    ADDG6S addg6s {}
+
+  const signed long __builtin_bpermd (signed long, signed long);
+    BPERMD bpermd_di {}
+
+  const unsigned int __builtin_cbcdtd (unsigned int);
+    CBCDTD cbcdtd {}
+
+  const unsigned int __builtin_cdtbcd (unsigned int);
+    CDTBCD cdtbcd {}
+
+  const signed int __builtin_divwe (signed int, signed int);
+    DIVWE dive_si {}
+
+  const unsigned int __builtin_divweu (unsigned int, unsigned int);
+    DIVWEU diveu_si {}
+
+  const vsq __builtin_pack_vector_int128 (unsigned long long, unsigned long long);
+    PACK_V1TI packv1ti {}
+
+  void __builtin_ppc_speculation_barrier ();
+    SPECBARR speculation_barrier {}
+
+  const unsigned long __builtin_unpack_vector_int128 (vsq, const int<1>);
+    UNPACK_V1TI unpackv1ti {}
+
+
+; Power7 builtins requiring 64-bit GPRs (even with 32-bit addressing).
+[power7-64]
+  const signed long long __builtin_divde (signed long long, signed long long);
+    DIVDE dive_di {}
+
+  const unsigned long long __builtin_divdeu (unsigned long long, unsigned long long);
+    DIVDEU diveu_di {}
+
+
+; Power8 vector built-ins.
+[power8-vector]
+  const vsll __builtin_altivec_abs_v2di (vsll);
+    ABS_V2DI absv2di2 {}
+
+  const vsc __builtin_altivec_bcddiv10_v16qi (vsc);
+    BCDDIV10_V16QI bcddiv10_v16qi {}
+
+  const vsc __builtin_altivec_bcdmul10_v16qi (vsc);
+    BCDMUL10_V16QI bcdmul10_v16qi {}
+
+  const vsc __builtin_altivec_eqv_v16qi (vsc, vsc);
+    EQV_V16QI eqvv16qi3 {}
+
+  const vuc __builtin_altivec_eqv_v16qi_uns (vuc, vuc);
+    EQV_V16QI_UNS eqvv16qi3 {}
+
+  const vsq __builtin_altivec_eqv_v1ti (vsq, vsq);
+    EQV_V1TI eqvv1ti3 {}
+
+  const vuq __builtin_altivec_eqv_v1ti_uns (vuq, vuq);
+    EQV_V1TI_UNS eqvv1ti3 {}
+
+  const vd __builtin_altivec_eqv_v2df (vd, vd);
+    EQV_V2DF eqvv2df3 {}
+
+  const vsll __builtin_altivec_eqv_v2di (vsll, vsll);
+    EQV_V2DI eqvv2di3 {}
+
+  const vull __builtin_altivec_eqv_v2di_uns (vull, vull);
+    EQV_V2DI_UNS eqvv2di3 {}
+
+  const vf __builtin_altivec_eqv_v4sf (vf, vf);
+    EQV_V4SF eqvv4sf3 {}
+
+  const vsi __builtin_altivec_eqv_v4si (vsi, vsi);
+    EQV_V4SI eqvv4si3 {}
+
+  const vui __builtin_altivec_eqv_v4si_uns (vui, vui);
+    EQV_V4SI_UNS eqvv4si3 {}
+
+  const vss __builtin_altivec_eqv_v8hi (vss, vss);
+    EQV_V8HI eqvv8hi3 {}
+
+  const vus __builtin_altivec_eqv_v8hi_uns (vus, vus);
+    EQV_V8HI_UNS eqvv8hi3 {}
+
+  const vsc __builtin_altivec_nand_v16qi (vsc, vsc);
+    NAND_V16QI nandv16qi3 {}
+
+  const vuc __builtin_altivec_nand_v16qi_uns (vuc, vuc);
+    NAND_V16QI_UNS nandv16qi3 {}
+
+  const vsq __builtin_altivec_nand_v1ti (vsq, vsq);
+    NAND_V1TI nandv1ti3 {}
+
+  const vuq __builtin_altivec_nand_v1ti_uns (vuq, vuq);
+    NAND_V1TI_UNS nandv1ti3 {}
+
+  const vd __builtin_altivec_nand_v2df (vd, vd);
+    NAND_V2DF nandv2df3 {}
+
+  const vsll __builtin_altivec_nand_v2di (vsll, vsll);
+    NAND_V2DI nandv2di3 {}
+
+  const vull __builtin_altivec_nand_v2di_uns (vull, vull);
+    NAND_V2DI_UNS nandv2di3 {}
+
+  const vf __builtin_altivec_nand_v4sf (vf, vf);
+    NAND_V4SF nandv4sf3 {}
+
+  const vsi __builtin_altivec_nand_v4si (vsi, vsi);
+    NAND_V4SI nandv4si3 {}
+
+  const vui __builtin_altivec_nand_v4si_uns (vui, vui);
+    NAND_V4SI_UNS nandv4si3 {}
+
+  const vss __builtin_altivec_nand_v8hi (vss, vss);
+    NAND_V8HI nandv8hi3 {}
+
+  const vus __builtin_altivec_nand_v8hi_uns (vus, vus);
+    NAND_V8HI_UNS nandv8hi3 {}
+
+  const vsc __builtin_altivec_neg_v16qi (vsc);
+    NEG_V16QI negv16qi2 {}
+
+  const vd __builtin_altivec_neg_v2df (vd);
+    NEG_V2DF negv2df2 {}
+
+  const vsll __builtin_altivec_neg_v2di (vsll);
+    NEG_V2DI negv2di2 {}
+
+  const vf __builtin_altivec_neg_v4sf (vf);
+    NEG_V4SF negv4sf2 {}
+
+  const vsi __builtin_altivec_neg_v4si (vsi);
+    NEG_V4SI negv4si2 {}
+
+  const vss __builtin_altivec_neg_v8hi (vss);
+    NEG_V8HI negv8hi2 {}
+
+  const vsc __builtin_altivec_orc_v16qi (vsc, vsc);
+    ORC_V16QI orcv16qi3 {}
+
+  const vuc __builtin_altivec_orc_v16qi_uns (vuc, vuc);
+    ORC_V16QI_UNS orcv16qi3 {}
+
+  const vsq __builtin_altivec_orc_v1ti (vsq, vsq);
+    ORC_V1TI orcv1ti3 {}
+
+  const vuq __builtin_altivec_orc_v1ti_uns (vuq, vuq);
+    ORC_V1TI_UNS orcv1ti3 {}
+
+  const vd __builtin_altivec_orc_v2df (vd, vd);
+    ORC_V2DF orcv2df3 {}
+
+  const vsll __builtin_altivec_orc_v2di (vsll, vsll);
+    ORC_V2DI orcv2di3 {}
+
+  const vull __builtin_altivec_orc_v2di_uns (vull, vull);
+    ORC_V2DI_UNS orcv2di3 {}
+
+  const vf __builtin_altivec_orc_v4sf (vf, vf);
+    ORC_V4SF orcv4sf3 {}
+
+  const vsi __builtin_altivec_orc_v4si (vsi, vsi);
+    ORC_V4SI orcv4si3 {}
+
+  const vui __builtin_altivec_orc_v4si_uns (vui, vui);
+    ORC_V4SI_UNS orcv4si3 {}
+
+  const vss __builtin_altivec_orc_v8hi (vss, vss);
+    ORC_V8HI orcv8hi3 {}
+
+  const vus __builtin_altivec_orc_v8hi_uns (vus, vus);
+    ORC_V8HI_UNS orcv8hi3 {}
+
+  const vsc __builtin_altivec_vclzb (vsc);
+    VCLZB clzv16qi2 {}
+
+  const vsll __builtin_altivec_vclzd (vsll);
+    VCLZD clzv2di2 {}
+
+  const vss __builtin_altivec_vclzh (vss);
+    VCLZH clzv8hi2 {}
+
+  const vsi __builtin_altivec_vclzw (vsi);
+    VCLZW clzv4si2 {}
+
+  const vuc __builtin_altivec_vgbbd (vuc);
+    VGBBD p8v_vgbbd {}
+
+  const vsq __builtin_altivec_vaddcuq (vsq, vsq);
+    VADDCUQ altivec_vaddcuq {}
+
+  const vsq __builtin_altivec_vaddecuq (vsq, vsq, vsq);
+    VADDECUQ altivec_vaddecuq {}
+
+  const vsq __builtin_altivec_vaddeuqm (vsq, vsq, vsq);
+    VADDEUQM altivec_vaddeuqm {}
+
+  const vsll __builtin_altivec_vaddudm (vsll, vsll);
+    VADDUDM addv2di3 {}
+
+  const vsq __builtin_altivec_vadduqm (vsq, vsq);
+    VADDUQM altivec_vadduqm {}
+
+  const vsll __builtin_altivec_vbpermq (vsc, vsc);
+    VBPERMQ altivec_vbpermq {}
+
+  const vsc __builtin_altivec_vbpermq2 (vsc, vsc);
+    VBPERMQ2 altivec_vbpermq2 {}
+
+  const vsll __builtin_altivec_vmaxsd (vsll, vsll);
+    VMAXSD smaxv2di3 {}
+
+  const vull __builtin_altivec_vmaxud (vull, vull);
+    VMAXUD umaxv2di3 {}
+
+  const vsll __builtin_altivec_vminsd (vsll, vsll);
+    VMINSD sminv2di3 {}
+
+  const vull __builtin_altivec_vminud (vull, vull);
+    VMINUD uminv2di3 {}
+
+  const vd __builtin_altivec_vmrgew_v2df (vd, vd);
+    VMRGEW_V2DF p8_vmrgew_v2df {}
+
+  const vsll __builtin_altivec_vmrgew_v2di (vsll, vsll);
+    VMRGEW_V2DI p8_vmrgew_v2di {}
+
+  const vf __builtin_altivec_vmrgew_v4sf (vf, vf);
+    VMRGEW_V4SF p8_vmrgew_v4sf {}
+
+  const vsi __builtin_altivec_vmrgew_v4si (vsi, vsi);
+    VMRGEW_V4SI p8_vmrgew_v4si {}
+
+  const vd __builtin_altivec_vmrgow_v2df (vd, vd);
+    VMRGOW_V2DF p8_vmrgow_v2df {}
+
+  const vsll __builtin_altivec_vmrgow_v2di (vsll, vsll);
+    VMRGOW_V2DI p8_vmrgow_v2di {}
+
+  const vf __builtin_altivec_vmrgow_v4sf (vf, vf);
+    VMRGOW_V4SF p8_vmrgow_v4sf {}
+
+  const vsi __builtin_altivec_vmrgow_v4si (vsi, vsi);
+    VMRGOW_V4SI p8_vmrgow_v4si {}
+
+  const vsc __builtin_altivec_vpermxor (vsc, vsc, vsc);
+    VPERMXOR altivec_vpermxor {}
+
+  const vsi __builtin_altivec_vpksdss (vsll, vsll);
+    VPKSDSS altivec_vpksdss {}
+
+  const vsi __builtin_altivec_vpksdus (vsll, vsll);
+    VPKSDUS altivec_vpksdus {}
+
+  const vsi __builtin_altivec_vpkudum (vsll, vsll);
+    VPKUDUM altivec_vpkudum {}
+
+  const vsi __builtin_altivec_vpkudus (vsll, vsll);
+    VPKUDUS altivec_vpkudus {}
+
+  const vsc __builtin_altivec_vpmsumb (vsc, vsc);
+    VPMSUMB_A crypto_vpmsumb {}
+
+  const vsll __builtin_altivec_vpmsumd (vsll, vsll);
+    VPMSUMD_A crypto_vpmsumd {}
+
+  const vss __builtin_altivec_vpmsumh (vss, vss);
+    VPMSUMH_A crypto_vpmsumh {}
+
+  const vsi __builtin_altivec_vpmsumw (vsi, vsi);
+    VPMSUMW_A crypto_vpmsumw {}
+
+  const vsc __builtin_altivec_vpopcntb (vsc);
+    VPOPCNTB popcountv16qi2 {}
+
+  const vsll __builtin_altivec_vpopcntd (vsll);
+    VPOPCNTD popcountv2di2 {}
+
+  const vss __builtin_altivec_vpopcnth (vss);
+    VPOPCNTH popcountv8hi2 {}
+
+  const vsc __builtin_altivec_vpopcntub (vsc);
+    VPOPCNTUB popcountv16qi2 {}
+
+  const vsll __builtin_altivec_vpopcntud (vsll);
+    VPOPCNTUD popcountv2di2 {}
+
+  const vss __builtin_altivec_vpopcntuh (vss);
+    VPOPCNTUH popcountv8hi2 {}
+
+  const vsi __builtin_altivec_vpopcntuw (vsi);
+    VPOPCNTUW popcountv4si2 {}
+
+  const vsi __builtin_altivec_vpopcntw (vsi);
+    VPOPCNTW popcountv4si2 {}
+
+  const vsll __builtin_altivec_vrld (vsll, vsll);
+    VRLD vrotlv2di3 {}
+
+  const vsll __builtin_altivec_vsld (vsll, vsll);
+    VSLD vashlv2di3 {}
+
+  const vsll __builtin_altivec_vsrad (vsll, vsll);
+    VSRAD vashrv2di3 {}
+
+  const vsll __builtin_altivec_vsrd (vsll, vull);
+    VSRD vlshrv2di3 {}
+
+  const vsq __builtin_altivec_vsubcuq (vsq, vsq);
+    VSUBCUQ altivec_vsubcuq {}
+
+  const vsq __builtin_altivec_vsubecuq (vsq, vsq, vsq);
+    VSUBECUQ altivec_vsubecuq {}
+
+  const vsq __builtin_altivec_vsubeuqm (vsq, vsq, vsq);
+    VSUBEUQM altivec_vsubeuqm {}
+
+  const vsll __builtin_altivec_vsubudm (vsll, vsll);
+    VSUBUDM subv2di3 {}
+
+  const vsq __builtin_altivec_vsubuqm (vsq, vsq);
+    VSUBUQM altivec_vsubuqm {}
+
+  const vsll __builtin_altivec_vupkhsw (vsi);
+    VUPKHSW altivec_vupkhsw {}
+
+  const vsll __builtin_altivec_vupklsw (vsi);
+    VUPKLSW altivec_vupklsw {}
+
+  const vsq __builtin_bcdadd_v1ti (vsq, vsq, const int<1>);
+    BCDADD_V1TI bcdadd_v1ti {}
+
+  const vsc __builtin_bcdadd_v16qi (vsc, vsc, const int<1>);
+    BCDADD_V16QI bcdadd_v16qi {}
+
+  const signed int __builtin_bcdadd_eq_v1ti (vsq, vsq, const int<1>);
+    BCDADD_EQ_V1TI bcdadd_eq_v1ti {}
+
+  const signed int __builtin_bcdadd_eq_v16qi (vsc, vsc, const int<1>);
+    BCDADD_EQ_V16QI bcdadd_eq_v16qi {}
+
+  const signed int __builtin_bcdadd_gt_v1ti (vsq, vsq, const int<1>);
+    BCDADD_GT_V1TI bcdadd_gt_v1ti {}
+
+  const signed int __builtin_bcdadd_gt_v16qi (vsc, vsc, const int<1>);
+    BCDADD_GT_V16QI bcdadd_gt_v16qi {}
+
+  const signed int __builtin_bcdadd_lt_v1ti (vsq, vsq, const int<1>);
+    BCDADD_LT_V1TI bcdadd_lt_v1ti {}
+
+  const signed int __builtin_bcdadd_lt_v16qi (vsc, vsc, const int<1>);
+    BCDADD_LT_V16QI bcdadd_lt_v16qi {}
+
+  const signed int __builtin_bcdadd_ov_v1ti (vsq, vsq, const int<1>);
+    BCDADD_OV_V1TI bcdadd_unordered_v1ti {}
+
+  const signed int __builtin_bcdadd_ov_v16qi (vsc, vsc, const int<1>);
+    BCDADD_OV_V16QI bcdadd_unordered_v16qi {}
+
+  const signed int __builtin_bcdinvalid_v1ti (vsq);
+    BCDINVALID_V1TI bcdinvalid_v1ti {}
+
+  const signed int __builtin_bcdinvalid_v16qi (vsc);
+    BCDINVALID_V16QI bcdinvalid_v16qi {}
+
+  const vsq __builtin_bcdsub_v1ti (vsq, vsq, const int<1>);
+    BCDSUB_V1TI bcdsub_v1ti {}
+
+  const vsc __builtin_bcdsub_v16qi (vsc, vsc, const int<1>);
+    BCDSUB_V16QI bcdsub_v16qi {}
+
+  const signed int __builtin_bcdsub_eq_v1ti (vsq, vsq, const int<1>);
+    BCDSUB_EQ_V1TI bcdsub_eq_v1ti {}
+
+  const signed int __builtin_bcdsub_eq_v16qi (vsc, vsc, const int<1>);
+    BCDSUB_EQ_V16QI bcdsub_eq_v16qi {}
+
+  const signed int __builtin_bcdsub_ge_v1ti (vsq, vsq, const int<1>);
+    BCDSUB_GE_V1TI bcdsub_ge_v1ti {}
+
+  const signed int __builtin_bcdsub_ge_v16qi (vsc, vsc, const int<1>);
+    BCDSUB_GE_V16QI bcdsub_ge_v16qi {}
+
+  const signed int __builtin_bcdsub_gt_v1ti (vsq, vsq, const int<1>);
+    BCDSUB_GT_V1TI bcdsub_gt_v1ti {}
+
+  const signed int __builtin_bcdsub_gt_v16qi (vsc, vsc, const int<1>);
+    BCDSUB_GT_V16QI bcdsub_gt_v16qi {}
+
+  const signed int __builtin_bcdsub_le_v1ti (vsq, vsq, const int<1>);
+    BCDSUB_LE_V1TI bcdsub_le_v1ti {}
+
+  const signed int __builtin_bcdsub_le_v16qi (vsc, vsc, const int<1>);
+    BCDSUB_LE_V16QI bcdsub_le_v16qi {}
+
+  const signed int __builtin_bcdsub_lt_v1ti (vsq, vsq, const int<1>);
+    BCDSUB_LT_V1TI bcdsub_lt_v1ti {}
+
+  const signed int __builtin_bcdsub_lt_v16qi (vsc, vsc, const int<1>);
+    BCDSUB_LT_V16QI bcdsub_lt_v16qi {}
+
+  const signed int __builtin_bcdsub_ov_v1ti (vsq, vsq, const int<1>);
+    BCDSUB_OV_V1TI bcdsub_unordered_v1ti {}
+
+  const signed int __builtin_bcdsub_ov_v16qi (vsc, vsc, const int<1>);
+    BCDSUB_OV_V16QI bcdsub_unordered_v16qi {}
+
+  const vuc __builtin_crypto_vpermxor_v16qi (vuc, vuc, vuc);
+    VPERMXOR_V16QI crypto_vpermxor_v16qi {}
+
+  const vull __builtin_crypto_vpermxor_v2di (vull, vull, vull);
+    VPERMXOR_V2DI crypto_vpermxor_v2di {}
+
+  const vui __builtin_crypto_vpermxor_v4si (vui, vui, vui);
+    VPERMXOR_V4SI crypto_vpermxor_v4si {}
+
+  const vus __builtin_crypto_vpermxor_v8hi (vus, vus, vus);
+    VPERMXOR_V8HI crypto_vpermxor_v8hi {}
+
+  const vuc __builtin_crypto_vpmsumb (vuc, vuc);
+    VPMSUMB crypto_vpmsumb {}
+
+  const vull __builtin_crypto_vpmsumd (vull, vull);
+    VPMSUMD crypto_vpmsumd {}
+
+  const vus __builtin_crypto_vpmsumh (vus, vus);
+    VPMSUMH crypto_vpmsumh {}
+
+  const vui __builtin_crypto_vpmsumw (vui, vui);
+    VPMSUMW crypto_vpmsumw {}
+
+  const vf __builtin_vsx_float2_v2df (vd, vd);
+    FLOAT2_V2DF float2_v2df {}
+
+  const vf __builtin_vsx_float2_v2di (vsll, vsll);
+    FLOAT2_V2DI float2_v2di {}
+
+  const vsc __builtin_vsx_revb_v16qi (vsc);
+    REVB_V16QI revb_v16qi {}
+
+  const vsq __builtin_vsx_revb_v1ti (vsq);
+    REVB_V1TI revb_v1ti {}
+
+  const vd __builtin_vsx_revb_v2df (vd);
+    REVB_V2DF revb_v2df {}
+
+  const vsll __builtin_vsx_revb_v2di (vsll);
+    REVB_V2DI revb_v2di {}
+
+  const vf __builtin_vsx_revb_v4sf (vf);
+    REVB_V4SF revb_v4sf {}
+
+  const vsi __builtin_vsx_revb_v4si (vsi);
+    REVB_V4SI revb_v4si {}
+
+  const vss __builtin_vsx_revb_v8hi (vss);
+    REVB_V8HI revb_v8hi {}
+
+  const vf __builtin_vsx_uns_float2_v2di (vsll, vsll);
+    UNS_FLOAT2_V2DI uns_float2_v2di {}
+
+  const vsi __builtin_vsx_vsigned2_v2df (vd, vd);
+    VEC_VSIGNED2_V2DF vsigned2_v2df {}
+
+  const vsi __builtin_vsx_vunsigned2_v2df (vd, vd);
+    VEC_VUNSIGNED2_V2DF vunsigned2_v2df {}
+
+  const vf __builtin_vsx_xscvdpspn (double);
+    XSCVDPSPN vsx_xscvdpspn {}
+
+  const double __builtin_vsx_xscvspdpn (vf);
+    XSCVSPDPN vsx_xscvspdpn {}
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index a295ff5..164f586 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -771,6 +771,7 @@ Objective-C and Objective-C++ Dialects}.
 -mverbose-cost-dump @gol
 -mpure-code @gol
 -mcmse @gol
+-mfix-cmse-cve-2021-35465 @gol
 -mfdpic}
 
 @emph{AVR Options}
@@ -14349,9 +14350,10 @@ The parameter only has an effect on targets that support partial
 vector loads and stores.
 
 @item vect-inner-loop-cost-factor
-The factor which the loop vectorizer applies to the cost of statements
-in an inner loop relative to the loop being vectorized.  The default
-value is 50.
+The maximum factor which the loop vectorizer applies to the cost of statements
+in an inner loop relative to the loop being vectorized.  The factor applied
+is the maximum of the estimated number of iterations of the inner loop and
+this parameter.  The default value of this parameter is 50.
 
 @item avoid-fma-max-bits
 Maximum number of bits for which we avoid creating FMAs.
@@ -20701,6 +20703,14 @@ Generate secure code as per the "ARMv8-M Security Extensions: Requirements on
 Development Tools Engineering Specification", which can be found on
 @url{https://developer.arm.com/documentation/ecm0359818/latest/}.
 
+@item -mfix-cmse-cve-2021-35465
+@opindex mfix-cmse-cve-2021-35465
+Mitigate against a potential security issue with the @code{VLLDM} instruction
+in some M-profile devices when using CMSE (CVE-2021-365465).  This option is
+enabled by default when the option @option{-mcpu=} is used with
+@code{cortex-m33}, @code{cortex-m35p} or @code{cortex-m55}.  The option
+@option{-mno-fix-cmse-cve-2021-35465} can be used to disable the mitigation.
+
 @item -mfdpic
 @itemx -mno-fdpic
 @opindex mfdpic
diff --git a/gcc/params.opt b/gcc/params.opt
index f926488..f414dc1 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1114,7 +1114,7 @@ Common Joined UInteger Var(param_vect_partial_vector_usage) Init(2) IntegerRange
 Controls how loop vectorizer uses partial vectors.  0 means never, 1 means only for loops whose need to iterate can be removed, 2 means for all loops.  The default value is 2.
 
 -param=vect-inner-loop-cost-factor=
-Common Joined UInteger Var(param_vect_inner_loop_cost_factor) Init(50) IntegerRange(1, 999999) Param Optimization
-The factor which the loop vectorizer applies to the cost of statements in an inner loop relative to the loop being vectorized.
+Common Joined UInteger Var(param_vect_inner_loop_cost_factor) Init(50) IntegerRange(1, 10000) Param Optimization
+The maximum factor which the loop vectorizer applies to the cost of statements in an inner loop relative to the loop being vectorized.
 
 ; This comment is to ensure we retain the blank line above.
diff --git a/gcc/testsuite/gcc.dg/predict-1.c b/gcc/testsuite/gcc.dg/predict-1.c
index 9e5605a..d2e753e 100644
--- a/gcc/testsuite/gcc.dg/predict-1.c
+++ b/gcc/testsuite/gcc.dg/predict-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -fdump-tree-profile_estimate" } */
+/* { dg-options "-O2 -fdump-tree-profile_estimate --disable-tree-evrp" } */
 
 extern int global;
 
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/evrp-trans.c b/gcc/testsuite/gcc.dg/tree-ssa/evrp-trans.c
new file mode 100644
index 0000000..8ee8e3c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/evrp-trans.c
@@ -0,0 +1,144 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-evrp" } */
+
+/* Simple tests to make sure transitives are working. */
+void keep();
+void kill();
+
+void
+f1 (int x, int y, int z)
+{
+  if (x > y)
+    if (y > z)
+      {
+	if (x > z)
+	  keep ();
+	else
+	  kill ();
+      }
+}
+
+void
+f2 (int w, int x, int y, int z)
+{  
+  // Test one equivalence.
+  if (w == z)
+    if (x > y)
+      if (y > z)
+	{
+	  if (x > w)
+	    keep ();
+	  else
+	    kill ();
+	}
+}
+
+void
+f3 (int a, int w, int x, int y, int z)
+{  
+  // Test two equivlaences.
+  if (a == x)
+    if (w == z)
+      if (x > y)
+	if (y > z)
+	  {
+	    if (a > w)
+	      keep ();
+	    else
+	      kill ();
+	  }
+}
+
+void
+f4 (int x, int y, int z)
+{
+  // test X > Y >= Z
+  if (x > y)
+    if (y >= z)
+      {
+        if (x > z)
+          keep ();
+        else
+          kill ();
+      }
+}
+void
+f5 (int x, int y, int z)
+{
+  // test X >= Y > Z
+  if (x >= y)
+    if (y > z)
+      {
+        if (x > z)
+          keep ();
+        else
+          kill ();
+      }
+}
+
+void
+f6 (int x, int y, int z)
+{
+  // test X >= Y >= Z
+  if (x >= y)
+    if (y >= z)
+      {
+        if (x > z)
+          keep ();
+        else if (x == z)
+	  keep ();
+         else
+          kill ();
+      }
+}
+
+void
+f7 (int x, int y, int z)
+{
+  // test Y <= X , Z <= Y
+  if (y <= x)
+    if (z <= y)
+      {
+        if (x > z)
+          keep ();
+        else if (x == z)
+	  keep ();
+	else
+          kill ();
+      }
+}
+
+void
+f8 (int x, int y, int z)
+{
+  // test X >= Y, Z <= Y
+  if (x >= y)
+    if (z <= y)
+      {
+        if (x > z)
+          keep ();
+        else if (x == z)
+	  keep ();
+	else
+          kill ();
+      }
+}
+
+void
+f9 (int x, int y, int z)
+{
+  // test Y <= X   Y >= Z
+  if (y <= x)
+    if (y >= z)
+      {
+        if (x > z)
+          keep ();
+        else if (x == z)
+	  keep ();
+        else
+          kill ();
+      }
+}
+
+/* { dg-final { scan-tree-dump-not "kill" "evrp" } }  */
+/* { dg-final { scan-tree-dump-times "keep" 13 "evrp"} } */
diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-13a.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-13a.c
new file mode 100644
index 0000000..553cc78
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-13a.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse -mfloat-abi=soft -mfix-cmse-cve-2021-35465" }  */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=*" } { "-mfloat-abi=soft" } } */
+
+#include "../../../cmse-13.x"
+
+/* Checks for saving and clearing prior to function call.  */
+/* Shift on the same register as blxns.  */
+/* { dg-final { scan-assembler "lsrs\t(r\[1,4-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "lsls\t(r\[1,4-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler "push\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler "vlstm\tsp" } } */
+/* Check the right registers are cleared and none appears twice.  */
+/* { dg-final { scan-assembler "clrm\t\{(r1, )?(r4, )?(r5, )?(r6, )?(r7, )?(r8, )?(r9, )?(r10, )?(fp, )?(ip, )?APSR\}" } } */
+/* Check that the right number of registers is cleared and thus only one
+   register is missing.  */
+/* { dg-final { scan-assembler "clrm\t\{((r\[1,4-9\]|r10|fp|ip), ){9}APSR\}" } } */
+/* Check that no cleared register is used for blxns.  */
+/* { dg-final { scan-assembler-not "clrm\t\{\[^\}\]\+(r\[1,4-9\]|r10|fp|ip),\[^\}\]\+\}.*blxns\t\\1" } } */
+/* Check for v8.1-m variant of erratum work-around.  */
+/* { dg-final { scan-assembler "vscclrm\t\{vpr\}" } } */
+/* { dg-final { scan-assembler "vlldm\tsp" } } */
+/* { dg-final { scan-assembler "pop\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler-not "vmov" } } */
+/* { dg-final { scan-assembler-not "vmsr" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "blxns" } } */
diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-7a.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-7a.c
new file mode 100644
index 0000000..ce02fde
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-7a.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse -mfloat-abi=soft -mfix-cmse-cve-2021-35465" }  */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=*" } { "-mfloat-abi=soft" } } */
+
+#include "../../../cmse-7.x"
+
+/* Checks for saving and clearing prior to function call.  */
+/* Shift on the same register as blxns.  */
+/* { dg-final { scan-assembler "lsrs\t(r\[0-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "lsls\t(r\[0-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "push\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler "vlstm\tsp" } } */
+/* Check the right registers are cleared and none appears twice.  */
+/* { dg-final { scan-assembler "clrm\t\{(r0, )?(r1, )?(r2, )?(r3, )?(r4, )?(r5, )?(r6, )?(r7, )?(r8, )?(r9, )?(r10, )?(fp, )?(ip, )?APSR\}" } } */
+/* Check that the right number of registers is cleared and thus only one
+   register is missing.  */
+/* { dg-final { scan-assembler "clrm\t\{((r\[0-9\]|r10|fp|ip), ){12}APSR\}" } } */
+/* Check that no cleared register is used for blxns.  */
+/* { dg-final { scan-assembler-not "clrm\t\{\[^\}\]\+(r\[0-9\]|r10|fp|ip),\[^\}\]\+\}.*blxns\t\\1" } } */
+/* Check for v8.1-m variant of erratum work-around.  */
+/* { dg-final { scan-assembler "vscclrm\t\{vpr\}" } } */
+/* { dg-final { scan-assembler "vlldm\tsp" } } */
+/* { dg-final { scan-assembler "pop\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler-not "vmov" } } */
+/* { dg-final { scan-assembler-not "vmsr" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "blxns" } } */
diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-8a.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-8a.c
new file mode 100644
index 0000000..75e1611
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/soft/cmse-8a.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse -mfloat-abi=soft -mfix-cmse-cve-2021-35465" }  */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=*" } { "-mfloat-abi=soft" } } */
+
+#include "../../../cmse-8.x"
+
+/* Checks for saving and clearing prior to function call.  */
+/* Shift on the same register as blxns.  */
+/* { dg-final { scan-assembler "lsrs\t(r\[2-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "lsls\t(r\[2-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "push\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler "vlstm\tsp" } } */
+/* Check the right registers are cleared and none appears twice.  */
+/* { dg-final { scan-assembler "clrm\t\{(r2, )?(r3, )?(r4, )?(r5, )?(r6, )?(r7, )?(r8, )?(r9, )?(r10, )?(fp, )?(ip, )?APSR\}" } } */
+/* Check that the right number of registers is cleared and thus only one
+   register is missing.  */
+/* { dg-final { scan-assembler "clrm\t\{((r\[2-9\]|r10|fp|ip), ){10}APSR\}" } } */
+/* Check that no cleared register is used for blxns.  */
+/* { dg-final { scan-assembler-not "clrm\t\{\[^\}\]\+(r\[2-9\]|r10|fp|ip),\[^\}\]\+\}.*blxns\t\\1" } } */
+/* Check for v8.1-m variant of erratum work-around.  */
+/* { dg-final { scan-assembler "vscclrm\t\{vpr\}" } } */
+/* { dg-final { scan-assembler "vlldm\tsp" } } */
+/* { dg-final { scan-assembler "pop\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler-not "vmov" } } */
+/* { dg-final { scan-assembler-not "vmsr" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "blxns" } } */
diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp-sp/cmse-7a.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp-sp/cmse-7a.c
new file mode 100644
index 0000000..dad7266
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp-sp/cmse-7a.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-sp-d16 -mfix-cmse-cve-2021-35465" }  */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=*" } { "-mfloat-abi=softfp" } } */
+/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
+
+#include "../../../cmse-7.x"
+
+/* Checks for saving and clearing prior to function call.  */
+/* Shift on the same register as blxns.  */
+/* { dg-final { scan-assembler "lsrs\t(r\[0-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "lsls\t(r\[0-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "push\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler "vlstm\tsp" } } */
+/* Check the right registers are cleared and none appears twice.  */
+/* { dg-final { scan-assembler "clrm\t\{(r0, )?(r1, )?(r2, )?(r3, )?(r4, )?(r5, )?(r6, )?(r7, )?(r8, )?(r9, )?(r10, )?(fp, )?(ip, )?APSR\}" } } */
+/* Check that the right number of registers is cleared and thus only one
+   register is missing.  */
+/* { dg-final { scan-assembler "clrm\t\{((r\[0-9\]|r10|fp|ip), ){12}APSR\}" } } */
+/* Check that no cleared register is used for blxns.  */
+/* { dg-final { scan-assembler-not "clrm\t\{\[^\}\]\+(r\[0-9\]|r10|fp|ip),\[^\}\]\+\}.*blxns\t\\1" } } */
+/* Check for v8.1-m variant of erratum work-around.  */
+/* { dg-final { scan-assembler "vscclrm\t\{vpr\}" } } */
+/* { dg-final { scan-assembler "vlldm\tsp" } } */
+/* { dg-final { scan-assembler "pop\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "blxns" } } */
diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp-sp/cmse-8a.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp-sp/cmse-8a.c
new file mode 100644
index 0000000..faa0448
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp-sp/cmse-8a.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-sp-d16 -mfix-cmse-cve-2021-35465" }  */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=*" } { "-mfloat-abi=softfp" } } */
+/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */
+
+#include "../../../cmse-8.x"
+
+/* Checks for saving and clearing prior to function call.  */
+/* Shift on the same register as blxns.  */
+/* { dg-final { scan-assembler "lsrs\t(r\[2-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "lsls\t(r\[2-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "push\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler "vlstm\tsp" } } */
+/* Check the right registers are cleared and none appears twice.  */
+/* { dg-final { scan-assembler "clrm\t\{(r2, )?(r3, )?(r4, )?(r5, )?(r6, )?(r7, )?(r8, )?(r9, )?(r10, )?(fp, )?(ip, )?APSR\}" } } */
+/* Check that the right number of registers is cleared and thus only one
+   register is missing.  */
+/* { dg-final { scan-assembler "clrm\t\{((r\[2-9\]|r10|fp|ip), ){10}APSR\}" } } */
+/* Check that no cleared register is used for blxns.  */
+/* { dg-final { scan-assembler-not "clrm\t\{\[^\}\]\+(r\[2-9\]|r10|fp|ip),\[^\}\]\+\}.*blxns\t\\1" } } */
+/* Check for v8.1-m variant of erratum work-around.  */
+/* { dg-final { scan-assembler "vscclrm\t\{vpr\}" } } */
+/* { dg-final { scan-assembler "vlldm\tsp" } } */
+/* { dg-final { scan-assembler "pop\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "blxns" } } */
diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-13a.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-13a.c
new file mode 100644
index 0000000..bceba44
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-13a.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16 -mfix-cmse-cve-2021-35465" }  */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=*" } { "-mfloat-abi=softfp" } } */
+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
+
+#include "../../../cmse-13.x"
+
+/* Checks for saving and clearing prior to function call.  */
+/* Shift on the same register as blxns.  */
+/* { dg-final { scan-assembler "lsrs\t(r\[1,4-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "lsls\t(r\[1,4-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr2, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr3, r4" } } */
+/* { dg-final { scan-assembler "push\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler "vlstm\tsp" } } */
+/* Check the right registers are cleared and none appears twice.  */
+/* { dg-final { scan-assembler "clrm\t\{(r1, )?(r4, )?(r5, )?(r6, )?(r7, )?(r8, )?(r9, )?(r10, )?(fp, )?(ip, )?APSR\}" } } */
+/* Check that the right number of registers is cleared and thus only one
+   register is missing.  */
+/* { dg-final { scan-assembler "clrm\t\{((r\[1,4-9\]|r10|fp|ip), ){9}APSR\}" } } */
+/* Check that no cleared register is used for blxns.  */
+/* { dg-final { scan-assembler-not "clrm\t\{\[^\}\]\+(r\[1,4-9\]|r10|fp|ip),\[^\}\]\+\}.*blxns\t\\1" } } */
+/* Check for v8.1-m variant of erratum work-around.  */
+/* { dg-final { scan-assembler "vscclrm\t\{vpr\}" } } */
+/* { dg-final { scan-assembler "vlldm\tsp" } } */
+/* { dg-final { scan-assembler "pop\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "blxns" } } */
diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-7a.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-7a.c
new file mode 100644
index 0000000..c74ebbd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-7a.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16 -mfix-cmse-cve-2021-35465" }  */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=*" } { "-mfloat-abi=softfp" } } */
+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
+
+#include "../../../cmse-7.x"
+
+/* Checks for saving and clearing prior to function call.  */
+/* Shift on the same register as blxns.  */
+/* { dg-final { scan-assembler "lsrs\t(r\[0-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "lsls\t(r\[0-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "push\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler "vlstm\tsp" } } */
+/* Check the right registers are cleared and none appears twice.  */
+/* { dg-final { scan-assembler "clrm\t\{(r0, )?(r1, )?(r2, )?(r3, )?(r4, )?(r5, )?(r6, )?(r7, )?(r8, )?(r9, )?(r10, )?(fp, )?(ip, )?APSR\}" } } */
+/* Check that the right number of registers is cleared and thus only one
+   register is missing.  */
+/* { dg-final { scan-assembler "clrm\t\{((r\[0-9\]|r10|fp|ip), ){12}APSR\}" } } */
+/* Check that no cleared register is used for blxns.  */
+/* { dg-final { scan-assembler-not "clrm\t\{\[^\}\]\+(r\[0-9\]|r10|fp|ip),\[^\}\]\+\}.*blxns\t\\1" } } */
+/* Check for v8.1-m variant of erratum work-around.  */
+/* { dg-final { scan-assembler "vscclrm\t\{vpr\}" } } */
+/* { dg-final { scan-assembler "vlldm\tsp" } } */
+/* { dg-final { scan-assembler "pop\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "blxns" } } */
diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-8a.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-8a.c
new file mode 100644
index 0000000..ffb67a7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/8_1m/softfp/cmse-8a.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16 -mfix-cmse-cve-2021-35465" }  */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=*" } { "-mfloat-abi=softfp" } } */
+/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */
+
+#include "../../../cmse-8.x"
+
+/* Checks for saving and clearing prior to function call.  */
+/* Shift on the same register as blxns.  */
+/* { dg-final { scan-assembler "lsrs\t(r\[2-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler "lsls\t(r\[2-9\]|r10|fp|ip), \\1, #1.*blxns\t\\1" } } */
+/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */
+/* { dg-final { scan-assembler-not "mov\tr1, r4" } } */
+/* { dg-final { scan-assembler "push\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+/* { dg-final { scan-assembler "vlstm\tsp" } } */
+/* Check the right registers are cleared and none appears twice.  */
+/* { dg-final { scan-assembler "clrm\t\{(r2, )?(r3, )?(r4, )?(r5, )?(r6, )?(r7, )?(r8, )?(r9, )?(r10, )?(fp, )?(ip, )?APSR\}" } } */
+/* Check that the right number of registers is cleared and thus only one
+   register is missing.  */
+/* { dg-final { scan-assembler "clrm\t\{((r\[2-9\]|r10|fp|ip), ){10}APSR\}" } } */
+/* Check that no cleared register is used for blxns.  */
+/* { dg-final { scan-assembler-not "clrm\t\{\[^\}\]\+(r\[2-9\]|r10|fp|ip),\[^\}\]\+\}.*blxns\t\\1" } } */
+/* Check for v8.1-m variant of erratum work-around.  */
+/* { dg-final { scan-assembler "vscclrm\t\{vpr\}" } } */
+/* { dg-final { scan-assembler "vlldm\tsp" } } */
+/* { dg-final { scan-assembler "pop\t\{r4, r5, r6, r7, r8, r9, r10, fp\}" } } */
+
+/* Now we check that we use the correct intrinsic to call.  */
+/* { dg-final { scan-assembler "blxns" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
index 78bf5d3..fbc3de0 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
@@ -1,7 +1,8 @@
 /* PR target/95524 */
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512bw" } */
-/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 3 } }  */
+/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 2 } }  */
+/* { dg-final { scan-assembler-times "vpternlogd\[^\n\]*%zmm" 1 } }  */
 typedef char v64qi  __attribute__ ((vector_size (64)));
 typedef unsigned char v64uqi  __attribute__ ((vector_size (64)));
 
@@ -11,7 +12,6 @@ foo_ashiftrt_512 (v64qi a)
   return a >> 2;
 }
 /* { dg-final { scan-assembler-times "vpsraw\[^\n\]*%zmm" 1 } } */
-/* { dg-final { scan-assembler-times "vpxor\[^\n\]*%zmm" 1 } } */
 /* { dg-final { scan-assembler-times "vpsubb\[^\n\]*%zmm" 1 } } */
 
 __attribute__((noipa)) v64qi
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
index 77ace86..e5616d8 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-10b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
@@ -5,4 +5,3 @@
 
 /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 8 } } */
-/* { dg-final { scan-assembler-not "vzeroupper" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
index 80e9fdb..6d9cb91 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
@@ -5,7 +5,6 @@
 
 /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 2 } } */
-/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
 /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-6b.c b/gcc/testsuite/gcc.target/i386/pr100865-6b.c
index 35f2e96..9588249 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-6b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-6b.c
@@ -4,9 +4,7 @@
 #include "pr100865-6a.c"
 
 /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "vmovdqu32\[\\t \]%ymm\[0-9\]+, " 8 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
 /* { dg-final { scan-assembler-not "vpbroadcastd\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-7b.c b/gcc/testsuite/gcc.target/i386/pr100865-7b.c
index ad267c4..3b20c68 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-7b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-7b.c
@@ -5,8 +5,6 @@
 
 /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %ymm\[0-9\]+" 1 { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "vmovdqu64\[\\t \]%ymm\[0-9\]+, " 16 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr101989-1.c b/gcc/testsuite/gcc.target/i386/pr101989-1.c
new file mode 100644
index 0000000..594093e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101989-1.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-times "vpternlog" 6 } } */
+/* { dg-final { scan-assembler-not "vpxor" } } */
+/* { dg-final { scan-assembler-not "vpor" } } */
+/* { dg-final { scan-assembler-not "vpand" } } */
+
+#include<immintrin.h>
+__m256d
+__attribute__((noipa, target("avx512vl")))
+copysign2_pd(__m256d from, __m256d to) {
+  __m256i a = _mm256_castpd_si256(from);
+  __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63));
+  /* (avx_signbit & from) | (~avx_signbit & to)  */
+  return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to));
+}
+
+__m256i
+__attribute__((noipa, target("avx512vl")))
+foo (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & ~src1) | (src3 & src1);
+}
+
+__m256i
+__attribute__ ((noipa, target("avx512vl")))
+foo1 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & src1) | (src3 & ~src1);
+}
+
+__m256i
+__attribute__ ((noipa, target("avx512vl")))
+foo2 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & src1) | (~src3 & src1);
+}
+
+__m256i
+__attribute__ ((noipa, target("avx512vl")))
+foo3 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (~src2 & src1) | (src3 & src1);
+}
+
+__m256i
+__attribute__ ((noipa, target("avx512vl")))
+foo4 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return src3 & src2 ^ src1;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101989-2.c b/gcc/testsuite/gcc.target/i386/pr101989-2.c
new file mode 100644
index 0000000..9d9759a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101989-2.c
@@ -0,0 +1,102 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -mno-avx512f" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+
+#include "avx512f-helper.h"
+
+#include "pr101989-1.c"
+__m256d
+avx2_copysign2_pd (__m256d from, __m256d to) {
+  __m256i a = _mm256_castpd_si256(from);
+  __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63));
+  /* (avx_signbit & from) | (~avx_signbit & to)  */
+  return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to));
+}
+
+__m256i
+avx2_foo (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & ~src1) | (src3 & src1);
+}
+
+__m256i
+avx2_foo1 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & src1) | (src3 & ~src1);
+}
+
+__m256i
+avx2_foo2 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (src2 & src1) | (~src3 & src1);
+}
+
+__m256i
+avx2_foo3 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return (~src2 & src1) | (src3 & src1);
+}
+
+__m256i
+avx2_foo4 (__m256i src1, __m256i src2, __m256i src3)
+{
+  return src3 & src2 ^ src1;
+}
+
+
+void
+test_256 (void)
+{
+  union256i_q q1, q2, q3, res2, exp2;
+  union256d d1, d2, res1, exp1;
+  int i, sign = 1;
+
+  for (i = 0; i < 4; i++)
+    {
+      d1.a[i] = 12.34 * (i + 2000) * sign;
+      d2.a[i] = 56.78 * (i - 30) * sign;
+      q1.a[i] = 12 * (i + 2000) * sign;
+      q2.a[i] = 56 * (i - 30) * sign;
+      q3.a[i] = 90 * (i + 40) * sign;
+      res1.a[i] = DEFAULT_VALUE;
+      exp1.a[i] = DEFAULT_VALUE;
+      res2.a[i] = exp2.a[i] = -1;
+      sign = -sign;
+    }
+
+  exp1.x = avx2_copysign2_pd (d1.x, d2.x);
+  res1.x = copysign2_pd (d1.x, d2.x);
+  if (UNION_CHECK (256, d) (res1, exp1.a))
+    abort ();
+
+  exp2.x = avx2_foo1 (q1.x, q2.x, q3.x);
+  res2.x = foo1 (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+
+  exp2.x = avx2_foo2 (q1.x, q2.x, q3.x);
+  res2.x = foo2 (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+
+  exp2.x = avx2_foo3 (q1.x, q2.x, q3.x);
+  res2.x = foo3 (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+
+  exp2.x = avx2_foo4 (q1.x, q2.x, q3.x);
+  res2.x = foo4 (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+
+  exp2.x = avx2_foo (q1.x, q2.x, q3.x);
+  res2.x = foo (q1.x, q2.x, q3.x);
+  if (UNION_CHECK (256, i_q) (res2, exp2.a))
+    abort ();
+}
+
+static void
+test_128 ()
+{}
diff --git a/gcc/testsuite/gcc.target/i386/pr101989-broadcast-1.c b/gcc/testsuite/gcc.target/i386/pr101989-broadcast-1.c
new file mode 100644
index 0000000..d03d192
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101989-broadcast-1.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512vl" } */
+/* { dg-final { scan-assembler-times "vpternlog" 4 } } */
+/* { dg-final { scan-assembler-times "\\\{1to4\\\}" 4 } } */
+#include<immintrin.h>
+extern long long C;
+__m256d
+copysign2_pd(__m256d from, __m256d to) {
+  __m256i a = _mm256_castpd_si256(from);
+  __m256d avx_signbit = _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_cmpeq_epi64(a, a), 63));
+  /* (avx_signbit & from) | (~avx_signbit & to)  */
+  return _mm256_or_pd(_mm256_and_pd(avx_signbit, from), _mm256_andnot_pd(avx_signbit, to));
+}
+
+__m256i
+mask_pternlog (__m256i A, __m256i B, __mmask8 U)
+{
+  return _mm256_mask_ternarylogic_epi64 (A, U, B, _mm256_set1_epi64x (C) ,202);
+}
+
+__m256i
+maskz_pternlog (__m256i A, __m256i B, __mmask8 U)
+{
+  return _mm256_maskz_ternarylogic_epi64 (U, A, B, _mm256_set1_epi64x (C) ,202);
+}
+
+__m256i
+none_pternlog (__m256i A, __m256i B)
+{
+  return _mm256_ternarylogic_epi64 (A, B, _mm256_set1_epi64x (C) ,202);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr102021.c b/gcc/testsuite/gcc.target/i386/pr102021.c
new file mode 100644
index 0000000..6db3f57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102021.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=skylake-avx512" } */
+
+#include<immintrin.h>
+
+__m256i
+foo ()
+{
+  return _mm256_set1_epi16 (12);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %ymm\[0-9\]+" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
+/* { dg-final { scan-assembler-not "vzeroupper" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 66ce48d..06f5b1e 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4878,15 +4878,16 @@ proc check_effective_target_arm_cmse_ok {} {
 
 proc check_effective_target_arm_cmse_hw { } {
     return [check_runtime arm_cmse_hw_available {
-	int __attribute__ ((cmse_nonsecure_entry)) ns_func(void)
-	{
-	    return 0;
-	}
 	int main (void)
 	{
-	    return ns_func();
-	}
-    } "-mcmse -Wl,--section-start,.gnu.sgstubs=0x00400000"]
+	    unsigned id_pfr1;
+	    asm ("ldr\t%0, =0xe000ed44\n" \
+		 "ldr\t%0, [%0]\n" \
+		 "sg" : "=l" (id_pfr1));
+	    /* Exit with code 0 iff security extension is available.  */
+	    return !(id_pfr1 & 0xf0);
+	}
+    } "-mcmse"]
 }
 # Return 1 if the target supports executing MVE instructions, 0
 # otherwise.
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index c521b43a..0c8d992 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1519,6 +1519,13 @@ vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
       stmt_vec_info inner_loop_cond_info
 	= loop_vinfo->lookup_stmt (inner_loop_cond);
       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
+      /* If we have an estimate on the number of iterations of the inner
+	 loop use that to limit the scale for costing, otherwise use
+	 --param vect-inner-loop-cost-factor literally.  */
+      widest_int nit;
+      if (estimated_stmt_executions (loop->inner, &nit))
+	LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
+	  = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
     }
 
   gcc_assert (!loop->aux);
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index d2f6a16..edc11c6 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -5233,7 +5233,8 @@ li_cost_vec_cmp (const void *a_, const void *b_)
 
 static bool
 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
-				    vec<slp_instance> slp_instances)
+				    vec<slp_instance> slp_instances,
+				    loop_p orig_loop)
 {
   slp_instance instance;
   int i;
@@ -5270,6 +5271,30 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
       vector_costs.safe_splice (instance->cost_vec);
       instance->cost_vec.release ();
     }
+  /* When we're vectorizing an if-converted loop body with the
+     very-cheap cost model make sure we vectorized all if-converted
+     code.  */
+  bool force_not_profitable = false;
+  if (orig_loop && flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP)
+    {
+      gcc_assert (bb_vinfo->bbs.length () == 1);
+      for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
+	   !gsi_end_p (gsi); gsi_next (&gsi))
+	{
+	  /* The costing above left us with DCEable vectorized scalar
+	     stmts having the visited flag set.  */
+	  if (gimple_visited_p (gsi_stmt (gsi)))
+	    continue;
+
+	  if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
+	    if (gimple_assign_rhs_code (ass) == COND_EXPR)
+	      {
+		force_not_profitable = true;
+		break;
+	      }
+	}
+    }
+
   /* Unset visited flag.  */
   stmt_info_for_cost *cost;
   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
@@ -5394,9 +5419,14 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
       return false;
     }
 
+  if (dump_enabled_p () && force_not_profitable)
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "not profitable because of unprofitable if-converted "
+		     "scalar code\n");
+
   scalar_costs.release ();
   vector_costs.release ();
-  return true;
+  return !force_not_profitable;
 }
 
 /* qsort comparator for lane defs.  */
@@ -5810,7 +5840,8 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
 
 static bool
 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
-		 vec<int> *dataref_groups, unsigned int n_stmts)
+		 vec<int> *dataref_groups, unsigned int n_stmts,
+		 loop_p orig_loop)
 {
   bb_vec_info bb_vinfo;
   auto_vector_modes vector_modes;
@@ -5859,7 +5890,9 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
 	      vect_location = instance->location ();
 	      if (!unlimited_cost_model (NULL)
 		  && !vect_bb_vectorization_profitable_p
-			(bb_vinfo, instance->subgraph_entries))
+			(bb_vinfo,
+			 orig_loop ? BB_VINFO_SLP_INSTANCES (bb_vinfo)
+			 : instance->subgraph_entries, orig_loop))
 		{
 		  if (dump_enabled_p ())
 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -5877,7 +5910,9 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
 				 "using SLP\n");
 	      vectorized = true;
 
-	      vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
+	      vect_schedule_slp (bb_vinfo,
+				 orig_loop ? BB_VINFO_SLP_INSTANCES (bb_vinfo)
+				 : instance->subgraph_entries);
 
 	      unsigned HOST_WIDE_INT bytes;
 	      if (dump_enabled_p ())
@@ -5892,6 +5927,11 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
 				     "basic block part vectorized using "
 				     "variable length vectors\n");
 		}
+
+	      /* When we're called from loop vectorization we're considering
+		 all subgraphs at once.  */
+	      if (orig_loop)
+		break;
 	    }
 	}
       else
@@ -5959,7 +5999,7 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
    true if anything in the basic-block was vectorized.  */
 
 static bool
-vect_slp_bbs (const vec<basic_block> &bbs)
+vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
 {
   vec<data_reference_p> datarefs = vNULL;
   auto_vec<int> dataref_groups;
@@ -5989,18 +6029,20 @@ vect_slp_bbs (const vec<basic_block> &bbs)
       ++current_group;
     }
 
-  return vect_slp_region (bbs, datarefs, &dataref_groups, insns);
+  return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
 }
 
-/* Main entry for the BB vectorizer.  Analyze and transform BB, returns
-   true if anything in the basic-block was vectorized.  */
+/* Special entry for the BB vectorizer.  Analyze and transform a single
+   if-converted BB with ORIG_LOOPs body being the not if-converted
+   representation.  Returns true if anything in the basic-block was
+   vectorized.  */
 
 bool
-vect_slp_bb (basic_block bb)
+vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
 {
   auto_vec<basic_block> bbs;
   bbs.safe_push (bb);
-  return vect_slp_bbs (bbs);
+  return vect_slp_bbs (bbs, orig_loop);
 }
 
 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
@@ -6051,7 +6093,7 @@ vect_slp_function (function *fun)
 
       if (split && !bbs.is_empty ())
 	{
-	  r |= vect_slp_bbs (bbs);
+	  r |= vect_slp_bbs (bbs, NULL);
 	  bbs.truncate (0);
 	  bbs.quick_push (bb);
 	}
@@ -6069,13 +6111,13 @@ vect_slp_function (function *fun)
 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			       "splitting region at control altering "
 			       "definition %G", last);
-	    r |= vect_slp_bbs (bbs);
+	    r |= vect_slp_bbs (bbs, NULL);
 	    bbs.truncate (0);
 	  }
     }
 
   if (!bbs.is_empty ())
-    r |= vect_slp_bbs (bbs);
+    r |= vect_slp_bbs (bbs, NULL);
 
   free (rpo);
 
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 813f468..3aa3e2a 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -1033,10 +1033,7 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 	 only non-if-converted parts took part in BB vectorization.  */
       if (flag_tree_slp_vectorize != 0
 	  && loop_vectorized_call
-	  && ! loop->inner
-	  /* This would purely be a workaround and should be removed
-	     once PR100089 is fixed.  */
-	  && flag_vect_cost_model != VECT_COST_MODEL_VERY_CHEAP)
+	  && ! loop->inner)
 	{
 	  basic_block bb = loop->header;
 	  bool require_loop_vectorize = false;
@@ -1062,12 +1059,17 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 	      gimple_set_uid (stmt, -1);
 	      gimple_set_visited (stmt, false);
 	    }
-	  if (!require_loop_vectorize && vect_slp_bb (bb))
+	  if (!require_loop_vectorize)
 	    {
-	      fold_loop_internal_call (loop_vectorized_call,
-				       boolean_true_node);
-	      loop_vectorized_call = NULL;
-	      ret |= TODO_cleanup_cfg | TODO_update_ssa_only_virtuals;
+	      tree arg = gimple_call_arg (loop_vectorized_call, 1);
+	      class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
+	      if (vect_slp_if_converted_bb (bb, scalar_loop))
+		{
+		  fold_loop_internal_call (loop_vectorized_call,
+					   boolean_true_node);
+		  loop_vectorized_call = NULL;
+		  ret |= TODO_cleanup_cfg | TODO_update_ssa_only_virtuals;
+		}
 	    }
 	}
       /* If outer loop vectorization fails for LOOP_VECTORIZED guarded
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 9c2c29d..72e018e 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2087,7 +2087,7 @@ extern void vect_gather_slp_loads (vec_info *);
 extern void vect_get_slp_defs (slp_tree, vec<tree> *);
 extern void vect_get_slp_defs (vec_info *, slp_tree, vec<vec<tree> > *,
 			       unsigned n = -1U);
-extern bool vect_slp_bb (basic_block);
+extern bool vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop);
 extern bool vect_slp_function (function *);
 extern stmt_vec_info vect_find_last_scalar_stmt_in_slp (slp_tree);
 extern stmt_vec_info vect_find_first_scalar_stmt_in_slp (slp_tree);
diff --git a/gcc/value-relation.cc b/gcc/value-relation.cc
index bcfe388..8edd98b 100644
--- a/gcc/value-relation.cc
+++ b/gcc/value-relation.cc
@@ -112,7 +112,7 @@ relation_kind rr_intersect_table[VREL_COUNT][VREL_COUNT] = {
   { NE_EXPR, LT_EXPR, LT_EXPR, GT_EXPR, GT_EXPR, VREL_EMPTY, VREL_EMPTY, NE_EXPR } };
 
 
-// Intersect relation R! with relation R2 and return the resulting relation.
+// Intersect relation R1 with relation R2 and return the resulting relation.
 
 relation_kind
 relation_intersect (relation_kind r1, relation_kind r2)
@@ -155,6 +155,39 @@ relation_union (relation_kind r1, relation_kind r2)
 }
 
 
+// This table is used to determine transitivity between 2 relations.
+// (A relation0 B) and (B relation1 C) implies  (A result C)
+
+relation_kind rr_transitive_table[VREL_COUNT][VREL_COUNT] = {
+//   NONE, LT_EXPR, LE_EXPR, GT_EXPR, GE_EXPR, EMPTY, EQ_EXPR, NE_EXPR
+// VREL_NONE
+  { VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE },
+// LT_EXPR
+  { VREL_NONE, LT_EXPR, LT_EXPR, VREL_NONE, VREL_NONE, VREL_NONE, LT_EXPR, VREL_NONE },
+// LE_EXPR
+  { VREL_NONE, LT_EXPR, LE_EXPR, VREL_NONE, VREL_NONE, VREL_NONE, LE_EXPR, VREL_NONE },
+// GT_EXPR
+  { VREL_NONE, VREL_NONE, VREL_NONE, GT_EXPR, GT_EXPR, VREL_NONE, GT_EXPR, VREL_NONE },
+// GE_EXPR
+  { VREL_NONE, VREL_NONE, VREL_NONE, GT_EXPR, GE_EXPR, VREL_NONE, GE_EXPR, VREL_NONE },
+// VREL_EMPTY
+  { VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE },
+// EQ_EXPR
+  { VREL_NONE, LT_EXPR, LE_EXPR, GT_EXPR, GE_EXPR, VREL_NONE, EQ_EXPR, VREL_NONE },
+// NE_EXPR
+  { VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE, VREL_NONE } };
+
+// Apply transitive operation between relation R1 and relation R2, and
+// return the resulting relation, if any.
+
+relation_kind
+relation_transitive (relation_kind r1, relation_kind r2)
+{
+  vrel_range_assert (r1);
+  vrel_range_assert (r2);
+  return rr_transitive_table[r1 - VREL_FIRST][r2 - VREL_FIRST];
+}
+
 // -------------------------------------------------------------------------
 
 // This class represents an equivalency set, and contains a link to the next
@@ -472,7 +505,7 @@ public:
   bool union_ (value_relation &p);
   bool intersect (value_relation &p);
   void negate ();
-  void swap ();
+  bool apply_transitive (const value_relation &rel);
 
   void dump (FILE *f) const;
 private:
@@ -517,14 +550,6 @@ value_relation::negate ()
   related = relation_negate (related);
 }
 
-// Modify the relation as if the operands were being swapped.
-
-void
-value_relation::swap ()
-{
-  related = relation_swap (related);
-}
-
 // Perform an intersection between 2 relations. *this &&= p.
 
 bool
@@ -561,6 +586,73 @@ value_relation::union_ (value_relation &p)
   return old != related;
 }
 
+// Identify and apply any transitive relations between REL
+// and THIS.  Return true if there was a transformation.
+
+bool
+value_relation::apply_transitive (const value_relation &rel)
+{
+  relation_kind k = VREL_NONE;
+
+  // Idenity any common operand, and notrmalize the relations to
+  // the form : A < B  B < C produces A < C
+  if (rel.op1 () == name2)
+    {
+      // A < B   B < C
+      if (rel.op2 () == name1)
+	return false;
+      k = relation_transitive (kind (), rel.kind ());
+      if (k != VREL_NONE)
+	{
+	  related = k;
+	  name2 = rel.op2 ();
+	  return true;
+	}
+    }
+  else if (rel.op1 () == name1)
+    {
+      // B > A   B < C
+      if (rel.op2 () == name2)
+	return false;
+      k = relation_transitive (relation_swap (kind ()), rel.kind ());
+      if (k != VREL_NONE)
+	{
+	  related = k;
+	  name1 = name2;
+	  name2 = rel.op2 ();
+	  return true;
+	}
+    }
+  else if (rel.op2 () == name2)
+    {
+       // A < B   C > B
+       if (rel.op1 () == name1)
+	 return false;
+      k = relation_transitive (kind (), relation_swap (rel.kind ()));
+      if (k != VREL_NONE)
+	{
+	  related = k;
+	  name2 = rel.op1 ();
+	  return true;
+	}
+    }
+  else if (rel.op2 () == name1)
+    {
+      // B > A  C > B
+      if (rel.op1 () == name2)
+	return false;
+      k = relation_transitive (relation_swap (kind ()),
+			       relation_swap (rel.kind ()));
+      if (k != VREL_NONE)
+	{
+	  related = k;
+	  name1 = name2;
+	  name2 = rel.op1 ();
+	  return true;
+	}
+    }
+  return false;
+}
 
 // Dump the relation to file F.
 
@@ -597,6 +689,7 @@ relation_oracle::relation_oracle ()
   m_relations.safe_grow_cleared (last_basic_block_for_fn (cfun) + 1);
   m_relation_set = BITMAP_ALLOC (&m_bitmaps);
   m_tmp = BITMAP_ALLOC (&m_bitmaps);
+  m_tmp2 = BITMAP_ALLOC (&m_bitmaps);
 }
 
 // Destruct a relation oracle.
@@ -669,10 +762,12 @@ relation_oracle::register_relation (edge e, relation_kind k, tree op1,
 // Register relation K between OP! and OP2 in block BB.
 // This creates the record and searches for existing records in the dominator
 // tree to merge with.
+// TRANSITIVE_P is true if this is being registered as a transitive operation,
+// and should not try to register further transitives.
 
 void
 relation_oracle::register_relation (basic_block bb, relation_kind k, tree op1,
-				    tree op2)
+				    tree op2, bool transitive_p)
 {
   gcc_checking_assert (k != VREL_NONE);
 
@@ -710,26 +805,160 @@ relation_oracle::register_relation (basic_block bb, relation_kind k, tree op1,
 	  ptr->dump (dump_file);
 	  fprintf (dump_file, "\n");
 	}
-      return;
+    }
+  else
+    {
+      // Check for an existing relation further up the DOM chain.
+      // By including dominating relations, The first one found in any search
+      // will be the aggregate of all the previous ones.
+      curr = find_relation_dom (bb, v1, v2);
+      if (curr != VREL_NONE)
+	k = relation_intersect (curr, k);
+
+      bitmap_set_bit (bm, v1);
+      bitmap_set_bit (bm, v2);
+      bitmap_set_bit (m_relation_set, v1);
+      bitmap_set_bit (m_relation_set, v2);
+
+      ptr = (relation_chain *) obstack_alloc (&m_chain_obstack,
+					      sizeof (relation_chain));
+      ptr->set_relation (k, op1, op2);
+      ptr->m_next = m_relations[bbi].m_head;
+      m_relations[bbi].m_head = ptr;;
     }
 
-  // Check for an existing relation further up the DOM chain.
-  // By including dominating relations, The first one found in any search
-  // will be the aggregate of all the previous ones.
-  curr = find_relation_dom (bb, v1, v2);
-  if (curr != VREL_NONE)
-    k = relation_intersect (curr, k);
-
-  bitmap_set_bit (bm, v1);
-  bitmap_set_bit (bm, v2);
-  bitmap_set_bit (m_relation_set, v1);
-  bitmap_set_bit (m_relation_set, v2);
-
-  ptr = (relation_chain *) obstack_alloc (&m_chain_obstack,
-					  sizeof (relation_chain));
-  ptr->set_relation (k, op1, op2);
-  ptr->m_next = m_relations[bbi].m_head;
-  m_relations[bbi].m_head = ptr;;
+  if (!transitive_p)
+    register_transitives (bb, *ptr);
+}
+
+// Starting at ROOT_BB search the DOM tree  looking for relations which
+// may produce transitive relations to RELATION.  EQUIV1 and EQUIV2 are
+// bitmaps for op1/op2 and any of their equivalences that should also be
+// considered.
+
+void
+relation_oracle::register_transitives (basic_block root_bb,
+				       const value_relation &relation,
+				       const_bitmap equiv1,
+				       const_bitmap equiv2)
+{
+  basic_block bb;
+  for (bb = root_bb; bb; bb = get_immediate_dominator (CDI_DOMINATORS, bb))
+    {
+      int bbi = bb->index;
+      if (bbi >= (int)m_relations.length())
+	continue;
+      const_bitmap bm = m_relations[bbi].m_names;
+      if (!bm)
+	continue;
+      if (!bitmap_intersect_p (bm, equiv1) && !bitmap_intersect_p (bm, equiv2))
+	continue;
+      // At least one of the 2 ops has a relation in this block.
+      relation_chain *ptr;
+      for (ptr = m_relations[bbi].m_head; ptr ; ptr = ptr->m_next)
+	{
+	  // In the presence of an equivalence, 2 operands may do not
+	  // naturally match. ie  with equivalence a_2 == b_3
+	  // given c_1 < a_2 && b_3 < d_4
+	  // convert the second relation (b_3 < d_4) to match any
+	  // equivalences to found in the first relation.
+	  // ie convert b_3 < d_4 to a_2 < d_4, which then exposes the
+	  // transitive operation:  c_1 < a_2 && a_2 < d_4 -> c_1 < d_4
+
+	  tree r1, r2;
+	  tree p1 = ptr->op1 ();
+	  tree p2 = ptr->op2 ();
+	  // Find which equivalence is in the first operand.
+	  if (bitmap_bit_p (equiv1, SSA_NAME_VERSION (p1)))
+	    r1 = p1;
+	  else if (bitmap_bit_p (equiv1, SSA_NAME_VERSION (p2)))
+	    r1 = p2;
+	  else
+	    r1 = NULL_TREE;
+
+	  // Find which equivalence is in the second operand.
+	  if (bitmap_bit_p (equiv2, SSA_NAME_VERSION (p1)))
+	    r2 = p1;
+	  else if (bitmap_bit_p (equiv2, SSA_NAME_VERSION (p2)))
+	    r2 = p2;
+	  else
+	    r2 = NULL_TREE;
+
+	  // Ignore if both NULL (not relevant relation) or the same,
+	  if (r1 == r2)
+	    continue;
+
+	  // Any operand not an equivalence, just take the real operand.
+	  if (!r1)
+	    r1 = relation.op1 ();
+	  if (!r2)
+	    r2 = relation.op2 ();
+
+	  value_relation nr (relation.kind (), r1, r2);
+	  if (nr.apply_transitive (*ptr))
+	    {
+	      register_relation (root_bb, nr.kind (), nr.op1 (), nr.op2 (),
+				 true);
+	      if (dump_file && (dump_flags & TDF_DETAILS))
+		{
+		  fprintf (dump_file, "   Registering transitive relation ");
+		  nr.dump (dump_file);
+		  fputc ('\n', dump_file);
+		}
+	    }
+
+	}
+    }
+}
+
+// Find adn register any transitive relations implied by RELATION occuring
+// in block BB.
+
+void
+relation_oracle::register_transitives (basic_block bb,
+				       const value_relation &relation)
+{
+  // Only apply transitives to certain kinds of operations.
+  switch (relation.kind ())
+    {
+      case LE_EXPR:
+      case LT_EXPR:
+      case GT_EXPR:
+      case GE_EXPR:
+	break;
+      default:
+	return;
+    }
+
+  // Set up the bitmaps for op1 and op2, and if there are no equivalencies,
+  // set just op1 or op2 in their own bitmap.
+  const_bitmap equiv1 = equiv_set (relation.op1 (), bb);
+  const_bitmap equiv2 = equiv_set (relation.op2 (), bb);
+  if (equiv1)
+    {
+      if (equiv2)
+	register_transitives (bb, relation, equiv1, equiv2);
+      else
+	{
+	  bitmap_clear (m_tmp);
+	  bitmap_set_bit (m_tmp, SSA_NAME_VERSION (relation.op2 ()));
+	  register_transitives (bb, relation, equiv1, m_tmp);
+	}
+    }
+  else if (equiv2)
+    {
+      bitmap_clear (m_tmp);
+      bitmap_set_bit (m_tmp, SSA_NAME_VERSION (relation.op1 ()));
+      register_transitives (bb, relation, m_tmp, equiv2);
+    }
+  else
+    {
+      bitmap_clear (m_tmp);
+      bitmap_clear (m_tmp2);
+      bitmap_set_bit (m_tmp, SSA_NAME_VERSION (relation.op1 ()));
+      bitmap_set_bit (m_tmp2, SSA_NAME_VERSION (relation.op2 ()));
+      register_transitives (bb, relation, m_tmp, m_tmp2);
+    }
 }
 
 // Find the relation between any ssa_name in B1 and any name in B2 in block BB.
diff --git a/gcc/value-relation.h b/gcc/value-relation.h
index 1148854..e0e2f82 100644
--- a/gcc/value-relation.h
+++ b/gcc/value-relation.h
@@ -143,7 +143,7 @@ public:
   void dump (FILE *f, basic_block bb) const;
   void dump (FILE *f) const;
 private:
-  bitmap m_tmp;
+  bitmap m_tmp, m_tmp2;
   bitmap m_relation_set;  // Index by ssa-name. True if a relation exists
   vec <relation_chain_head> m_relations;  // Index by BB, list of relations.
   relation_kind find_relation_block (unsigned bb, const_bitmap b1,
@@ -153,7 +153,12 @@ private:
   relation_kind find_relation_block (int bb, unsigned v1, unsigned v2,
 				     relation_chain **obj = NULL);
   relation_kind find_relation_dom (basic_block bb, unsigned v1, unsigned v2);
-  void register_relation (basic_block bb, relation_kind k, tree op1, tree op2);
+  void register_relation (basic_block bb, relation_kind k, tree op1, tree op2,
+			  bool transitive_p = false);
+  void register_transitives (basic_block, const class value_relation &);
+  void register_transitives (basic_block, const value_relation &, const_bitmap,
+			     const_bitmap);
+
 };
 
 #endif  /* GCC_VALUE_RELATION_H */
author	Martin Liska <mliska@suse.cz>	2021-08-24 16:42:47 +0200
committer	Martin Liska <mliska@suse.cz>	2021-08-24 16:42:47 +0200
commit	7572f9cd10edd3bc1889a8f513dbf77b7f4e470d (patch)
tree	2f77059468d7b182b29483f6fa674fe5d6550652 /gcc
parent	eb2de151c582a38efc53ce57416f7bd7a3a9c0eb (diff)
parent	8ce18a29ef717f5920ebf5dc1d9e84570a1827d4 (diff)
download	gcc-7572f9cd10edd3bc1889a8f513dbf77b7f4e470d.zip gcc-7572f9cd10edd3bc1889a8f513dbf77b7f4e470d.tar.gz gcc-7572f9cd10edd3bc1889a8f513dbf77b7f4e470d.tar.bz2