Merge branch 'master' into devel/sphinx

author: Martin Liska <mliska@suse.cz> 2021-08-05 19:50:30 +0200
committer: Martin Liska <mliska@suse.cz> 2021-08-05 19:50:30 +0200
commit: f182597d273fe81ffae6dfece17fecadd01842f7 (patch)
tree: de04d1a91a0c59ca773707b395393d0ce12516e7 /gcc
parent: fc45f824a020dff1ec2ea68cef1d23345fb7d447 (diff)
parent: 4739344d36e6d24764cbedde44a3fff6edc70f6c (diff)
download: gcc-f182597d273fe81ffae6dfece17fecadd01842f7.zip
gcc-f182597d273fe81ffae6dfece17fecadd01842f7.tar.gz
gcc-f182597d273fe81ffae6dfece17fecadd01842f7.tar.bz2
184 files changed, 6507 insertions, 752 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 87b0e27..d888dc5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,308 @@
+2021-08-04  David Malcolm  <dmalcolm@redhat.com>
+
+	PR analyzer/101570
+	* Makefile.in (ANALYZER_OBJS): Add analyzer/region-model-asm.o.
+
+2021-08-04  H.J. Lu  <hjl.tools@gmail.com>
+
+	PR target/101742
+	* config/i386/i386.h (STORE_MAX_PIECES): Allow 16/32/64 bytes
+	only if TARGET_INTER_UNIT_MOVES_TO_VEC is true.
+
+2021-08-04  H.J. Lu  <hjl.tools@gmail.com>
+
+	PR target/101772
+	* config/i386/i386-expand.c (ix86_expand_vector_move): Call
+	ix86_gen_scratch_sse_rtx to get a scratch SSE register to copy
+	data with SSE register from one memory location to another.
+
+2021-08-04  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* config/s390/s390.c (expand_perm_with_vpdi): New function.
+	(vectorize_vec_perm_const_1): Call expand_perm_with_vpdi.
+	* config/s390/vector.md (*vpdi1<mode>, @vpdi1<mode>): Enable a
+	parameterized expander.
+	(*vpdi4<mode>, @vpdi4<mode>): Likewise.
+
+2021-08-04  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* config/s390/s390.c (MAX_VECT_LEN): Define macro.
+	(struct expand_vec_perm_d): Define struct.
+	(expand_perm_with_merge): New function.
+	(vectorize_vec_perm_const_1): New function.
+	(s390_vectorize_vec_perm_const): New function.
+	(TARGET_VECTORIZE_VEC_PERM_CONST): Define target macro.
+
+2021-08-04  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* config/s390/vector.md (V_HW_64): Remove mode iterator.
+	(*vec_load_pair<mode>): Use V_HW_2 instead of V_HW_64.
+	* config/s390/vx-builtins.md
+	(vec_scatter_element<V_HW_2:mode>_SI): Use V_HW_2 instead of
+	V_HW_64.
+
+2021-08-04  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* config/s390/s390.md (UNSPEC_VEC_PERMI): Remove constant
+	definition.
+	* config/s390/vector.md (*vpdi1<mode>, *vpdi4<mode>): New pattern
+	definitions.
+	* config/s390/vx-builtins.md (*vec_permi<mode>): Emit generic rtx
+	instead of an unspec.
+
+2021-08-04  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* config/s390/s390-modes.def: Add more vector modes to support
+	concatenation of two vectors.
+	* config/s390/s390-protos.h (s390_expand_merge_perm_const): Add
+	prototype.
+	(s390_expand_merge): Likewise.
+	* config/s390/s390.c (s390_expand_merge_perm_const): New function.
+	(s390_expand_merge): New function.
+	* config/s390/s390.md (UNSPEC_VEC_MERGEH, UNSPEC_VEC_MERGEL):
+	Remove constant definitions.
+	* config/s390/vector.md (V_HW_2): Add mode iterators.
+	(VI_HW_4, V_HW_4): Rename VI_HW_4 to V_HW_4.
+	(vec_2x_nelts, vec_2x_wide): New mode attributes.
+	(*vmrhb, *vmrlb, *vmrhh, *vmrlh, *vmrhf, *vmrlf, *vmrhg, *vmrlg):
+	New pattern definitions.
+	(vec_widen_umult_lo_<mode>, vec_widen_umult_hi_<mode>)
+	(vec_widen_smult_lo_<mode>, vec_widen_smult_hi_<mode>)
+	(vec_unpacks_lo_v4sf, vec_unpacks_hi_v4sf, vec_unpacks_lo_v2df)
+	(vec_unpacks_hi_v2df): Adjust expanders to emit non-unspec RTX for
+	vec merge.
+	* config/s390/vx-builtins.md (V_HW_4): Remove mode iterator. Now
+	in vector.md.
+	(vec_mergeh<mode>, vec_mergel<mode>): Use s390_expand_merge to
+	emit vec merge pattern.
+
+2021-08-04  Jonathan Wright  <jonathan.wright@arm.com>
+
+	* config/aarch64/aarch64.c (aarch64_strip_extend_vec_half):
+	Define.
+	(aarch64_rtx_mult_cost): Traverse RTL tree to prevent cost of
+	vec_select high-half from being added into Neon multiply
+	cost.
+	* rtlanal.c (vec_series_highpart_p): Define.
+	* rtlanal.h (vec_series_highpart_p): Declare.
+
+2021-08-04  Jonathan Wright  <jonathan.wright@arm.com>
+
+	* config/aarch64/aarch64.c (aarch64_strip_duplicate_vec_elt):
+	Define.
+	(aarch64_rtx_mult_cost): Traverse RTL tree to prevent
+	vec_select cost from being added into Neon multiply cost.
+
+2021-08-04  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* tree-vect-loop.c (vect_better_loop_vinfo_p): Detect cases in
+	which old_loop_vinfo is an epilogue loop that handles a constant
+	number of iterations.
+
+2021-08-04  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* tree-vect-loop.c (vect_analyze_loop): Print a dump message
+	when a reanalyzed loop fails to be cheaper than the current
+	main loop.
+
+2021-08-04  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/aarch64.c: Fix a typo.
+
+2021-08-04  Vincent Lefèvre  <vincent-gcc@vinc17.net>
+
+	PR gcov-profile/101773
+	* gcov-io.c (gcov_close): Check return code of a fclose.
+
+2021-08-04  Bernd Edlinger  <bernd.edlinger@hotmail.de>
+
+	PR ada/101575
+	* dwarf2out.c (dwarf2out_assembly_start): Emit a dummy
+	.file statement when needed.
+
+2021-08-04  Richard Biener  <rguenther@suse.de>
+
+	* tree-vect-data-refs.c (vect_check_gather_scatter):
+	Include widening conversions only when the result is
+	still handed by native gather or the current offset
+	size not already matches the data size.
+	Also succeed analysis in case there's no native support,
+	noted by a IFN_LAST ifn and a NULL decl.
+	(vect_analyze_data_refs): Always consider gathers.
+	* tree-vect-patterns.c (vect_recog_gather_scatter_pattern):
+	Test for no IFN gather rather than decl gather.
+	* tree-vect-stmts.c (vect_model_load_cost): Pass in the
+	gather-scatter info and cost emulated gathers accordingly.
+	(vect_truncate_gather_scatter_offset): Properly test for
+	no IFN gather.
+	(vect_use_strided_gather_scatters_p): Likewise.
+	(get_load_store_type): Handle emulated gathers and its
+	restrictions.
+	(vectorizable_load): Likewise.  Emulate them by extracting
+	scalar offsets, doing scalar loads and a vector construct.
+
+2021-08-04  H.J. Lu  <hjl.tools@gmail.com>
+
+	PR target/101742
+	* expr.c (op_by_pieces_d::op_by_pieces_d): Add a max_pieces
+	argument to set m_max_size.
+	(move_by_pieces_d): Pass MOVE_MAX_PIECES to op_by_pieces_d.
+	(store_by_pieces_d): Pass STORE_MAX_PIECES to op_by_pieces_d.
+	(compare_by_pieces_d): Pass COMPARE_MAX_PIECES to op_by_pieces_d.
+
+2021-08-04  Roger Sayle  <roger@nextmovesoftware.com>
+	    Marc Glisse  <marc.glisse@inria.fr>
+
+	* match.pd (bit_ior, bit_xor): Canonicalize (X*C1)|(X*C2) and
+	(X*C1)^(X*C2) as X*(C1+C2), and related variants, using
+	tree_nonzero_bits to ensure that operands are bit-wise disjoint.
+
+2021-08-04  Richard Biener  <rguenther@suse.de>
+
+	* tree-ssa-forwprop.c (pass_forwprop::execute): Split
+	out code to decompose vector loads ...
+	(optimize_vector_load): ... here.  Generalize it to
+	handle intermediate widening and TARGET_MEM_REF loads
+	and apply it to loads with a supported vector mode as well.
+
+2021-08-04  Richard Biener  <rguenther@suse.de>
+
+	PR tree-optimization/101756
+	* tree-vect-slp.c (vectorizable_bb_reduc_epilogue): Make sure
+	the result of the reduction epilogue is compatible to the original
+	scalar result.
+
+2021-08-04  liuhongt  <hongtao.liu@intel.com>
+
+	PR target/101743
+	* config/i386/i386.md (peephole2): Refine predicate from
+	register_operand to general_reg_operand.
+
+2021-08-04  Aldy Hernandez  <aldyh@redhat.com>
+
+	* gimple-range-path.h (path_range_query::dump): Mark override.
+
+2021-08-04  Richard Biener  <rguenther@suse.de>
+
+	PR tree-optimization/101769
+	* tree-tailcall.c (eliminate_tail_call): Add the created loop
+	for the first recursion and return it via the new output parameter.
+	(optimize_tail_call): Pass through new output param.
+	(tree_optimize_tail_calls_1): After creating all latches,
+	add the created loop to the loop tree.  Do not mark loops for fixup.
+
+2021-08-04  Martin Liska  <mliska@suse.cz>
+
+	* doc/invoke.texi: Document threader-mode param.
+
+2021-08-04  liuhongt  <hongtao.liu@intel.com>
+
+	* config/i386/sse.md (cond_fma<mode>): New expander.
+	(cond_fms<mode>): Ditto.
+	(cond_fnma<mode>): Ditto.
+	(cond_fnms<mode>): Ditto.
+
+2021-08-03  Segher Boessenkool  <segher@kernel.crashing.org>
+
+	* config/rs6000/vsx.md (*vsx_le_perm_store_<mode>): Use && instead of &.
+
+2021-08-03  Segher Boessenkool  <segher@kernel.crashing.org>
+
+	* config/rs6000/constraints.md: Remove "e" from the list of available
+	constraint characters.
+
+2021-08-03  Eugene Rozenfeld  <erozen@microsoft.com>
+
+	PR gcov-profile/71672
+	* auto-profile.c (afdo_indirect_call): Fix setup of the historgram value for indirect calls.
+
+2021-08-03  Paul A. Clarke  <pc@us.ibm.com>
+
+	* config/rs6000/smmintrin.h (_mm_minpos_epu16): New.
+
+2021-08-03  H.J. Lu  <hjl.tools@gmail.com>
+
+	* config/i386/i386.c (ix86_gen_scratch_sse_rtx): In 64-bit mode,
+	try XMM31 to avoid vzeroupper.
+
+2021-08-03  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* doc/invoke.texi: Document -mtune=neoverse-512tvb and
+	-mcpu=neoverse-512tvb.
+	* config/aarch64/aarch64-cores.def (neoverse-512tvb): New entry.
+	* config/aarch64/aarch64-tune.md: Regenerate.
+	* config/aarch64/aarch64.c (neoverse512tvb_sve_vector_cost)
+	(neoverse512tvb_sve_issue_info, neoverse512tvb_vec_issue_info)
+	(neoverse512tvb_vector_cost, neoverse512tvb_tunings): New structures.
+	(aarch64_adjust_body_cost_sve): Handle -mtune=neoverse-512tvb.
+	(aarch64_adjust_body_cost): Likewise.
+
+2021-08-03  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/aarch64.c (aarch64_add_stmt_cost): Only
+	record issue information for operations that occur in the
+	innermost loop.
+
+2021-08-03  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/aarch64.c (aarch64_multiply_add_p): Add a vec_flags
+	parameter.  Detect cases in which an Advanced SIMD MLA would almost
+	certainly require a MOV.
+	(aarch64_count_ops): Update accordingly.
+
+2021-08-03  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/aarch64.c (aarch64_is_store_elt_extraction): New
+	function, split out from...
+	(aarch64_detect_vector_stmt_subtype): ...here.
+	(aarch64_add_stmt_cost): Treat extracting element 0 as free.
+
+2021-08-03  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/aarch64-protos.h (sve_vec_cost):
+	Add gather_load_x32_cost and gather_load_x64_cost.
+	* config/aarch64/aarch64.c (generic_sve_vector_cost)
+	(a64fx_sve_vector_cost, neoversev1_sve_vector_cost): Update
+	accordingly, using the values given by the scalar_load * number
+	of elements calculation that we used previously.
+	(aarch64_detect_vector_stmt_subtype): Use the new fields.
+
+2021-08-03  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/aarch64.c (aarch64_adjust_body_cost_sve): New
+	function, split out from...
+	(aarch64_adjust_body_cost): ...here.
+
+2021-08-03  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/fractional-cost.h: New file.
+	* config/aarch64/aarch64.c: Include <algorithm> (indirectly)
+	and cost_fraction.h.
+	(vec_cost_fraction): New typedef.
+	(aarch64_detect_scalar_stmt_subtype): Use it for statement costs.
+	(aarch64_detect_vector_stmt_subtype): Likewise.
+	(aarch64_sve_adjust_stmt_cost, aarch64_adjust_stmt_cost): Likewise.
+	(aarch64_estimate_min_cycles_per_iter): Use vec_cost_fraction
+	for cycle counts.
+	(aarch64_adjust_body_cost): Likewise.
+	(aarch64_test_cost_fraction): New function.
+	(aarch64_run_selftests): Call it.
+
+2021-08-03  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/aarch64-protos.h (tune_params::sve_width): Turn
+	into a bitmask.
+	* config/aarch64/aarch64.c (aarch64_cmp_autovec_modes): Update
+	accordingly.
+	(aarch64_estimated_poly_value): Likewise.  Use the least significant
+	set bit for the minimum and likely values.  Use the most significant
+	set bit for the maximum value.
+
+2021-08-03  liuhongt  <hongtao.liu@intel.com>
+
+	* config/i386/sse.md (cond_<insn><mode>): New expander.
+	(cond_mul<mode>): Ditto.
+
 2021-08-03  Kewen Lin  <linkw@linux.ibm.com>
 
 	* tree-cfg.c (move_sese_region_to_fn): Fix typos on dloop.
diff --git a/gcc/DATESTAMP b/gcc/DATESTAMP
index d34783f..6168f46 100644
--- a/gcc/DATESTAMP
+++ b/gcc/DATESTAMP
@@ -1 +1 @@
-20210803
+20210805
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 98eb479..c0f6e0a 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1262,6 +1262,7 @@ ANALYZER_OBJS = \
 	analyzer/program-state.o \
 	analyzer/region.o \
 	analyzer/region-model.o \
+	analyzer/region-model-asm.o \
 	analyzer/region-model-impl-calls.o \
 	analyzer/region-model-manager.o \
 	analyzer/region-model-reachability.o \
diff --git a/gcc/analyzer/ChangeLog b/gcc/analyzer/ChangeLog
index 2da5aae5..43e3c63 100644
--- a/gcc/analyzer/ChangeLog
+++ b/gcc/analyzer/ChangeLog
@@ -1,3 +1,45 @@
+2021-08-04  David Malcolm  <dmalcolm@redhat.com>
+
+	PR analyzer/101570
+	* analyzer.cc (maybe_reconstruct_from_def_stmt): Add GIMPLE_ASM
+	case.
+	* analyzer.h (class asm_output_svalue): New forward decl.
+	(class reachable_regions): New forward decl.
+	* complexity.cc (complexity::from_vec_svalue): New.
+	* complexity.h (complexity::from_vec_svalue): New decl.
+	* engine.cc (feasibility_state::maybe_update_for_edge): Handle
+	asm stmts by calling on_asm_stmt.
+	* region-model-asm.cc: New file.
+	* region-model-manager.cc
+	(region_model_manager::maybe_fold_asm_output_svalue): New.
+	(region_model_manager::get_or_create_asm_output_svalue): New.
+	(region_model_manager::log_stats): Log m_asm_output_values_map.
+	* region-model.cc (region_model::on_stmt_pre): Handle GIMPLE_ASM.
+	* region-model.h (visitor::visit_asm_output_svalue): New.
+	(region_model_manager::get_or_create_asm_output_svalue): New decl.
+	(region_model_manager::maybe_fold_asm_output_svalue): New decl.
+	(region_model_manager::asm_output_values_map_t): New typedef.
+	(region_model_manager::m_asm_output_values_map): New field.
+	(region_model::on_asm_stmt): New.
+	* store.cc (binding_cluster::on_asm): New.
+	* store.h (binding_cluster::on_asm): New decl.
+	* svalue.cc (svalue::cmp_ptr): Handle SK_ASM_OUTPUT.
+	(asm_output_svalue::dump_to_pp): New.
+	(asm_output_svalue::dump_input): New.
+	(asm_output_svalue::input_idx_to_asm_idx): New.
+	(asm_output_svalue::accept): New.
+	* svalue.h (enum svalue_kind): Add SK_ASM_OUTPUT.
+	(svalue::dyn_cast_asm_output_svalue): New.
+	(class asm_output_svalue): New.
+	(is_a_helper <const asm_output_svalue *>::test): New.
+	(struct default_hash_traits<asm_output_svalue::key_t>): New.
+
+2021-08-03  Jakub Jelinek  <jakub@redhat.com>
+
+	PR analyzer/101721
+	* sm-malloc.cc (known_allocator_p): Only check DECL_FUNCTION_CODE on
+	BUILT_IN_NORMAL builtins.
+
 2021-07-29  Ankur Saini  <arsenic@sourceware.org>
 
 	* call-string.cc (call_string::element_t::operator==): New operator.
diff --git a/gcc/analyzer/analyzer.cc b/gcc/analyzer/analyzer.cc
index b845b86..5578877 100644
--- a/gcc/analyzer/analyzer.cc
+++ b/gcc/analyzer/analyzer.cc
@@ -131,6 +131,7 @@ maybe_reconstruct_from_def_stmt (tree ssa_name,
     {
     default:
       gcc_unreachable ();
+    case GIMPLE_ASM:
     case GIMPLE_NOP:
     case GIMPLE_PHI:
       /* Can't handle these.  */
diff --git a/gcc/analyzer/analyzer.h b/gcc/analyzer/analyzer.h
index 8de5d60..896b350 100644
--- a/gcc/analyzer/analyzer.h
+++ b/gcc/analyzer/analyzer.h
@@ -53,6 +53,7 @@ class svalue;
   class widening_svalue;
   class compound_svalue;
   class conjured_svalue;
+  class asm_output_svalue;
 typedef hash_set<const svalue *> svalue_set;
 class region;
   class frame_region;
@@ -77,6 +78,7 @@ class call_details;
 struct rejected_constraint;
 class constraint_manager;
 class equiv_class;
+class reachable_regions;
 
 class pending_diagnostic;
 class state_change_event;
diff --git a/gcc/analyzer/complexity.cc b/gcc/analyzer/complexity.cc
index ece4272..ae9f982 100644
--- a/gcc/analyzer/complexity.cc
+++ b/gcc/analyzer/complexity.cc
@@ -90,6 +90,22 @@ complexity::from_pair (const complexity &c1, const complexity &c2)
 		     MAX (c1.m_max_depth, c2.m_max_depth) + 1);
 }
 
+/* Get complexity for a new node that references the svalues in VEC.  */
+
+complexity
+complexity::from_vec_svalue (const vec<const svalue *> &vec)
+{
+  unsigned num_nodes = 0;
+  unsigned max_depth = 0;
+  for (auto iter_sval : vec)
+    {
+      const complexity &iter_c = iter_sval->get_complexity ();
+      num_nodes += iter_c.m_num_nodes;
+      max_depth = MAX (max_depth, iter_c.m_max_depth);
+    }
+  return complexity (num_nodes + 1, max_depth + 1);
+}
+
 } // namespace ana
 
 #endif /* #if ENABLE_ANALYZER */
diff --git a/gcc/analyzer/complexity.h b/gcc/analyzer/complexity.h
index 459987e..85c0372 100644
--- a/gcc/analyzer/complexity.h
+++ b/gcc/analyzer/complexity.h
@@ -36,6 +36,7 @@ struct complexity
   complexity (const region *reg);
   complexity (const svalue *sval);
   static complexity from_pair (const complexity &c1, const complexity &c);
+  static complexity from_vec_svalue (const vec<const svalue *> &vec);
 
   /* The total number of svalues and regions in the tree of this
      entity, including the entity itself.  */
diff --git a/gcc/analyzer/engine.cc b/gcc/analyzer/engine.cc
index ee625fb..ecd4265 100644
--- a/gcc/analyzer/engine.cc
+++ b/gcc/analyzer/engine.cc
@@ -3718,6 +3718,8 @@ feasibility_state::maybe_update_for_edge (logger *logger,
 
       if (const gassign *assign = dyn_cast <const gassign *> (stmt))
 	m_model.on_assignment (assign, NULL);
+      else if (const gasm *asm_stmt = dyn_cast <const gasm *> (stmt))
+	m_model.on_asm_stmt (asm_stmt, NULL);
       else if (const gcall *call = dyn_cast <const gcall *> (stmt))
 	{
 	  bool terminate_path;
diff --git a/gcc/analyzer/region-model-asm.cc b/gcc/analyzer/region-model-asm.cc
new file mode 100644
index 0000000..3efc3fd
--- /dev/null
+++ b/gcc/analyzer/region-model-asm.cc
@@ -0,0 +1,303 @@
+/* Handling inline asm in the analyzer.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   Contributed by David Malcolm <dmalcolm@redhat.com>.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tree.h"
+#include "function.h"
+#include "basic-block.h"
+#include "gimple.h"
+#include "gimple-iterator.h"
+#include "diagnostic-core.h"
+#include "pretty-print.h"
+#include "tristate.h"
+#include "selftest.h"
+#include "json.h"
+#include "analyzer/analyzer.h"
+#include "analyzer/analyzer-logging.h"
+#include "options.h"
+#include "analyzer/call-string.h"
+#include "analyzer/program-point.h"
+#include "analyzer/store.h"
+#include "analyzer/region-model.h"
+#include "analyzer/region-model-reachability.h"
+#include "stmt.h"
+
+#if ENABLE_ANALYZER
+
+namespace ana {
+
+/* Minimal asm support for the analyzer.
+
+   The objective of this code is to:
+   - minimize false positives from the analyzer on the Linux kernel
+   (which makes heavy use of inline asm), whilst
+   - avoiding having to "teach" the compiler anything about specific strings
+   in asm statements.
+
+   Specifically, we want to:
+
+   (a) mark asm outputs and certain other regions as having been written to,
+       to avoid false postives from -Wanalyzer-use-of-uninitialized-value.
+
+   (b) identify some of these stmts as "deterministic" so that we can
+       write consistent outputs given consistent inputs, so that we can
+       avoid false positives for paths in which an asm is invoked twice
+       with the same inputs and is expected to emit the same output.
+
+   This file implements heuristics for achieving the above.  */
+
+/* Determine if ASM_STMT is deterministic, in the sense of (b) above.
+
+   Consider this x86 function taken from the Linux kernel
+   (arch/x86/include/asm/barrier.h):
+
+     static inline unsigned long array_index_mask_nospec(unsigned long index,
+							 unsigned long size)
+     {
+       unsigned long mask;
+
+       asm volatile ("cmp %1,%2; sbb %0,%0;"
+		     :"=r" (mask)
+		     :"g"(size),"r" (index)
+		     :"cc");
+       return mask;
+     }
+
+   The above is a mitigation for Spectre-variant-1 attacks, for clamping
+   an array access to within the range of [0, size] if the CPU speculates
+   past the array bounds.
+
+   However, it is ultimately used to implement wdev_to_wvif:
+
+     static inline struct wfx_vif *
+     wdev_to_wvif(struct wfx_dev *wdev, int vif_id)
+     {
+       vif_id = array_index_nospec(vif_id, ARRAY_SIZE(wdev->vif));
+       if (!wdev->vif[vif_id]) {
+	 return NULL;
+       }
+       return (struct wfx_vif *)wdev->vif[vif_id]->drv_priv;
+     }
+
+   which is used by:
+
+     if (wdev_to_wvif(wvif->wdev, 1))
+       return wdev_to_wvif(wvif->wdev, 1)->vif;
+
+   The code has been written to assume that wdev_to_wvif is deterministic,
+   and won't change from returning non-NULL at the "if" clause to
+   returning NULL at the "->vif" dereference.
+
+   By treating the above specific "asm volatile" as deterministic we avoid
+   a false positive from -Wanalyzer-null-dereference.  */
+
+static bool
+deterministic_p (const gasm *asm_stmt)
+{
+  /* Assume something volatile with no inputs is querying
+     changeable state e.g. rdtsc.  */
+  if (gimple_asm_ninputs (asm_stmt) == 0
+      && gimple_asm_volatile_p (asm_stmt))
+    return false;
+
+  /* Otherwise assume it's purely a function of its inputs.  */
+  return true;
+}
+
+/* Update this model for the asm STMT, using CTXT to report any
+   diagnostics.
+
+   Compare with cfgexpand.c: expand_asm_stmt.  */
+
+void
+region_model::on_asm_stmt (const gasm *stmt, region_model_context *ctxt)
+{
+  logger *logger = ctxt ? ctxt->get_logger () : NULL;
+  LOG_SCOPE (logger);
+
+  const unsigned noutputs = gimple_asm_noutputs (stmt);
+  const unsigned ninputs = gimple_asm_ninputs (stmt);
+
+  auto_vec<tree> output_tvec;
+  auto_vec<tree> input_tvec;
+  auto_vec<const char *> constraints;
+
+  /* Copy the gimple vectors into new vectors that we can manipulate.  */
+  output_tvec.safe_grow (noutputs, true);
+  input_tvec.safe_grow (ninputs, true);
+  constraints.safe_grow (noutputs + ninputs, true);
+
+  for (unsigned i = 0; i < noutputs; ++i)
+    {
+      tree t = gimple_asm_output_op (stmt, i);
+      output_tvec[i] = TREE_VALUE (t);
+      constraints[i] = TREE_STRING_POINTER (TREE_VALUE (TREE_PURPOSE (t)));
+    }
+  for (unsigned i = 0; i < ninputs; i++)
+    {
+      tree t = gimple_asm_input_op (stmt, i);
+      input_tvec[i] = TREE_VALUE (t);
+      constraints[i + noutputs]
+	= TREE_STRING_POINTER (TREE_VALUE (TREE_PURPOSE (t)));
+    }
+
+  /* Determine which regions are reachable from the inputs
+     to this stmt.  */
+  reachable_regions reachable_regs (this);
+
+  int num_errors = 0;
+
+  auto_vec<const region *> output_regions (noutputs);
+  for (unsigned i = 0; i < noutputs; ++i)
+    {
+      tree val = output_tvec[i];
+      const char *constraint;
+      bool is_inout;
+      bool allows_reg;
+      bool allows_mem;
+
+      const region *dst_reg = get_lvalue (val, ctxt);
+      output_regions.quick_push (dst_reg);
+      reachable_regs.add (dst_reg, true);
+
+      /* Try to parse the output constraint.  If that fails, there's
+	 no point in going further.  */
+      constraint = constraints[i];
+      if (!parse_output_constraint (&constraint, i, ninputs, noutputs,
+				    &allows_mem, &allows_reg, &is_inout))
+	{
+	  if (logger)
+	    logger->log ("error parsing constraint for output %i: %qs",
+			 i, constraint);
+	  num_errors++;
+	  continue;
+	}
+
+      if (logger)
+	{
+	  logger->log ("output %i: %qs %qE"
+		       " is_inout: %i allows_reg: %i allows_mem: %i",
+		       i, constraint, val,
+		       (int)is_inout, (int)allows_reg, (int)allows_mem);
+	  logger->start_log_line ();
+	  logger->log_partial ("  region: ");
+	  dst_reg->dump_to_pp (logger->get_printer (), true);
+	  logger->end_log_line ();
+	}
+
+    }
+
+  /* Ideally should combine with inout_svals to determine the
+     "effective inputs" and use this for the asm_output_svalue.  */
+
+  auto_vec<const svalue *> input_svals (ninputs);
+  for (unsigned i = 0; i < ninputs; i++)
+    {
+      tree val = input_tvec[i];
+      const char *constraint = constraints[i + noutputs];
+      bool allows_reg, allows_mem;
+      if (! parse_input_constraint (&constraint, i, ninputs, noutputs, 0,
+				    constraints.address (),
+				    &allows_mem, &allows_reg))
+	{
+	  if (logger)
+	    logger->log ("error parsing constraint for input %i: %qs",
+			 i, constraint);
+	  num_errors++;
+	  continue;
+	}
+
+      tree src_expr = input_tvec[i];
+      const svalue *src_sval = get_rvalue (src_expr, ctxt);
+      check_for_poison (src_sval, src_expr, ctxt);
+      input_svals.quick_push (src_sval);
+      reachable_regs.handle_sval (src_sval);
+
+      if (logger)
+	{
+	  logger->log ("input %i: %qs %qE"
+		       " allows_reg: %i allows_mem: %i",
+		       i, constraint, val,
+		       (int)allows_reg, (int)allows_mem);
+	  logger->start_log_line ();
+	  logger->log_partial ("  sval: ");
+	  src_sval->dump_to_pp (logger->get_printer (), true);
+	  logger->end_log_line ();
+	}
+    }
+
+  if (num_errors > 0)
+    gcc_unreachable ();
+
+  if (logger)
+    {
+      logger->log ("reachability: ");
+      reachable_regs.dump_to_pp (logger->get_printer ());
+      logger->end_log_line ();
+    }
+
+  /* Given the regions that were reachable from the inputs we
+     want to clobber them.
+     This is similar to region_model::handle_unrecognized_call,
+     but the unknown call policies seems too aggressive (e.g. purging state
+     from anything that's ever escaped).  Instead, clobber any clusters
+     that were reachable in *this* asm stmt, rather than those that
+     escaped, and we don't treat the values as having escaped.
+     We also assume that asm stmts don't affect sm-state.  */
+  for (auto iter = reachable_regs.begin_mutable_base_regs ();
+       iter != reachable_regs.end_mutable_base_regs (); ++iter)
+    {
+      const region *base_reg = *iter;
+      if (base_reg->symbolic_for_unknown_ptr_p ())
+	continue;
+
+      binding_cluster *cluster = m_store.get_or_create_cluster (base_reg);
+      cluster->on_asm (stmt, m_mgr->get_store_manager ());
+    }
+
+  /* Update the outputs.  */
+  for (unsigned output_idx = 0; output_idx < noutputs; output_idx++)
+    {
+      tree dst_expr = output_tvec[output_idx];
+      const region *dst_reg = output_regions[output_idx];
+
+      const svalue *sval;
+      if (deterministic_p (stmt)
+	  && input_svals.length () <= asm_output_svalue::MAX_INPUTS)
+	sval = m_mgr->get_or_create_asm_output_svalue (TREE_TYPE (dst_expr),
+						       stmt,
+						       output_idx,
+						       input_svals);
+      else
+	{
+	  sval = m_mgr->get_or_create_conjured_svalue (TREE_TYPE (dst_expr),
+						       stmt,
+						       dst_reg);
+	  purge_state_involving (sval, ctxt);
+	}
+      set_value (dst_reg, sval, ctxt);
+    }
+}
+
+} // namespace ana
+
+#endif /* #if ENABLE_ANALYZER */
diff --git a/gcc/analyzer/region-model-manager.cc b/gcc/analyzer/region-model-manager.cc
index 14c57d8..9e4644f 100644
--- a/gcc/analyzer/region-model-manager.cc
+++ b/gcc/analyzer/region-model-manager.cc
@@ -1087,6 +1087,51 @@ region_model_manager::get_or_create_conjured_svalue (tree type,
   return conjured_sval;
 }
 
+/* Subroutine of region_model_manager::get_or_create_asm_output_svalue.
+   Return a folded svalue, or NULL.  */
+
+const svalue *
+region_model_manager::
+maybe_fold_asm_output_svalue (tree type,
+			      const vec<const svalue *> &inputs)
+{
+  /* Unknown inputs should lead to unknown results.  */
+  for (const auto &iter : inputs)
+    if (iter->get_kind () == SK_UNKNOWN)
+      return get_or_create_unknown_svalue (type);
+
+  return NULL;
+}
+
+/* Return the svalue * of type TYPE for OUTPUT_IDX of the deterministic
+   asm stmt ASM_STMT, given INPUTS as inputs.  */
+
+const svalue *
+region_model_manager::
+get_or_create_asm_output_svalue (tree type,
+				 const gasm *asm_stmt,
+				 unsigned output_idx,
+				 const vec<const svalue *> &inputs)
+{
+  gcc_assert (inputs.length () <= asm_output_svalue::MAX_INPUTS);
+
+  if (const svalue *folded
+	= maybe_fold_asm_output_svalue (type, inputs))
+    return folded;
+
+  const char *asm_string = gimple_asm_string (asm_stmt);
+  const unsigned noutputs = gimple_asm_noutputs (asm_stmt);
+
+  asm_output_svalue::key_t key (type, asm_string, output_idx, inputs);
+  if (asm_output_svalue **slot = m_asm_output_values_map.get (key))
+    return *slot;
+  asm_output_svalue *asm_output_sval
+    = new asm_output_svalue (type, asm_string, output_idx, noutputs, inputs);
+  RETURN_UNKNOWN_IF_TOO_COMPLEX (asm_output_sval);
+  m_asm_output_values_map.put (key, asm_output_sval);
+  return asm_output_sval;
+}
+
 /* Given STRING_CST, a STRING_CST and BYTE_OFFSET_CST a constant,
    attempt to get the character at that offset, returning either
    the svalue for the character constant, or NULL if unsuccessful.  */
@@ -1505,6 +1550,9 @@ region_model_manager::log_stats (logger *logger, bool show_objs) const
   log_uniq_map (logger, show_objs, "widening_svalue", m_widening_values_map);
   log_uniq_map (logger, show_objs, "compound_svalue", m_compound_values_map);
   log_uniq_map (logger, show_objs, "conjured_svalue", m_conjured_values_map);
+  log_uniq_map (logger, show_objs, "asm_output_svalue",
+		m_asm_output_values_map);
+
   logger->log ("max accepted svalue num_nodes: %i",
 	       m_max_complexity.m_num_nodes);
   logger->log ("max accepted svalue max_depth: %i",
diff --git a/gcc/analyzer/region-model.cc b/gcc/analyzer/region-model.cc
index 1bc411b..58da7e3 100644
--- a/gcc/analyzer/region-model.cc
+++ b/gcc/analyzer/region-model.cc
@@ -980,7 +980,10 @@ region_model::on_stmt_pre (const gimple *stmt,
       break;
 
     case GIMPLE_ASM:
-      /* No-op for now.  */
+      {
+	const gasm *asm_stmt = as_a <const gasm *> (stmt);
+	on_asm_stmt (asm_stmt, ctxt);
+      }
       break;
 
     case GIMPLE_CALL:
diff --git a/gcc/analyzer/region-model.h b/gcc/analyzer/region-model.h
index d07ce9c..30f02a0 100644
--- a/gcc/analyzer/region-model.h
+++ b/gcc/analyzer/region-model.h
@@ -221,6 +221,7 @@ public:
   virtual void visit_widening_svalue (const widening_svalue *) {}
   virtual void visit_compound_svalue (const compound_svalue *) {}
   virtual void visit_conjured_svalue (const conjured_svalue *) {}
+  virtual void visit_asm_output_svalue (const asm_output_svalue *) {}
 
   virtual void visit_region (const region *) {}
 };
@@ -274,6 +275,11 @@ public:
 					       const binding_map &map);
   const svalue *get_or_create_conjured_svalue (tree type, const gimple *stmt,
 					       const region *id_reg);
+  const svalue *
+  get_or_create_asm_output_svalue (tree type,
+				   const gasm *asm_stmt,
+				   unsigned output_idx,
+				   const vec<const svalue *> &inputs);
 
   const svalue *maybe_get_char_from_string_cst (tree string_cst,
 						tree byte_offset_cst);
@@ -346,6 +352,8 @@ private:
   const svalue *maybe_undo_optimize_bit_field_compare (tree type,
 						       const compound_svalue *compound_sval,
 						       tree cst, const svalue *arg1);
+  const svalue *maybe_fold_asm_output_svalue (tree type,
+					      const vec<const svalue *> &inputs);
 
   unsigned m_next_region_id;
   root_region m_root_region;
@@ -410,6 +418,10 @@ private:
 		   conjured_svalue *> conjured_values_map_t;
   conjured_values_map_t m_conjured_values_map;
 
+  typedef hash_map<asm_output_svalue::key_t,
+		   asm_output_svalue *> asm_output_values_map_t;
+  asm_output_values_map_t m_asm_output_values_map;
+
   bool m_check_complexity;
 
   /* Maximum complexity of svalues that weren't rejected.  */
@@ -537,6 +549,7 @@ class region_model
   void on_assignment (const gassign *stmt, region_model_context *ctxt);
   const svalue *get_gassign_result (const gassign *assign,
 				    region_model_context *ctxt);
+  void on_asm_stmt (const gasm *asm_stmt, region_model_context *ctxt);
   bool on_call_pre (const gcall *stmt, region_model_context *ctxt,
 		    bool *out_terminate_path);
   void on_call_post (const gcall *stmt,
diff --git a/gcc/analyzer/store.cc b/gcc/analyzer/store.cc
index 8ee414d..eac1295 100644
--- a/gcc/analyzer/store.cc
+++ b/gcc/analyzer/store.cc
@@ -1796,6 +1796,23 @@ binding_cluster::on_unknown_fncall (const gcall *call,
     }
 }
 
+/* Mark this cluster as having been clobbered by STMT.  */
+
+void
+binding_cluster::on_asm (const gasm *stmt,
+			 store_manager *mgr)
+{
+  m_map.empty ();
+
+  /* Bind it to a new "conjured" value using CALL.  */
+  const svalue *sval
+    = mgr->get_svalue_manager ()->get_or_create_conjured_svalue
+    (m_base_region->get_type (), stmt, m_base_region);
+  bind (mgr, m_base_region, sval);
+
+  m_touched = true;
+}
+
 /* Return true if this binding_cluster has no information
    i.e. if there are no bindings, and it hasn't been marked as having
    escaped, or touched symbolically.  */
diff --git a/gcc/analyzer/store.h b/gcc/analyzer/store.h
index bc58694..b75691e 100644
--- a/gcc/analyzer/store.h
+++ b/gcc/analyzer/store.h
@@ -603,6 +603,7 @@ public:
 
   void mark_as_escaped ();
   void on_unknown_fncall (const gcall *call, store_manager *mgr);
+  void on_asm (const gasm *stmt, store_manager *mgr);
 
   bool escaped_p () const { return m_escaped; }
   bool touched_p () const { return m_touched; }
diff --git a/gcc/analyzer/svalue.cc b/gcc/analyzer/svalue.cc
index a1e6f50..6913161 100644
--- a/gcc/analyzer/svalue.cc
+++ b/gcc/analyzer/svalue.cc
@@ -508,6 +508,29 @@ svalue::cmp_ptr (const svalue *sval1, const svalue *sval2)
 				conjured_sval2->get_id_region ());
       }
       break;
+    case SK_ASM_OUTPUT:
+      {
+	const asm_output_svalue *asm_output_sval1
+	  = (const asm_output_svalue *)sval1;
+	const asm_output_svalue *asm_output_sval2
+	  = (const asm_output_svalue *)sval2;
+	if (int asm_string_cmp = strcmp (asm_output_sval1->get_asm_string (),
+					 asm_output_sval2->get_asm_string ()))
+	  return asm_string_cmp;
+	if (int output_idx_cmp = ((int)asm_output_sval1->get_output_idx ()
+				  - (int)asm_output_sval2->get_output_idx ()))
+	  return output_idx_cmp;
+	if (int cmp = ((int)asm_output_sval1->get_num_inputs ()
+		       - (int)asm_output_sval2->get_num_inputs ()))
+	  return cmp;
+	for (unsigned i = 0; i < asm_output_sval1->get_num_inputs (); i++)
+	  if (int input_cmp
+	      = svalue::cmp_ptr (asm_output_sval1->get_input (i),
+				 asm_output_sval2->get_input (i)))
+	    return input_cmp;
+	return 0;
+      }
+      break;
     }
 }
 
@@ -1794,6 +1817,72 @@ conjured_svalue::accept (visitor *v) const
   m_id_reg->accept (v);
 }
 
+/* class asm_output_svalue : public svalue.  */
+
+/* Implementation of svalue::dump_to_pp vfunc for asm_output_svalue.  */
+
+void
+asm_output_svalue::dump_to_pp (pretty_printer *pp, bool simple) const
+{
+  if (simple)
+    {
+      pp_printf (pp, "ASM_OUTPUT(%qs, %%%i, {",
+		 get_asm_string (),
+		 get_output_idx ());
+      for (unsigned i = 0; i < m_num_inputs; i++)
+	{
+	  if (i > 0)
+	    pp_string (pp, ", ");
+	  dump_input (pp, 0, m_input_arr[i], simple);
+	}
+      pp_string (pp, "})");
+    }
+  else
+    {
+      pp_printf (pp, "asm_output_svalue (%qs, %%%i, {",
+		 get_asm_string (),
+		 get_output_idx ());
+      for (unsigned i = 0; i < m_num_inputs; i++)
+	{
+	  if (i > 0)
+	    pp_string (pp, ", ");
+	  dump_input (pp, 0, m_input_arr[i], simple);
+	}
+      pp_string (pp, "})");
+    }
+}
+
+/* Subroutine of asm_output_svalue::dump_to_pp.  */
+
+void
+asm_output_svalue::dump_input (pretty_printer *pp,
+			       unsigned input_idx,
+			       const svalue *sval,
+			       bool simple) const
+{
+  pp_printf (pp, "%%%i: ", input_idx_to_asm_idx (input_idx));
+  sval->dump_to_pp (pp, simple);
+}
+
+/* Convert INPUT_IDX from an index into the array of inputs
+   into the index of all operands for the asm stmt.  */
+
+unsigned
+asm_output_svalue::input_idx_to_asm_idx (unsigned input_idx) const
+{
+  return input_idx + m_num_outputs;
+}
+
+/* Implementation of svalue::accept vfunc for asm_output_svalue.  */
+
+void
+asm_output_svalue::accept (visitor *v) const
+{
+  v->visit_asm_output_svalue (this);
+  for (unsigned i = 0; i < m_num_inputs; i++)
+    m_input_arr[i]->accept (v);
+}
+
 } // namespace ana
 
 #endif /* #if ENABLE_ANALYZER */
diff --git a/gcc/analyzer/svalue.h b/gcc/analyzer/svalue.h
index debe439..63f7d15 100644
--- a/gcc/analyzer/svalue.h
+++ b/gcc/analyzer/svalue.h
@@ -47,7 +47,8 @@ enum svalue_kind
   SK_PLACEHOLDER,
   SK_WIDENING,
   SK_COMPOUND,
-  SK_CONJURED
+  SK_CONJURED,
+  SK_ASM_OUTPUT
 };
 
 /* svalue and its subclasses.
@@ -74,7 +75,9 @@ enum svalue_kind
      widening_svalue (SK_WIDENING): a merger of two svalues (possibly
        in an iteration).
      compound_svalue (SK_COMPOUND): a mapping of bit-ranges to svalues
-     conjured_svalue (SK_CONJURED): a value arising from a stmt.  */
+     conjured_svalue (SK_CONJURED): a value arising from a stmt
+     asm_output_svalue (SK_ASM_OUTPUT): an output from a deterministic
+       asm stmt.  */
 
 /* An abstract base class representing a value held by a region of memory.  */
 
@@ -124,6 +127,8 @@ public:
   dyn_cast_compound_svalue () const { return NULL; }
   virtual const conjured_svalue *
   dyn_cast_conjured_svalue () const { return NULL; }
+  virtual const asm_output_svalue *
+  dyn_cast_asm_output_svalue () const { return NULL; }
 
   tree maybe_get_constant () const;
   const region *maybe_get_region () const;
@@ -1394,4 +1399,140 @@ template <> struct default_hash_traits<conjured_svalue::key_t>
   static const bool empty_zero_p = true;
 };
 
+namespace ana {
+
+/* An output from a deterministic asm stmt, where we want to identify a
+   particular unknown value, rather than resorting to the unknown_value
+   singleton.
+
+   Comparisons of variables that share the same asm_output_svalue are known
+   to be equal, even if we don't know what the value is.  */
+
+class asm_output_svalue : public svalue
+{
+public:
+  /* Imposing an upper limit and using a (small) array allows key_t
+     to avoid memory management.  */
+  static const unsigned MAX_INPUTS = 2;
+
+  /* A support class for uniquifying instances of asm_output_svalue.  */
+  struct key_t
+  {
+    key_t (tree type,
+	   const char *asm_string,
+	   unsigned output_idx,
+	   const vec<const svalue *> &inputs)
+    : m_type (type), m_asm_string (asm_string), m_output_idx (output_idx),
+      m_num_inputs (inputs.length ())
+    {
+      gcc_assert (inputs.length () <= MAX_INPUTS);
+      for (unsigned i = 0; i < m_num_inputs; i++)
+	m_input_arr[i] = inputs[i];
+    }
+
+    hashval_t hash () const
+    {
+      inchash::hash hstate;
+      hstate.add_ptr (m_type);
+      /* We don't bother hashing m_asm_str.  */
+      hstate.add_int (m_output_idx);
+      for (unsigned i = 0; i < m_num_inputs; i++)
+	hstate.add_ptr (m_input_arr[i]);
+      return hstate.end ();
+    }
+
+    bool operator== (const key_t &other) const
+    {
+      if (!(m_type == other.m_type
+	    && 0 == (strcmp (m_asm_string, other.m_asm_string))
+	    && m_output_idx == other.m_output_idx
+	    && m_num_inputs == other.m_num_inputs))
+	return false;
+      for (unsigned i = 0; i < m_num_inputs; i++)
+	if (m_input_arr[i] != other.m_input_arr[i])
+	  return false;
+      return true;
+    }
+
+    /* Use m_asm_string to mark empty/deleted, as m_type can be NULL for
+       legitimate instances.  */
+    void mark_deleted () { m_asm_string = reinterpret_cast<const char *> (1); }
+    void mark_empty () { m_asm_string = NULL; }
+    bool is_deleted () const
+    {
+      return m_asm_string == reinterpret_cast<const char *> (1);
+    }
+    bool is_empty () const { return m_asm_string == NULL; }
+
+    tree m_type;
+    const char *m_asm_string;
+    unsigned m_output_idx;
+    unsigned m_num_inputs;
+    const svalue *m_input_arr[MAX_INPUTS];
+  };
+
+  asm_output_svalue (tree type,
+		     const char *asm_string,
+		     unsigned output_idx,
+		     unsigned num_outputs,
+		     const vec<const svalue *> &inputs)
+  : svalue (complexity::from_vec_svalue (inputs), type),
+    m_asm_string (asm_string),
+    m_output_idx (output_idx),
+    m_num_outputs (num_outputs),
+    m_num_inputs (inputs.length ())
+  {
+    gcc_assert (inputs.length () <= MAX_INPUTS);
+    for (unsigned i = 0; i < m_num_inputs; i++)
+      m_input_arr[i] = inputs[i];
+  }
+
+  enum svalue_kind get_kind () const FINAL OVERRIDE { return SK_ASM_OUTPUT; }
+  const asm_output_svalue *
+  dyn_cast_asm_output_svalue () const FINAL OVERRIDE
+  {
+    return this;
+  }
+
+  void dump_to_pp (pretty_printer *pp, bool simple) const FINAL OVERRIDE;
+  void accept (visitor *v) const FINAL OVERRIDE;
+
+  const char *get_asm_string () const { return m_asm_string; }
+  unsigned get_output_idx () const { return m_output_idx; }
+  unsigned get_num_inputs () const { return m_num_inputs; }
+  const svalue *get_input (unsigned idx) const { return m_input_arr[idx]; }
+
+ private:
+  void dump_input (pretty_printer *pp,
+		   unsigned input_idx,
+		   const svalue *sval,
+		   bool simple) const;
+  unsigned input_idx_to_asm_idx (unsigned input_idx) const;
+
+  const char *m_asm_string;
+  unsigned m_output_idx;
+
+  /* We capture this so that we can offset the input indices
+     to match the %0, %1, %2 in the asm_string when dumping.  */
+  unsigned m_num_outputs;
+
+  unsigned m_num_inputs;
+  const svalue *m_input_arr[MAX_INPUTS];
+};
+
+} // namespace ana
+
+template <>
+template <>
+inline bool
+is_a_helper <const asm_output_svalue *>::test (const svalue *sval)
+{
+  return sval->get_kind () == SK_ASM_OUTPUT;
+}
+
+template <> struct default_hash_traits<asm_output_svalue::key_t>
+: public member_function_hash_traits<asm_output_svalue::key_t>
+{
+  static const bool empty_zero_p = true;
+};
 #endif /* GCC_ANALYZER_SVALUE_H */
diff --git a/gcc/auto-profile.c b/gcc/auto-profile.c
index b23b82b..4c1fc6b 100644
--- a/gcc/auto-profile.c
+++ b/gcc/auto-profile.c
@@ -1009,13 +1009,18 @@ afdo_indirect_call (gimple_stmt_iterator *gsi, const icall_target_map &map,
 
   histogram_value hist = gimple_alloc_histogram_value (
       cfun, HIST_TYPE_INDIR_CALL, stmt, callee);
-  hist->n_counters = 3;
+  hist->n_counters = 4;
   hist->hvalue.counters = XNEWVEC (gcov_type, hist->n_counters);
   gimple_add_histogram_value (cfun, stmt, hist);
 
-  hist->hvalue.counters[0] = direct_call->profile_id;
-  hist->hvalue.counters[1] = max_iter->second;
-  hist->hvalue.counters[2] = total;
+  // Total counter
+  hist->hvalue.counters[0] = total;
+  // Number of value/counter pairs
+  hist->hvalue.counters[1] = 1;
+  // Value
+  hist->hvalue.counters[2] = direct_call->profile_id;
+  // Counter
+  hist->hvalue.counters[3] = max_iter->second;
 
   if (!transform)
     return;
diff --git a/gcc/cfgloop.c b/gcc/cfgloop.c
index 6284ae2..2ba9918 100644
--- a/gcc/cfgloop.c
+++ b/gcc/cfgloop.c
@@ -2104,3 +2104,69 @@ mark_loop_for_removal (loop_p loop)
   loop->latch = NULL;
   loops_state_set (LOOPS_NEED_FIXUP);
 }
+
+/* Starting from loop tree ROOT, walk loop tree as the visiting
+   order specified by FLAGS.  The supported visiting orders
+   are:
+     - LI_ONLY_INNERMOST
+     - LI_FROM_INNERMOST
+     - Preorder (if neither of above is specified)  */
+
+void
+loops_list::walk_loop_tree (class loop *root, unsigned flags)
+{
+  bool only_innermost_p = flags & LI_ONLY_INNERMOST;
+  bool from_innermost_p = flags & LI_FROM_INNERMOST;
+  bool preorder_p = !(only_innermost_p || from_innermost_p);
+
+  /* Early handle root without any inner loops, make later
+     processing simpler, that is all loops processed in the
+     following while loop are impossible to be root.  */
+  if (!root->inner)
+    {
+      if (flags & LI_INCLUDE_ROOT)
+	this->to_visit.quick_push (root->num);
+      return;
+    }
+  else if (preorder_p && flags & LI_INCLUDE_ROOT)
+    this->to_visit.quick_push (root->num);
+
+  class loop *aloop;
+  for (aloop = root->inner;
+       aloop->inner != NULL;
+       aloop = aloop->inner)
+    {
+      if (preorder_p)
+	this->to_visit.quick_push (aloop->num);
+      continue;
+    }
+
+  while (1)
+    {
+      gcc_assert (aloop != root);
+      if (from_innermost_p || aloop->inner == NULL)
+	this->to_visit.quick_push (aloop->num);
+
+      if (aloop->next)
+	{
+	  for (aloop = aloop->next;
+	       aloop->inner != NULL;
+	       aloop = aloop->inner)
+	    {
+	      if (preorder_p)
+		this->to_visit.quick_push (aloop->num);
+	      continue;
+	    }
+	}
+      else if (loop_outer (aloop) == root)
+	break;
+      else
+	aloop = loop_outer (aloop);
+    }
+
+  /* When visiting from innermost, we need to consider root here
+     since the previous while loop doesn't handle it.  */
+  if (from_innermost_p && flags & LI_INCLUDE_ROOT)
+    this->to_visit.quick_push (root->num);
+}
+
diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
index 885632b..0f71a6bf 100644
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@@ -669,13 +669,15 @@ as_const (T &t)
 }
 
 /* A list for visiting loops, which contains the loop numbers instead of
-   the loop pointers.  The scope is restricted in function FN and the
-   visiting order is specified by FLAGS.  */
+   the loop pointers.  If the loop ROOT is offered (non-null), the visiting
+   will start from it, otherwise it would start from the tree_root of
+   loops_for_fn (FN) instead.  The scope is restricted in function FN and
+   the visiting order is specified by FLAGS.  */
 
 class loops_list
 {
 public:
-  loops_list (function *fn, unsigned flags);
+  loops_list (function *fn, unsigned flags, class loop *root = nullptr);
 
   template <typename T> class Iter
   {
@@ -750,6 +752,10 @@ public:
   }
 
 private:
+  /* Walk loop tree starting from ROOT as the visiting order specified
+     by FLAGS.  */
+  void walk_loop_tree (class loop *root, unsigned flags);
+
   /* The function we are visiting.  */
   function *fn;
 
@@ -782,76 +788,50 @@ loops_list::Iter<T>::fill_curr_loop ()
 }
 
 /* Set up the loops list to visit according to the specified
-   function scope FN and iterating order FLAGS.  */
+   function scope FN and iterating order FLAGS.  If ROOT is
+   not null, the visiting would start from it, otherwise it
+   will start from tree_root of loops_for_fn (FN).  */
 
-inline loops_list::loops_list (function *fn, unsigned flags)
+inline loops_list::loops_list (function *fn, unsigned flags, class loop *root)
 {
-  class loop *aloop;
-  unsigned i;
-  int mn;
+  struct loops *loops = loops_for_fn (fn);
+  gcc_assert (!root || loops);
+
+  /* Check mutually exclusive flags should not co-exist.  */
+  unsigned checked_flags = LI_ONLY_INNERMOST | LI_FROM_INNERMOST;
+  gcc_assert ((flags & checked_flags) != checked_flags);
 
   this->fn = fn;
-  if (!loops_for_fn (fn))
+  if (!loops)
     return;
 
+  class loop *tree_root = root ? root : loops->tree_root;
+
   this->to_visit.reserve_exact (number_of_loops (fn));
-  mn = (flags & LI_INCLUDE_ROOT) ? 0 : 1;
 
-  if (flags & LI_ONLY_INNERMOST)
+  /* When root is tree_root of loops_for_fn (fn) and the visiting
+     order is LI_ONLY_INNERMOST, we would like to use linear
+     search here since it has a more stable bound than the
+     walk_loop_tree.  */
+  if (flags & LI_ONLY_INNERMOST && tree_root == loops->tree_root)
     {
-      for (i = 0; vec_safe_iterate (loops_for_fn (fn)->larray, i, &aloop); i++)
-	if (aloop != NULL
-	    && aloop->inner == NULL
-	    && aloop->num >= mn)
-	  this->to_visit.quick_push (aloop->num);
-    }
-  else if (flags & LI_FROM_INNERMOST)
-    {
-      /* Push the loops to LI->TO_VISIT in postorder.  */
-      for (aloop = loops_for_fn (fn)->tree_root;
-	   aloop->inner != NULL;
-	   aloop = aloop->inner)
-	continue;
-
-      while (1)
+      gcc_assert (tree_root->num == 0);
+      if (tree_root->inner == NULL)
 	{
-	  if (aloop->num >= mn)
-	    this->to_visit.quick_push (aloop->num);
-
-	  if (aloop->next)
-	    {
-	      for (aloop = aloop->next;
-		   aloop->inner != NULL;
-		   aloop = aloop->inner)
-		continue;
-	    }
-	  else if (!loop_outer (aloop))
-	    break;
-	  else
-	    aloop = loop_outer (aloop);
+	  if (flags & LI_INCLUDE_ROOT)
+	    this->to_visit.quick_push (0);
+
+	  return;
 	}
+
+      class loop *aloop;
+      unsigned int i;
+      for (i = 1; vec_safe_iterate (loops->larray, i, &aloop); i++)
+	if (aloop != NULL && aloop->inner == NULL)
+	  this->to_visit.quick_push (aloop->num);
     }
   else
-    {
-      /* Push the loops to LI->TO_VISIT in preorder.  */
-      aloop = loops_for_fn (fn)->tree_root;
-      while (1)
-	{
-	  if (aloop->num >= mn)
-	    this->to_visit.quick_push (aloop->num);
-
-	  if (aloop->inner != NULL)
-	    aloop = aloop->inner;
-	  else
-	    {
-	      while (aloop != NULL && aloop->next == NULL)
-		aloop = loop_outer (aloop);
-	      if (aloop == NULL)
-		break;
-	      aloop = aloop->next;
-	    }
-	}
-    }
+    walk_loop_tree (tree_root, flags);
 }
 
 /* The properties of the target.  */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index f80de2c..4cd4b03 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -78,6 +78,7 @@
 #include "gimple-pretty-print.h"
 #include "tree-ssa-loop-niter.h"
 #include "fractional-cost.h"
+#include "rtlanal.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -12046,6 +12047,42 @@ aarch64_strip_extend (rtx x, bool strip_shift)
   return x;
 }
 
+/* Helper function for rtx cost calculation. Strip extension as well as any
+   inner VEC_SELECT high-half from X. Returns the inner vector operand if
+   successful, or the original expression on failure.  */
+static rtx
+aarch64_strip_extend_vec_half (rtx x)
+{
+  if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
+    {
+      x = XEXP (x, 0);
+      if (GET_CODE (x) == VEC_SELECT
+	  && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
+				    XEXP (x, 1)))
+	x = XEXP (x, 0);
+    }
+  return x;
+}
+
+/* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
+   any subsequent extend and VEC_SELECT from X. Returns the inner scalar
+   operand if successful, or the original expression on failure.  */
+static rtx
+aarch64_strip_duplicate_vec_elt (rtx x)
+{
+  if (GET_CODE (x) == VEC_DUPLICATE
+      && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
+    {
+      x = XEXP (x, 0);
+      if (GET_CODE (x) == VEC_SELECT)
+	x = XEXP (x, 0);
+      else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
+	       && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
+	x = XEXP (XEXP (x, 0), 0);
+    }
+  return x;
+}
+
 /* Return true iff CODE is a shift supported in combination
    with arithmetic instructions.  */
 
@@ -12113,16 +12150,17 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
       if (vec_flags & VEC_ADVSIMD)
 	{
+	  /* The select-operand-high-half versions of the instruction have the
+	     same cost as the three vector version - don't add the costs of the
+	     extension or selection into the costs of the multiply.  */
+	  op0 = aarch64_strip_extend_vec_half (op0);
+	  op1 = aarch64_strip_extend_vec_half (op1);
 	  /* The by-element versions of the instruction have the same costs as
-	     the normal 3-vector version.  So don't add the costs of the
-	     duplicate into the costs of the multiply.  We make an assumption
-	     that the input to the VEC_DUPLICATE is already on the FP & SIMD
-	     side.  This means costing of a MUL by element pre RA is a bit
-	     optimistic.  */
-	  if (GET_CODE (op0) == VEC_DUPLICATE)
-	    op0 = XEXP (op0, 0);
-	  else if (GET_CODE (op1) == VEC_DUPLICATE)
-	    op1 = XEXP (op1, 0);
+	     the normal 3-vector version.  We make an assumption that the input
+	     to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
+	     costing of a MUL by element pre RA is a bit optimistic.  */
+	  op0 = aarch64_strip_duplicate_vec_elt (op0);
+	  op1 = aarch64_strip_duplicate_vec_elt (op1);
 	}
       cost += rtx_cost (op0, mode, MULT, 0, speed);
       cost += rtx_cost (op1, mode, MULT, 1, speed);
@@ -13051,6 +13089,21 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
 	op1 = XEXP (x, 1);
 
 cost_minus:
+	if (VECTOR_MODE_P (mode))
+	  {
+	    /* SUBL2 and SUBW2.  */
+	    unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+	    if (vec_flags & VEC_ADVSIMD)
+	      {
+		/* The select-operand-high-half versions of the sub instruction
+		   have the same cost as the regular three vector version -
+		   don't add the costs of the select into the costs of the sub.
+		   */
+		op0 = aarch64_strip_extend_vec_half (op0);
+		op1 = aarch64_strip_extend_vec_half (op1);
+	      }
+	  }
+
 	*cost += rtx_cost (op0, mode, MINUS, 0, speed);
 
 	/* Detect valid immediates.  */
@@ -13123,6 +13176,21 @@ cost_minus:
 	op1 = XEXP (x, 1);
 
 cost_plus:
+	if (VECTOR_MODE_P (mode))
+	  {
+	    /* ADDL2 and ADDW2.  */
+	    unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+	    if (vec_flags & VEC_ADVSIMD)
+	      {
+		/* The select-operand-high-half versions of the add instruction
+		   have the same cost as the regular three vector version -
+		   don't add the costs of the select into the costs of the add.
+		   */
+		op0 = aarch64_strip_extend_vec_half (op0);
+		op1 = aarch64_strip_extend_vec_half (op1);
+	      }
+	  }
+
 	if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
 	    || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
 	  {
@@ -14752,40 +14820,6 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
     }
 }
 
-/* Return true if an operaton of kind KIND for STMT_INFO represents
-   the extraction of an element from a vector in preparation for
-   storing the element to memory.  */
-static bool
-aarch64_is_store_elt_extraction (vect_cost_for_stmt kind,
-				 stmt_vec_info stmt_info)
-{
-  return (kind == vec_to_scalar
-	  && STMT_VINFO_DATA_REF (stmt_info)
-	  && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)));
-}
-
-/* Return true if STMT_INFO represents part of a reduction.  */
-static bool
-aarch64_is_reduction (stmt_vec_info stmt_info)
-{
-  return (STMT_VINFO_REDUC_DEF (stmt_info)
-	  || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)));
-}
-
-/* If STMT_INFO describes a reduction, return the type of reduction
-   it describes, otherwise return -1.  */
-static int
-aarch64_reduc_type (vec_info *vinfo, stmt_vec_info stmt_info)
-{
-  if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
-    if (STMT_VINFO_REDUC_DEF (stmt_info))
-      {
-	stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
-	return int (STMT_VINFO_REDUC_TYPE (reduc_info));
-      }
-  return -1;
-}
-
 /* Return true if an access of kind KIND for STMT_INFO represents one
    vector of an LD[234] or ST[234] operation.  Return the total number of
    vectors (2, 3 or 4) if so, otherwise return a value outside that range.  */
@@ -14806,32 +14840,6 @@ aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
   return 0;
 }
 
-/* If STMT_INFO is a COND_EXPR that includes an embedded comparison, return the
-   scalar type of the values being compared.  Return null otherwise.  */
-static tree
-aarch64_embedded_comparison_type (stmt_vec_info stmt_info)
-{
-  if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
-    if (gimple_assign_rhs_code (assign) == COND_EXPR)
-      {
-	tree cond = gimple_assign_rhs1 (assign);
-	if (COMPARISON_CLASS_P (cond))
-	  return TREE_TYPE (TREE_OPERAND (cond, 0));
-      }
-  return NULL_TREE;
-}
-
-/* If STMT_INFO is a comparison or contains an embedded comparison, return the
-   scalar type of the values being compared.  Return null otherwise.  */
-static tree
-aarch64_comparison_type (stmt_vec_info stmt_info)
-{
-  if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
-    if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison)
-      return TREE_TYPE (gimple_assign_rhs1 (assign));
-  return aarch64_embedded_comparison_type (stmt_info);
-}
-
 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
    vectors would produce a series of LDP or STP operations.  KIND is the
    kind of statement that STMT_INFO represents.  */
@@ -14858,43 +14866,6 @@ aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
   return is_gimple_assign (stmt_info->stmt);
 }
 
-/* Return true if STMT_INFO extends the result of a load.  */
-static bool
-aarch64_extending_load_p (class vec_info *vinfo, stmt_vec_info stmt_info)
-{
-  gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
-  if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
-    return false;
-
-  tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
-  tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
-  tree rhs_type = TREE_TYPE (rhs);
-  if (!INTEGRAL_TYPE_P (lhs_type)
-      || !INTEGRAL_TYPE_P (rhs_type)
-      || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
-    return false;
-
-  stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
-  return (def_stmt_info
-	  && STMT_VINFO_DATA_REF (def_stmt_info)
-	  && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
-}
-
-/* Return true if STMT_INFO is an integer truncation.  */
-static bool
-aarch64_integer_truncation_p (stmt_vec_info stmt_info)
-{
-  gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
-  if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
-    return false;
-
-  tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
-  tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
-  return (INTEGRAL_TYPE_P (lhs_type)
-	  && INTEGRAL_TYPE_P (rhs_type)
-	  && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
-}
-
 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
    or multiply-subtract sequence that might be suitable for fusing into a
    single instruction.  If VEC_FLAGS is zero, analyze the operation as
@@ -14997,7 +14968,7 @@ aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
 				       tree vectype,
 				       const sve_vec_cost *sve_costs)
 {
-  switch (aarch64_reduc_type (vinfo, stmt_info))
+  switch (vect_reduc_type (vinfo, stmt_info))
     {
     case EXTRACT_LAST_REDUCTION:
       return sve_costs->clast_cost;
@@ -15032,7 +15003,7 @@ aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
      scalar operation.
 
    - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
-     the Advanced SIMD implementation.
+     Advanced SIMD implementation.
 
    - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
      SVE implementation.
@@ -15088,7 +15059,7 @@ aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
 {
   /* Detect an extension of a loaded value.  In general, we'll be able to fuse
      the extension with the load.  */
-  if (kind == scalar_stmt && aarch64_extending_load_p (vinfo, stmt_info))
+  if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
     return 0;
 
   return stmt_cost;
@@ -15120,7 +15091,7 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
   /* Detect cases in which vec_to_scalar is describing the extraction of a
      vector element in preparation for a scalar store.  The store itself is
      costed separately.  */
-  if (aarch64_is_store_elt_extraction (kind, stmt_info))
+  if (vect_is_store_elt_extraction (kind, stmt_info))
     return simd_costs->store_elt_extra_cost;
 
   /* Detect SVE gather loads, which are costed as a single scalar_load
@@ -15159,7 +15130,7 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
      instruction like FADDP or MAXV.  */
   if (kind == vec_to_scalar
       && where == vect_epilogue
-      && aarch64_is_reduction (stmt_info))
+      && vect_is_reduction (stmt_info))
     switch (GET_MODE_INNER (TYPE_MODE (vectype)))
       {
       case E_QImode:
@@ -15209,12 +15180,12 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
      on the fly.  Optimistically assume that a load followed by an extension
      will fold to this form during combine, and that the extension therefore
      comes for free.  */
-  if (kind == vector_stmt && aarch64_extending_load_p (vinfo, stmt_info))
+  if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
     stmt_cost = 0;
 
   /* For similar reasons, vector_stmt integer truncations are a no-op,
      because we can just ignore the unused upper bits of the source.  */
-  if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
+  if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
     stmt_cost = 0;
 
   /* Advanced SIMD can load and store pairs of registers using LDP and STP,
@@ -15289,7 +15260,7 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
 	}
 
       if (kind == vector_stmt || kind == vec_to_scalar)
-	if (tree cmp_type = aarch64_embedded_comparison_type (stmt_info))
+	if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
 	  {
 	    if (FLOAT_TYPE_P (cmp_type))
 	      stmt_cost += simd_costs->fp_stmt_cost;
@@ -15299,7 +15270,7 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
     }
 
   if (kind == scalar_stmt)
-    if (tree cmp_type = aarch64_embedded_comparison_type (stmt_info))
+    if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
       {
 	if (FLOAT_TYPE_P (cmp_type))
 	  stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
@@ -15349,12 +15320,12 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
   /* Calculate the minimum cycles per iteration imposed by a reduction
      operation.  */
   if ((kind == vector_stmt || kind == vec_to_scalar)
-      && aarch64_is_reduction (stmt_info))
+      && vect_is_reduction (stmt_info))
     {
       unsigned int base
 	= aarch64_in_loop_reduction_latency (vinfo, stmt_info, vectype,
 					     vec_flags);
-      if (aarch64_reduc_type (vinfo, stmt_info) == FOLD_LEFT_REDUCTION)
+      if (vect_reduc_type (vinfo, stmt_info) == FOLD_LEFT_REDUCTION)
 	{
 	  if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
 	    {
@@ -15453,7 +15424,7 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
 
   /* Add any embedded comparison operations.  */
   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
-      && aarch64_embedded_comparison_type (stmt_info))
+      && vect_embedded_comparison_type (stmt_info))
     ops->general_ops += num_copies;
 
   /* Detect COND_REDUCTIONs and things that would need to become
@@ -15462,7 +15433,7 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
      have only accounted for one.  */
   if (vec_flags && (kind == vector_stmt || kind == vec_to_scalar))
     {
-      int reduc_type = aarch64_reduc_type (vinfo, stmt_info);
+      int reduc_type = vect_reduc_type (vinfo, stmt_info);
       if ((reduc_type == EXTRACT_LAST_REDUCTION && (vec_flags & VEC_ADVSIMD))
 	  || reduc_type == COND_REDUCTION)
 	ops->general_ops += num_copies;
@@ -15470,7 +15441,7 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
 
   /* Count the predicate operations needed by an SVE comparison.  */
   if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
-    if (tree type = aarch64_comparison_type (stmt_info))
+    if (tree type = vect_comparison_type (stmt_info))
       {
 	unsigned int base = (FLOAT_TYPE_P (type)
 			     ? sve_issue->fp_cmp_pred_ops
@@ -15548,7 +15519,7 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
 	  /* If we scalarize a strided store, the vectorizer costs one
 	     vec_to_scalar for each element.  However, we can store the first
 	     element using an FP store without a separate extract step.  */
-	  if (aarch64_is_store_elt_extraction (kind, stmt_info))
+	  if (vect_is_store_elt_extraction (kind, stmt_info))
 	    count -= 1;
 
 	  stmt_cost = aarch64_detect_scalar_stmt_subtype
diff --git a/gcc/config/arm/arm-c.c b/gcc/config/arm/arm-c.c
index ae2139c..cc7901b 100644
--- a/gcc/config/arm/arm-c.c
+++ b/gcc/config/arm/arm-c.c
@@ -409,6 +409,7 @@ arm_pragma_target_parse (tree args, tree pop_target)
       target_option_current_node = cur_tree;
       arm_configure_build_target (&arm_active_target,
 				  TREE_TARGET_OPTION (cur_tree), false);
+      arm_option_reconfigure_globals ();
     }
 
   /* Update macros if target_node changes. The global state will be restored
diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in
index ab4b6ac..249995a 100644
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -1080,6 +1080,7 @@ begin cpu generic-armv7-a
  cname genericv7a
  tune flags LDSCHED
  architecture armv7-a+fp
+ isa quirk_no_asmcpu
  option mp add mp
  option sec add sec
  option vfpv3-d16 add VFPv3 FP_DBL
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 6d781e2..11dafc7 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -79,10 +79,6 @@
 typedef struct minipool_node    Mnode;
 typedef struct minipool_fixup   Mfix;
 
-/* The last .arch and .fpu assembly strings that we printed.  */
-static std::string arm_last_printed_arch_string;
-static std::string arm_last_printed_fpu_string;
-
 void (*arm_lang_output_object_attributes_hook)(void);
 
 struct four_ints
@@ -334,6 +330,7 @@ static rtx_insn *thumb1_md_asm_adjust (vec<rtx> &, vec<rtx> &,
 				       vec<machine_mode> &,
 				       vec<const char *> &, vec<rtx> &,
 				       HARD_REG_SET &, location_t);
+static const char *arm_identify_fpu_from_isa (sbitmap);
 
 /* Table of machine attributes.  */
 static const struct attribute_spec arm_attribute_table[] =
@@ -3058,6 +3055,7 @@ arm_option_restore (struct gcc_options */* opts */,
 		    struct cl_target_option *ptr)
 {
   arm_configure_build_target (&arm_active_target, ptr, false);
+  arm_option_reconfigure_globals ();
 }
 
 /* Reset options between modes that the user has specified.  */
@@ -3410,6 +3408,11 @@ arm_configure_build_target (struct arm_build_target *target,
       bitmap_ior (target->isa, target->isa, fpu_bits);
     }
 
+  /* If we have the soft-float ABI, clear any feature bits relating to use of
+     floating-point operations.  They'll just confuse things later on.  */
+  if (arm_float_abi == ARM_FLOAT_ABI_SOFT)
+    bitmap_and_compl (target->isa, target->isa, isa_all_fpbits);
+
   /* There may be implied bits which we still need to enable. These are
      non-named features which are needed to complete other sets of features,
      but cannot be enabled from arm-cpus.in due to being shared between
@@ -3432,6 +3435,8 @@ arm_configure_build_target (struct arm_build_target *target,
   const cpu_tune *tune_data = &all_tunes[arm_selected_tune - all_cores];
 
   /* Finish initializing the target structure.  */
+  if (!target->arch_name)
+    target->arch_name = arm_selected_arch->common.name;
   target->arch_pp_name = arm_selected_arch->arch;
   target->base_arch = arm_selected_arch->base_arch;
   target->profile = arm_selected_arch->profile;
@@ -3439,7 +3444,6 @@ arm_configure_build_target (struct arm_build_target *target,
   target->tune_flags = tune_data->tune_flags;
   target->tune = tune_data->tune;
   target->tune_core = tune_data->scheduler;
-  arm_option_reconfigure_globals ();
 }
 
 /* Fix up any incompatible options that the user has specified.  */
@@ -28094,20 +28098,65 @@ arm_print_tune_info (void)
 	       (int) current_tune->sched_autopref);
 }
 
+/* The last set of target options used to emit .arch directives, etc.  This
+   could be a function-local static if it were not required to expose it as a
+   root to the garbage collector.  */
+static GTY(()) cl_target_option *last_asm_targ_options = NULL;
+
 /* Print .arch and .arch_extension directives corresponding to the
    current architecture configuration.  */
 static void
-arm_print_asm_arch_directives ()
+arm_print_asm_arch_directives (FILE *stream, cl_target_option *targ_options)
 {
+  arm_build_target build_target;
+  /* If the target options haven't changed since the last time we were called
+     there is nothing to do.  This should be sufficient to suppress the
+     majority of redundant work.  */
+  if (last_asm_targ_options == targ_options)
+    return;
+
+  last_asm_targ_options = targ_options;
+
+  build_target.isa = sbitmap_alloc (isa_num_bits);
+  arm_configure_build_target (&build_target, targ_options, false);
+
+  if (build_target.core_name
+      && !bitmap_bit_p (build_target.isa, isa_bit_quirk_no_asmcpu))
+    {
+      const char* truncated_name
+	= arm_rewrite_selected_cpu (build_target.core_name);
+      asm_fprintf (stream, "\t.cpu %s\n", truncated_name);
+    }
+
   const arch_option *arch
     = arm_parse_arch_option_name (all_architectures, "-march",
-				  arm_active_target.arch_name);
+				  build_target.arch_name);
   auto_sbitmap opt_bits (isa_num_bits);
 
   gcc_assert (arch);
 
-  asm_fprintf (asm_out_file, "\t.arch %s\n", arm_active_target.arch_name);
-  arm_last_printed_arch_string = arm_active_target.arch_name;
+  if (strcmp (build_target.arch_name, "armv7ve") == 0)
+    {
+      /* Keep backward compatability for assemblers which don't support
+	 armv7ve.  Fortunately, none of the following extensions are reset
+	 by a .fpu directive.  */
+      asm_fprintf (stream, "\t.arch armv7-a\n");
+      asm_fprintf (stream, "\t.arch_extension virt\n");
+      asm_fprintf (stream, "\t.arch_extension idiv\n");
+      asm_fprintf (stream, "\t.arch_extension sec\n");
+      asm_fprintf (stream, "\t.arch_extension mp\n");
+    }
+  else
+    asm_fprintf (stream, "\t.arch %s\n", build_target.arch_name);
+
+  /* The .fpu directive will reset any architecture extensions from the
+     assembler that relate to the fp/vector extensions.  So put this out before
+     any .arch_extension directives.  */
+  const char *fpu_name = (TARGET_SOFT_FLOAT
+			  ? "softvfp"
+			  : arm_identify_fpu_from_isa (build_target.isa));
+  asm_fprintf (stream, "\t.fpu %s\n", fpu_name);
+
   if (!arch->common.extensions)
     return;
 
@@ -28133,13 +28182,12 @@ arm_print_asm_arch_directives ()
 		  && !TARGET_HAVE_MVE_FLOAT))
 	    continue;
 
-	  /* If every feature bit of this option is set in the target
-	     ISA specification, print out the option name.  However,
-	     don't print anything if all the bits are part of the
-	     FPU specification.  */
-	  if (bitmap_subset_p (opt_bits, arm_active_target.isa)
+	  /* If every feature bit of this option is set in the target ISA
+	     specification, print out the option name.  However, don't print
+	     anything if all the bits are part of the FPU specification.  */
+	  if (bitmap_subset_p (opt_bits, build_target.isa)
 	      && !bitmap_subset_p (opt_bits, isa_all_fpubits_internal))
-	    asm_fprintf (asm_out_file, "\t.arch_extension %s\n", opt->name);
+	    asm_fprintf (stream, "\t.arch_extension %s\n", opt->name);
 	}
     }
 }
@@ -28149,46 +28197,23 @@ arm_file_start (void)
 {
   int val;
 
+  arm_print_asm_arch_directives
+    (asm_out_file, TREE_TARGET_OPTION (target_option_default_node));
+
   if (TARGET_BPABI)
     {
-      /* We don't have a specified CPU.  Use the architecture to
-	 generate the tags.
-
-	 Note: it might be better to do this unconditionally, then the
-	 assembler would not need to know about all new CPU names as
-	 they are added.  */
-      if (!arm_active_target.core_name)
-	{
-	  /* armv7ve doesn't support any extensions.  */
-	  if (strcmp (arm_active_target.arch_name, "armv7ve") == 0)
-	    {
-	      /* Keep backward compatability for assemblers
-		 which don't support armv7ve.  */
-	      asm_fprintf (asm_out_file, "\t.arch armv7-a\n");
-	      asm_fprintf (asm_out_file, "\t.arch_extension virt\n");
-	      asm_fprintf (asm_out_file, "\t.arch_extension idiv\n");
-	      asm_fprintf (asm_out_file, "\t.arch_extension sec\n");
-	      asm_fprintf (asm_out_file, "\t.arch_extension mp\n");
-	      arm_last_printed_arch_string = "armv7ve";
-	    }
-	  else
-	    arm_print_asm_arch_directives ();
-	}
-      else if (startswith (arm_active_target.core_name, "generic"))
-	{
-	  asm_fprintf (asm_out_file, "\t.arch %s\n",
-		       arm_active_target.core_name + 8);
-	  arm_last_printed_arch_string = arm_active_target.core_name + 8;
-	}
-      else
+      /* If we have a named cpu, but we the assembler does not support that
+	 name via .cpu, put out a cpu name attribute; but don't do this if the
+	 name starts with the fictitious prefix, 'generic'.  */
+      if (arm_active_target.core_name
+	  && bitmap_bit_p (arm_active_target.isa, isa_bit_quirk_no_asmcpu)
+	  && !startswith (arm_active_target.core_name, "generic"))
 	{
 	  const char* truncated_name
 	    = arm_rewrite_selected_cpu (arm_active_target.core_name);
 	  if (bitmap_bit_p (arm_active_target.isa, isa_bit_quirk_no_asmcpu))
 	    asm_fprintf (asm_out_file, "\t.eabi_attribute 5, \"%s\"\n",
 			 truncated_name);
-	  else
-	    asm_fprintf (asm_out_file, "\t.cpu %s\n", truncated_name);
 	}
 
       if (print_tune_info)
@@ -28253,6 +28278,13 @@ arm_file_end (void)
 {
   int regno;
 
+  /* Just in case the last function output in the assembler had non-default
+     architecture directives, we force the assembler state back to the default
+     set, so that any 'calculated' build attributes are based on the default
+     options rather than the special options for that function.  */
+  arm_print_asm_arch_directives
+    (asm_out_file, TREE_TARGET_OPTION (target_option_default_node));
+
   if (NEED_INDICATE_EXEC_STACK)
     /* Add .note.GNU-stack.  */
     file_end_indicate_exec_stack ();
@@ -33263,58 +33295,7 @@ arm_declare_function_name (FILE *stream, const char *name, tree decl)
     targ_options = TREE_TARGET_OPTION (target_option_current_node);
   gcc_assert (targ_options);
 
-  /* Only update the assembler .arch string if it is distinct from the last
-     such string we printed. arch_to_print is set conditionally in case
-     targ_options->x_arm_arch_string is NULL which can be the case
-     when cc1 is invoked directly without passing -march option.  */
-  std::string arch_to_print;
-  if (targ_options->x_arm_arch_string)
-    arch_to_print = targ_options->x_arm_arch_string;
-
-  if (arch_to_print != arm_last_printed_arch_string)
-    {
-      std::string arch_name
-	= arch_to_print.substr (0, arch_to_print.find ("+"));
-      asm_fprintf (asm_out_file, "\t.arch %s\n", arch_name.c_str ());
-      const arch_option *arch
-	= arm_parse_arch_option_name (all_architectures, "-march",
-				      targ_options->x_arm_arch_string);
-      auto_sbitmap opt_bits (isa_num_bits);
-
-      gcc_assert (arch);
-      if (arch->common.extensions)
-	{
-	  for (const struct cpu_arch_extension *opt = arch->common.extensions;
-	       opt->name != NULL;
-	       opt++)
-	    {
-	      if (!opt->remove)
-		{
-		  arm_initialize_isa (opt_bits, opt->isa_bits);
-		  /* For the cases "-march=armv8.1-m.main+mve -mfloat-abi=soft"
-		     and "-march=armv8.1-m.main+mve.fp -mfloat-abi=soft" MVE and
-		     MVE with floating point instructions is disabled.  So the
-		     following check restricts the printing of ".arch_extension
-		     mve" and ".arch_extension fp" (for mve.fp) in the assembly
-		     file.    MVE needs this special behaviour because the
-		     feature bit "mve" and "mve_float" are not part of
-		     "fpu bits", so they are not cleared when -mfloat-abi=soft
-		     (i.e nofp) but the marco TARGET_HAVE_MVE and
-		     TARGET_HAVE_MVE_FLOAT are disabled.  */
-		  if ((bitmap_bit_p (opt_bits, isa_bit_mve) && !TARGET_HAVE_MVE)
-		      || (bitmap_bit_p (opt_bits, isa_bit_mve_float)
-			  && !TARGET_HAVE_MVE_FLOAT))
-		    continue;
-		  if (bitmap_subset_p (opt_bits, arm_active_target.isa)
-		      && !bitmap_subset_p (opt_bits, isa_all_fpubits_internal))
-		    asm_fprintf (asm_out_file, "\t.arch_extension %s\n",
-				 opt->name);
-		}
-	     }
-	}
-
-      arm_last_printed_arch_string = arch_to_print;
-    }
+  arm_print_asm_arch_directives (stream, targ_options);
 
   fprintf (stream, "\t.syntax unified\n");
 
@@ -33332,17 +33313,6 @@ arm_declare_function_name (FILE *stream, const char *name, tree decl)
   else
     fprintf (stream, "\t.arm\n");
 
-  std::string fpu_to_print
-    = TARGET_SOFT_FLOAT
-	? "softvfp" : arm_identify_fpu_from_isa (arm_active_target.isa);
-
-  if (!(!strcmp (fpu_to_print.c_str (), "softvfp") && TARGET_VFP_BASE)
-      && (fpu_to_print != arm_last_printed_arch_string))
-    {
-      asm_fprintf (asm_out_file, "\t.fpu %s\n", fpu_to_print.c_str ());
-      arm_last_printed_fpu_string = fpu_to_print;
-    }
-
   if (TARGET_POKE_FUNCTION_NAME)
     arm_poke_function_name (stream, (const char *) name);
 }
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 1d469bf..bd21efa 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -613,7 +613,11 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
 	 arguments in memory.  */
       if (!register_operand (op0, mode)
 	  && !register_operand (op1, mode))
-	op1 = force_reg (mode, op1);
+	{
+	  rtx scratch = ix86_gen_scratch_sse_rtx (mode);
+	  emit_move_insn (scratch, op1);
+	  op1 = scratch;
+	}
 
       tmp[0] = op0; tmp[1] = op1;
       ix86_expand_vector_move_misalign (mode, tmp);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 842eb0e..aea224a 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -554,7 +554,7 @@ ix86_can_inline_p (tree caller, tree callee)
 
   /* Changes of those flags can be tolerated for always inlines. Lets hope
      user knows what he is doing.  */
-  const unsigned HOST_WIDE_INT always_inline_safe_mask
+  unsigned HOST_WIDE_INT always_inline_safe_mask
 	 = (MASK_USE_8BIT_IDIV | MASK_ACCUMULATE_OUTGOING_ARGS
 	    | MASK_NO_ALIGN_STRINGOPS | MASK_AVX256_SPLIT_UNALIGNED_LOAD
 	    | MASK_AVX256_SPLIT_UNALIGNED_STORE | MASK_CLD
@@ -579,6 +579,10 @@ ix86_can_inline_p (tree caller, tree callee)
        && lookup_attribute ("always_inline",
 			    DECL_ATTRIBUTES (callee)));
 
+  /* If callee only uses GPRs, ignore MASK_80387.  */
+  if (TARGET_GENERAL_REGS_ONLY_P (callee_opts->x_ix86_target_flags))
+    always_inline_safe_mask |= MASK_80387;
+
   cgraph_node *callee_node = cgraph_node::get (callee);
   /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
      function can inline a SSE2 function but a SSE2 function can't inline
@@ -23335,9 +23339,21 @@ rtx
 ix86_gen_scratch_sse_rtx (machine_mode mode)
 {
   if (TARGET_SSE && !lra_in_progress)
-    return gen_rtx_REG (mode, (TARGET_64BIT
-			       ? LAST_REX_SSE_REG
-			       : LAST_SSE_REG));
+    {
+      unsigned int regno;
+      if (TARGET_64BIT)
+	{
+	  /* In 64-bit mode, use XMM31 to avoid vzeroupper and always
+	     use XMM31 for CSE.  */
+	  if (ix86_hard_regno_mode_ok (LAST_EXT_REX_SSE_REG, mode))
+	    regno = LAST_EXT_REX_SSE_REG;
+	  else
+	    regno = LAST_REX_SSE_REG;
+	}
+      else
+	regno = LAST_SSE_REG;
+      return gen_rtx_REG (mode, regno);
+    }
   else
     return gen_reg_rtx (mode);
 }
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index bed9cd9..21fe51b 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1780,18 +1780,22 @@ typedef struct ix86_args {
 	  && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
 	 ? 16 : UNITS_PER_WORD)))
 
-/* STORE_MAX_PIECES is the number of bytes at a time that we can
-   store efficiently.  */
+/* STORE_MAX_PIECES is the number of bytes at a time that we can store
+   efficiently.  Allow 16/32/64 bytes only if inter-unit move is enabled
+   since vec_duplicate enabled by inter-unit move is used to implement
+   store_by_pieces of 16/32/64 bytes.  */
 #define STORE_MAX_PIECES \
-  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
-   ? 64 \
-   : ((TARGET_AVX \
-       && !TARGET_PREFER_AVX128 \
-       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
-      ? 32 \
-      : ((TARGET_SSE2 \
-	  && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
-	 ? 16 : UNITS_PER_WORD)))
+  (TARGET_INTER_UNIT_MOVES_TO_VEC \
+   ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+      ? 64 \
+      : ((TARGET_AVX \
+	  && !TARGET_PREFER_AVX128 \
+	  && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+	  ? 32 \
+	  : ((TARGET_SSE2 \
+	      && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+	      ? 16 : UNITS_PER_WORD))) \
+   : UNITS_PER_WORD)
 
 /* If a memory-to-memory move would take MOVE_RATIO or more simple
    move-instruction pairs, we will do a cpymem or libcall instead.
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 0c23ddb..51e8b47 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -19423,11 +19423,11 @@
 ;; Eliminate a reg-reg mov by inverting the condition of a cmov (#1).
 ;; mov r0,r1; dec r0; mov r2,r3; cmov r0,r2 -> dec r1; mov r0,r3; cmov r0, r1
 (define_peephole2
- [(set (match_operand:SWI248 0 "register_operand")
-       (match_operand:SWI248 1 "register_operand"))
+ [(set (match_operand:SWI248 0 "general_reg_operand")
+       (match_operand:SWI248 1 "general_reg_operand"))
   (parallel [(set (reg FLAGS_REG) (match_operand 5))
 	     (set (match_dup 0) (match_operand:SWI248 6))])
-  (set (match_operand:SWI248 2 "register_operand")
+  (set (match_operand:SWI248 2 "general_reg_operand")
        (match_operand:SWI248 3))
   (set (match_dup 0)
        (if_then_else:SWI248 (match_operator 4 "ix86_comparison_operator"
@@ -19455,10 +19455,10 @@
 ;; Eliminate a reg-reg mov by inverting the condition of a cmov (#2).
 ;; mov r2,r3; mov r0,r1; dec r0; cmov r0,r2 -> dec r1; mov r0,r3; cmov r0, r1
 (define_peephole2
- [(set (match_operand:SWI248 2 "register_operand")
+ [(set (match_operand:SWI248 2 "general_reg_operand")
        (match_operand:SWI248 3))
-  (set (match_operand:SWI248 0 "register_operand")
-       (match_operand:SWI248 1 "register_operand"))
+  (set (match_operand:SWI248 0 "general_reg_operand")
+       (match_operand:SWI248 1 "general_reg_operand"))
   (parallel [(set (reg FLAGS_REG) (match_operand 5))
 	     (set (match_dup 0) (match_operand:SWI248 6))])
   (set (match_dup 0)
diff --git a/gcc/config/i386/ia32intrin.h b/gcc/config/i386/ia32intrin.h
index 5422b0f..df99220 100644
--- a/gcc/config/i386/ia32intrin.h
+++ b/gcc/config/i386/ia32intrin.h
@@ -107,12 +107,22 @@ __rdpmc (int __S)
 #endif /* __iamcu__ */
 
 /* rdtsc */
-#define __rdtsc()		__builtin_ia32_rdtsc ()
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rdtsc (void)
+{
+  return __builtin_ia32_rdtsc ();
+}
 
 #ifndef __iamcu__
 
 /* rdtscp */
-#define __rdtscp(a)		__builtin_ia32_rdtscp (a)
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rdtscp (unsigned int *__A)
+{
+  return __builtin_ia32_rdtscp (__A);
+}
 
 #endif /* __iamcu__ */
 
diff --git a/gcc/config/i386/serializeintrin.h b/gcc/config/i386/serializeintrin.h
index e280250..89b5b94 100644
--- a/gcc/config/i386/serializeintrin.h
+++ b/gcc/config/i386/serializeintrin.h
@@ -34,7 +34,12 @@
 #define __DISABLE_SERIALIZE__
 #endif /* __SERIALIZE__ */
 
-#define _serialize()	__builtin_ia32_serialize ()
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_serialize (void)
+{
+  __builtin_ia32_serialize ();
+}
 
 #ifdef __DISABLE_SERIALIZE__
 #undef __DISABLE_SERIALIZE__
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 52b2b42..a46a237 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2376,6 +2376,24 @@
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "SF")])
 
+(define_expand "cond_<code><mode>"
+  [(set (match_operand:VF 0 "register_operand")
+	(vec_merge:VF
+	  (smaxmin:VF
+	    (match_operand:VF 2 "vector_operand")
+	    (match_operand:VF 3 "vector_operand"))
+	  (match_operand:VF 4 "nonimm_or_0_operand")
+	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "<MODE_SIZE> == 64 || TARGET_AVX512VL"
+{
+  emit_insn (gen_<code><mode>3_mask (operands[0],
+				     operands[2],
+				     operands[3],
+				     operands[4],
+				     operands[1]));
+  DONE;
+})
+
 (define_expand "<code><mode>3<mask_name><round_saeonly_name>"
   [(set (match_operand:VF 0 "register_operand")
 	(smaxmin:VF
@@ -4438,6 +4456,29 @@
   [(set_attr "type" "ssemuladd")
    (set_attr "mode" "<MODE>")])
 
+(define_expand "cond_fma<mode>"
+  [(set (match_operand:VF_AVX512VL 0 "register_operand")
+	(vec_merge:VF_AVX512VL
+	  (fma:VF_AVX512VL
+	    (match_operand:VF_AVX512VL 2 "vector_operand")
+	    (match_operand:VF_AVX512VL 3 "vector_operand")
+	    (match_operand:VF_AVX512VL 4 "vector_operand"))
+	  (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_fma<mode>4 (tmp,
+			     operands[2],
+			     operands[3],
+			     operands[4]));
+  emit_move_insn (operands[0], gen_rtx_VEC_MERGE (<MODE>mode,
+						  tmp,
+						  operands[5],
+						  operands[1]));
+  DONE;
+})
+
 (define_insn "<avx512>_fmadd_<mode>_mask<round_name>"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v")
 	(vec_merge:VF_AVX512VL
@@ -4515,6 +4556,30 @@
   [(set_attr "type" "ssemuladd")
    (set_attr "mode" "<MODE>")])
 
+(define_expand "cond_fms<mode>"
+  [(set (match_operand:VF_AVX512VL 0 "register_operand")
+	(vec_merge:VF_AVX512VL
+	  (fma:VF_AVX512VL
+	    (match_operand:VF_AVX512VL 2 "vector_operand")
+	    (match_operand:VF_AVX512VL 3 "vector_operand")
+	    (neg:VF_AVX512VL
+	      (match_operand:VF_AVX512VL 4 "vector_operand")))
+	  (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_fms<mode>4 (tmp,
+			     operands[2],
+			     operands[3],
+			     operands[4]));
+  emit_move_insn (operands[0], gen_rtx_VEC_MERGE (<MODE>mode,
+						  tmp,
+						  operands[5],
+						  operands[1]));
+  DONE;
+})
+
 (define_insn "<avx512>_fmsub_<mode>_mask<round_name>"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v")
 	(vec_merge:VF_AVX512VL
@@ -4594,6 +4659,30 @@
   [(set_attr "type" "ssemuladd")
    (set_attr "mode" "<MODE>")])
 
+(define_expand "cond_fnma<mode>"
+  [(set (match_operand:VF_AVX512VL 0 "register_operand")
+	(vec_merge:VF_AVX512VL
+	  (fma:VF_AVX512VL
+	    (neg:VF_AVX512VL
+	      (match_operand:VF_AVX512VL 2 "vector_operand"))
+	    (match_operand:VF_AVX512VL 3 "vector_operand")
+	    (match_operand:VF_AVX512VL 4 "vector_operand"))
+	  (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_fnma<mode>4 (tmp,
+			      operands[2],
+			      operands[3],
+			      operands[4]));
+  emit_move_insn (operands[0], gen_rtx_VEC_MERGE (<MODE>mode,
+						  tmp,
+						  operands[5],
+						  operands[1]));
+  DONE;
+})
+
 (define_insn "<avx512>_fnmadd_<mode>_mask<round_name>"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v")
 	(vec_merge:VF_AVX512VL
@@ -4675,6 +4764,31 @@
   [(set_attr "type" "ssemuladd")
    (set_attr "mode" "<MODE>")])
 
+(define_expand "cond_fnms<mode>"
+  [(set (match_operand:VF_AVX512VL 0 "register_operand")
+	(vec_merge:VF_AVX512VL
+	  (fma:VF_AVX512VL
+	    (neg:VF_AVX512VL
+	      (match_operand:VF_AVX512VL 2 "vector_operand"))
+	    (match_operand:VF_AVX512VL 3 "vector_operand")
+	    (neg:VF_AVX512VL
+	      (match_operand:VF_AVX512VL 4 "vector_operand")))
+	  (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_fnms<mode>4 (tmp,
+			      operands[2],
+			      operands[3],
+			      operands[4]));
+  emit_move_insn (operands[0], gen_rtx_VEC_MERGE (<MODE>mode,
+						  tmp,
+						  operands[5],
+						  operands[1]));
+  DONE;
+})
+
 (define_insn "<avx512>_fnmsub_<mode>_mask<round_name>"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v")
 	(vec_merge:VF_AVX512VL
@@ -12974,6 +13088,24 @@
    (set_attr "prefix" "vex")
    (set_attr "mode" "OI")])
 
+(define_expand "cond_<code><mode>"
+  [(set (match_operand:VI1248_AVX512VLBW 0 "register_operand")
+	(vec_merge:VI1248_AVX512VLBW
+	  (maxmin:VI1248_AVX512VLBW
+	    (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand")
+	    (match_operand:VI1248_AVX512VLBW 3 "nonimmediate_operand"))
+	  (match_operand:VI1248_AVX512VLBW 4 "nonimm_or_0_operand")
+	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  emit_insn (gen_<code><mode>3_mask (operands[0],
+				     operands[2],
+				     operands[3],
+				     operands[4],
+				     operands[1]));
+  DONE;
+})
+
 (define_expand "<code><mode>3_mask"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand")
 	(vec_merge:VI48_AVX512VL
@@ -13931,6 +14063,24 @@
   DONE;
 })
 
+(define_expand "cond_<code><mode>"
+  [(set (match_operand:VI48_AVX512VL 0 "register_operand")
+	(vec_merge:VI48_AVX512VL
+	  (any_logic:VI48_AVX512VL
+	    (match_operand:VI48_AVX512VL 2 "vector_operand")
+	    (match_operand:VI48_AVX512VL 3 "vector_operand"))
+	  (match_operand:VI48_AVX512VL 4 "nonimm_or_0_operand")
+	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  emit_insn (gen_<code><mode>3_mask (operands[0],
+				     operands[2],
+				     operands[3],
+				     operands[4],
+				     operands[1]));
+  DONE;
+})
+
 (define_insn "<mask_codefor><code><mode>3<mask_name>"
   [(set (match_operand:VI48_AVX_AVX512F 0 "register_operand" "=x,x,v")
 	(any_logic:VI48_AVX_AVX512F
diff --git a/gcc/config/i386/x86gprintrin.h b/gcc/config/i386/x86gprintrin.h
index 7793032..b7fefa7 100644
--- a/gcc/config/i386/x86gprintrin.h
+++ b/gcc/config/i386/x86gprintrin.h
@@ -24,6 +24,12 @@
 #ifndef _X86GPRINTRIN_H_INCLUDED
 #define _X86GPRINTRIN_H_INCLUDED
 
+#if defined __MMX__ || defined __SSE__
+#pragma GCC push_options
+#pragma GCC target("general-regs-only")
+#define __DISABLE_GENERAL_REGS_ONLY__
+#endif
+
 #include <ia32intrin.h>
 
 #ifndef __iamcu__
@@ -255,4 +261,9 @@ _ptwrite32 (unsigned __B)
 
 #endif /* __iamcu__ */
 
+#ifdef __DISABLE_GENERAL_REGS_ONLY__
+#undef __DISABLE_GENERAL_REGS_ONLY__
+#pragma GCC pop_options
+#endif /* __DISABLE_GENERAL_REGS_ONLY__ */
+
 #endif /* _X86GPRINTRIN_H_INCLUDED.  */
diff --git a/gcc/config/rs6000/constraints.md b/gcc/config/rs6000/constraints.md
index 561ce97..c8cff1a 100644
--- a/gcc/config/rs6000/constraints.md
+++ b/gcc/config/rs6000/constraints.md
@@ -17,7 +17,7 @@
 ;; along with GCC; see the file COPYING3.  If not see
 ;; <http://www.gnu.org/licenses/>.
 
-;; Available constraint letters: e k q t u A B C D S T
+;; Available constraint letters: k q t u A B C D S T
 
 ;; Register constraints
 
diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h
index 0145b92..3767a67 100644
--- a/gcc/config/rs6000/smmintrin.h
+++ b/gcc/config/rs6000/smmintrin.h
@@ -296,4 +296,31 @@ _mm_floor_ss (__m128 __A, __m128 __B)
   return __r;
 }
 
+/* Return horizontal packed word minimum and its index in bits [15:0]
+   and bits [18:16] respectively.  */
+__inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_minpos_epu16 (__m128i __A)
+{
+  union __u
+    {
+      __m128i __m;
+      __v8hu __uh;
+    };
+  union __u __u = { .__m = __A }, __r = { .__m = {0} };
+  unsigned short __ridx = 0;
+  unsigned short __rmin = __u.__uh[__ridx];
+  for (unsigned long __i = 1; __i < 8; __i++)
+    {
+      if (__u.__uh[__i] < __rmin)
+	{
+	  __rmin = __u.__uh[__i];
+	  __ridx = __i;
+	}
+    }
+  __r.__uh[0] = __rmin;
+  __r.__uh[1] = __ridx;
+  return __r.__m;
+}
+
 #endif
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 6f6fc0b..441735d 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -1014,7 +1014,7 @@
   [(set (match_operand:VSX_LE_128 0 "memory_operand" "=Z,Q")
         (match_operand:VSX_LE_128 1 "vsx_register_operand" "+wa,r"))]
   "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR
-   & !altivec_indexed_or_indirect_operand (operands[0], <MODE>mode)"
+   && !altivec_indexed_or_indirect_operand (operands[0], <MODE>mode)"
   "@
    #
    #"
diff --git a/gcc/config/s390/s390-modes.def b/gcc/config/s390/s390-modes.def
index 6d814fc..245c2b8 100644
--- a/gcc/config/s390/s390-modes.def
+++ b/gcc/config/s390/s390-modes.def
@@ -259,14 +259,17 @@ CC_MODE (CCVFANY);
 
 /* Vector modes.  */
 
-VECTOR_MODES (INT, 2);        /*                 V2QI */
-VECTOR_MODES (INT, 4);        /*            V4QI V2HI */
-VECTOR_MODES (INT, 8);        /*       V8QI V4HI V2SI */
-VECTOR_MODES (INT, 16);       /* V16QI V8HI V4SI V2DI */
+VECTOR_MODES (INT, 2);        /*                       V2QI */
+VECTOR_MODES (INT, 4);        /*                  V4QI V2HI */
+VECTOR_MODES (INT, 8);        /*             V8QI V4HI V2SI */
+VECTOR_MODES (INT, 16);       /*       V16QI V8HI V4SI V2DI */
+VECTOR_MODES (INT, 32);       /* V32QI V16HI V8SI V4DI V2TI */
 
 VECTOR_MODE (FLOAT, SF, 2);   /* V2SF */
 VECTOR_MODE (FLOAT, SF, 4);   /* V4SF */
+VECTOR_MODE (FLOAT, SF, 8);   /* V8SF */
 VECTOR_MODE (FLOAT, DF, 2);   /* V2DF */
+VECTOR_MODE (FLOAT, DF, 4);   /* V4DF */
 
 VECTOR_MODE (INT, QI, 1);     /* V1QI */
 VECTOR_MODE (INT, HI, 1);     /* V1HI */
diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
index 289e018..4b03c6e 100644
--- a/gcc/config/s390/s390-protos.h
+++ b/gcc/config/s390/s390-protos.h
@@ -122,6 +122,8 @@ extern void s390_expand_vec_compare_cc (rtx, enum rtx_code, rtx, rtx, bool);
 extern enum rtx_code s390_reverse_condition (machine_mode, enum rtx_code);
 extern void s390_expand_vcond (rtx, rtx, rtx, enum rtx_code, rtx, rtx);
 extern void s390_expand_vec_init (rtx, rtx);
+extern rtx s390_expand_merge_perm_const (machine_mode, bool);
+extern void s390_expand_merge (rtx, rtx, rtx, bool);
 extern rtx s390_build_signbit_mask (machine_mode);
 extern rtx s390_return_addr_rtx (int, rtx);
 extern rtx s390_back_chain_rtx (void);
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 8c7d366..673a134 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -7014,6 +7014,42 @@ s390_expand_vec_init (rtx target, rtx vals)
     }
 }
 
+/* Return a parallel of constant integers to be used as permutation
+   vector for a vector merge operation in MODE.  If HIGH_P is true the
+   left-most elements of the source vectors are merged otherwise the
+   right-most elements.  */
+rtx
+s390_expand_merge_perm_const (machine_mode mode, bool high_p)
+{
+  int nelts = GET_MODE_NUNITS (mode);
+  rtx perm[16];
+  int addend = high_p ? 0 : nelts;
+
+  for (int i = 0; i < nelts; i++)
+    perm[i] = GEN_INT ((i + addend) / 2 + (i % 2) * nelts);
+
+  return gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelts, perm));
+}
+
+/* Emit RTL to implement a vector merge operation of SRC1 and SRC2
+   which creates the result in TARGET. HIGH_P determines whether a
+   merge hi or lo will be generated.  */
+void
+s390_expand_merge (rtx target, rtx src1, rtx src2, bool high_p)
+{
+  machine_mode mode = GET_MODE (target);
+  opt_machine_mode opt_mode_2x = mode_for_vector (GET_MODE_INNER (mode),
+						  2 * GET_MODE_NUNITS (mode));
+  gcc_assert (opt_mode_2x.exists ());
+  machine_mode mode_double_nelts = opt_mode_2x.require ();
+  rtx constv = s390_expand_merge_perm_const (mode, high_p);
+  src1 = force_reg (GET_MODE (src1), src1);
+  src2 = force_reg (GET_MODE (src2), src2);
+  rtx x = gen_rtx_VEC_CONCAT (mode_double_nelts, src1, src2);
+  x = gen_rtx_VEC_SELECT (mode, x, constv);
+  emit_insn (gen_rtx_SET (target, x));
+}
+
 /* Emit a vector constant that contains 1s in each element's sign bit position
    and 0s in other positions.  MODE is the desired constant's mode.  */
 extern rtx
@@ -16890,6 +16926,154 @@ s390_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
   return after_md_seq;
 }
 
+#define MAX_VECT_LEN	16
+
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  machine_mode vmode;
+  unsigned char nelt;
+  bool testing_p;
+};
+
+/* Try to expand the vector permute operation described by D using the
+   vector merge instructions vml and vmh.  Return true if vector merge
+   could be used.  */
+static bool
+expand_perm_with_merge (const struct expand_vec_perm_d &d)
+{
+  bool merge_lo_p = true;
+  bool merge_hi_p = true;
+
+  if (d.nelt % 2)
+    return false;
+
+  // For V4SI this checks for: { 0, 4, 1, 5 }
+  for (int telt = 0; telt < d.nelt; telt++)
+    if (d.perm[telt] != telt / 2 + (telt % 2) * d.nelt)
+      {
+	merge_hi_p = false;
+	break;
+      }
+
+  if (!merge_hi_p)
+    {
+      // For V4SI this checks for: { 2, 6, 3, 7 }
+      for (int telt = 0; telt < d.nelt; telt++)
+	if (d.perm[telt] != (telt + d.nelt) / 2 + (telt % 2) * d.nelt)
+	  {
+	    merge_lo_p = false;
+	    break;
+	  }
+    }
+  else
+    merge_lo_p = false;
+
+  if (d.testing_p)
+    return merge_lo_p || merge_hi_p;
+
+  if (merge_lo_p || merge_hi_p)
+    s390_expand_merge (d.target, d.op0, d.op1, merge_hi_p);
+
+  return merge_lo_p || merge_hi_p;
+}
+
+/* Try to expand the vector permute operation described by D using the
+   vector permute doubleword immediate instruction vpdi.  Return true
+   if vpdi could be used.
+
+   VPDI allows 4 different immediate values (0, 1, 4, 5). The 0 and 5
+   cases are covered by vmrhg and vmrlg already.  So we only care
+   about the 1, 4 cases here.
+   1 - First element of src1 and second of src2
+   4 - Second element of src1 and first of src2  */
+static bool
+expand_perm_with_vpdi (const struct expand_vec_perm_d &d)
+{
+  bool vpdi1_p = false;
+  bool vpdi4_p = false;
+  rtx op0_reg, op1_reg;
+
+  // Only V2DI and V2DF are supported here.
+  if (d.nelt != 2)
+    return false;
+
+  if (d.perm[0] == 0 && d.perm[1] == 3)
+    vpdi1_p = true;
+
+  if (d.perm[0] == 1 && d.perm[1] == 2)
+    vpdi4_p = true;
+
+  if (!vpdi1_p && !vpdi4_p)
+    return false;
+
+  if (d.testing_p)
+    return true;
+
+  op0_reg = force_reg (GET_MODE (d.op0), d.op0);
+  op1_reg = force_reg (GET_MODE (d.op1), d.op1);
+
+  if (vpdi1_p)
+    emit_insn (gen_vpdi1 (d.vmode, d.target, op0_reg, op1_reg));
+
+  if (vpdi4_p)
+    emit_insn (gen_vpdi4 (d.vmode, d.target, op0_reg, op1_reg));
+
+  return true;
+}
+
+/* Try to find the best sequence for the vector permute operation
+   described by D.  Return true if the operation could be
+   expanded.  */
+static bool
+vectorize_vec_perm_const_1 (const struct expand_vec_perm_d &d)
+{
+  if (expand_perm_with_merge (d))
+    return true;
+
+  if (expand_perm_with_vpdi (d))
+    return true;
+
+  return false;
+}
+
+/* Return true if we can emit instructions for the constant
+   permutation vector in SEL.  If OUTPUT, IN0, IN1 are non-null the
+   hook is supposed to emit the required INSNs.  */
+
+bool
+s390_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, rtx op1,
+			       const vec_perm_indices &sel)
+{
+  struct expand_vec_perm_d d;
+  unsigned int i, nelt;
+
+  if (!s390_vector_mode_supported_p (vmode) || GET_MODE_SIZE (vmode) != 16)
+    return false;
+
+  d.target = target;
+  d.op0 = op0;
+  d.op1 = op1;
+
+  d.vmode = vmode;
+  gcc_assert (VECTOR_MODE_P (d.vmode));
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = target == NULL_RTX;
+
+  gcc_assert (target == NULL_RTX || REG_P (target));
+  gcc_assert (sel.length () == nelt);
+
+  for (i = 0; i < nelt; i++)
+    {
+      unsigned char e = sel[i];
+      gcc_assert (e < 2 * nelt);
+      d.perm[i] = e;
+    }
+
+  return vectorize_vec_perm_const_1 (d);
+}
+
 /* Initialize GCC target structure.  */
 
 #undef  TARGET_ASM_ALIGNED_HI_OP
@@ -17200,6 +17384,10 @@ s390_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
 #undef TARGET_MD_ASM_ADJUST
 #define TARGET_MD_ASM_ADJUST s390_md_asm_adjust
 
+#undef TARGET_VECTORIZE_VEC_PERM_CONST
+#define TARGET_VECTORIZE_VEC_PERM_CONST s390_vectorize_vec_perm_const
+
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-s390.h"
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 8ad21b0..1b894a9 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -158,8 +158,6 @@
    UNSPEC_VEC_LOAD_BNDRY
    UNSPEC_VEC_LOAD_LEN
    UNSPEC_VEC_LOAD_LEN_R
-   UNSPEC_VEC_MERGEH
-   UNSPEC_VEC_MERGEL
    UNSPEC_VEC_PACK
    UNSPEC_VEC_PACK_SATURATE
    UNSPEC_VEC_PACK_SATURATE_CC
@@ -168,7 +166,6 @@
    UNSPEC_VEC_PACK_UNSIGNED_SATURATE_CC
    UNSPEC_VEC_PACK_UNSIGNED_SATURATE_GENCC
    UNSPEC_VEC_PERM
-   UNSPEC_VEC_PERMI
    UNSPEC_VEC_EXTEND
    UNSPEC_VEC_STORE_LEN
    UNSPEC_VEC_STORE_LEN_R
diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index ab605b3..70274a6 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -36,7 +36,6 @@
 (define_mode_iterator V_HW2 [V16QI V8HI V4SI V2DI V2DF (V4SF "TARGET_VXE")
 			     (V1TF "TARGET_VXE") (TF "TARGET_VXE")])
 
-(define_mode_iterator V_HW_64 [V2DI V2DF])
 (define_mode_iterator VT_HW_HSDT [V8HI V4SI V4SF V2DI V2DF V1TI V1TF TI TF])
 (define_mode_iterator V_HW_HSD [V8HI V4SI (V4SF "TARGET_VXE") V2DI V2DF])
 
@@ -50,7 +49,10 @@
 (define_mode_iterator VI_HW_HSD [V8HI  V4SI V2DI])
 (define_mode_iterator VI_HW_HS  [V8HI  V4SI])
 (define_mode_iterator VI_HW_QH  [V16QI V8HI])
-(define_mode_iterator VI_HW_4   [V4SI V4SF])
+
+; Directly supported vector modes with a certain number of elements
+(define_mode_iterator V_HW_2   [V2DI V2DF])
+(define_mode_iterator V_HW_4   [V4SI V4SF])
 
 ; All integer vector modes supported in a vector register + TImode
 (define_mode_iterator VIT [V1QI V2QI V4QI V8QI V16QI V1HI V2HI V4HI V8HI V1SI V2SI V4SI V1DI V2DI V1TI TI])
@@ -163,14 +165,14 @@
 		       (DF "d") (V1DF "d") (V2DF "d")
 		       (TF "x") (V1TF "x")])
 
-; Vector with doubled element size.
+; Vector with widened element size but half the number of elements.
 (define_mode_attr vec_double [(V1QI "V1HI") (V2QI "V1HI") (V4QI "V2HI") (V8QI "V4HI") (V16QI "V8HI")
 			      (V1HI "V1SI") (V2HI "V1SI") (V4HI "V2SI") (V8HI "V4SI")
 			      (V1SI "V1DI") (V2SI "V1DI") (V4SI "V2DI")
 			      (V1DI "V1TI") (V2DI "V1TI")
 			      (V1SF "V1DF") (V2SF "V1DF") (V4SF "V2DF")])
 
-; Vector with half the element size.
+; Vector with shrinked element size but twice the number of elements.
 (define_mode_attr vec_half [(V1HI "V2QI") (V2HI "V4QI") (V4HI "V8QI") (V8HI "V16QI")
 			    (V1SI "V2HI") (V2SI "V4HI") (V4SI "V8HI")
 			    (V1DI "V2SI") (V2DI "V4SI")
@@ -178,6 +180,22 @@
 			    (V1DF "V2SF") (V2DF "V4SF")
 			    (V1TF "V1DF")])
 
+; Vector with twice the number of elements but same element size.
+(define_mode_attr vec_2x_nelts [(V1QI "V2QI") (V2QI "V4QI") (V4QI "V8QI") (V8QI "V16QI") (V16QI "V32QI")
+				(V1HI "V2HI") (V2HI "V4HI") (V4HI "V8HI") (V8HI "V16HI")
+				(V1SI "V2SI") (V2SI "V4SI") (V4SI "V8SI")
+				(V1DI "V2DI") (V2DI "V4DI")
+				(V1SF "V2SF") (V2SF "V4SF") (V4SF "V8SF")
+				(V1DF "V2DF") (V2DF "V4DF")])
+
+; Vector with widened element size and the same number of elements.
+(define_mode_attr vec_2x_wide [(V1QI "V1HI") (V2QI "V2HI") (V4QI "V4HI") (V8QI "V8HI") (V16QI "V16HI")
+			       (V1HI "V1SI") (V2HI "V2SI") (V4HI "V4SI") (V8HI "V8SI")
+			       (V1SI "V1DI") (V2SI "V2DI") (V4SI "V4DI")
+			       (V1DI "V1TI") (V2DI "V2TI")
+			       (V1SF "V1DF") (V2SF "V2DF") (V4SF "V4DF")
+			       (V1DF "V1TF") (V2DF "V2TF")])
+
 ; Vector with half the element size AND half the number of elements.
 (define_mode_attr vec_halfhalf
   [(V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI")
@@ -532,8 +550,8 @@
 })
 
 (define_insn "*vec_vllezlf<mode>"
-  [(set (match_operand:VI_HW_4              0 "register_operand" "=v")
-	(vec_concat:VI_HW_4
+  [(set (match_operand:V_HW_4              0 "register_operand" "=v")
+	(vec_concat:V_HW_4
 	 (vec_concat:<vec_halfnumelts>
 	  (match_operand:<non_vec> 1 "memory_operand"    "R")
 	  (const_int 0))
@@ -748,6 +766,135 @@
   "vperm\t%v0,%v1,%v2,%v3"
   [(set_attr "op_type" "VRR")])
 
+
+; First DW of op1 and second DW of op2
+(define_insn "@vpdi1<mode>"
+  [(set (match_operand:V_HW_2   0 "register_operand" "=v")
+	(vec_select:V_HW_2
+	 (vec_concat:<vec_2x_nelts>
+	  (match_operand:V_HW_2 1 "register_operand"  "v")
+	  (match_operand:V_HW_2 2 "register_operand"  "v"))
+	 (parallel [(const_int 0) (const_int 3)])))]
+  "TARGET_VX"
+  "vpdi\t%v0,%v1,%v2,1"
+  [(set_attr "op_type" "VRR")])
+
+; Second DW of op1 and first of op2
+(define_insn "@vpdi4<mode>"
+  [(set (match_operand:V_HW_2   0 "register_operand" "=v")
+	(vec_select:V_HW_2
+	 (vec_concat:<vec_2x_nelts>
+	  (match_operand:V_HW_2 1 "register_operand"  "v")
+	  (match_operand:V_HW_2 2 "register_operand"  "v"))
+	 (parallel [(const_int 1) (const_int 2)])))]
+  "TARGET_VX"
+  "vpdi\t%v0,%v1,%v2,4"
+  [(set_attr "op_type" "VRR")])
+
+
+(define_insn "*vmrhb"
+  [(set (match_operand:V16QI                     0 "register_operand" "=v")
+        (vec_select:V16QI
+	  (vec_concat:V32QI (match_operand:V16QI 1 "register_operand"  "v")
+			    (match_operand:V16QI 2 "register_operand"  "v"))
+	  (parallel [(const_int 0) (const_int 16)
+		     (const_int 1) (const_int 17)
+		     (const_int 2) (const_int 18)
+		     (const_int 3) (const_int 19)
+		     (const_int 4) (const_int 20)
+		     (const_int 5) (const_int 21)
+		     (const_int 6) (const_int 22)
+		     (const_int 7) (const_int 23)])))]
+  "TARGET_VX"
+  "vmrhb\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrlb"
+  [(set (match_operand:V16QI                     0 "register_operand" "=v")
+        (vec_select:V16QI
+	  (vec_concat:V32QI (match_operand:V16QI 1 "register_operand"  "v")
+			    (match_operand:V16QI 2 "register_operand"  "v"))
+	  (parallel [(const_int  8) (const_int 24)
+		     (const_int  9) (const_int 25)
+		     (const_int 10) (const_int 26)
+		     (const_int 11) (const_int 27)
+		     (const_int 12) (const_int 28)
+		     (const_int 13) (const_int 29)
+		     (const_int 14) (const_int 30)
+		     (const_int 15) (const_int 31)])))]
+  "TARGET_VX"
+  "vmrlb\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrhh"
+  [(set (match_operand:V8HI                     0 "register_operand" "=v")
+        (vec_select:V8HI
+	  (vec_concat:V16HI (match_operand:V8HI 1 "register_operand"  "v")
+			    (match_operand:V8HI 2 "register_operand"  "v"))
+	  (parallel [(const_int 0) (const_int 8)
+		     (const_int 1) (const_int 9)
+		     (const_int 2) (const_int 10)
+		     (const_int 3) (const_int 11)])))]
+  "TARGET_VX"
+  "vmrhh\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrlh"
+  [(set (match_operand:V8HI                     0 "register_operand" "=v")
+        (vec_select:V8HI
+	  (vec_concat:V16HI (match_operand:V8HI 1 "register_operand"  "v")
+			    (match_operand:V8HI 2 "register_operand"  "v"))
+	  (parallel [(const_int 4) (const_int 12)
+		     (const_int 5) (const_int 13)
+		     (const_int 6) (const_int 14)
+		     (const_int 7) (const_int 15)])))]
+  "TARGET_VX"
+  "vmrlh\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrhf"
+  [(set (match_operand:V_HW_4                              0 "register_operand" "=v")
+        (vec_select:V_HW_4
+	  (vec_concat:<vec_2x_nelts> (match_operand:V_HW_4 1 "register_operand"  "v")
+				     (match_operand:V_HW_4 2 "register_operand"  "v"))
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 1) (const_int 5)])))]
+  "TARGET_VX"
+  "vmrhf\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrlf"
+  [(set (match_operand:V_HW_4                              0 "register_operand" "=v")
+        (vec_select:V_HW_4
+	  (vec_concat:<vec_2x_nelts> (match_operand:V_HW_4 1 "register_operand"  "v")
+				     (match_operand:V_HW_4 2 "register_operand"  "v"))
+	  (parallel [(const_int 2) (const_int 6)
+		     (const_int 3) (const_int 7)])))]
+  "TARGET_VX"
+  "vmrlf\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrhg"
+  [(set (match_operand:V_HW_2                              0 "register_operand" "=v")
+        (vec_select:V_HW_2
+	  (vec_concat:<vec_2x_nelts> (match_operand:V_HW_2 1 "register_operand"  "v")
+				     (match_operand:V_HW_2 2 "register_operand"  "v"))
+	  (parallel [(const_int 0) (const_int 2)])))]
+  "TARGET_VX"
+  "vmrhg\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrlg"
+  [(set (match_operand:V_HW_2                              0 "register_operand" "=v")
+        (vec_select:V_HW_2
+	  (vec_concat:<vec_2x_nelts> (match_operand:V_HW_2 1 "register_operand"  "v")
+				     (match_operand:V_HW_2 2 "register_operand"  "v"))
+	  (parallel [(const_int 1) (const_int 3)])))]
+  "TARGET_VX"
+  "vmrlg\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+
 (define_insn "*tf_to_fprx2_0"
   [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 0)
 	(subreg:DF (match_operand:TF    1 "general_operand"       "v") 0))]
@@ -779,7 +926,6 @@
   operands[5] = simplify_gen_subreg (DFmode, operands[1], TFmode, 8);
 })
 
-; vec_perm_const for V2DI using vpdi?
 
 ;;
 ;; Vector integer arithmetic instructions
@@ -1271,12 +1417,14 @@
 	(unspec:<vec_double> [(match_dup 1) (match_dup 2)]
 			     UNSPEC_VEC_UMULT_ODD))
    (set (match_operand:<vec_double>                 0 "register_operand" "")
-	(unspec:<vec_double> [(match_dup 3) (match_dup 4)]
-			     UNSPEC_VEC_MERGEL))]
+        (vec_select:<vec_double>
+	 (vec_concat:<vec_2x_wide> (match_dup 3) (match_dup 4))
+	 (match_dup 5)))]
   "TARGET_VX"
  {
    operands[3] = gen_reg_rtx (<vec_double>mode);
    operands[4] = gen_reg_rtx (<vec_double>mode);
+   operands[5] = s390_expand_merge_perm_const (<vec_double>mode, false);
  })
 
 (define_expand "vec_widen_umult_hi_<mode>"
@@ -1288,12 +1436,14 @@
 	(unspec:<vec_double> [(match_dup 1) (match_dup 2)]
 			     UNSPEC_VEC_UMULT_ODD))
    (set (match_operand:<vec_double>                 0 "register_operand" "")
-	(unspec:<vec_double> [(match_dup 3) (match_dup 4)]
-			     UNSPEC_VEC_MERGEH))]
+        (vec_select:<vec_double>
+	 (vec_concat:<vec_2x_wide> (match_dup 3) (match_dup 4))
+	 (match_dup 5)))]
   "TARGET_VX"
  {
    operands[3] = gen_reg_rtx (<vec_double>mode);
    operands[4] = gen_reg_rtx (<vec_double>mode);
+   operands[5] = s390_expand_merge_perm_const (<vec_double>mode, true);
  })
 
 (define_expand "vec_widen_smult_lo_<mode>"
@@ -1305,12 +1455,14 @@
 	(unspec:<vec_double> [(match_dup 1) (match_dup 2)]
 			     UNSPEC_VEC_SMULT_ODD))
    (set (match_operand:<vec_double>                 0 "register_operand" "")
-	(unspec:<vec_double> [(match_dup 3) (match_dup 4)]
-			     UNSPEC_VEC_MERGEL))]
+        (vec_select:<vec_double>
+	 (vec_concat:<vec_2x_wide> (match_dup 3) (match_dup 4))
+	 (match_dup 5)))]
   "TARGET_VX"
  {
    operands[3] = gen_reg_rtx (<vec_double>mode);
    operands[4] = gen_reg_rtx (<vec_double>mode);
+   operands[5] = s390_expand_merge_perm_const (<vec_double>mode, false);
  })
 
 (define_expand "vec_widen_smult_hi_<mode>"
@@ -1322,12 +1474,14 @@
 	(unspec:<vec_double> [(match_dup 1) (match_dup 2)]
 			     UNSPEC_VEC_SMULT_ODD))
    (set (match_operand:<vec_double>                 0 "register_operand" "")
-	(unspec:<vec_double> [(match_dup 3) (match_dup 4)]
-			     UNSPEC_VEC_MERGEH))]
+        (vec_select:<vec_double>
+	 (vec_concat:<vec_2x_wide> (match_dup 3) (match_dup 4))
+	 (match_dup 5)))]
   "TARGET_VX"
  {
    operands[3] = gen_reg_rtx (<vec_double>mode);
    operands[4] = gen_reg_rtx (<vec_double>mode);
+   operands[5] = s390_expand_merge_perm_const (<vec_double>mode, true);
  })
 
 ; vec_widen_ushiftl_hi
@@ -1816,9 +1970,9 @@
 })
 
 (define_insn "*vec_load_pair<mode>"
-  [(set (match_operand:V_HW_64                       0 "register_operand" "=v,v")
-	(vec_concat:V_HW_64 (match_operand:<non_vec> 1 "register_operand"  "d,v")
-			    (match_operand:<non_vec> 2 "register_operand"  "d,v")))]
+  [(set (match_operand:V_HW_2                       0 "register_operand" "=v,v")
+	(vec_concat:V_HW_2 (match_operand:<non_vec> 1 "register_operand"  "d,v")
+			   (match_operand:<non_vec> 2 "register_operand"  "d,v")))]
   "TARGET_VX"
   "@
    vlvgp\t%v0,%1,%2
@@ -2166,29 +2320,35 @@
 
 (define_expand "vec_unpacks_lo_v4sf"
   [(set (match_dup 2)
-	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")
-		      (match_dup 1)]
-		     UNSPEC_VEC_MERGEL))
-   (set (match_operand:V2DF               0 "register_operand" "=v")
+        (vec_select:V4SF
+	 (vec_concat:V8SF (match_operand:V4SF 1 "register_operand" "") (match_dup 1))
+	 (match_dup 3)))
+   (set (match_operand:V2DF                   0 "register_operand" "")
 	(float_extend:V2DF
 	 (vec_select:V2SF
 	  (match_dup 2)
 	  (parallel [(const_int 0) (const_int 2)]))))]
   "TARGET_VX"
-{ operands[2] = gen_reg_rtx(V4SFmode); })
+{
+  operands[2] = gen_reg_rtx(V4SFmode);
+  operands[3] = s390_expand_merge_perm_const (V4SFmode, false);
+})
 
 (define_expand "vec_unpacks_hi_v4sf"
   [(set (match_dup 2)
-	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")
-		      (match_dup 1)]
-		     UNSPEC_VEC_MERGEH))
-   (set (match_operand:V2DF               0 "register_operand" "=v")
+        (vec_select:V4SF
+	 (vec_concat:V8SF (match_operand:V4SF 1 "register_operand" "") (match_dup 1))
+	 (match_dup 3)))
+   (set (match_operand:V2DF                   0 "register_operand" "")
 	(float_extend:V2DF
 	 (vec_select:V2SF
 	  (match_dup 2)
 	  (parallel [(const_int 0) (const_int 2)]))))]
   "TARGET_VX"
-{ operands[2] = gen_reg_rtx(V4SFmode); })
+{
+  operands[2] = gen_reg_rtx(V4SFmode);
+  operands[3] = s390_expand_merge_perm_const (V4SFmode, true);
+})
 
 
 ; double -> long double
@@ -2204,29 +2364,35 @@
 
 (define_expand "vec_unpacks_lo_v2df"
   [(set (match_dup 2)
-	(unspec:V2DF [(match_operand:V2DF 1 "register_operand" "v")
-		      (match_dup 1)]
-		     UNSPEC_VEC_MERGEL))
-   (set (match_operand:V1TF               0 "register_operand" "=v")
+        (vec_select:V2DF
+	 (vec_concat:V4DF (match_operand:V2DF 1 "register_operand" "") (match_dup 1))
+	 (match_dup 3)))
+   (set (match_operand:V1TF                   0 "register_operand" "")
 	(float_extend:V1TF
 	 (vec_select:V1DF
 	  (match_dup 2)
 	  (parallel [(const_int 0)]))))]
   "TARGET_VXE"
-{ operands[2] = gen_reg_rtx (V2DFmode); })
+{
+  operands[2] = gen_reg_rtx (V2DFmode);
+  operands[3] = s390_expand_merge_perm_const (V2DFmode, false);
+})
 
 (define_expand "vec_unpacks_hi_v2df"
   [(set (match_dup 2)
-	(unspec:V2DF [(match_operand:V2DF 1 "register_operand" "v")
-		      (match_dup 1)]
-		     UNSPEC_VEC_MERGEH))
-   (set (match_operand:V1TF               0 "register_operand" "=v")
+        (vec_select:V2DF
+	 (vec_concat:V4DF (match_operand:V2DF 1 "register_operand" "") (match_dup 1))
+	 (match_dup 3)))
+   (set (match_operand:V1TF                   0 "register_operand" "")
 	(float_extend:V1TF
 	 (vec_select:V1DF
 	  (match_dup 2)
 	  (parallel [(const_int 0)]))))]
   "TARGET_VXE"
-{ operands[2] = gen_reg_rtx (V2DFmode); })
+{
+  operands[2] = gen_reg_rtx (V2DFmode);
+  operands[3] = s390_expand_merge_perm_const (V2DFmode, true);
+})
 
 
 ; 2 x v2df -> 1 x v4sf
diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
index 3df501b..3e7b854 100644
--- a/gcc/config/s390/vx-builtins.md
+++ b/gcc/config/s390/vx-builtins.md
@@ -22,7 +22,7 @@
 
 (define_mode_iterator V_HW_32_64 [V4SI V2DI V2DF (V4SF "TARGET_VXE")])
 (define_mode_iterator VI_HW_SD [V4SI V2DI])
-(define_mode_iterator V_HW_4 [V4SI V4SF])
+
 ; Full size vector modes with more than one element which are directly supported in vector registers by the hardware.
 (define_mode_iterator VEC_HW  [V16QI V8HI V4SI V2DI V2DF (V4SF "TARGET_VXE")])
 (define_mode_iterator VECF_HW [(V4SF "TARGET_VXE") V2DF])
@@ -232,28 +232,27 @@
   [(set_attr "op_type" "VRS,VRX,VSI")])
 
 
-; FIXME: The following two patterns might using vec_merge. But what is
-; the canonical form: (vec_select (vec_merge op0 op1)) or (vec_merge
-; (vec_select op0) (vec_select op1)
 ; vmrhb, vmrhh, vmrhf, vmrhg
-(define_insn "vec_mergeh<mode>"
-  [(set (match_operand:V_128_NOSINGLE                         0 "register_operand" "=v")
-	(unspec:V_128_NOSINGLE [(match_operand:V_128_NOSINGLE 1 "register_operand"  "v")
-			(match_operand:V_128_NOSINGLE         2 "register_operand"  "v")]
-		       UNSPEC_VEC_MERGEH))]
+(define_expand "vec_mergeh<mode>"
+  [(match_operand:V_128_NOSINGLE 0 "register_operand" "")
+   (match_operand:V_128_NOSINGLE 1 "register_operand" "")
+   (match_operand:V_128_NOSINGLE 2 "register_operand" "")]
   "TARGET_VX"
-  "vmrh<bhfgq>\t%v0,%1,%2"
-  [(set_attr "op_type" "VRR")])
+{
+  s390_expand_merge (operands[0], operands[1], operands[2], true);
+  DONE;
+})
 
 ; vmrlb, vmrlh, vmrlf, vmrlg
-(define_insn "vec_mergel<mode>"
-  [(set (match_operand:V_128_NOSINGLE                         0 "register_operand" "=v")
-	(unspec:V_128_NOSINGLE [(match_operand:V_128_NOSINGLE 1 "register_operand"  "v")
-			(match_operand:V_128_NOSINGLE         2 "register_operand"  "v")]
-		     UNSPEC_VEC_MERGEL))]
+(define_expand "vec_mergel<mode>"
+  [(match_operand:V_128_NOSINGLE 0 "register_operand" "")
+   (match_operand:V_128_NOSINGLE 1 "register_operand" "")
+   (match_operand:V_128_NOSINGLE 2 "register_operand" "")]
   "TARGET_VX"
-  "vmrl<bhfgq>\t%v0,%1,%2"
-  [(set_attr "op_type" "VRR")])
+{
+  s390_expand_merge (operands[0], operands[1], operands[2], false);
+  DONE;
+})
 
 
 ; Vector pack
@@ -404,28 +403,22 @@
   "vperm\t%v0,%v1,%v2,%v3"
   [(set_attr "op_type" "VRR")])
 
+; Incoming op3 is in vec_permi format and will we turned into a
+; permute vector consisting of op3 and op4.
 (define_expand "vec_permi<mode>"
-  [(set (match_operand:V_HW_64                  0 "register_operand"   "")
-	(unspec:V_HW_64 [(match_operand:V_HW_64 1 "register_operand"   "")
-			 (match_operand:V_HW_64 2 "register_operand"   "")
-			 (match_operand:QI      3 "const_mask_operand" "")]
-			UNSPEC_VEC_PERMI))]
+  [(set (match_operand:V_HW_2   0 "register_operand" "")
+	(vec_select:V_HW_2
+	 (vec_concat:<vec_2x_nelts>
+	  (match_operand:V_HW_2 1 "register_operand" "")
+	  (match_operand:V_HW_2 2 "register_operand" ""))
+	 (parallel [(match_operand:QI 3 "const_mask_operand" "") (match_dup 4)])))]
   "TARGET_VX"
 {
   HOST_WIDE_INT val = INTVAL (operands[3]);
-  operands[3] = GEN_INT ((val & 1) | (val & 2) << 1);
+  operands[3] = GEN_INT ((val & 2) >> 1);
+  operands[4] = GEN_INT ((val & 1) + 2);
 })
 
-(define_insn "*vec_permi<mode>"
-  [(set (match_operand:V_HW_64                  0 "register_operand"  "=v")
-	(unspec:V_HW_64 [(match_operand:V_HW_64 1 "register_operand"   "v")
-			 (match_operand:V_HW_64 2 "register_operand"   "v")
-			 (match_operand:QI      3 "const_mask_operand" "C")]
-			UNSPEC_VEC_PERMI))]
-  "TARGET_VX && (UINTVAL (operands[3]) & 10) == 0"
-  "vpdi\t%v0,%v1,%v2,%b3"
-  [(set_attr "op_type" "VRR")])
-
 
 ; Vector replicate
 
@@ -459,17 +452,17 @@
 
 ; A 31 bit target address is generated from 64 bit elements
 ; vsceg
-(define_insn "vec_scatter_element<V_HW_64:mode>_SI"
+(define_insn "vec_scatter_element<V_HW_2:mode>_SI"
   [(set (mem:<non_vec>
 	 (plus:SI (subreg:SI
-		   (unspec:<non_vec_int> [(match_operand:V_HW_64 1 "register_operand"   "v")
-					  (match_operand:QI      3 "const_mask_operand" "C")]
+		   (unspec:<non_vec_int> [(match_operand:V_HW_2 1 "register_operand"   "v")
+					  (match_operand:QI     3 "const_mask_operand" "C")]
 					 UNSPEC_VEC_EXTRACT) 4)
-		  (match_operand:SI                              2 "address_operand"   "ZQ")))
-	(unspec:<non_vec> [(match_operand:V_HW_64                0 "register_operand"   "v")
+		  (match_operand:SI                             2 "address_operand"   "ZQ")))
+	(unspec:<non_vec> [(match_operand:V_HW_2                0 "register_operand"   "v")
 			   (match_dup 3)] UNSPEC_VEC_EXTRACT))]
-  "TARGET_VX && !TARGET_64BIT && UINTVAL (operands[3]) < GET_MODE_NUNITS (<V_HW_64:MODE>mode)"
-  "vsce<V_HW_64:bhfgq>\t%v0,%O2(%v1,%R2),%3"
+  "TARGET_VX && !TARGET_64BIT && UINTVAL (operands[3]) < GET_MODE_NUNITS (<V_HW_2:MODE>mode)"
+  "vsce<V_HW_2:bhfgq>\t%v0,%O2(%v1,%R2),%3"
   [(set_attr "op_type" "VRV")])
 
 ; Element size and target address size is the same
diff --git a/gcc/cp/ChangeLog b/gcc/cp/ChangeLog
index 5468792..5b3e191 100644
--- a/gcc/cp/ChangeLog
+++ b/gcc/cp/ChangeLog
@@ -1,3 +1,9 @@
+2021-08-04  Jakub Jelinek  <jakub@redhat.com>
+
+	PR c++/101759
+	* parser.c (cp_parser_default_argument): Temporarily override
+	parser->omp_declare_simd and parser->oacc_routine to NULL.
+
 2021-08-02  Patrick Palka  <ppalka@redhat.com>
 
 	PR c++/100828
diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index 47bf7d9..02ce954 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -24488,6 +24488,8 @@ cp_parser_default_argument (cp_parser *parser, bool template_parm_p)
      set correctly.  */
   saved_greater_than_is_operator_p = parser->greater_than_is_operator_p;
   parser->greater_than_is_operator_p = !template_parm_p;
+  auto odsd = make_temp_override (parser->omp_declare_simd, NULL);
+  auto ord = make_temp_override (parser->oacc_routine, NULL);
   /* Local variable names (and the `this' keyword) may not
      appear in a default argument.  */
   saved_local_variables_forbidden_p = parser->local_variables_forbidden_p;
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 9818ed0..ff99887 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -13384,6 +13384,9 @@ Setting to 0 disables the analysis completely.
 @item modref-max-escape-points
 Specifies the maximum number of escape points tracked by modref per SSA-name.
 
+@item threader-mode
+Specifies the mode the backwards threader should run in.
+
 @item profile-func-internal-id
 A parameter to control whether to use function internal id in profile
 database lookup. If the value is 0, the compiler uses an id that
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index f6d1bc1..f8047ae 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6921,6 +6921,9 @@ operand 0, otherwise (operand 2 + operand 3) is moved.
 @cindex @code{cond_smax@var{mode}} instruction pattern
 @cindex @code{cond_umin@var{mode}} instruction pattern
 @cindex @code{cond_umax@var{mode}} instruction pattern
+@cindex @code{cond_ashl@var{mode}} instruction pattern
+@cindex @code{cond_ashr@var{mode}} instruction pattern
+@cindex @code{cond_lshr@var{mode}} instruction pattern
 @item @samp{cond_add@var{mode}}
 @itemx @samp{cond_sub@var{mode}}
 @itemx @samp{cond_mul@var{mode}}
@@ -6935,6 +6938,9 @@ operand 0, otherwise (operand 2 + operand 3) is moved.
 @itemx @samp{cond_smax@var{mode}}
 @itemx @samp{cond_umin@var{mode}}
 @itemx @samp{cond_umax@var{mode}}
+@itemx @samp{cond_ashl@var{mode}}
+@itemx @samp{cond_ashr@var{mode}}
+@itemx @samp{cond_lshr@var{mode}}
 When operand 1 is true, perform an operation on operands 2 and 3 and
 store the result in operand 0, otherwise store operand 4 in operand 0.
 The operation works elementwise if the operands are vectors.
@@ -6962,6 +6968,11 @@ Operands 0, 2, 3 and 4 all have mode @var{m}.  Operand 1 is a scalar
 integer if @var{m} is scalar, otherwise it has the mode returned by
 @code{TARGET_VECTORIZE_GET_MASK_MODE}.
 
+@samp{cond_@var{op}@var{mode}} generally corresponds to a conditional
+form of @samp{@var{op}@var{mode}3}.  As an exception, the vector forms
+of shifts correspond to patterns like @code{vashl@var{mode}3} rather
+than patterns like @code{ashl@var{mode}3}.
+
 @cindex @code{cond_fma@var{mode}} instruction pattern
 @cindex @code{cond_fms@var{mode}} instruction pattern
 @cindex @code{cond_fnma@var{mode}} instruction pattern
diff --git a/gcc/dwarf2out.c b/gcc/dwarf2out.c
index 884f1e1..b91a9b5 100644
--- a/gcc/dwarf2out.c
+++ b/gcc/dwarf2out.c
@@ -29389,7 +29389,18 @@ dwarf2out_assembly_start (void)
       output_quoted_string (asm_out_file, remap_debug_filename (filename0));
       fputc ('\n', asm_out_file);
     }
+  else
 #endif
+  /* Work around for PR101575: output a dummy .file directive.  */
+  if (!last_emitted_file && dwarf_debuginfo_p ()
+      && debug_info_level >= DINFO_LEVEL_TERSE)
+    {
+      const char *filename0 = get_AT_string (comp_unit_die (), DW_AT_name);
+
+      if (filename0 == NULL)
+	filename0 = "<dummy>";
+      maybe_emit_file (lookup_filename (filename0));
+    }
 }
 
 /* A helper function for dwarf2out_finish called through
diff --git a/gcc/expr.c b/gcc/expr.c
index b65cfcf..096c031 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -1110,8 +1110,8 @@ class op_by_pieces_d
   }
 
  public:
-  op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
-		  unsigned HOST_WIDE_INT, unsigned int, bool,
+  op_by_pieces_d (unsigned int, rtx, bool, rtx, bool, by_pieces_constfn,
+		  void *, unsigned HOST_WIDE_INT, unsigned int, bool,
 		  bool = false);
   void run ();
 };
@@ -1120,10 +1120,12 @@ class op_by_pieces_d
    objects named TO and FROM, which are identified as loads or stores
    by TO_LOAD and FROM_LOAD.  If FROM is a load, the optional FROM_CFN
    and its associated FROM_CFN_DATA can be used to replace loads with
-   constant values.  LEN describes the length of the operation.  */
+   constant values.  MAX_PIECES describes the maximum number of bytes
+   at a time which can be moved efficiently.  LEN describes the length
+   of the operation.  */
 
-op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
-				rtx from, bool from_load,
+op_by_pieces_d::op_by_pieces_d (unsigned int max_pieces, rtx to,
+				bool to_load, rtx from, bool from_load,
 				by_pieces_constfn from_cfn,
 				void *from_cfn_data,
 				unsigned HOST_WIDE_INT len,
@@ -1131,7 +1133,7 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
 				bool qi_vector_mode)
   : m_to (to, to_load, NULL, NULL),
     m_from (from, from_load, from_cfn, from_cfn_data),
-    m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
+    m_len (len), m_max_size (max_pieces + 1),
     m_push (push), m_qi_vector_mode (qi_vector_mode)
 {
   int toi = m_to.get_addr_inc ();
@@ -1324,8 +1326,8 @@ class move_by_pieces_d : public op_by_pieces_d
  public:
   move_by_pieces_d (rtx to, rtx from, unsigned HOST_WIDE_INT len,
 		    unsigned int align)
-    : op_by_pieces_d (to, false, from, true, NULL, NULL, len, align,
-		      PUSHG_P (to))
+    : op_by_pieces_d (MOVE_MAX_PIECES, to, false, from, true, NULL,
+		      NULL, len, align, PUSHG_P (to))
   {
   }
   rtx finish_retmode (memop_ret);
@@ -1421,8 +1423,8 @@ class store_by_pieces_d : public op_by_pieces_d
   store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
 		     unsigned HOST_WIDE_INT len, unsigned int align,
 		     bool qi_vector_mode)
-    : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,
-		      align, false, qi_vector_mode)
+    : op_by_pieces_d (STORE_MAX_PIECES, to, false, NULL_RTX, true, cfn,
+		      cfn_data, len, align, false, qi_vector_mode)
   {
   }
   rtx finish_retmode (memop_ret);
@@ -1618,8 +1620,8 @@ class compare_by_pieces_d : public op_by_pieces_d
   compare_by_pieces_d (rtx op0, rtx op1, by_pieces_constfn op1_cfn,
 		       void *op1_cfn_data, HOST_WIDE_INT len, int align,
 		       rtx_code_label *fail_label)
-    : op_by_pieces_d (op0, true, op1, true, op1_cfn, op1_cfn_data, len,
-		      align, false)
+    : op_by_pieces_d (COMPARE_MAX_PIECES, op0, true, op1, true, op1_cfn,
+		      op1_cfn_data, len, align, false)
   {
     m_fail_label = fail_label;
   }
diff --git a/gcc/gcov-io.c b/gcc/gcov-io.c
index 4b1e11d..7819593 100644
--- a/gcc/gcov-io.c
+++ b/gcc/gcov-io.c
@@ -199,7 +199,9 @@ gcov_close (void)
 {
   if (gcov_var.file)
     {
-      fclose (gcov_var.file);
+      if (fclose (gcov_var.file))
+	gcov_var.error = 1;
+
       gcov_var.file = 0;
     }
   gcov_var.mode = 0;
diff --git a/gcc/gimple-range-path.h b/gcc/gimple-range-path.h
index 43f0ec8..0d2d2e7 100644
--- a/gcc/gimple-range-path.h
+++ b/gcc/gimple-range-path.h
@@ -41,7 +41,7 @@ public:
 			  const bitmap_head *imports);
   bool range_of_expr (irange &r, tree name, gimple * = NULL) override;
   bool range_of_stmt (irange &r, gimple *, tree name = NULL) override;
-  void dump (FILE *);
+  void dump (FILE *) override;
   void debug ();
 
 private:
diff --git a/gcc/go/gofrontend/MERGE b/gcc/go/gofrontend/MERGE
index 95b9340..394530c 100644
--- a/gcc/go/gofrontend/MERGE
+++ b/gcc/go/gofrontend/MERGE
@@ -1,4 +1,4 @@
-0a4d612e6b211780b294717503fc739bbd1f509c
+b47bcf942daa9a0c252db9b57b8f138adbfcdaa2
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
diff --git a/gcc/go/gofrontend/escape.cc b/gcc/go/gofrontend/escape.cc
index cf68874..c8978ac 100644
--- a/gcc/go/gofrontend/escape.cc
+++ b/gcc/go/gofrontend/escape.cc
@@ -1608,8 +1608,33 @@ Escape_analysis_assign::expression(Expression** pexpr)
                 }
                 break;
 
-              default:
+              case Builtin_call_expression::BUILTIN_CLOSE:
+              case Builtin_call_expression::BUILTIN_DELETE:
+              case Builtin_call_expression::BUILTIN_PRINT:
+              case Builtin_call_expression::BUILTIN_PRINTLN:
+              case Builtin_call_expression::BUILTIN_LEN:
+              case Builtin_call_expression::BUILTIN_CAP:
+              case Builtin_call_expression::BUILTIN_COMPLEX:
+              case Builtin_call_expression::BUILTIN_REAL:
+              case Builtin_call_expression::BUILTIN_IMAG:
+              case Builtin_call_expression::BUILTIN_RECOVER:
+              case Builtin_call_expression::BUILTIN_ALIGNOF:
+              case Builtin_call_expression::BUILTIN_OFFSETOF:
+              case Builtin_call_expression::BUILTIN_SIZEOF:
+                // these do not escape.
+                break;
+
+              case Builtin_call_expression::BUILTIN_ADD:
+              case Builtin_call_expression::BUILTIN_SLICE:
+                // handled in ::assign.
                 break;
+
+              case Builtin_call_expression::BUILTIN_MAKE:
+              case Builtin_call_expression::BUILTIN_NEW:
+                // should have been lowered to runtime calls at this point.
+                // fallthrough
+              default:
+                go_unreachable();
               }
             break;
           }
@@ -2325,19 +2350,82 @@ Escape_analysis_assign::assign(Node* dst, Node* src)
 	  }
 	  break;
 
+        case Expression::EXPRESSION_SLICE_INFO:
+          {
+            Slice_info_expression* sie = e->slice_info_expression();
+            if (sie->info() == Expression::SLICE_INFO_VALUE_POINTER)
+              {
+                Node* slice = Node::make_node(sie->slice());
+                this->assign(dst, slice);
+              }
+          }
+          break;
+
 	case Expression::EXPRESSION_CALL:
 	  {
 	    Call_expression* call = e->call_expression();
             if (call->is_builtin())
               {
                 Builtin_call_expression* bce = call->builtin_call_expression();
-                if (bce->code() == Builtin_call_expression::BUILTIN_APPEND)
+                switch (bce->code())
                   {
-                    // Append returns the first argument.
-                    // The subsequent arguments are already leaked because
-                    // they are operands to append.
-                    Node* appendee = Node::make_node(call->args()->front());
-                    this->assign(dst, appendee);
+                  case Builtin_call_expression::BUILTIN_APPEND:
+                    {
+                      // Append returns the first argument.
+                      // The subsequent arguments are already leaked because
+                      // they are operands to append.
+                      Node* appendee = Node::make_node(call->args()->front());
+                      this->assign(dst, appendee);
+                    }
+                    break;
+
+                  case Builtin_call_expression::BUILTIN_ADD:
+                    {
+                      // unsafe.Add(p, off).
+                      // Flow p to result.
+                      Node* arg = Node::make_node(call->args()->front());
+                      this->assign(dst, arg);
+                    }
+                    break;
+
+                  case Builtin_call_expression::BUILTIN_SLICE:
+                    {
+                      // unsafe.Slice(p, len).
+                      // The resulting slice has the same backing store as p. Flow p to result.
+                      Node* arg = Node::make_node(call->args()->front());
+                      this->assign(dst, arg);
+                    }
+                    break;
+
+                  case Builtin_call_expression::BUILTIN_LEN:
+                  case Builtin_call_expression::BUILTIN_CAP:
+                  case Builtin_call_expression::BUILTIN_COMPLEX:
+                  case Builtin_call_expression::BUILTIN_REAL:
+                  case Builtin_call_expression::BUILTIN_IMAG:
+                  case Builtin_call_expression::BUILTIN_RECOVER:
+                  case Builtin_call_expression::BUILTIN_ALIGNOF:
+                  case Builtin_call_expression::BUILTIN_OFFSETOF:
+                  case Builtin_call_expression::BUILTIN_SIZEOF:
+                    // these do not escape.
+                    break;
+
+                  case Builtin_call_expression::BUILTIN_COPY:
+                    // handled in ::expression.
+                    break;
+
+                  case Builtin_call_expression::BUILTIN_CLOSE:
+                  case Builtin_call_expression::BUILTIN_DELETE:
+                  case Builtin_call_expression::BUILTIN_PRINT:
+                  case Builtin_call_expression::BUILTIN_PRINTLN:
+                  case Builtin_call_expression::BUILTIN_PANIC:
+                    // these do not have result.
+                    // fallthrough
+                  case Builtin_call_expression::BUILTIN_MAKE:
+                  case Builtin_call_expression::BUILTIN_NEW:
+                    // should have been lowered to runtime calls at this point.
+                    // fallthrough
+                  default:
+                    go_unreachable();
                   }
                 break;
               }
@@ -2592,6 +2680,21 @@ Escape_analysis_assign::assign(Node* dst, Node* src)
 	  }
 	  break;
 
+        case Expression::EXPRESSION_CONDITIONAL:
+          {
+            Conditional_expression* ce = e->conditional_expression();
+            this->assign(dst, Node::make_node(ce->then_expr()));
+            this->assign(dst, Node::make_node(ce->else_expr()));
+          }
+          break;
+
+        case Expression::EXPRESSION_COMPOUND:
+          {
+            Compound_expression* ce = e->compound_expression();
+            this->assign(dst, Node::make_node(ce->expr()));
+          }
+          break;
+
 	default:
 	  // TODO(cmang): Add debug info here; this should not be reachable.
 	  // For now, just to be conservative, we'll just say dst flows to src.
diff --git a/gcc/go/gofrontend/expressions.cc b/gcc/go/gofrontend/expressions.cc
index 15c9eab..3e433d6 100644
--- a/gcc/go/gofrontend/expressions.cc
+++ b/gcc/go/gofrontend/expressions.cc
@@ -3962,7 +3962,10 @@ Type_conversion_expression::do_lower(Gogo*, Named_object*,
   if (type->points_to() != NULL
       && type->points_to()->array_type() != NULL
       && !type->points_to()->is_slice_type()
-      && val->type()->is_slice_type())
+      && val->type()->is_slice_type()
+      && Type::are_identical(type->points_to()->array_type()->element_type(),
+			     val->type()->array_type()->element_type(),
+			     0, NULL))
     {
       Temporary_statement* val_temp = NULL;
       if (!val->is_multi_eval_safe())
@@ -18006,49 +18009,7 @@ Expression::make_type_info(Type* type, Type_info type_info)
   return new Type_info_expression(type, type_info);
 }
 
-// An expression that evaluates to some characteristic of a slice.
-// This is used when indexing, bound-checking, or nil checking a slice.
-
-class Slice_info_expression : public Expression
-{
- public:
-  Slice_info_expression(Expression* slice, Slice_info slice_info,
-                        Location location)
-    : Expression(EXPRESSION_SLICE_INFO, location),
-      slice_(slice), slice_info_(slice_info)
-  { }
-
- protected:
-  Type*
-  do_type();
-
-  void
-  do_determine_type(const Type_context*)
-  { }
-
-  Expression*
-  do_copy()
-  {
-    return new Slice_info_expression(this->slice_->copy(), this->slice_info_,
-                                     this->location());
-  }
-
-  Bexpression*
-  do_get_backend(Translate_context* context);
-
-  void
-  do_dump_expression(Ast_dump_context*) const;
-
-  void
-  do_issue_nil_check()
-  { this->slice_->issue_nil_check(); }
-
- private:
-  // The slice for which we are getting information.
-  Expression* slice_;
-  // What information we want.
-  Slice_info slice_info_;
-};
+// Slice_info_expression.
 
 // Return the type of the slice info.
 
diff --git a/gcc/go/gofrontend/expressions.h b/gcc/go/gofrontend/expressions.h
index 492849b..79a8785 100644
--- a/gcc/go/gofrontend/expressions.h
+++ b/gcc/go/gofrontend/expressions.h
@@ -62,6 +62,7 @@ class Type_guard_expression;
 class Heap_expression;
 class Receive_expression;
 class Slice_value_expression;
+class Slice_info_expression;
 class Conditional_expression;
 class Compound_expression;
 class Numeric_constant;
@@ -900,6 +901,14 @@ class Expression
   compound_expression()
   { return this->convert<Compound_expression, EXPRESSION_COMPOUND>(); }
 
+  // If this is a slice info expression, return the
+  // Slice_info_expression structure.  Otherwise, return NULL.
+  Slice_info_expression*
+  slice_info_expression()
+  {
+    return this->convert<Slice_info_expression, EXPRESSION_SLICE_INFO>();
+  }
+
   // Return true if this is a composite literal.
   bool
   is_composite_literal() const;
@@ -4262,6 +4271,60 @@ class Slice_value_expression : public Expression
   Expression* cap_;
 };
 
+// An expression that evaluates to some characteristic of a slice.
+// This is used when indexing, bound-checking, or nil checking a slice.
+
+class Slice_info_expression : public Expression
+{
+ public:
+  Slice_info_expression(Expression* slice, Slice_info slice_info,
+                        Location location)
+    : Expression(EXPRESSION_SLICE_INFO, location),
+      slice_(slice), slice_info_(slice_info)
+  { }
+
+  // The slice operand of this slice info expression.
+  Expression*
+  slice() const
+  { return this->slice_; }
+
+  // The info this expression is about.
+  Slice_info
+  info() const
+  { return this->slice_info_; }
+
+ protected:
+  Type*
+  do_type();
+
+  void
+  do_determine_type(const Type_context*)
+  { }
+
+  Expression*
+  do_copy()
+  {
+    return new Slice_info_expression(this->slice_->copy(), this->slice_info_,
+                                     this->location());
+  }
+
+  Bexpression*
+  do_get_backend(Translate_context* context);
+
+  void
+  do_dump_expression(Ast_dump_context*) const;
+
+  void
+  do_issue_nil_check()
+  { this->slice_->issue_nil_check(); }
+
+ private:
+  // The slice for which we are getting information.
+  Expression* slice_;
+  // What information we want.
+  Slice_info slice_info_;
+};
+
 // Conditional expressions.
 
 class Conditional_expression : public Expression
@@ -4277,6 +4340,14 @@ class Conditional_expression : public Expression
   condition() const
   { return this->cond_; }
 
+  Expression*
+  then_expr() const
+  { return this->then_; }
+
+  Expression*
+  else_expr() const
+  { return this->else_; }
+
  protected:
   int
   do_traverse(Traverse*);
@@ -4322,6 +4393,10 @@ class Compound_expression : public Expression
   init() const
   { return this->init_; }
 
+  Expression*
+  expr() const
+  { return this->expr_; }
+
  protected:
   int
   do_traverse(Traverse*);
diff --git a/gcc/go/gofrontend/runtime.def b/gcc/go/gofrontend/runtime.def
index fad8ceb..87a2708 100644
--- a/gcc/go/gofrontend/runtime.def
+++ b/gcc/go/gofrontend/runtime.def
@@ -204,12 +204,8 @@ DEF_GO_RUNTIME(SELECTNBSEND, "runtime.selectnbsend", P2(CHAN, POINTER), R1(BOOL)
 
 // Non-blocking receive a value from a channel, used for two-case select
 // statement with a default case.
-DEF_GO_RUNTIME(SELECTNBRECV, "runtime.selectnbrecv", P2(POINTER, CHAN), R1(BOOL))
-
-// Non-blocking tuple receive from a channel, used for two-case select
-// statement with a default case.
-DEF_GO_RUNTIME(SELECTNBRECV2, "runtime.selectnbrecv2", P3(POINTER, POINTER, CHAN),
-               R1(BOOL))
+DEF_GO_RUNTIME(SELECTNBRECV, "runtime.selectnbrecv", P2(POINTER, CHAN),
+	       R2(BOOL, BOOL))
 
 // Block execution.  Used for zero-case select.
 DEF_GO_RUNTIME(BLOCK, "runtime.block", P0(), R0())
diff --git a/gcc/go/gofrontend/statements.cc b/gcc/go/gofrontend/statements.cc
index 9643d1b..95fa3c4 100644
--- a/gcc/go/gofrontend/statements.cc
+++ b/gcc/go/gofrontend/statements.cc
@@ -6051,7 +6051,7 @@ Select_statement::lower_two_case(Block* b)
   Expression* chanref = Expression::make_temporary_reference(chantmp, loc);
 
   Block* bchan;
-  Expression* call;
+  Expression* cond;
   if (chancase.is_send())
     {
       // if selectnbsend(chan, &val) { body } else { default body }
@@ -6065,7 +6065,7 @@ Select_statement::lower_two_case(Block* b)
 
       Expression* ref = Expression::make_temporary_reference(ts, loc);
       Expression* addr = Expression::make_unary(OPERATOR_AND, ref, loc);
-      call = Runtime::make_call(Runtime::SELECTNBSEND, loc, 2, chanref, addr);
+      cond = Runtime::make_call(Runtime::SELECTNBSEND, loc, 2, chanref, addr);
       bchan = chancase.statements();
     }
   else
@@ -6075,34 +6075,31 @@ Select_statement::lower_two_case(Block* b)
 
       Expression* ref = Expression::make_temporary_reference(ts, loc);
       Expression* addr = Expression::make_unary(OPERATOR_AND, ref, loc);
-      Expression* okref = NULL;
-      if (chancase.closed() == NULL && chancase.closedvar() == NULL)
-        {
-          // Simple receive.
-          // if selectnbrecv(&lhs, chan) { body } else { default body }
-          call = Runtime::make_call(Runtime::SELECTNBRECV, loc, 2, addr, chanref);
-        }
-      else
-        {
-          // Tuple receive.
-          // if selectnbrecv2(&lhs, &ok, chan) { body } else { default body }
-
-          Type* booltype = Type::make_boolean_type();
-          Temporary_statement* okts = Statement::make_temporary(booltype, NULL,
-                                                                loc);
-          b->add_statement(okts);
-
-          okref = Expression::make_temporary_reference(okts, loc);
-          Expression* okaddr = Expression::make_unary(OPERATOR_AND, okref, loc);
-          call = Runtime::make_call(Runtime::SELECTNBRECV2, loc, 3, addr, okaddr,
-                                    chanref);
-        }
+
+      // selected, ok = selectnbrecv(&lhs, chan)
+      Call_expression* call = Runtime::make_call(Runtime::SELECTNBRECV, loc, 2,
+						 addr, chanref);
+
+      Temporary_statement* selected_temp =
+	Statement::make_temporary(Type::make_boolean_type(),
+				  Expression::make_call_result(call, 0),
+				  loc);
+      b->add_statement(selected_temp);
+
+      Temporary_statement* ok_temp =
+	Statement::make_temporary(Type::make_boolean_type(),
+				  Expression::make_call_result(call, 1),
+				  loc);
+      b->add_statement(ok_temp);
+
+      cond = Expression::make_temporary_reference(selected_temp, loc);
 
       Location cloc = chancase.location();
       bchan = new Block(b, loc);
       if (chancase.val() != NULL && !chancase.val()->is_sink_expression())
         {
-          Statement* as = Statement::make_assignment(chancase.val(), ref->copy(),
+          Statement* as = Statement::make_assignment(chancase.val(),
+						     ref->copy(),
                                                      cloc);
           bchan->add_statement(as);
         }
@@ -6114,12 +6111,18 @@ Select_statement::lower_two_case(Block* b)
 
       if (chancase.closed() != NULL && !chancase.closed()->is_sink_expression())
         {
+	  Expression* okref = Expression::make_temporary_reference(ok_temp,
+								   cloc);
           Statement* as = Statement::make_assignment(chancase.closed(),
-                                                     okref->copy(), cloc);
+                                                     okref, cloc);
           bchan->add_statement(as);
         }
       else if (chancase.closedvar() != NULL)
-        chancase.closedvar()->var_value()->set_init(okref->copy());
+	{
+	  Expression* okref = Expression::make_temporary_reference(ok_temp,
+								   cloc);
+	  chancase.closedvar()->var_value()->set_init(okref);
+	}
 
       Statement* bs = Statement::make_block_statement(chancase.statements(),
                                                       cloc);
@@ -6127,7 +6130,7 @@ Select_statement::lower_two_case(Block* b)
     }
 
   Statement* ifs =
-    Statement::make_if_statement(call, bchan, defcase.statements(), loc);
+    Statement::make_if_statement(cond, bchan, defcase.statements(), loc);
   b->add_statement(ifs);
 
   Statement* label =
diff --git a/gcc/go/gofrontend/types.cc b/gcc/go/gofrontend/types.cc
index 7c7b2eb..0c44186 100644
--- a/gcc/go/gofrontend/types.cc
+++ b/gcc/go/gofrontend/types.cc
@@ -846,7 +846,9 @@ Type::are_convertible(const Type* lhs, const Type* rhs, std::string* reason)
   if (rhs->is_slice_type()
       && lhs->points_to() != NULL
       && lhs->points_to()->array_type() != NULL
-      && !lhs->points_to()->is_slice_type())
+      && !lhs->points_to()->is_slice_type()
+      && Type::are_identical(lhs->points_to()->array_type()->element_type(),
+			     rhs->array_type()->element_type(), 0, reason))
     return true;
 
   // An unsafe.Pointer type may be converted to any pointer type or to
diff --git a/gcc/match.pd b/gcc/match.pd
index 19cbad7..0fcfd0e 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -2833,6 +2833,62 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
     (convert (mult (convert:t @0) { cst; })))))
 #endif
 
+/* Canonicalize (X*C1)|(X*C2) and (X*C1)^(X*C2) to (C1+C2)*X when
+   tree_nonzero_bits allows IOR and XOR to be treated like PLUS.
+   Likewise, handle (X<<C3) and X as legitimate variants of X*C.  */
+(for op (bit_ior bit_xor)
+ (simplify
+  (op (mult:s@0 @1 INTEGER_CST@2)
+      (mult:s@3 @1 INTEGER_CST@4))
+  (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
+       && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0)
+   (mult @1
+	 { wide_int_to_tree (type, wi::to_wide (@2) + wi::to_wide (@4)); })))
+ (simplify
+  (op:c (mult:s@0 @1 INTEGER_CST@2)
+	(lshift:s@3 @1 INTEGER_CST@4))
+  (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
+       && tree_int_cst_sgn (@4) > 0
+       && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0)
+   (with { wide_int wone = wi::one (TYPE_PRECISION (type));
+	   wide_int c = wi::add (wi::to_wide (@2),
+				 wi::lshift (wone, wi::to_wide (@4))); }
+    (mult @1 { wide_int_to_tree (type, c); }))))
+ (simplify
+  (op:c (mult:s@0 @1 INTEGER_CST@2)
+	@1)
+  (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
+       && (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0)
+   (mult @1
+	 { wide_int_to_tree (type,
+			     wi::add (wi::to_wide (@2), 1)); })))
+ (simplify
+  (op (lshift:s@0 @1 INTEGER_CST@2)
+      (lshift:s@3 @1 INTEGER_CST@4))
+  (if (INTEGRAL_TYPE_P (type)
+       && tree_int_cst_sgn (@2) > 0
+       && tree_int_cst_sgn (@4) > 0
+       && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0)
+   (with { tree t = type;
+	   if (!TYPE_OVERFLOW_WRAPS (t))
+	     t = unsigned_type_for (t);
+	   wide_int wone = wi::one (TYPE_PRECISION (t));
+	   wide_int c = wi::add (wi::lshift (wone, wi::to_wide (@2)),
+				 wi::lshift (wone, wi::to_wide (@4))); }
+    (convert (mult:t (convert:t @1) { wide_int_to_tree (t,c); })))))
+ (simplify
+  (op:c (lshift:s@0 @1 INTEGER_CST@2)
+	@1)
+  (if (INTEGRAL_TYPE_P (type)
+       && tree_int_cst_sgn (@2) > 0
+       && (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0)
+   (with { tree t = type;
+	   if (!TYPE_OVERFLOW_WRAPS (t))
+	     t = unsigned_type_for (t);
+	   wide_int wone = wi::one (TYPE_PRECISION (t));
+	   wide_int c = wi::add (wi::lshift (wone, wi::to_wide (@2)), wone); }
+    (convert (mult:t (convert:t @1) { wide_int_to_tree (t, c); }))))))
+
 /* Simplifications of MIN_EXPR, MAX_EXPR, fmin() and fmax().  */
 
 (for minmax (min max FMIN_ALL FMAX_ALL)
diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c
index f7f3acb..d37f778 100644
--- a/gcc/rtlanal.c
+++ b/gcc/rtlanal.c
@@ -6957,6 +6957,25 @@ register_asm_p (const_rtx x)
 
      (vec_select:RESULT_MODE OP SEL)
 
+   is equivalent to the highpart RESULT_MODE of OP.  */
+
+bool
+vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel)
+{
+  int nunits;
+  if (GET_MODE_NUNITS (op_mode).is_constant (&nunits)
+      && targetm.can_change_mode_class (op_mode, result_mode, ALL_REGS))
+    {
+      int offset = BYTES_BIG_ENDIAN ? 0 : nunits - XVECLEN (sel, 0);
+      return rtvec_series_p (XVEC (sel, 0), offset);
+    }
+  return false;
+}
+
+/* Return true if, for all OP of mode OP_MODE:
+
+     (vec_select:RESULT_MODE OP SEL)
+
    is equivalent to the lowpart RESULT_MODE of OP.  */
 
 bool
diff --git a/gcc/rtlanal.h b/gcc/rtlanal.h
index e164242..542dc78 100644
--- a/gcc/rtlanal.h
+++ b/gcc/rtlanal.h
@@ -332,6 +332,10 @@ inline vec_rtx_properties_base::~vec_rtx_properties_base ()
 using vec_rtx_properties = growing_rtx_properties<vec_rtx_properties_base>;
 
 bool
+vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode,
+		       rtx sel);
+
+bool
 vec_series_lowpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel);
 
 #endif
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 3806434..04b011b 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,185 @@
+2021-08-04  David Malcolm  <dmalcolm@redhat.com>
+
+	PR analyzer/101570
+	* gcc.dg/analyzer/asm-x86-1.c: New test.
+	* gcc.dg/analyzer/asm-x86-lp64-1.c: New test.
+	* gcc.dg/analyzer/asm-x86-lp64-2.c: New test.
+	* gcc.dg/analyzer/pr101570.c: New test.
+	* gcc.dg/analyzer/torture/asm-x86-linux-array_index_mask_nospec.c:
+	New test.
+	* gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-1.c: New
+	test.
+	* gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-2.c: New
+	test.
+	* gcc.dg/analyzer/torture/asm-x86-linux-cpuid.c: New test.
+	* gcc.dg/analyzer/torture/asm-x86-linux-rdmsr-paravirt.c: New
+	test.
+	* gcc.dg/analyzer/torture/asm-x86-linux-rdmsr.c: New test.
+	* gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-full.c:
+	New test.
+	* gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-reduced.c:
+	New test.
+
+2021-08-04  H.J. Lu  <hjl.tools@gmail.com>
+
+	PR target/101742
+	* gcc.target/i386/pr101742a.c: New test.
+	* gcc.target/i386/pr101742b.c: Likewise.
+
+2021-08-04  H.J. Lu  <hjl.tools@gmail.com>
+
+	PR target/101772
+	* gcc.target/i386/eh_return-2.c: New test.
+
+2021-08-04  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* gcc.target/s390/vector/perm-vpdi.c: New test.
+
+2021-08-04  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* gcc.target/s390/vector/perm-merge.c: New test.
+	* gcc.target/s390/vector/vec-types.h: New test.
+
+2021-08-04  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* gcc.target/s390/zvector/vec-permi.c: Removed.
+	* gcc.target/s390/zvector/vec_permi.c: New test.
+
+2021-08-04  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* gcc.target/s390/vector/long-double-asm-in-out-hard-fp-reg.c:
+	Instead of vpdi with 0 and 5 vmrlg and vmrhg are used now.
+	* gcc.target/s390/vector/long-double-asm-inout-hard-fp-reg.c: Likewise.
+	* gcc.target/s390/zvector/vec-types.h: New test.
+	* gcc.target/s390/zvector/vec_merge.c: New test.
+
+2021-08-04  Jonathan Wright  <jonathan.wright@arm.com>
+
+	* gcc.target/aarch64/vmul_high_cost.c: New test.
+
+2021-08-04  Jonathan Wright  <jonathan.wright@arm.com>
+
+	* gcc.target/aarch64/vmul_element_cost.c: New test.
+
+2021-08-04  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* gcc.target/aarch64/sve/cost_model_12.c: New test.
+
+2021-08-04  Tamar Christina  <tamar.christina@arm.com>
+
+	PR tree-optimization/101750
+	* g++.dg/vect/pr99149.cc: Name class.
+
+2021-08-04  Richard Biener  <rguenther@suse.de>
+
+	* gcc.target/i386/vect-gather-1.c: New testcase.
+	* gfortran.dg/vect/vect-8.f90: Adjust.
+
+2021-08-04  Roger Sayle  <roger@nextmovesoftware.com>
+	    Marc Glisse  <marc.glisse@inria.fr>
+
+	* gcc.dg/fold-ior-4.c: New test.
+
+2021-08-04  Richard Biener  <rguenther@suse.de>
+
+	PR tree-optimization/101756
+	* gcc.dg/vect/bb-slp-pr101756.c: New testcase.
+
+2021-08-04  Jakub Jelinek  <jakub@redhat.com>
+
+	PR c++/101759
+	* g++.dg/gomp/pr101759.C: New test.
+	* g++.dg/goacc/pr101759.C: New test.
+
+2021-08-04  Jakub Jelinek  <jakub@redhat.com>
+
+	* gcc.c-torture/execute/ieee/pr29302-1.x: Undo doubly applied patch.
+
+2021-08-04  Richard Biener  <rguenther@suse.de>
+
+	PR tree-optimization/101769
+	* g++.dg/tree-ssa/pr101769.C: New testcase.
+
+2021-08-04  liuhongt  <hongtao.liu@intel.com>
+
+	* gcc.target/i386/cond_op_addsubmul_d-2.c: Add
+	dg-require-effective-target for avx512.
+	* gcc.target/i386/cond_op_addsubmul_q-2.c: Ditto.
+	* gcc.target/i386/cond_op_addsubmul_w-2.c: Ditto.
+	* gcc.target/i386/cond_op_addsubmuldiv_double-2.c: Ditto.
+	* gcc.target/i386/cond_op_addsubmuldiv_float-2.c: Ditto.
+	* gcc.target/i386/cond_op_fma_double-2.c: Ditto.
+	* gcc.target/i386/cond_op_fma_float-2.c: Ditto.
+
+2021-08-04  liuhongt  <hongtao.liu@intel.com>
+
+	* gcc.target/i386/cond_op_fma_double-1.c: New test.
+	* gcc.target/i386/cond_op_fma_double-2.c: New test.
+	* gcc.target/i386/cond_op_fma_float-1.c: New test.
+	* gcc.target/i386/cond_op_fma_float-2.c: New test.
+
+2021-08-03  Eugene Rozenfeld  <erozen@microsoft.com>
+
+	* lib/profopt.exp: Pass gdwarf-4 when compiling test to profile; pass -gcov_version=2.
+	* lib/target-supports.exp: Remove unnecessary -o perf.data passed to gcc-auto-profile.
+
+2021-08-03  Eugene Rozenfeld  <erozen@microsoft.com>
+
+	* gcc.dg/tree-prof/indir-call-prof-2.c: Fix dg-final-use-autofdo.
+	* lib/profopt.exp: Pass -fearly-inlining when compiling with AutoFDO.
+
+2021-08-03  Eugene Rozenfeld  <erozen@microsoft.com>
+
+	* g++.dg/tree-prof/indir-call-prof.C: Fix options, increase the number of iterations.
+	* g++.dg/tree-prof/morefunc.C: Fix options, increase the number of iterations.
+	* g++.dg/tree-prof/reorder.C: Fix options, increase the number of iterations.
+	* gcc.dg/tree-prof/indir-call-prof-2.c: Fix options, increase the number of iterations.
+	* gcc.dg/tree-prof/indir-call-prof.c: Fix options.
+
+2021-08-03  Martin Sebor  <msebor@redhat.com>
+
+	PR testsuite/101688
+	* g++.dg/warn/Wstringop-overflow-4.C: Disable a test case in ILP32.
+
+2021-08-03  Paul A. Clarke  <pc@us.ibm.com>
+
+	* gcc.target/powerpc/sse4_1-phminposuw.c: Copy from
+	gcc/testsuite/gcc.target/i386, adjust dg directives to suit,
+	make more robust.
+
+2021-08-03  H.J. Lu  <hjl.tools@gmail.com>
+
+	* gcc.target/i386/avx-vzeroupper-14.c: Pass -mno-avx512f to
+	disable XMM31.
+	* gcc.target/i386/avx-vzeroupper-15.c: Likewise.
+	* gcc.target/i386/pr82941-1.c: Updated.  Check for vzeroupper.
+	* gcc.target/i386/pr82942-1.c: Likewise.
+	* gcc.target/i386/pr82990-1.c: Likewise.
+	* gcc.target/i386/pr82990-3.c: Likewise.
+	* gcc.target/i386/pr82990-5.c: Likewise.
+	* gcc.target/i386/pr100865-4b.c: Likewise.
+	* gcc.target/i386/pr100865-6b.c: Likewise.
+	* gcc.target/i386/pr100865-7b.c: Likewise.
+	* gcc.target/i386/pr100865-10b.c: Likewise.
+	* gcc.target/i386/pr100865-8b.c: Updated.
+	* gcc.target/i386/pr100865-9b.c: Likewise.
+	* gcc.target/i386/pr100865-11b.c: Likewise.
+	* gcc.target/i386/pr100865-12b.c: Likewise.
+
+2021-08-03  liuhongt  <hongtao.liu@intel.com>
+
+	* gcc.target/i386/cond_op_addsubmul_d-1.c: New test.
+	* gcc.target/i386/cond_op_addsubmul_d-2.c: New test.
+	* gcc.target/i386/cond_op_addsubmul_q-1.c: New test.
+	* gcc.target/i386/cond_op_addsubmul_q-2.c: New test.
+	* gcc.target/i386/cond_op_addsubmul_w-1.c: New test.
+	* gcc.target/i386/cond_op_addsubmul_w-2.c: New test.
+
+2021-08-03  Jakub Jelinek  <jakub@redhat.com>
+
+	PR analyzer/101721
+	* gcc.dg/analyzer/pr101721.c: New test.
+
 2021-08-03  H.J. Lu  <hjl.tools@gmail.com>
 
 	PR target/80566
diff --git a/gcc/testsuite/g++.dg/goacc/pr101759.C b/gcc/testsuite/g++.dg/goacc/pr101759.C
new file mode 100644
index 0000000..522a5d4
--- /dev/null
+++ b/gcc/testsuite/g++.dg/goacc/pr101759.C
@@ -0,0 +1,5 @@
+// PR c++/101759
+// { dg-do compile { target c++11 } }
+
+#pragma acc routine
+int foo (int x = []() { extern int bar (int); return 1; }());
diff --git a/gcc/testsuite/g++.dg/gomp/pr101759.C b/gcc/testsuite/g++.dg/gomp/pr101759.C
new file mode 100644
index 0000000..905b875
--- /dev/null
+++ b/gcc/testsuite/g++.dg/gomp/pr101759.C
@@ -0,0 +1,8 @@
+// PR c++/101759
+// { dg-do compile { target c++11 } }
+
+#pragma omp declare simd
+int foo (int x = []() { extern int bar (int); return 1; }());
+int corge (int = 1);
+#pragma omp declare variant (corge) match (user={condition(true)})
+int baz (int x = []() { extern int qux (int); return 1; }());
diff --git a/gcc/testsuite/g++.dg/tree-prof/indir-call-prof.C b/gcc/testsuite/g++.dg/tree-prof/indir-call-prof.C
index 3374744..b454171 100644
--- a/gcc/testsuite/g++.dg/tree-prof/indir-call-prof.C
+++ b/gcc/testsuite/g++.dg/tree-prof/indir-call-prof.C
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-profile-optimized -fdump-ipa-afdo" } */
+/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-profile-optimized -fdump-ipa-afdo-optimized" } */
 
 struct A {
   A () {}
@@ -26,7 +26,7 @@ main (void)
 
   int i;
 
-  for (i = 0; i < 1000000; i++)
+  for (i = 0; i < 10000000; i++)
     {
       p = (A *)wrap ((void *)&a);
       p->AA ();
diff --git a/gcc/testsuite/g++.dg/tree-prof/morefunc.C b/gcc/testsuite/g++.dg/tree-prof/morefunc.C
index 621d09a..96e0073 100644
--- a/gcc/testsuite/g++.dg/tree-prof/morefunc.C
+++ b/gcc/testsuite/g++.dg/tree-prof/morefunc.C
@@ -1,4 +1,5 @@
-/* { dg-options "-O2 -fno-devirtualize --param=profile-func-internal-id=0 -fdump-ipa-profile-optimized -fdump-ipa-afdo -Wno-attributes -Wno-coverage-mismatch -Wno-missing-profile" } */
+/* { dg-options "-O2 -fno-devirtualize --param=profile-func-internal-id=0 -fdump-ipa-profile-optimized -fdump-ipa-afdo-optimized -Wno-attributes -Wno-coverage-mismatch -Wno-missing-profile" } */
+
 #include "reorder_class1.h"
 #include "reorder_class2.h"
 
@@ -19,7 +20,7 @@ static __attribute__((always_inline))
 void test1 (A *tc)
 {
   int i;
-  for (i = 0; i < 1000; i++)
+  for (i = 0; i < 10000000; i++)
      g += tc->foo(); 
    if (g<100) g++;
 }
@@ -28,7 +29,7 @@ static __attribute__((always_inline))
 void test2 (B *tc)
 {
   int i;
-  for (i = 0; i < 1000000; i++)
+  for (i = 0; i < 10000000; i++)
      g += tc->foo();
 }
 
diff --git a/gcc/testsuite/g++.dg/tree-prof/reorder.C b/gcc/testsuite/g++.dg/tree-prof/reorder.C
index 000fb65..5049096 100644
--- a/gcc/testsuite/g++.dg/tree-prof/reorder.C
+++ b/gcc/testsuite/g++.dg/tree-prof/reorder.C
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fno-devirtualize --param=profile-func-internal-id=0 -fdump-ipa-profile-optimized -fdump-ipa-afdo -Wno-coverage-mismatch -Wno-attributes" } */
+/* { dg-options "-O2 -fno-devirtualize --param=profile-func-internal-id=0 -fdump-ipa-profile-optimized -fdump-ipa-afdo-optimized -Wno-coverage-mismatch -Wno-attributes" } */
 
 #ifdef _PROFILE_USE
 #include "reorder_class1.h"
@@ -13,7 +13,7 @@ static __attribute__((always_inline))
 void test1 (A *tc)
 {
   int i;
-  for (i = 0; i < 1000000; i++)
+  for (i = 0; i < 10000000; i++)
      g += tc->foo(); 
    if (g<100) g++;
 }
@@ -22,7 +22,7 @@ static __attribute__((always_inline))
 void test2 (B *tc)
 {
   int i;
-  for (i = 0; i < 1000000; i++)
+  for (i = 0; i < 10000000; i++)
      g += tc->foo();
 }
 
diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr101769.C b/gcc/testsuite/g++.dg/tree-ssa/pr101769.C
new file mode 100644
index 0000000..4979c42
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr101769.C
@@ -0,0 +1,56 @@
+// { dg-do compile }
+// { dg-require-effective-target c++11 }
+// { dg-options "-O2 -fdump-tree-optimized" }
+
+struct Node
+{
+  Node*	right;
+  Node*	down;
+};
+
+inline
+void free_node(Node*)
+{
+}
+
+void free_all(Node* n_)
+{
+  if (n_ == nullptr) {
+      return;
+  }
+  free_all(n_->right);
+  do {
+      Node* t = n_->down;
+      free_node(n_);
+      n_ = t;
+  } while (n_);
+}
+
+void free_all2_r(Node* n_)
+{
+  if (n_->right) {
+      free_all2_r(n_->right);
+  }
+  do {
+      Node* t = n_->down;
+      free_node(n_);
+      n_ = t;
+  } while (n_);
+}
+
+void free_all2(Node* n_)
+{
+  if (n_) {
+      free_all2_r(n_);
+  }
+}
+
+void loop(Node* n_)
+{
+  do {
+      n_ = n_->down;
+  } while (n_);
+}
+
+// All functions should be empty.
+// { dg-final { scan-tree-dump-times "<bb " 4 "optimized" } }
diff --git a/gcc/testsuite/g++.dg/vect/pr99149.cc b/gcc/testsuite/g++.dg/vect/pr99149.cc
index 00ebe9d..9d58426 100755
--- a/gcc/testsuite/g++.dg/vect/pr99149.cc
+++ b/gcc/testsuite/g++.dg/vect/pr99149.cc
@@ -11,7 +11,7 @@ public:
   a operator*(a d) { return a(b * b - c * c, b * c + c * d.b); }
 };
 int f, g;
-class {
+class mp {
   a *h;
   a *i;
 
diff --git a/gcc/testsuite/g++.dg/warn/Wstringop-overflow-4.C b/gcc/testsuite/g++.dg/warn/Wstringop-overflow-4.C
index 121239a..c80977d 100644
--- a/gcc/testsuite/g++.dg/warn/Wstringop-overflow-4.C
+++ b/gcc/testsuite/g++.dg/warn/Wstringop-overflow-4.C
@@ -145,7 +145,12 @@ void test_strcpy_new_int16_t (size_t n, const size_t vals[])
   T (S (9), new int16_t[r_imin_imax * 2 + 1]);
 
   int r_0_imax = SR (0, INT_MAX);
-  T (S (1), new int16_t[r_0_imax]);
+
+  if (sizeof (int) < sizeof (size_t))
+    /* The code below might emit a warning when int is the same size
+       as size_t as a result of threading.  See PR 101688 comment #2.  */
+    T (S (1), new int16_t[r_0_imax]);
+
   T (S (2), new int16_t[r_0_imax + 1]);
   T (S (9), new int16_t[r_0_imax * 2 + 1]);
 
diff --git a/gcc/testsuite/g++.old-deja/g++.other/inline7.C b/gcc/testsuite/g++.old-deja/g++.other/inline7.C
index a3723cf..6260000 100644
--- a/gcc/testsuite/g++.old-deja/g++.other/inline7.C
+++ b/gcc/testsuite/g++.old-deja/g++.other/inline7.C
@@ -8,7 +8,7 @@ std::list<int*> li;
 
 void f ()
 {
-  li.size ();
+  (void) li.size ();
 }
 
 int main ()
diff --git a/gcc/testsuite/gcc.c-torture/execute/ieee/pr29302-1.x b/gcc/testsuite/gcc.c-torture/execute/ieee/pr29302-1.x
index 1922b14..3ae2096 100644
--- a/gcc/testsuite/gcc.c-torture/execute/ieee/pr29302-1.x
+++ b/gcc/testsuite/gcc.c-torture/execute/ieee/pr29302-1.x
@@ -4,9 +4,3 @@ if { [istarget "tic6x-*-*"] && [check_effective_target_ti_c67x] } {
     return 1
 }
 return 0
-if { [istarget "tic6x-*-*"] && [check_effective_target_ti_c67x] } {
-    # C6X uses -freciprocal-math by default.
-    set torture_execute_xfail "tic6x-*-*"
-    return 1
-}
-return 0
diff --git a/gcc/testsuite/gcc.dg/analyzer/asm-x86-1.c b/gcc/testsuite/gcc.dg/analyzer/asm-x86-1.c
new file mode 100644
index 0000000..f6026b7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/asm-x86-1.c
@@ -0,0 +1,69 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+
+#include "analyzer-decls.h"
+
+int test_out (void)
+{
+  int dst_a, dst_b;
+  asm ("mov 42, %0"
+       : "=r" (dst_a));
+  asm ("mov 42, %0"
+       : "=r" (dst_b));
+  __analyzer_eval (dst_a == dst_b); /* { dg-warning "TRUE" } */
+  return dst_a;
+}
+
+int test_out_in (int src_a)
+{
+  int dst_a, dst_b;
+  asm ("mov %1, %0"
+       : "=r" (dst_a)
+       : "r" (src_a));
+  asm ("mov %1, %0"
+       : "=r" (dst_b)
+       : "r" (src_a));
+  __analyzer_eval (dst_a == dst_b); /* { dg-warning "TRUE" } */
+  return dst_a;
+}
+
+int test_out_in_in (int src_a, int src_b)
+{
+  int dst_a, dst_b;
+  asm ("mov %1, %0;\n"
+       "add %2, %0"
+       : "=r" (dst_a)
+       : "r" (src_a),
+	 "r" (src_b));
+  asm ("mov %1, %0;\n"
+       "add %2, %0"
+       : "=r" (dst_b)
+       : "r" (src_a),
+	 "r" (src_b));
+  __analyzer_eval (dst_a == dst_b); /* { dg-warning "TRUE" } */
+  return dst_a;
+}
+
+void test_inout_1 (int v)
+{
+  int saved = v;
+  int result_a, result_b;
+  asm ("dec %0"
+       : "+r" (v));
+  result_a = v;
+
+  asm ("dec %0"
+       : "+r" (v));
+  result_b = v;
+
+  __analyzer_eval (v == saved); /* { dg-warning "UNKNOWN" } */
+  __analyzer_eval (v == result_a); /* { dg-warning "UNKNOWN" } */
+  __analyzer_eval (v == result_b); /* { dg-warning "TRUE" } */
+}
+
+void test_inout_2 (void)
+{
+  int v;
+  int result_a, result_b;
+  asm ("dec %0" /* { dg-warning "use of uninitialized value 'v'" } */
+       : "+r" (v));
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/asm-x86-lp64-1.c b/gcc/testsuite/gcc.dg/analyzer/asm-x86-lp64-1.c
new file mode 100644
index 0000000..c235e22
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/asm-x86-lp64-1.c
@@ -0,0 +1,131 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-require-effective-target lp64 } */
+
+#include "analyzer-decls.h"
+
+#include <stdint.h>
+
+int test_1 (int src)
+{
+  int dst;
+  asm ("mov %1, %0\n\t"
+       "add $1, %0"
+       : "=r" (dst)
+       : "r" (src));
+  return dst;
+}
+
+uint32_t test_2 (uint32_t Mask)
+{
+  uint32_t Index;
+  asm ("bsfl %[aMask], %[aIndex]"
+       : [aIndex] "=r" (Index)
+       : [aMask] "r" (Mask)
+       : "cc");
+  return Index;
+}
+
+int test_3a (int p1, int p2)
+{
+  asm goto ("btl %1, %0\n\t"
+	    "jc %l2"
+	    : // No outputs
+	    : "r" (p1), "r" (p2)
+	    : "cc"
+	    : carry);
+
+  return 0;
+
+ carry:
+  return 1;
+}
+
+int test_3b (int p1, int p2)
+{
+  asm goto ("btl %1, %0\n\t"
+	    "jc %l[carry]"
+	    : // No outputs
+	    : "r" (p1), "r" (p2)
+	    : "cc"
+	    : carry);
+
+  return 0;
+
+ carry:
+  return 1;
+}
+
+uint64_t test_4 (void)
+{
+  uint64_t start_time, end_time;
+
+  // Get start time
+  asm volatile ("rdtsc\n\t"    // Returns the time in EDX:EAX.
+		"shl $32, %%rdx\n\t"  // Shift the upper bits left.
+		"or %%rdx, %0"        // 'Or' in the lower bits.
+		: "=a" (start_time)
+		:
+		: "rdx");
+
+  // could do other work here
+
+  // Get end time
+  asm volatile ("rdtsc\n\t"    // Returns the time in EDX:EAX.
+		"shl $32, %%rdx\n\t"  // Shift the upper bits left.
+		"or %%rdx, %0"        // 'Or' in the lower bits.
+		: "=a" (end_time)
+		:
+		: "rdx");
+
+  __analyzer_eval (start_time == end_time); /* { dg-warning "UNKNOWN" } */
+
+  // Get elapsed time
+  return end_time - start_time;
+}
+
+static uint64_t get_time (void)
+{
+  uint64_t result;
+  asm volatile ("rdtsc\n\t"    // Returns the time in EDX:EAX.
+		"shl $32, %%rdx\n\t"  // Shift the upper bits left.
+		"or %%rdx, %0"        // 'Or' in the lower bits.
+		: "=a" (result)
+		:
+		: "rdx");
+  return result;
+}
+
+uint64_t test_4a (void)
+{
+  uint64_t start_time, end_time;
+
+  start_time = get_time ();
+  // could do other work here
+  end_time = get_time ();
+
+  __analyzer_eval (start_time == end_time); /* { dg-warning "UNKNOWN" } */
+
+  // Get elapsed time
+  return end_time - start_time;
+}
+
+asm ("\t.pushsection .text\n"
+     "\t.globl add_asm\n"
+     "\t.type add_asm, @function\n"
+     "add_asm:\n"
+     "\tmovq %rdi, %rax\n"
+     "\tadd %rsi, %rax\n"
+     "\tret\n"
+     "\t.popsection\n");
+
+int test_5 (int count)
+{
+  asm goto ("dec %0; jb %l[stop]"
+	    : "+r" (count)
+	    :
+	    :
+	    : stop);
+  return count;
+stop:
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/asm-x86-lp64-2.c b/gcc/testsuite/gcc.dg/analyzer/asm-x86-lp64-2.c
new file mode 100644
index 0000000..fa50739
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/asm-x86-lp64-2.c
@@ -0,0 +1,34 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-require-effective-target lp64 } */
+
+/* Adapted from Linux x86: page_ref_dec_and_test.c (GPL-2.0).  */
+
+typedef _Bool bool;
+
+typedef struct {
+  int counter;
+} atomic_t;
+
+bool
+arch_atomic_dec_and_test(atomic_t *v) {
+  return ({
+    bool c;
+    asm volatile(".pushsection .smp_locks,\"a\"\n"
+                 ".balign 4\n"
+                 ".long 671f - .\n"
+                 ".popsection\n"
+                 "671:"
+                 "\n\tlock; "
+                 "decl"
+                 " "
+                 "%[var]"
+                 "\n\t/* output condition code "
+                 "e"
+                 "*/\n"
+                 : [ var ] "+m"(v->counter), "=@cc"
+                                             "e"(c)
+                 :
+                 : "memory");
+    c;
+  });
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/pr101570.c b/gcc/testsuite/gcc.dg/analyzer/pr101570.c
new file mode 100644
index 0000000..809bad6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/pr101570.c
@@ -0,0 +1,5 @@
+void
+test2 (_Complex double f)
+{
+  __asm__ ("" : "=r" (__real f));
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-array_index_mask_nospec.c b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-array_index_mask_nospec.c
new file mode 100644
index 0000000..6201fdb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-array_index_mask_nospec.c
@@ -0,0 +1,74 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } { "" } } */
+
+#include "../analyzer-decls.h"
+
+/* Copied from linux: arch/x86/include/asm/barrier.h (GPL-2.0) */
+
+static inline unsigned long array_index_mask_nospec(unsigned long index,
+						    unsigned long size)
+{
+	unsigned long mask;
+
+	asm volatile ("cmp %1,%2; sbb %0,%0;"
+			:"=r" (mask)
+			:"g"(size),"r" (index)
+			:"cc");
+	return mask;
+}
+
+/* The analyzer ought to treat array_index_mask_nospec as being
+   effectively pure.  */
+
+void test_1 (unsigned long index, unsigned long size)
+{
+  unsigned long a = array_index_mask_nospec (index, size);
+  unsigned long b = array_index_mask_nospec (index, size);
+  __analyzer_eval (a == b); /* { dg-warning "TRUE" } */
+}
+
+void test_2 (unsigned long index_a, unsigned long size_a,
+	     unsigned long index_b, unsigned long size_b)
+{
+  unsigned long aa_1 = array_index_mask_nospec (index_a, size_a);
+  unsigned long ab_1 = array_index_mask_nospec (index_a, size_b);
+  unsigned long ba_1 = array_index_mask_nospec (index_b, size_a);
+  unsigned long bb_1 = array_index_mask_nospec (index_b, size_b);
+
+  unsigned long aa_2 = array_index_mask_nospec (index_a, size_a);
+  unsigned long ab_2 = array_index_mask_nospec (index_a, size_b);
+  unsigned long ba_2 = array_index_mask_nospec (index_b, size_a);
+  unsigned long bb_2 = array_index_mask_nospec (index_b, size_b);
+
+  __analyzer_eval (aa_1 == aa_2); /* { dg-warning "TRUE" } */
+  __analyzer_eval (ab_1 == ab_2); /* { dg-warning "TRUE" } */
+  __analyzer_eval (ba_1 == ba_2); /* { dg-warning "TRUE" } */
+  __analyzer_eval (bb_1 == bb_2); /* { dg-warning "TRUE" } */
+
+  __analyzer_eval (aa_1 == ab_1); /* { dg-warning "UNKNOWN" } */
+  __analyzer_eval (aa_1 == ba_1); /* { dg-warning "UNKNOWN" } */
+  __analyzer_eval (aa_1 == bb_1); /* { dg-warning "UNKNOWN" } */
+
+  __analyzer_eval (ab_1 == ba_1); /* { dg-warning "UNKNOWN" } */
+  __analyzer_eval (ab_1 == bb_1); /* { dg-warning "UNKNOWN" } */
+
+  __analyzer_eval (ba_1 == bb_1); /* { dg-warning "UNKNOWN" } */
+}
+
+/* Equivalent asm strings should be treated the same, rather
+   than requiring the results to come from the same stmt.  */
+
+void test_3 (unsigned long index, unsigned long size)
+{
+  unsigned long a = array_index_mask_nospec (index, size);
+  unsigned long b;
+
+  /* Copy of the asm from array_index_mask_nospec.  */
+  asm volatile ("cmp %1,%2; sbb %0,%0;"
+		:"=r" (b)
+		:"g"(size),"r" (index)
+		:"cc");
+
+  __analyzer_eval (a == b); /* { dg-warning "TRUE" } */
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-1.c b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-1.c
new file mode 100644
index 0000000..cf5cf97
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-1.c
@@ -0,0 +1,81 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } { "" } } */
+
+/* Adapted/reduced from linux kernel (GPL-2.0).  */
+
+register unsigned long current_stack_pointer asm("rsp");
+
+struct pv_cpu_ops {
+  /* snip */
+  void (*cpuid)(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
+                unsigned int *edx);
+  /* snip */
+};
+struct paravirt_patch_template {
+  struct pv_cpu_ops cpu;
+  /* snip */
+};
+extern struct paravirt_patch_template pv_ops;
+
+/* snip */
+static void cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
+		  unsigned int *edx) {
+  unsigned long __edi = __edi, __esi = __esi, __edx = __edx, __ecx = __ecx,
+                __eax = __eax;
+  asm volatile(
+      "771:\n\t"
+      "999:\n\t"
+      ".pushsection .discard.retpoline_safe\n\t"
+      " "
+      ".quad"
+      " "
+      " 999b\n\t"
+      ".popsection\n\t"
+      "call *%c[paravirt_opptr];"
+      "\n"
+      "772:\n"
+      ".pushsection .parainstructions,\"a\"\n"
+      " "
+      ".balign 8"
+      " "
+      "\n"
+      " "
+      ".quad"
+      " "
+      " 771b\n"
+      "  .byte "
+      "%c[paravirt_typenum]"
+      "\n"
+      "  .byte 772b-771b\n"
+      "  .short "
+      "%c[paravirt_clobber]"
+      "\n"
+      ".popsection\n"
+      : "=D"(__edi), "=S"(__esi), "=d"(__edx), "=c"(__ecx),
+        "+r"(current_stack_pointer)
+      : [ paravirt_typenum ] "i"(
+            (__builtin_offsetof(struct paravirt_patch_template, cpu.cpuid) /
+             sizeof(void *))),
+        [ paravirt_opptr ] "i"(&(pv_ops.cpu.cpuid)),
+        [ paravirt_clobber ] "i"(((1 << 9) - 1)), "D"((unsigned long)(eax)),
+        "S"((unsigned long)(ebx)), "d"((unsigned long)(ecx)),
+        "c"((unsigned long)(edx))
+      : "memory", "cc", "rax", "r8", "r9", "r10", "r11");
+}
+
+extern void check_init_int(int v);
+
+void test(unsigned int op) {
+  unsigned int eax, ebx, ecx, edx;
+
+  eax = op;
+  ecx = 0;
+  cpuid(&eax, &ebx, &ecx, &edx);
+
+  check_init_int(eax);
+  check_init_int(ebx); /* { dg-bogus "use of uninitialized value 'ebx'" } */
+  check_init_int(ecx);
+  check_init_int(edx); /* { dg-bogus "use of uninitialized value 'edx'" } */
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-2.c b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-2.c
new file mode 100644
index 0000000..c4b365f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-2.c
@@ -0,0 +1,135 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } { "" } } */
+
+/* Adapted/reduced from linux kernel (GPL-2.0).  */
+
+typedef __SIZE_TYPE__ size_t;
+
+#define offsetof(TYPE, MEMBER)	((size_t)&((TYPE *)0)->MEMBER)
+
+#define __stringify_1(x...)	#x
+#define __stringify(x...)	__stringify_1(x)
+#define __ASM_FORM(x, ...)		" " __stringify(x,##__VA_ARGS__) " "
+#define __ASM_FORM_RAW(x, ...)		    __stringify(x,##__VA_ARGS__)
+#define __ASM_SEL(a,b)		__ASM_FORM(b)
+#define __ASM_SEL_RAW(a,b)	__ASM_FORM_RAW(b)
+#define __ASM_REG(reg)         __ASM_SEL_RAW(e##reg, r##reg)
+#define _ASM_PTR	__ASM_SEL(.long, .quad)
+#define _ASM_ALIGN   __ASM_SEL(.balign 4, .balign 8)
+#define _ASM_SP		__ASM_REG(sp)
+
+
+register unsigned long current_stack_pointer asm(_ASM_SP);
+#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
+
+#define ANNOTATE_RETPOLINE_SAFE					\
+	"999:\n\t"						\
+	".pushsection .discard.retpoline_safe\n\t"		\
+	_ASM_PTR " 999b\n\t"					\
+	".popsection\n\t"
+
+/* Adapted from Linux arch/x86/include/asm/paravirt.h  */
+
+struct pv_cpu_ops {
+  /* snip */
+  void (*cpuid)(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
+		unsigned int *edx);
+  /* snip */
+};
+
+struct paravirt_patch_template {
+  struct pv_cpu_ops cpu;
+  /* snip */
+};
+extern struct paravirt_patch_template pv_ops;
+
+#define PARAVIRT_PATCH(x)					\
+	(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
+
+#define paravirt_type(op)				\
+	[paravirt_typenum] "i" (PARAVIRT_PATCH(op)),	\
+	[paravirt_opptr] "i" (&(pv_ops.op))
+#define paravirt_clobber(clobber)		\
+	[paravirt_clobber] "i" (clobber)
+
+#define CLBR_ANY  ((1 << 9) - 1)
+
+#define _paravirt_alt(insn_string, type, clobber)	\
+	"771:\n\t" insn_string "\n" "772:\n"		\
+	".pushsection .parainstructions,\"a\"\n"	\
+	_ASM_ALIGN "\n"					\
+	_ASM_PTR " 771b\n"				\
+	"  .byte " type "\n"				\
+	"  .byte 772b-771b\n"				\
+	"  .short " clobber "\n"			\
+	".popsection\n"
+
+#define paravirt_alt(insn_string)					\
+	_paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
+
+#define PARAVIRT_CALL					\
+	ANNOTATE_RETPOLINE_SAFE				\
+	"call *%c[paravirt_opptr];"
+
+#define PVOP_CALL_ARGS						\
+	unsigned long __edi = __edi, __esi = __esi,		\
+		__edx = __edx, __ecx = __ecx, __eax = __eax;
+
+#define PVOP_CALL_ARG1(x)		"D" ((unsigned long)(x))
+#define PVOP_CALL_ARG2(x)		"S" ((unsigned long)(x))
+#define PVOP_CALL_ARG3(x)		"d" ((unsigned long)(x))
+#define PVOP_CALL_ARG4(x)		"c" ((unsigned long)(x))
+
+#define PVOP_VCALL_CLOBBERS	"=D" (__edi),				\
+				"=S" (__esi), "=d" (__edx),		\
+				"=c" (__ecx)
+/* void functions are still allowed [re]ax for scratch */
+#define PVOP_VCALLEE_CLOBBERS	"=a" (__eax)
+
+#define VEXTRA_CLOBBERS	 , "rax", "r8", "r9", "r10", "r11"
+
+#define PVOP_TEST_NULL(op)	((void)pv_ops.op)
+
+#define ____PVOP_CALL(ret, op, clbr, call_clbr, extra_clbr, ...)	\
+	({								\
+		PVOP_CALL_ARGS;						\
+		PVOP_TEST_NULL(op);					\
+		asm volatile(paravirt_alt(PARAVIRT_CALL)		\
+			     : call_clbr, ASM_CALL_CONSTRAINT		\
+			     : paravirt_type(op),			\
+			       paravirt_clobber(clbr),			\
+			       ##__VA_ARGS__				\
+			     : "memory", "cc" extra_clbr);		\
+		ret;							\
+	})
+
+#define __PVOP_VCALL(op, ...)						\
+	(void)____PVOP_CALL(, op, CLBR_ANY, PVOP_VCALL_CLOBBERS,	\
+		       VEXTRA_CLOBBERS, ##__VA_ARGS__)
+
+#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4)				\
+	__PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2),	\
+		     PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
+
+static void cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
+		  unsigned int *edx)
+{
+  PVOP_VCALL4(cpu.cpuid, eax, ebx, ecx, edx);
+}
+
+extern void check_init_int(int v);
+
+void test(unsigned int op) {
+  unsigned int eax, ebx, ecx, edx;
+
+  eax = op;
+  ecx = 0;
+  cpuid(&eax, &ebx, &ecx, &edx);
+
+  check_init_int(eax);
+  check_init_int(ebx); /* { dg-bogus "use of uninitialized value 'ebx'" } */
+  check_init_int(ecx);
+  check_init_int(edx); /* { dg-bogus "use of uninitialized value 'edx'" } */
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid.c b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid.c
new file mode 100644
index 0000000..243931a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid.c
@@ -0,0 +1,46 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-require-effective-target lp64 } */
+
+#include "../analyzer-decls.h"
+
+typedef unsigned __INT32_TYPE__ u32;
+typedef unsigned __INT64_TYPE__ u64;
+
+extern void check_init_u32 (u32 v);
+extern void check_init_u64 (u32 v);
+
+/* Adapted from linux kernel: arch/x86/include/asm/processor.h (GPL-2.0).  */
+
+static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
+				unsigned int *ecx, unsigned int *edx)
+{
+	/* ecx is often an input as well as an output. */
+	asm volatile("cpuid"
+	    : "=a" (*eax),
+	      "=b" (*ebx),
+	      "=c" (*ecx),
+	      "=d" (*edx)
+	    : "0" (*eax), "2" (*ecx)
+	    : "memory");
+}
+
+static inline void cpuid(unsigned int op,
+			 unsigned int *eax, unsigned int *ebx,
+			 unsigned int *ecx, unsigned int *edx)
+{
+	*eax = op;
+	*ecx = 0;
+	native_cpuid(eax, ebx, ecx, edx);
+}
+
+void test_1 (void)
+{
+  u32 eax, ebx, ecx, edx;
+  cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); /* from "amd_get_topology".  */
+
+  /* Verify that they are now initialized.  */
+  check_init_u32 (eax);
+  check_init_u32 (ebx);
+  check_init_u32 (ecx);
+  check_init_u32 (edx);
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-rdmsr-paravirt.c b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-rdmsr-paravirt.c
new file mode 100644
index 0000000..d994787
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-rdmsr-paravirt.c
@@ -0,0 +1,210 @@
+/* Adapted from Linux: arch/x86/include/asm/paravirt.h */
+
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } { "" } } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
+
+/* Adapted/reduced from linux kernel (GPL-2.0).  */
+
+#include "../analyzer-decls.h"
+
+typedef unsigned char u8;
+typedef unsigned __INT32_TYPE__ u32;
+typedef unsigned __INT64_TYPE__ u64;
+typedef __SIZE_TYPE__ size_t;
+
+#define offsetof(TYPE, MEMBER)	((size_t)&((TYPE *)0)->MEMBER)
+
+#define __stringify_1(x...)	#x
+#define __stringify(x...)	__stringify_1(x)
+
+# define __ASM_FORM(x, ...)		" " __stringify(x,##__VA_ARGS__) " "
+# define __ASM_FORM_RAW(x, ...)		    __stringify(x,##__VA_ARGS__)
+
+#ifndef __x86_64__
+/* 32 bit */
+# define __ASM_SEL(a,b)		__ASM_FORM(a)
+# define __ASM_SEL_RAW(a,b)	__ASM_FORM_RAW(a)
+#else
+/* 64 bit */
+# define __ASM_SEL(a,b)		__ASM_FORM(b)
+# define __ASM_SEL_RAW(a,b)	__ASM_FORM_RAW(b)
+#endif
+
+#define __ASM_REG(reg)         __ASM_SEL_RAW(e##reg, r##reg)
+
+#define _ASM_PTR	__ASM_SEL(.long, .quad)
+#define _ASM_ALIGN   __ASM_SEL(.balign 4, .balign 8)
+
+#define _ASM_SP		__ASM_REG(sp)
+
+
+register unsigned long current_stack_pointer asm(_ASM_SP);
+#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
+
+#define ANNOTATE_RETPOLINE_SAFE					\
+	"999:\n\t"						\
+	".pushsection .discard.retpoline_safe\n\t"		\
+	_ASM_PTR " 999b\n\t"					\
+	".popsection\n\t"
+
+/* Adapted from Linux arch/x86/include/asm/paravirt.h  */
+
+
+/* snip */
+
+/* ./arch/x86/include/asm/paravirt.h I think; was:
+   PVOP_VCALL4(cpu.cpuid, eax, ebx, ecx, edx);
+
+*/
+
+#ifndef __x86_64__
+#define CLBR_ANY  ((1 << 4) - 1)
+#else
+#define CLBR_ANY  ((1 << 9) - 1)
+#endif /* X86_64 */
+
+struct pv_cpu_ops {
+  /* snip */
+  u64 (*read_msr_safe)(unsigned int msr, int *err);
+  /* snip */
+};
+
+struct paravirt_patch_template {
+  struct pv_cpu_ops cpu;
+  /* snip */
+};
+extern struct paravirt_patch_template pv_ops;
+
+#define PARAVIRT_PATCH(x)					\
+	(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
+
+#define paravirt_type(op)				\
+	[paravirt_typenum] "i" (PARAVIRT_PATCH(op)),	\
+	[paravirt_opptr] "i" (&(pv_ops.op))
+#define paravirt_clobber(clobber)		\
+	[paravirt_clobber] "i" (clobber)
+
+/*
+ * Generate some code, and mark it as patchable by the
+ * apply_paravirt() alternate instruction patcher.
+ */
+#define _paravirt_alt(insn_string, type, clobber)	\
+	"771:\n\t" insn_string "\n" "772:\n"		\
+	".pushsection .parainstructions,\"a\"\n"	\
+	_ASM_ALIGN "\n"					\
+	_ASM_PTR " 771b\n"				\
+	"  .byte " type "\n"				\
+	"  .byte 772b-771b\n"				\
+	"  .short " clobber "\n"			\
+	".popsection\n"
+
+/* Generate patchable code, with the default asm parameters. */
+#define paravirt_alt(insn_string)					\
+	_paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
+
+#define PARAVIRT_CALL					\
+	ANNOTATE_RETPOLINE_SAFE				\
+	"call *%c[paravirt_opptr];"
+
+#ifndef __x86_64__
+
+/* 32-bit.  */
+
+#define PVOP_CALL_ARGS							\
+	unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx;
+
+#define PVOP_CALL_ARG1(x)		"a" ((unsigned long)(x))
+#define PVOP_CALL_ARG2(x)		"d" ((unsigned long)(x))
+
+#define PVOP_VCALL_CLOBBERS		"=a" (__eax), "=d" (__edx),	\
+					"=c" (__ecx)
+#define PVOP_CALL_CLOBBERS		PVOP_VCALL_CLOBBERS
+
+#define PVOP_VCALLEE_CLOBBERS		"=a" (__eax), "=d" (__edx)
+#define PVOP_CALLEE_CLOBBERS		PVOP_VCALLEE_CLOBBERS
+
+#define EXTRA_CLOBBERS
+
+#else
+
+/* 64-bit.  */
+
+/* [re]ax isn't an arg, but the return val */
+#define PVOP_CALL_ARGS						\
+	unsigned long __edi = __edi, __esi = __esi,		\
+		__edx = __edx, __ecx = __ecx, __eax = __eax;
+
+#define PVOP_CALL_ARG1(x)		"D" ((unsigned long)(x))
+#define PVOP_CALL_ARG2(x)		"S" ((unsigned long)(x))
+
+#define PVOP_VCALL_CLOBBERS	"=D" (__edi),				\
+				"=S" (__esi), "=d" (__edx),		\
+				"=c" (__ecx)
+#define PVOP_CALL_CLOBBERS	PVOP_VCALL_CLOBBERS, "=a" (__eax)
+#define PVOP_VCALLEE_CLOBBERS	"=a" (__eax)
+#define PVOP_CALLEE_CLOBBERS	PVOP_VCALLEE_CLOBBERS
+
+#define EXTRA_CLOBBERS	 , "r8", "r9", "r10", "r11"
+#endif	/* CONFIG_X86_32 */
+
+#define PVOP_TEST_NULL(op)	((void)pv_ops.op)
+
+#define PVOP_RETVAL(rettype)						\
+	({	unsigned long __mask = ~0UL;				\
+		switch (sizeof(rettype)) {				\
+		case 1: __mask =       0xffUL; break;			\
+		case 2: __mask =     0xffffUL; break;			\
+		case 4: __mask = 0xffffffffUL; break;			\
+		default: break;						\
+		}							\
+		__mask & __eax;						\
+	})
+
+#define ____PVOP_CALL(ret, op, clbr, call_clbr, extra_clbr, ...)	\
+	({								\
+		PVOP_CALL_ARGS;						\
+		PVOP_TEST_NULL(op);					\
+		asm volatile(paravirt_alt(PARAVIRT_CALL)		\
+			     : call_clbr, ASM_CALL_CONSTRAINT		\
+			     : paravirt_type(op),			\
+			       paravirt_clobber(clbr),			\
+			       ##__VA_ARGS__				\
+			     : "memory", "cc" extra_clbr);		\
+		ret;							\
+	})
+
+#define __PVOP_CALL(rettype, op, ...)					\
+	____PVOP_CALL(PVOP_RETVAL(rettype), op, CLBR_ANY,		\
+		      PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, ##__VA_ARGS__)
+
+#define PVOP_CALL2(rettype, op, arg1, arg2)				\
+	__PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2))
+
+static inline u64 paravirt_read_msr_safe(unsigned msr, int *err)
+{
+	return PVOP_CALL2(u64, cpu.read_msr_safe, msr, err);
+}
+
+#define rdmsr_safe(msr, a, b)				\
+({							\
+	int _err;					\
+	u64 _l = paravirt_read_msr_safe(msr, &_err);	\
+	(*a) = (u32)_l;					\
+	(*b) = _l >> 32;				\
+	_err;						\
+})
+
+
+void check_init_int(int);
+void check_init_u32(u32);
+
+void test(void)
+{
+  int err;
+  u32 eax, edx;
+  err = rdmsr_safe(0, &eax, &edx);
+  check_init_int(err);
+  check_init_u32(eax);
+  check_init_u32(edx);
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-rdmsr.c b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-rdmsr.c
new file mode 100644
index 0000000..0a1c48f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-rdmsr.c
@@ -0,0 +1,33 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } { "" } } */
+
+#include "../analyzer-decls.h"
+
+/* Adapted from Linux: arch/x86/include/asm/msr.h (GPL-2.0)  */
+
+#ifdef __x86_64__
+#define DECLARE_ARGS(val, low, high)	unsigned long low, high
+#define EAX_EDX_VAL(val, low, high)	((low) | (high) << 32)
+#define EAX_EDX_RET(val, low, high)	"=a" (low), "=d" (high)
+#else
+#define DECLARE_ARGS(val, low, high)	unsigned long long val
+#define EAX_EDX_VAL(val, low, high)	(val)
+#define EAX_EDX_RET(val, low, high)	"=A" (val)
+#endif
+
+static unsigned long long __rdmsr(unsigned int msr)
+{
+	DECLARE_ARGS(val, low, high);
+
+	asm volatile("1: rdmsr\n"
+		     "2:\n"
+		     : EAX_EDX_RET(val, low, high) : "c" (msr));
+
+	return EAX_EDX_VAL(val, low, high);
+}
+
+void test (void)
+{
+  __analyzer_eval (__rdmsr (0)); /* { dg-warning "UNKNOWN" } */
+  __analyzer_eval (__rdmsr (1)); /* { dg-warning "UNKNOWN" } */
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-full.c b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-full.c
new file mode 100644
index 0000000..e90dccf5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-full.c
@@ -0,0 +1,319 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-additional-options "-fsanitize=bounds -fno-analyzer-call-summaries" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
+
+/* Reduced from linux kernel: drivers/staging/wfx/sta.c (GPL-2.0)
+   on x86_64 with "allyesconfig"
+
+   This test is deliberately not fully reduced, as an integration test
+   that the analyzer doesn't emit bogus "dereference of NULL" warnings
+   on the repeated wdev_to_wvif calls.  */
+
+#define NULL ((void *)0)
+
+/* Types.  */
+
+typedef unsigned char __u8;
+typedef unsigned short __u16;
+__extension__ typedef unsigned long long __u64;
+
+typedef __u8 u8;
+typedef __u16 u16;
+typedef __u64 u64;
+
+enum { false = 0, true = 1 };
+typedef _Bool bool;
+
+struct device;
+
+typedef struct {
+  int counter;
+} atomic_t;
+
+struct static_key {
+  atomic_t enabled;
+  union {
+    unsigned long type;
+    struct jump_entry *entries;
+    struct static_key_mod *next;
+  };
+};
+
+struct static_key_true {
+  struct static_key key;
+};
+
+struct static_key_false {
+  struct static_key key;
+};
+
+struct _ddebug {
+  const char *modname;
+  const char *function;
+  const char *filename;
+  const char *format;
+  unsigned int lineno : 18;
+  unsigned int flags : 8;
+
+  union {
+    struct static_key_true dd_key_true;
+    struct static_key_false dd_key_false;
+  } key;
+
+} __attribute__((aligned(8)));
+
+enum nl80211_iftype {
+  /* [...snip...] */
+  NL80211_IFTYPE_AP,
+  /* [...snip...] */
+  NUM_NL80211_IFTYPES,
+  NL80211_IFTYPE_MAX = NUM_NL80211_IFTYPES - 1
+};
+
+struct ieee80211_channel {
+  /* [...snip...] */
+  u16 hw_value;
+  /* [...snip...] */
+};
+
+struct cfg80211_chan_def {
+  struct ieee80211_channel *chan;
+  /* [...snip...] */
+};
+
+struct ieee80211_bss_conf {
+  /* [...snip...] */
+  bool assoc, ibss_joined;
+  /* [...snip...] */
+  struct cfg80211_chan_def chandef;
+  /* [...snip...] */
+  bool ps;
+  /* [...snip...] */
+};
+
+struct ieee80211_conf {
+  /* [...snip...] */
+  int power_level, dynamic_ps_timeout;
+  /* [...snip...] */
+};
+
+struct ieee80211_vif {
+  enum nl80211_iftype type;
+  struct ieee80211_bss_conf bss_conf;
+  /* [...snip...] */
+  u8 drv_priv[] __attribute__((__aligned__(sizeof(void *))));
+};
+
+struct ieee80211_hw {
+  struct ieee80211_conf conf;
+  /* [...snip...] */
+};
+
+struct wfx_dev {
+  /* [...snip...] */
+  struct device *dev;
+  struct ieee80211_hw *hw;
+  struct ieee80211_vif *vif[2];
+  /* [...snip...] */
+  int force_ps_timeout;
+};
+
+struct wfx_vif {
+  struct wfx_dev *wdev;
+  struct ieee80211_vif *vif;
+  /* [...snip...] */
+};
+
+/* Function decls.  */
+
+extern __attribute__((__format__(printf, 1, 2))) void
+__warn_printk(const char *fmt, ...);
+
+extern bool ____wrong_branch_error(void);
+
+extern __attribute__((__format__(printf, 3, 4))) void
+__dynamic_dev_dbg(struct _ddebug *descriptor, const struct device *dev,
+                  const char *fmt, ...);
+
+bool wfx_api_older_than(struct wfx_dev *wdev, int major, int minor);
+
+/* Function defns.  */
+
+static inline unsigned long array_index_mask_nospec(unsigned long index,
+                                                    unsigned long size) {
+  unsigned long mask;
+
+  asm volatile("cmp %1,%2; sbb %0,%0;"
+               : "=r"(mask)
+               : "g"(size), "r"(index)
+               : "cc");
+  return mask;
+}
+
+static inline __attribute__((__always_inline__)) bool
+arch_static_branch(struct static_key *key, bool branch) {
+  asm goto("1:"
+           "jmp %l[l_yes] # objtool NOPs this \n\t"
+           ".pushsection __jump_table,  \"aw\" \n\t"
+           " "
+           ".balign 8"
+           " "
+           "\n\t"
+           ".long 1b - . \n\t"
+           ".long %l[l_yes] - . \n\t"
+           " "
+           ".quad"
+           " "
+           "%c0 + %c1 - .\n\t"
+           ".popsection \n\t"
+           :
+           : "i"(key), "i"(2 | branch)
+           :
+           : l_yes);
+  asm("");
+
+  return false;
+l_yes:
+  return true;
+}
+
+static inline __attribute__((__always_inline__)) bool
+arch_static_branch_jump(struct static_key *const key, const bool branch) {
+  asm goto("1:"
+           "jmp %l[l_yes]\n\t"
+           ".pushsection __jump_table,  \"aw\" \n\t"
+           " "
+           ".balign 8"
+           " "
+           "\n\t"
+           ".long 1b - . \n\t"
+           ".long %l[l_yes] - . \n\t"
+           " "
+           ".quad"
+           " "
+           "%c0 + %c1 - .\n\t"
+           ".popsection \n\t"
+           :
+           : "i"(key), "i"(branch)
+           :
+           : l_yes);
+  asm("");
+
+  return false;
+l_yes:
+  return true;
+}
+
+static inline struct wfx_vif *wdev_to_wvif(struct wfx_dev *wdev, int vif_id) {
+  if (vif_id >=
+      (sizeof(wdev->vif) / sizeof((wdev->vif)[0]) + ((int)(sizeof(struct {
+         int : (-!!(__builtin_types_compatible_p(typeof((wdev->vif)),
+                                                 typeof(&(wdev->vif)[0]))));
+       }))))) {
+    static struct _ddebug __attribute__((__aligned__(8)))
+    __attribute__((__section__("__dyndbg"))) __UNIQUE_ID_ddebug1678 = {
+        .modname = "wfx",
+        .function = __func__,
+        .filename = "drivers/staging/wfx/wfx.h",
+        .format = ("requesting non-existent vif: %d\n"),
+        .lineno = 97,
+        .flags = 0,
+        .key.dd_key_false = ((struct static_key_false){
+            .key = {.enabled = {0}, {.entries = (void *)0UL}},
+        })};
+    if (({
+          bool branch;
+          if (__builtin_types_compatible_p(
+                  typeof(*&__UNIQUE_ID_ddebug1678.key.dd_key_false),
+                  struct static_key_true))
+            branch = arch_static_branch_jump(
+                &(&__UNIQUE_ID_ddebug1678.key.dd_key_false)->key, false);
+          else if (__builtin_types_compatible_p(
+                       typeof(*&__UNIQUE_ID_ddebug1678.key.dd_key_false),
+                       struct static_key_false))
+            branch = arch_static_branch(
+                &(&__UNIQUE_ID_ddebug1678.key.dd_key_false)->key, false);
+          else
+            branch = ____wrong_branch_error();
+          __builtin_expect(!!(branch), 0);
+        }))
+      __dynamic_dev_dbg(&__UNIQUE_ID_ddebug1678, wdev->dev,
+                        "requesting non-existent vif: %d\n", vif_id);
+    return NULL;
+  }
+  typeof(vif_id) _i = (vif_id);
+  typeof((sizeof(wdev->vif) / sizeof((wdev->vif)[0]) + ((int)(sizeof(struct {
+            int : (-!!(__builtin_types_compatible_p(typeof((wdev->vif)),
+                                                    typeof(&(wdev->vif)[0]))));
+          }))))) _s =
+      ((sizeof(wdev->vif) / sizeof((wdev->vif)[0]) + ((int)(sizeof(struct {
+          int : (-!!(__builtin_types_compatible_p(typeof((wdev->vif)),
+                                                  typeof(&(wdev->vif)[0]))));
+        })))));
+  unsigned long _mask = array_index_mask_nospec(_i, _s);
+  vif_id = (typeof(_i))(_i & _mask);
+  if (!wdev->vif[vif_id]) {
+    static struct _ddebug __attribute__((__aligned__(8)))
+    __attribute__((__section__("__dyndbg"))) __UNIQUE_ID_ddebug1681 = {
+        .modname = "wfx",
+        .function = __func__,
+        .filename = "drivers/staging/wfx/wfx.h",
+        .format = ("requesting non-allocated vif: %d\n"),
+        .lineno = 102,
+        .flags = 0,
+        .key.dd_key_false = ((struct static_key_false){
+            .key = {.enabled = {0}, {.entries = (void *)0UL}},
+        })};
+    if (({
+          bool branch;
+          if (__builtin_types_compatible_p(
+                  typeof(*&__UNIQUE_ID_ddebug1681.key.dd_key_false),
+                  struct static_key_true))
+            branch = arch_static_branch_jump(
+                &(&__UNIQUE_ID_ddebug1681.key.dd_key_false)->key, false);
+          else if (__builtin_types_compatible_p(
+                       typeof(*&__UNIQUE_ID_ddebug1681.key.dd_key_false),
+                       struct static_key_false))
+            branch = arch_static_branch(
+                &(&__UNIQUE_ID_ddebug1681.key.dd_key_false)->key, false);
+          else
+            branch = ____wrong_branch_error();
+          __builtin_expect(!!(branch), 0);
+        }))
+      __dynamic_dev_dbg(&__UNIQUE_ID_ddebug1681, wdev->dev,
+                        "requesting non-allocated vif: %d\n", vif_id);
+    return NULL;
+  }
+  return (struct wfx_vif *)wdev->vif[vif_id]->drv_priv;
+}
+
+int wfx_get_ps_timeout(struct wfx_vif *wvif, bool *enable_ps) {
+  struct ieee80211_channel *chan0 = NULL, *chan1 = NULL;
+  struct ieee80211_conf *conf = &wvif->wdev->hw->conf;
+
+  if (wdev_to_wvif(wvif->wdev, 0))
+    chan0 = wdev_to_wvif(wvif->wdev, 0)->vif->bss_conf.chandef.chan; /* { dg-bogus "dereference of NULL" } */
+  if (wdev_to_wvif(wvif->wdev, 1))
+    chan1 = wdev_to_wvif(wvif->wdev, 1)->vif->bss_conf.chandef.chan; /* { dg-bogus "dereference of NULL" } */
+  if (chan0 && chan1 && chan0->hw_value != chan1->hw_value &&
+      wvif->vif->type != NL80211_IFTYPE_AP) {
+
+    if (enable_ps)
+      *enable_ps = true;
+    if (wvif->wdev->force_ps_timeout > -1)
+      return wvif->wdev->force_ps_timeout;
+    else if (wfx_api_older_than(wvif->wdev, 3, 2))
+      return 0;
+    else
+      return 30;
+  }
+  if (enable_ps)
+    *enable_ps = wvif->vif->bss_conf.ps;
+  if (wvif->wdev->force_ps_timeout > -1)
+    return wvif->wdev->force_ps_timeout;
+  else if (wvif->vif->bss_conf.assoc && wvif->vif->bss_conf.ps)
+    return conf->dynamic_ps_timeout;
+  else
+    return -1;
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-reduced.c b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-reduced.c
new file mode 100644
index 0000000..a18c58c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-reduced.c
@@ -0,0 +1,77 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-require-effective-target lp64 } */
+
+/* Reproducer for false positive from -Wanalyzer-null-dereference seen
+   in Linux kernel (drivers/staging/wfx/sta.c; GPL-2.0) due to
+   the analyzer not grokking that array_index_mask_nospec is
+   effectively pure, and thus not realizing that array_index_no_spec
+   is also pure, leading to wdev_to_wvif not being treated as pure,
+   and thus able to return non-NULL and then NULL.  */
+
+typedef unsigned char u8;
+#define NULL ((void *)0)
+
+/* Types.  */
+
+struct ieee80211_vif {
+  int placeholder;
+  /* snip */
+  u8 drv_priv[];
+};
+
+struct wfx_dev {
+  /* snip */
+  struct ieee80211_vif *vif[2];
+  /* snip */
+};
+
+struct wfx_vif {
+  struct wfx_dev *wdev;
+  struct ieee80211_vif *vif;
+  /* snip */
+};
+
+/* Copied from arch/x86/include/asm/barrier.h */
+
+static inline unsigned long array_index_mask_nospec(unsigned long index,
+		unsigned long size)
+{
+	unsigned long mask;
+
+	asm volatile ("cmp %1,%2; sbb %0,%0;"
+			:"=r" (mask)
+			:"g"(size),"r" (index)
+			:"cc");
+	return mask;
+}
+
+/* Simplified from include/linux/kernel.h */
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+/* Simplified from include/linux/nospec.h */
+
+#define array_index_nospec(index, size)					\
+({									\
+	typeof(index) _i = (index);					\
+	typeof(size) _s = (size);					\
+	unsigned long _mask = array_index_mask_nospec(_i, _s);		\
+	/* snip */							\
+	(typeof(_i)) (_i & _mask);					\
+})
+
+/* Simplified from drivers/staging/wfx/wfx.h */
+
+static inline struct wfx_vif *wdev_to_wvif(struct wfx_dev *wdev, int vif_id) {
+  vif_id = array_index_nospec(vif_id, ARRAY_SIZE(wdev->vif));
+  if (!wdev->vif[vif_id]) {
+    return NULL;
+  }
+  return (struct wfx_vif *)wdev->vif[vif_id]->drv_priv;
+}
+
+struct ieee80211_vif *test (struct wfx_vif *wvif) {
+  if (wdev_to_wvif(wvif->wdev, 1))
+    return wdev_to_wvif(wvif->wdev, 1)->vif; /* { dg-bogus "dereference of NULL" } */
+  else
+    return NULL;
+}
diff --git a/gcc/testsuite/gcc.dg/fold-ior-4.c b/gcc/testsuite/gcc.dg/fold-ior-4.c
new file mode 100644
index 0000000..8f7213e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-ior-4.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+unsigned int test_ior(unsigned char i)
+{
+  return i | (i<<8) | (i<<16) | (i<<24);
+}
+
+unsigned int test_xor(unsigned char i)
+{
+  return i ^ (i<<8) ^ (i<<16) ^ (i<<24);
+}
+
+unsigned int test_ior_1s(unsigned char i)
+{
+  return i | (i<<8);
+}
+
+unsigned int test_ior_1u(unsigned char i)
+{
+  unsigned int t = i;
+  return t | (t<<8);
+}
+
+unsigned int test_xor_1s(unsigned char i)
+{
+  return i ^ (i<<8);
+}
+
+unsigned int test_xor_1u(unsigned char i)
+{
+  unsigned int t = i;
+  return t ^ (t<<8);
+}
+
+unsigned int test_ior_2s(unsigned char i)
+{
+  return (i<<8) | (i<<16);
+}
+
+unsigned int test_ior_2u(unsigned char i)
+{
+  unsigned int t = i;
+  return (t<<8) | (t<<16);
+}
+
+unsigned int test_xor_2s(unsigned char i)
+{
+  return (i<<8) ^ (i<<16);
+}
+
+unsigned int test_xor_2u(unsigned char i)
+{
+  unsigned int t = i;
+  return (t<<8) ^ (t<<16);
+}
+
+/* { dg-final { scan-tree-dump-not " \\^ " "optimized" } } */
+/* { dg-final { scan-tree-dump-not " \\| " "optimized" } } */
+/* { dg-final { scan-tree-dump-times " \\* 16843009" 2 "optimized" } } */
+
diff --git a/gcc/testsuite/gcc.dg/sso-15.c b/gcc/testsuite/gcc.dg/sso-15.c
new file mode 100644
index 0000000..d8a711d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/sso-15.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define REV_ENDIANNESS __attribute__((scalar_storage_order("big-endian")))
+#else
+#define REV_ENDIANNESS __attribute__((scalar_storage_order("little-endian")))
+#endif
+
+struct X { int *p; } REV_ENDIANNESS;
+
+struct X x;
+
+struct X __attribute__((noinline)) foo (int *p)
+{
+  struct X x;
+  x.p = p;
+  return x;
+}
+
+void __attribute((noinline)) bar (void)
+{
+  *x.p = 1;
+}
+
+extern void abort (void);
+
+int main (void)
+{
+  int i = 0;
+  x = foo(&i);
+  bar();
+  if (i != 1)
+    abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c b/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c
index bbba052..594c3f3 100644
--- a/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fno-early-inlining -fdump-ipa-profile-optimized -fdump-ipa-afdo" } */
+/* { dg-options "-O2 -fno-early-inlining -fdump-ipa-profile-optimized -fdump-tree-einline-optimized" } */
 volatile int one;
 static int
 add1 (int val)
@@ -22,7 +22,7 @@ int
 main (void)
 {
   int i, val = 0;
-  for (i = 0; i < 100000; i++)
+  for (i = 0; i < 10000000; i++)
     {
       val = do_op (val, add1);
       val = do_op (val, sub1);
@@ -31,5 +31,5 @@ main (void)
 }
 /* { dg-final-use-not-autofdo { scan-ipa-dump "Indirect call -> direct call.* add1 .will resolve by ipa-profile" "profile"} } */
 /* { dg-final-use-not-autofdo { scan-ipa-dump "Indirect call -> direct call.* sub1 .will resolve by ipa-profile" "profile"} } */
-/* { dg-final-use-autofdo { scan-ipa-dump "Indirect call -> direct call.* add1 .will resolve by ipa-profile" "afdo"} } */
-/* { dg-final-use-autofdo { scan-ipa-dump "Indirect call -> direct call.* sub1 .will resolve by ipa-profile" "afdo"} } */
+/* { dg-final-use-autofdo { scan-tree-dump "Inlining add1/1 into main/4." "einline"} } */
+/* { dg-final-use-autofdo { scan-tree-dump "Inlining sub1/2 into main/4." "einline"} } */
diff --git a/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof.c b/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof.c
index 138b85a..7020452 100644
--- a/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof.c
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-profile-optimized -fdump-ipa-afdo" } */
+/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-profile-optimized -fdump-ipa-afdo-optimized" } */
 
 static int a1 (void)
 {
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr101756.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr101756.c
new file mode 100644
index 0000000..de7f180
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr101756.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* SIMD support can emit additional diagnostics.  */
+/* { dg-additional-options "-w" } */
+
+__attribute__ ((simd)) int
+tq (long int ea, int of, int kk)
+{
+  int bc;
+
+  for (bc = 0; bc < 2; ++bc)
+    {
+      ++ea;
+      of |= !!kk < !!ea;
+    }
+
+  return of;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_12.c b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_12.c
new file mode 100644
index 0000000..4c5226e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_12.c
@@ -0,0 +1,19 @@
+/* { dg-options "-O3 -mtune=neoverse-512tvb" } */
+
+void
+f (float x[restrict 10][1024],
+   float y[restrict 10][1024], float z)
+{
+  for (int i = 0; i < 10; ++i)
+    {
+#pragma GCC unroll 10
+      for (int j = 0; j < 10; ++j)
+	x[j][i] = y[j][i] * z;
+    }
+}
+
+/* We should unroll the outer loop, with 2x 16-byte vectors and 1x
+   8-byte vectors.  */
+/* { dg-final { scan-assembler-not {\tptrue\t} } } */
+/* { dg-final { scan-assembler {\tv[0-9]+\.4s,} } } */
+/* { dg-final { scan-assembler {\tv[0-9]+\.2s,} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vaddX_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vaddX_high_cost.c
new file mode 100644
index 0000000..43f28d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vaddX_high_cost.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_ADDL(rettype, intype, ts, rs) \
+  rettype test_vaddl_ ## ts (intype a, intype b, intype c) \
+	{ \
+		rettype t0 = vaddl_ ## ts (vget_high_ ## ts (a), \
+					   vget_high_ ## ts (c)); \
+		rettype t1 = vaddl_ ## ts (vget_high_ ## ts (b), \
+					   vget_high_ ## ts (c)); \
+		return vaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_ADDL (int16x8_t, int8x16_t, s8, s16)
+TEST_ADDL (uint16x8_t, uint8x16_t, u8, u16)
+TEST_ADDL (int32x4_t, int16x8_t, s16, s32)
+TEST_ADDL (uint32x4_t, uint16x8_t, u16, u32)
+TEST_ADDL (int64x2_t, int32x4_t, s32, s64)
+TEST_ADDL (uint64x2_t, uint32x4_t, u32, u64)
+
+#define TEST_ADDW(rettype, intype, intypel, ts, rs) \
+  rettype test_vaddw_ ## ts (intype a, intype b, intypel c) \
+	{ \
+		rettype t0 = vaddw_ ## ts (a, vget_high_ ## ts (c)); \
+		rettype t1 = vaddw_ ## ts (b, vget_high_ ## ts (c)); \
+		return vaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_ADDW (int16x8_t, int16x8_t, int8x16_t, s8, s16)
+TEST_ADDW (uint16x8_t, uint16x8_t, uint8x16_t, u8, u16)
+TEST_ADDW (int32x4_t, int32x4_t, int16x8_t, s16, s32)
+TEST_ADDW (uint32x4_t, uint32x4_t, uint16x8_t, u16, u32)
+TEST_ADDW (int64x2_t, int64x2_t, int32x4_t, s32, s64)
+TEST_ADDW (uint64x2_t, uint64x2_t, uint32x4_t, u32, u64)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vmul_element_cost.c b/gcc/testsuite/gcc.target/aarch64/vmul_element_cost.c
new file mode 100644
index 0000000..c153775
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vmul_element_cost.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_MUL_UNIFORM(name, q, vectype, ts) \
+  vectype test_ ## name ## q ## _ ## ts (vectype a, vectype b, vectype c) \
+	{ \
+		vectype t0 = name ## q ## _n_ ## ts (a, c[1]); \
+		vectype t1 = name ## q ## _n_ ## ts (b, c[1]); \
+		return vmul ## q ## _ ## ts (t0, t1); \
+	}
+
+TEST_MUL_UNIFORM (vmul, , int16x4_t, s16)
+TEST_MUL_UNIFORM (vmul, , uint16x4_t, u16)
+TEST_MUL_UNIFORM (vmul, , int32x2_t, s32)
+TEST_MUL_UNIFORM (vmul, , uint32x2_t, u32)
+TEST_MUL_UNIFORM (vmul, , float32x2_t, f32)
+TEST_MUL_UNIFORM (vmul, q, int16x8_t, s16)
+TEST_MUL_UNIFORM (vmul, q, uint16x8_t, u16)
+TEST_MUL_UNIFORM (vmul, q, int32x4_t, s32)
+TEST_MUL_UNIFORM (vmul, q, uint32x4_t, u32)
+TEST_MUL_UNIFORM (vmul, q, float32x4_t, f32)
+TEST_MUL_UNIFORM (vmul, q, float64x2_t, f64)
+
+#define TEST_MLX_UNIFORM(name, q, vectype, ts) \
+  vectype test_ ## name ## q ## _ ## ts (vectype acc, vectype a, vectype b) \
+	{ \
+		acc = name ## q ## _n_ ## ts (acc, a, b[1]); \
+		return name ## q ## _n_ ## ts (acc, a, b[1]); \
+	}
+
+TEST_MLX_UNIFORM (vmla, , int16x4_t, s16)
+TEST_MLX_UNIFORM (vmla, , uint16x4_t, u16)
+TEST_MLX_UNIFORM (vmla, , int32x2_t, s32)
+TEST_MLX_UNIFORM (vmla, , uint32x2_t, u32)
+TEST_MLX_UNIFORM (vmla, , float32x2_t, f32)
+TEST_MLX_UNIFORM (vmla, q, int16x8_t, s16)
+TEST_MLX_UNIFORM (vmla, q, uint16x8_t, u16)
+TEST_MLX_UNIFORM (vmla, q, int32x4_t, s32)
+TEST_MLX_UNIFORM (vmla, q, uint32x4_t, u32)
+TEST_MLX_UNIFORM (vmla, q, float32x4_t, f32)
+
+TEST_MLX_UNIFORM (vmls, , int16x4_t, s16)
+TEST_MLX_UNIFORM (vmls, , uint16x4_t, u16)
+TEST_MLX_UNIFORM (vmls, , int32x2_t, s32)
+TEST_MLX_UNIFORM (vmls, , uint32x2_t, u32)
+TEST_MLX_UNIFORM (vmls, , float32x2_t, f32)
+TEST_MLX_UNIFORM (vmls, q, int16x8_t, s16)
+TEST_MLX_UNIFORM (vmls, q, uint16x8_t, u16)
+TEST_MLX_UNIFORM (vmls, q, int32x4_t, s32)
+TEST_MLX_UNIFORM (vmls, q, uint32x4_t, u32)
+TEST_MLX_UNIFORM (vmls, q, float32x4_t, f32)
+
+#define TEST_MUL_LONG(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## ts (intype a, intype b, intype c) \
+	{ \
+		rettype t0 = name ## ts (a, c[1]); \
+		rettype t1 = name ## ts (b, c[1]); \
+		return vqaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_MUL_LONG (vmull_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MUL_LONG (vmull_n_, uint32x4_t, uint16x4_t, u16, u32)
+TEST_MUL_LONG (vmull_n_, int64x2_t, int32x2_t, s32, s64)
+TEST_MUL_LONG (vmull_n_, uint64x2_t, uint32x2_t, u32, u64)
+
+TEST_MUL_LONG (vqdmull_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MUL_LONG (vqdmull_n_, int64x2_t, int32x2_t, s32, s64)
+
+#define TEST_MLX_LONG(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b) \
+	{ \
+		acc = name ## ts (acc, a, b[1]); \
+		return name ## ts (acc, a, b[1]); \
+	}
+
+TEST_MLX_LONG (vmlal_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MLX_LONG (vmlal_n_, uint32x4_t, uint16x4_t, u16, u32)
+TEST_MLX_LONG (vmlal_n_, int64x2_t, int32x2_t, s32, s64)
+TEST_MLX_LONG (vmlal_n_, uint64x2_t, uint32x2_t, u32, u64)
+
+TEST_MLX_LONG (vmlsl_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MLX_LONG (vmlsl_n_, uint32x4_t, uint16x4_t, u16, u32)
+TEST_MLX_LONG (vmlsl_n_, int64x2_t, int32x2_t, s32, s64)
+TEST_MLX_LONG (vmlsl_n_, uint64x2_t, uint32x2_t, u32, u64)
+
+TEST_MLX_LONG (vqdmlal_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MLX_LONG (vqdmlal_n_, int64x2_t, int32x2_t, s32, s64)
+
+TEST_MLX_LONG (vqdmlsl_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MLX_LONG (vqdmlsl_n_, int64x2_t, int32x2_t, s32, s64)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
new file mode 100644
index 0000000..ecc02e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_MULL_VEC(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
+	{ \
+		rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), \
+					      vget_high_ ## ts (c)); \
+		rettype t1 = name ## _ ## ts (vget_high_ ## ts (b), \
+					      vget_high_ ## ts (c)); \
+		return vqaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_MULL_VEC (vmull, int16x8_t, int8x16_t, s8, s16)
+TEST_MULL_VEC (vmull, uint16x8_t, uint8x16_t, u8, u16)
+TEST_MULL_VEC (vmull, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_VEC (vmull, uint32x4_t, uint16x8_t, u16, u32)
+TEST_MULL_VEC (vmull, int64x2_t, int32x4_t, s32, s64)
+TEST_MULL_VEC (vmull, uint64x2_t, uint32x4_t, u32, u64)
+
+TEST_MULL_VEC (vqdmull, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_VEC (vqdmull, int64x2_t, int32x4_t, s32, s64)
+
+#define TEST_MULL_N(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
+	{ \
+		rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), b[1]); \
+		rettype t1 = name ## _ ## ts (vget_high_ ## ts (a), c[1]); \
+		return vqaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_MULL_N (vmull_n, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_N (vmull_n, uint32x4_t, uint16x8_t, u16, u32)
+TEST_MULL_N (vmull_n, int64x2_t, int32x4_t, s32, s64)
+TEST_MULL_N (vmull_n, uint64x2_t, uint32x4_t, u32, u64)
+
+TEST_MULL_N (vqdmull_n, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_N (vqdmull_n, int64x2_t, int32x4_t, s32, s64)
+
+#define TEST_MLXL_VEC(name, rettype, intype, ts) \
+  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b, \
+				    intype c) \
+	{ \
+		acc = name ## _ ## ts (acc, vget_high_ ## ts (a), \
+					    vget_high_ ## ts (b)); \
+		return name ## _ ## ts (acc, vget_high_ ## ts (a), \
+					     vget_high_ ## ts (c)); \
+	}
+
+TEST_MLXL_VEC (vmlal, int16x8_t, int8x16_t, s8)
+TEST_MLXL_VEC (vmlal, uint16x8_t, uint8x16_t, u8)
+TEST_MLXL_VEC (vmlal, int32x4_t, int16x8_t, s16)
+TEST_MLXL_VEC (vmlal, uint32x4_t, uint16x8_t, u16)
+
+TEST_MLXL_VEC (vmlsl, int16x8_t, int8x16_t, s8)
+TEST_MLXL_VEC (vmlsl, uint16x8_t, uint8x16_t, u8)
+TEST_MLXL_VEC (vmlsl, int32x4_t, int16x8_t, s16)
+TEST_MLXL_VEC (vmlsl, uint32x4_t, uint16x8_t, u16)
+
+#define TEST_MLXL_N(name, rettype, intype, ts) \
+  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b) \
+	{ \
+		acc = name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
+		return name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
+	}
+
+TEST_MLXL_N (vmlal_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vmlal_n, uint32x4_t, uint16x8_t, u16)
+TEST_MLXL_N (vmlal_n, int64x2_t, int32x4_t, s32)
+TEST_MLXL_N (vmlal_n, uint64x2_t, uint32x4_t, u32)
+
+TEST_MLXL_N (vmlsl_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vmlsl_n, uint32x4_t, uint16x8_t, u16)
+TEST_MLXL_N (vmlsl_n, int64x2_t, int32x4_t, s32)
+TEST_MLXL_N (vmlsl_n, uint64x2_t, uint32x4_t, u32)
+
+TEST_MLXL_N (vqdmlal_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vqdmlal_n, int64x2_t, int32x4_t, s32)
+
+TEST_MLXL_N (vqdmlsl_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vqdmlsl_n, int64x2_t, int32x4_t, s32)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c
new file mode 100644
index 0000000..09bc7fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_SUBL(rettype, intype, ts, rs) \
+  rettype test_vsubl_ ## ts (intype a, intype b, intype c) \
+	{ \
+		rettype t0 = vsubl_ ## ts (vget_high_ ## ts (a), \
+					   vget_high_ ## ts (c)); \
+		rettype t1 = vsubl_ ## ts (vget_high_ ## ts (b), \
+					   vget_high_ ## ts (c)); \
+		return vaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_SUBL (int16x8_t, int8x16_t, s8, s16)
+TEST_SUBL (uint16x8_t, uint8x16_t, u8, u16)
+TEST_SUBL (int32x4_t, int16x8_t, s16, s32)
+TEST_SUBL (uint32x4_t, uint16x8_t, u16, u32)
+TEST_SUBL (int64x2_t, int32x4_t, s32, s64)
+TEST_SUBL (uint64x2_t, uint32x4_t, u32, u64)
+
+#define TEST_SUBW(rettype, intype, intypel, ts, rs) \
+  rettype test_vsubw_ ## ts (intype a, intype b, intypel c) \
+	{ \
+		rettype t0 = vsubw_ ## ts (a, vget_high_ ## ts (c)); \
+		rettype t1 = vsubw_ ## ts (b, vget_high_ ## ts (c)); \
+		return vaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_SUBW (int16x8_t, int16x8_t, int8x16_t, s8, s16)
+TEST_SUBW (uint16x8_t, uint16x8_t, uint8x16_t, u8, u16)
+TEST_SUBW (int32x4_t, int32x4_t, int16x8_t, s16, s32)
+TEST_SUBW (uint32x4_t, uint32x4_t, uint16x8_t, u16, u32)
+TEST_SUBW (int64x2_t, int64x2_t, int32x4_t, s32, s64)
+TEST_SUBW (uint64x2_t, uint64x2_t, uint32x4_t, u32, u64)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */
diff --git a/gcc/testsuite/gcc.target/arm/attr-neon.c b/gcc/testsuite/gcc.target/arm/attr-neon.c
index 225fb8d..e8e3086 100644
--- a/gcc/testsuite/gcc.target/arm/attr-neon.c
+++ b/gcc/testsuite/gcc.target/arm/attr-neon.c
@@ -1,7 +1,10 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_neon_ok } */
 /* { dg-options "-O2 -ftree-vectorize" } */
-/* { dg-add-options arm_neon arm_v8_vfp } */ /* The arm_v8_vfp adds -mfpu=fp-armv8 to the command line, overriding any -mfpu= option set by arm_neon, thus ensuring that the attributes below really are checked for correct fpu selection.  */
+/* { dg-add-options arm_neon arm_v8_vfp } */
+/* The arm_v8_vfp adds -mfpu=fp-armv8 to the command line, overriding any
+   -mfpu= option set by arm_neon, thus ensuring that the attributes below
+   really are checked for correct fpu selection.  */
 
 /* Verify that neon instructions are emitted once.  */
 void __attribute__ ((target("fpu=neon")))
@@ -18,6 +21,6 @@ f3(int n, int x[], int y[]) {
     y[i] = x[i] << 3;
 }
 
-/* { dg-final { scan-assembler-times "\.fpu vfp" 1 } } */
-/* { dg-final { scan-assembler-times "\.fpu neon" 1 } } */
+/* { dg-final { scan-assembler-times "\.fpu\\s+vfp\n" 1 } } */
+/* { dg-final { scan-assembler-times "\.fpu\\s+neon\n" 1 } } */
 /* { dg-final { scan-assembler-times "vshl" 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/attr-neon2.c b/gcc/testsuite/gcc.target/arm/attr-neon2.c
index 2966825..a7a72da 100644
--- a/gcc/testsuite/gcc.target/arm/attr-neon2.c
+++ b/gcc/testsuite/gcc.target/arm/attr-neon2.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_neon_ok } */
 /* { dg-require-effective-target arm_fp_ok } */
-/* { dg-options "-O2" } */
+/* { dg-options "-Ofast" } */
 /* { dg-add-options arm_fp } */
 
 /* Reset fpu to a value compatible with the next pragmas.  */
@@ -12,23 +12,36 @@
 #include <arm_neon.h>
 
 /* Check that pragma target is used.  */
-int8x8_t 
-my (int8x8_t __a, int8x8_t __b)
+/*
+**my:
+**	...
+**	vadd.f32	d[0-9]+, d[0-9]+, d[0-9]+
+**	...
+**	bx	lr
+*/
+float32x2_t
+my (float32x2_t __a, float32x2_t __b)
 {
   return __a + __b;
 }
 
 #pragma GCC pop_options
 
-/* Check that command line option is restored.  */
-int8x8_t 
-my1 (int8x8_t __a, int8x8_t __b)
+/* Check that fpu=vfp is restored.  */
+/*
+**my1:
+**	...
+**	vadd.f32	s[0-9]+, s[0-9]+, s[0-9]+
+**	vadd.f32	s[0-9]+, s[0-9]+, s[0-9]+
+**	...
+**	bx	lr
+*/
+float32x2_t
+my1 (float32x2_t __a, float32x2_t __b)
 {
   return __a + __b;
 }
 
-/* { dg-final { scan-assembler-times "\.fpu vfp" 1 } } */
-/* { dg-final { scan-assembler-times "\.fpu neon" 1 } } */
-/* { dg-final { scan-assembler "vadd" } } */
-
-
+/* { dg-final { scan-assembler "\.fpu\\s+vfp\n" } } */
+/* { dg-final { scan-assembler "\.fpu\\s+neon\n" } } */
+/* { dg-final { check-function-bodies "**" "" } } */
diff --git a/gcc/testsuite/gcc.target/arm/attr-neon3.c b/gcc/testsuite/gcc.target/arm/attr-neon3.c
index 17e429a..0fbce6e 100644
--- a/gcc/testsuite/gcc.target/arm/attr-neon3.c
+++ b/gcc/testsuite/gcc.target/arm/attr-neon3.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_crypto_ok } */
 /* { dg-require-effective-target arm_fp_ok } */
-/* { dg-options "-O2" } */
+/* { dg-options "-Ofast" } */
 /* { dg-add-options arm_fp } */
 
 /* Reset fpu to a value compatible with the next pragmas.  */
@@ -11,28 +11,54 @@
 #include <arm_neon.h>
 
 /* Check that neon is used.  */
-int8x8_t __attribute__ ((target("fpu=neon")))
-my (int8x8_t __a, int8x8_t __b)
+/*
+**my:
+**	...
+**	vadd.f32	d[0-9]+, d[0-9]+, d[0-9]+
+**	...
+**	bx	lr
+*/
+float32x2_t __attribute__ ((target("fpu=neon")))
+my (float32x2_t __a, float32x2_t __b)
 {
   return __a + __b;
 }
 
 /* Check that crypto builtins are recognized.  */
+/*
+**foo:
+**	...
+** (
+**	vld1.64	{d[0-9]+-d[0-9]+}, \[r[0-9]+:64\]
+** |
+**	vld1.64	{d[0-9]+}, \[r[0-9]+:64\]!
+**	vld1.64	{d[0-9]+}, \[r[0-9]+:64\]
+** }
+**	...
+**	bx	lr
+*/
+
 poly128_t __attribute__ ((target("fpu=crypto-neon-fp-armv8")))
 foo (poly128_t* ptr)
 {
   return vldrq_p128 (ptr);
 }
 
-/* Check that default mode is restored.  */
-int8x8_t
-my1 (int8x8_t __a, int8x8_t __b)
+/* Check that fpu=vfp is restored.  */
+/*
+**my1:
+**	...
+**	vadd.f32	s[0-9]+, s[0-9]+, s[0-9]+
+**	vadd.f32	s[0-9]+, s[0-9]+, s[0-9]+
+**	...
+**	bx	lr
+*/float32x2_t
+my1 (float32x2_t __a, float32x2_t __b)
 {
   return __a + __b;
 }
 
-/* { dg-final { scan-assembler-times "\.fpu vfp" 1 } } */
-/* { dg-final { scan-assembler-times "\.fpu neon" 1 } } */
-/* { dg-final { scan-assembler-times "\.fpu crypto-neon-fp-armv8" 1 } } */
-/* { dg-final { scan-assembler-times "vld1" 1 } } */
-/* { dg-final { scan-assembler-times "vadd" 1} } */
+/* { dg-final { scan-assembler "\.fpu\\s+vfp\n" } } */
+/* { dg-final { scan-assembler "\.fpu\\s+neon\n" } } */
+/* { dg-final { scan-assembler "\.fpu\\s+crypto-neon-fp-armv8\n" } } */
+/* { dg-final { check-function-bodies "**" "" } } */
diff --git a/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-flag-hard.c b/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-flag-hard.c
index 6a92ded..e0fb307 100644
--- a/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-flag-hard.c
+++ b/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-flag-hard.c
@@ -1,12 +1,12 @@
 /* { dg-do assemble } */
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-additional-options "-mcpu=cortex-m55+nofp -mthumb -mfloat-abi=hard -mfpu=auto --save-temps" } */
+/* { dg-final { scan-assembler "\.fpu softvfp" } } */
 /* { dg-final { scan-assembler "\.arch_extension mve" } } */
 /* { dg-final { scan-assembler "\.arch_extension dsp" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension fp" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension fp.dp" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension mve.fp" } } */
-/* { dg-final { scan-assembler-not "\.fpu" } } */
 
 int
 f ()
diff --git a/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-flag-softfp.c b/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-flag-softfp.c
index 25e80e9..50645e8 100644
--- a/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-flag-softfp.c
+++ b/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-flag-softfp.c
@@ -1,12 +1,12 @@
 /* { dg-do assemble } */
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-additional-options "-mcpu=cortex-m55+nofp -mthumb -mfloat-abi=softfp -mfpu=auto --save-temps" } */
+/* { dg-final { scan-assembler "\.fpu softvfp" } } */
 /* { dg-final { scan-assembler "\.arch_extension mve" } } */
 /* { dg-final { scan-assembler "\.arch_extension dsp" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension fp" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension fp.dp" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension mve.fp" } } */
-/* { dg-final { scan-assembler-not "\.fpu" } } */
 
 int
 f ()
diff --git a/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-nomve-flag-softfp.c b/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-nomve-flag-softfp.c
index 38042cc..948f622 100644
--- a/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-nomve-flag-softfp.c
+++ b/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-nomve-flag-softfp.c
@@ -1,12 +1,12 @@
 /* { dg-do assemble } */
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-additional-options "-mcpu=cortex-m55+nomve+nofp -mthumb -mfloat-abi=softfp -mfpu=auto --save-temps" } */
+/* { dg-final { scan-assembler "\.fpu softvfp" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension mve" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension mve.fp" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension fp" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension fp.dp" } } */
 /* { dg-final { scan-assembler "\.arch_extension dsp" } } */
-/* { dg-final { scan-assembler-not "\.fpu" } } */
 
 int
 f ()
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu1.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu1.c
index 611097e..c5acdb5 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu1.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu1.c
@@ -1,3 +1,4 @@
+/* { dg-do assemble } */
 /* { dg-require-effective-target arm_v8_1m_mve_ok } */
 /* { dg-require-effective-target arm_hard_ok } */
 /* { dg-add-options arm_v8_1m_mve } */
@@ -8,8 +9,6 @@
 int8x16_t
 foo1 (int8x16_t value)
 {
-  int8x16_t b = value;
+  int8x16_t b = -value;
   return b;
 }
-
-/* { dg-final { scan-assembler-not "\.fpu softvfp" }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu2.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu2.c
index b8e1051..907db5e 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu2.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu2.c
@@ -1,3 +1,4 @@
+/* { dg-do assemble } */
 /* { dg-require-effective-target arm_v8_1m_mve_ok } */
 /* { dg-require-effective-target arm_softfp_ok } */
 /* { dg-add-options arm_v8_1m_mve } */
@@ -8,8 +9,6 @@
 int8x16_t
 foo1 (int8x16_t value)
 {
-  int8x16_t b = value;
+  int8x16_t b = -value;
   return b;
 }
-
-/* { dg-final { scan-assembler-not "\.fpu softvfp" }  } */
diff --git a/gcc/testsuite/gcc.target/arm/pr69245.c b/gcc/testsuite/gcc.target/arm/pr69245.c
index bd50518..34b97a2 100644
--- a/gcc/testsuite/gcc.target/arm/pr69245.c
+++ b/gcc/testsuite/gcc.target/arm/pr69245.c
@@ -23,4 +23,8 @@ void fn2 ()
   d = b * c + a;
 }
 
-/* { dg-final { scan-assembler-times "\.fpu vfp" 1 } } */
+/* Because we don't know the exact command-line options used to invoke the test
+   we cannot expect these tests to match exactly once.  But they must appear at
+   least once.  */
+/* { dg-final { scan-assembler "\.fpu\s+vfp\n" } } */
+/* { dg-final { scan-assembler "\.fpu\s+neon-vfpv4\n" } } */
diff --git a/gcc/testsuite/gcc.target/arm/pr98636.c b/gcc/testsuite/gcc.target/arm/pr98636.c
index c4d235c..559f9a2 100644
--- a/gcc/testsuite/gcc.target/arm/pr98636.c
+++ b/gcc/testsuite/gcc.target/arm/pr98636.c
@@ -1,5 +1,6 @@
 /* { dg-do compile } */
-/* { dg-options "-mfp16-format=alternative" } */
+/* { dg-require-effective-target arm_softfp_ok } */
+/* { dg-options "-mfp16-format=alternative -mfloat-abi=softfp" } */
 
 #pragma GCC push_options
 # pragma GCC target ("arch=armv8.2-a+fp16") /* { dg-error "selected fp16 options are incompatible" } */
diff --git a/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute.c b/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute.c
index 174be85..7e63cf5 100644
--- a/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute.c
+++ b/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute.c
@@ -22,5 +22,8 @@ uint32_t restored ()
   return bar();
 }
 
-/* { dg-final { scan-assembler-times {\.fpu\s+vfpv4} 1 } } */
-/* { dg-final { scan-assembler-times {\.fpu\s+vfpv3-d16} 1 } } */
+/* We can't tell exactly how many times the following tests will match because
+   command-line options may cause additional instances to be generated, but
+   each must be present at least once.  */
+/* { dg-final { scan-assembler {\.fpu\s+vfpv4\n} } } */
+/* { dg-final { scan-assembler {\.fpu\s+vfpv3-d16\n} } } */
diff --git a/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute_2.c b/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute_2.c
index add40dd..3d33b04 100644
--- a/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute_2.c
+++ b/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute_2.c
@@ -25,5 +25,8 @@ uint32_t restored ()
   return bar();
 }
 
-/* { dg-final { scan-assembler-times {\.fpu\s+vfpv4} 1 } } */
-/* { dg-final { scan-assembler-times {\.fpu\s+vfpv3-d16} 1 } } */
+/* We can't tell exactly how many times the following tests will match because
+   command-line options may cause additional instances to be generated, but
+   each must be present at least once.  */
+/* { dg-final { scan-assembler-times {\.fpu\s+vfpv4\n} } } */
+/* { dg-final { scan-assembler-times {\.fpu\s+vfpv3-d16\n} } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c
index a31b4a2..9590f25 100644
--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mavx -mtune=generic -dp" } */
+/* { dg-options "-O2 -mavx -mno-avx512f -mtune=generic -dp" } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-15.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-15.c
index 803936e..36dcf73 100644
--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-15.c
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-15.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mavx -mtune=generic -dp" } */
+/* { dg-options "-O2 -mavx -mno-avx512f -mtune=generic -dp" } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c
index 490f4af..046804b 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512vl } */
+
 #define AVX512VL
 #ifndef CHECK
 #define CHECK "avx512f-helper.h"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c
index 09a87de..56245b1 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c
@@ -1,4 +1,6 @@
 /* { dg-do run { target { ! ia32 } } } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -mavx512dq -DTYPE=long" } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512dq } */
 
 #include "cond_op_addsubmul_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c
index fdcdb34..bdcd2ef 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -mavx512bw -DTYPE=short" } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-require-effective-target avx512vl } */
 
 #define AVX512BW
 #include "cond_op_addsubmul_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c b/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c
index 360891f..5ec38df 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c
@@ -1,5 +1,6 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512vl } */
 
 #define AVX512VL
 #ifndef CHECK
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c b/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c
index 20ed737..c99c04c 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c
@@ -1,4 +1,5 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=float" } */
+/* { dg-require-effective-target avx512vl } */
 
 #include "cond_op_addsubmuldiv_double-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-1.c b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-1.c
new file mode 100644
index 0000000..8951f4a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-1.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_AND" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_XOR" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_IOR" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpxord"  1 } } */
+/* { dg-final { scan-assembler-times "vpord"  1 } } */
+/* { dg-final { scan-assembler-times "vpandd"  1 } } */
+
+typedef int int32;
+typedef unsigned int uint32;
+typedef long long int64;
+typedef unsigned long long uint64;
+
+#ifndef NUM
+#define NUM 800
+#endif
+#ifndef TYPE
+#define TYPE int
+#endif
+
+TYPE a[NUM], b[NUM], c[NUM], d[NUM], e[NUM], j[NUM];
+
+#define BIN(OPNAME, OP)				\
+  void						\
+  __attribute__ ((noipa,optimize ("O3")))	\
+  foo_##OPNAME ()				\
+  {						\
+    for (int i = 0; i != NUM; i++)		\
+      if (b[i] < c[i])				\
+	a[i] = d[i] OP e[i];			\
+      else					\
+	a[i] = d[i] - e[i];			\
+  }
+
+BIN (and, &);
+BIN (ior, |);
+BIN (xor, ^);
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-2.c b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-2.c
new file mode 100644
index 0000000..23ca412
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-2.c
@@ -0,0 +1,78 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#include "cond_op_anylogic_d-1.c"
+#define BINO2(OPNAME, OP)			\
+  void						\
+  __attribute__ ((noipa,optimize ("O2")))	\
+  foo_o2_##OPNAME ()				\
+  {						\
+    for (int i = 0; i != NUM; i++)		\
+      if (b[i] < c[i])				\
+	j[i] = d[i] OP e[i];			\
+      else					\
+	j[i] = d[i] - e[i];			\
+  }
+
+BINO2 (and, &);
+BINO2 (ior, |);
+BINO2 (xor, ^);
+
+static void
+test_256 (void)
+{
+  int sign = -1;
+  for (int i = 0; i != NUM; i++)
+    {
+      a[i] = 0;
+      d[i] = i * 2;
+      e[i] = i * i * 3 - i * 9 + 153;
+      b[i] = i * 83;
+      c[i] = b[i] + sign;
+      sign *= -1;
+      j[i] = 1;
+    }
+  foo_and ();
+  foo_o2_and ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+	abort ();
+      a[i] = 0;
+      b[i] = 1;
+    }
+
+  foo_xor ();
+  foo_o2_xor ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+
+  foo_ior ();
+  foo_o2_ior ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+}
+
+static void
+test_128 ()
+{
+  
+}
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_anylogic_q-1.c b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_q-1.c
new file mode 100644
index 0000000..cb47701
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_q-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=int64 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_AND" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_XOR" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_IOR" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpxorq"  1 } } */
+/* { dg-final { scan-assembler-times "vporq"  1 } } */
+/* { dg-final { scan-assembler-times "vpandq"  1 } } */
+
+#include "cond_op_anylogic_d-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_anylogic_q-2.c b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_q-2.c
new file mode 100644
index 0000000..709babf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_q-2.c
@@ -0,0 +1,5 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=int64" } */
+/* { dg-require-effective-target avx512vl } */
+
+#include "cond_op_anylogic_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_fma_double-1.c b/gcc/testsuite/gcc.target/i386/cond_op_fma_double-1.c
new file mode 100644
index 0000000..4e14b75
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_fma_double-1.c
@@ -0,0 +1,87 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-times ".COND_FMA" 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times ".COND_FNMA" 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times ".COND_FMS" 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times ".COND_FNMS" 3 "optimized" } } */
+/* { dg-final { scan-assembler-times "vfmadd132pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmsub132pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmadd231pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmadd231pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmsub231pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmsub231pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmadd132pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmsub132pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+
+#ifndef NUM
+#define NUM 800
+#endif
+#ifndef TYPE
+#define TYPE double
+#endif
+#ifndef __BUILTIN_FMA
+#define __BUILTIN_FMA __builtin_fma
+#endif
+
+TYPE a[NUM], b[NUM], c[NUM], d[NUM], e[NUM], j[NUM];
+#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
+#define MAX(X,Y) ((X) < (Y) ? (Y) : (X))
+
+#define FMA3(OPNAME, OP1, OP2)					\
+  void								\
+  __attribute__ ((noipa,optimize ("O3")))			\
+  foo3_##OPNAME ()						\
+  {								\
+    for (int i = 0; i != NUM; i++)				\
+      {								\
+	TYPE tmp = MAX(d[i], e[i]);				\
+	if (b[i] < c[i])					\
+	  a[i] = __BUILTIN_FMA (OP1 d[i], e[i], OP2 tmp);	\
+	else							\
+	  a[i] = tmp;						\
+      }								\
+  }
+
+#define FMAZ(OPNAME, OP1, OP2)					\
+  void								\
+  __attribute__ ((noipa,optimize ("O3")))			\
+  fooz_##OPNAME ()						\
+  {								\
+    for (int i = 0; i != NUM; i++)				\
+      if (b[i] < c[i])						\
+	a[i] = __BUILTIN_FMA (OP1 d[i], e[i], OP2 a[i]);	\
+      else							\
+	a[i] = .0;						\
+  }
+
+#define FMA1(OPNAME, OP1, OP2)					\
+  void								\
+  __attribute__ ((noipa,optimize ("O3")))			\
+  foo1_##OPNAME ()						\
+  {								\
+    for (int i = 0; i != NUM; i++)				\
+      if (b[i] < c[i])						\
+	a[i] = __BUILTIN_FMA (OP1 d[i], e[i], OP2 a[i]);	\
+      else							\
+	a[i] = d[i];						\
+  }
+
+
+FMAZ (fma,, +);
+FMAZ (fms,, -);
+FMAZ (fnma, -, +);
+FMAZ (fnms, -, -);
+
+FMA1 (fma,, +);
+FMA1 (fms,, -);
+FMA1 (fnma, -, +);
+FMA1 (fnms, -, -);
+
+FMA3 (fma,, +);
+FMA3 (fms,, -);
+FMA3 (fnma, -, +);
+FMA3 (fnms, -, -);
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c b/gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c
new file mode 100644
index 0000000..4c6514e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c
@@ -0,0 +1,208 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#include "cond_op_fma_double-1.c"
+#define FMA3_O2(OPNAME, OP1, OP2)				\
+  void								\
+  __attribute__ ((noipa,optimize ("O2")))			\
+  foo3_o2_##OPNAME ()						\
+  {								\
+    for (int i = 0; i != NUM; i++)				\
+      {								\
+	TYPE tmp = MAX(d[i], e[i]);				\
+	if (b[i] < c[i])					\
+	  j[i] = __BUILTIN_FMA (OP1 d[i], e[i], OP2 tmp);	\
+	else							\
+	  j[i] = tmp;						\
+      }								\
+  }
+
+#define FMAZ_O2(OPNAME, OP1, OP2)				\
+  void								\
+  __attribute__ ((noipa,optimize ("O2")))			\
+  fooz_o2_##OPNAME ()						\
+  {								\
+    for (int i = 0; i != NUM; i++)				\
+      if (b[i] < c[i])						\
+	j[i] = __BUILTIN_FMA (OP1 d[i], e[i], OP2 a[i]);	\
+      else							\
+	j[i] = .0;						\
+  }
+
+#define FMA1_O2(OPNAME, OP1, OP2)				\
+  void								\
+  __attribute__ ((noipa,optimize ("O2")))			\
+  foo1_o2_##OPNAME ()						\
+  {								\
+    for (int i = 0; i != NUM; i++)				\
+      if (b[i] < c[i])						\
+	j[i] = __BUILTIN_FMA (OP1 d[i], e[i], OP2 a[i]);	\
+      else							\
+	j[i] = d[i];						\
+  }
+
+FMAZ_O2 (fma,, +);
+FMAZ_O2 (fms,, -);
+FMAZ_O2 (fnma, -, +);
+FMAZ_O2 (fnms, -, -);
+
+FMA1_O2 (fma,, +);
+FMA1_O2 (fms,, -);
+FMA1_O2 (fnma, -, +);
+FMA1_O2 (fnms, -, -);
+
+FMA3_O2 (fma,, +);
+FMA3_O2 (fms,, -);
+FMA3_O2 (fnma, -, +);
+FMA3_O2 (fnms, -, -);
+
+static void
+test_256 (void)
+{
+  int sign = -1;
+  for (int i = 0; i != NUM; i++)
+    {
+      a[i] = 0;
+      d[i] = i * 2;
+      e[i] = i * i * 3 - i * 9 + 153;
+      b[i] = i * 83;
+      c[i] = b[i] + sign;
+      sign *= -1;
+      j[i] = 1;
+    }
+  foo1_o2_fma ();
+  /* foo1_fma need to be after foo1_o2_fma since
+     it changes a[i] which is used by foo1_o2_fma.  */
+  foo1_fma ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+	abort ();
+      a[i] = 0;
+      b[i] = 1;
+    }
+
+  foo1_o2_fms ();
+  foo1_fms ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+
+  foo1_o2_fnma ();
+  foo1_fnma ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+
+  foo1_o2_fnms ();
+  foo1_fnms ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+
+  fooz_o2_fma ();
+  fooz_fma ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      b[i] = 1;
+    }
+
+  fooz_o2_fms ();
+  fooz_fms ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+
+  fooz_o2_fnma ();
+  fooz_fnma ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+
+  fooz_o2_fnms ();
+  fooz_fnms ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+
+  foo3_o2_fma ();
+  foo3_fma ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      b[i] = 1;
+    }
+
+  foo3_o2_fms ();
+  foo3_fms ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+
+  foo3_o2_fnma ();
+  foo3_fnma ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+
+  foo3_o2_fnms ();
+  foo3_fnms ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+}
+
+static void
+test_128 ()
+{
+  
+}
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_fma_float-1.c b/gcc/testsuite/gcc.target/i386/cond_op_fma_float-1.c
new file mode 100644
index 0000000..a5752e7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_fma_float-1.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=float -fdump-tree-optimized -D__BUILTIN_FMA=__builtin_fmaf" } */
+/* { dg-final { scan-tree-dump-times ".COND_FMA" 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times ".COND_FNMA" 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times ".COND_FMS" 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times ".COND_FNMS" 3 "optimized" } } */
+/* { dg-final { scan-assembler-times "vfmadd132ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmsub132ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmadd231ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmadd231ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmsub231ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmsub231ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmadd132ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmsub132ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+
+#include "cond_op_fma_double-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c b/gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c
new file mode 100644
index 0000000..e13d377
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c
@@ -0,0 +1,5 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=float -D__BUILTIN_FMA=__builtin_fmaf" } */
+/* { dg-require-effective-target avx512vl } */
+
+#include "cond_op_fma_double-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-1.c
new file mode 100644
index 0000000..78c6600
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=int8 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpmaxsb"  1 } } */
+/* { dg-final { scan-assembler-times "vpminsb"  1 } } */
+
+#include "cond_op_maxmin_d-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-2.c
new file mode 100644
index 0000000..8ba7a3f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-2.c
@@ -0,0 +1,6 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mavx512bw -mprefer-vector-width=256 -DTYPE=int8" } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512bw } */
+
+#include "cond_op_maxmin_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-1.c
new file mode 100644
index 0000000..2543d36
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-1.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpmaxsd"  1 } } */
+/* { dg-final { scan-assembler-times "vpminsd"  1 } } */
+
+typedef char int8;
+typedef unsigned char uint8;
+typedef short int16;
+typedef unsigned short uint16;
+typedef int int32;
+typedef unsigned int uint32;
+typedef long long int64;
+typedef unsigned long long uint64;
+
+#ifndef NUM
+#define NUM 800
+#endif
+#ifndef TYPE
+#define TYPE int
+#endif
+
+TYPE a[NUM], b[NUM], c[NUM], d[NUM], e[NUM], j[NUM];
+#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
+#define MAX(X,Y) ((X) < (Y) ? (Y) : (X))
+
+#define BIN(OPNAME, OP)				\
+  void						\
+  __attribute__ ((noipa,optimize ("O3")))	\
+  foo_##OPNAME ()				\
+  {						\
+    for (int i = 0; i != NUM; i++)		\
+      if (b[i] < c[i])				\
+	a[i] = OP(d[i], e[i]);			\
+      else					\
+	a[i] = d[i] - e[i];			\
+  }
+
+BIN (max, MAX);
+BIN (min, MIN);
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-2.c
new file mode 100644
index 0000000..f715f54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-2.c
@@ -0,0 +1,67 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#include "cond_op_maxmin_d-1.c"
+#define BINO2(OPNAME, OP)			\
+  void						\
+  __attribute__ ((noipa,optimize ("O2")))	\
+  foo_o2_##OPNAME ()				\
+  {						\
+    for (int i = 0; i != NUM; i++)		\
+      if (b[i] < c[i])				\
+	j[i] = OP(d[i], e[i]);			\
+      else					\
+	j[i] = d[i] - e[i];			\
+  }
+
+BINO2 (max, MAX);
+BINO2 (min, MIN);
+
+static void
+test_256 (void)
+{
+  int sign = -1;
+  for (int i = 0; i != NUM; i++)
+    {
+      a[i] = 0;
+      d[i] = i * 2;
+      e[i] = i * i * 3 - i * 9 + 153;
+      b[i] = i * 83;
+      c[i] = b[i] + sign;
+      sign *= -1;
+      j[i] = 1;
+    }
+  foo_max ();
+  foo_o2_max ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+	abort ();
+      a[i] = 0;
+      b[i] = 1;
+    }
+
+  foo_min ();
+  foo_o2_min ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+}
+
+static void
+test_128 ()
+{
+  
+}
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-1.c
new file mode 100644
index 0000000..eda8e19
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-1.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vmaxpd"  1 } } */
+/* { dg-final { scan-assembler-times "vminpd"  1 } } */
+
+#include<math.h>
+#ifndef NUM
+#define NUM 800
+#endif
+#ifndef TYPE
+#define TYPE double
+#endif
+#ifndef FN_MAX
+#define FN_MAX fmax
+#endif
+#ifndef FN_MIN
+#define FN_MIN fmin
+#endif
+
+TYPE a[NUM], b[NUM], c[NUM], d[NUM], e[NUM], j[NUM];
+#define MAX FN_MAX
+#define MIN FN_MIN
+
+#define BIN(OPNAME, OP)				\
+  void						\
+  __attribute__ ((noipa,optimize ("Ofast")))	\
+  foo_##OPNAME ()				\
+  {						\
+    for (int i = 0; i != NUM; i++)		\
+      if (b[i] < c[i])				\
+	a[i] = (OP (d[i], e[i]));		\
+      else					\
+	a[i] = d[i] - e[i];			\
+  }
+
+BIN (max, MAX);
+BIN (min, MIN);
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-2.c
new file mode 100644
index 0000000..c50a831
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-2.c
@@ -0,0 +1,67 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -ffast-math" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#include "cond_op_maxmin_double-1.c"
+#define BINO2(OPNAME, OP)			\
+  void						\
+  __attribute__ ((noipa))			\
+  foo_o2_##OPNAME ()				\
+  {						\
+    for (int i = 0; i != NUM; i++)		\
+      if (b[i] < c[i])				\
+	j[i] = OP(d[i], e[i]);			\
+      else					\
+	j[i] = d[i] - e[i];			\
+  }
+
+BINO2 (max, MAX);
+BINO2 (min, MIN);
+
+static void
+test_256 (void)
+{
+  int sign = -1;
+  for (int i = 0; i != NUM; i++)
+    {
+      a[i] = 0;
+      d[i] = i * 2;
+      e[i] = i * i * 3 - i * 9 + 153;
+      b[i] = i * 83;
+      c[i] = b[i] + sign;
+      sign *= -1;
+      j[i] = 1;
+    }
+  foo_max ();
+  foo_o2_max ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+	abort ();
+      a[i] = 0;
+      b[i] = 1;
+    }
+
+  foo_min ();
+  foo_o2_min ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+}
+
+static void
+test_128 ()
+{
+  
+}
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_float-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_float-1.c
new file mode 100644
index 0000000..2d2157d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_float-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=float -fdump-tree-optimized -DFN_MAX=fmaxf -DFN_MIN=fminf" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vmaxps"  1 } } */
+/* { dg-final { scan-assembler-times "vminps"  1 } } */
+
+#include "cond_op_maxmin_double-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_float-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_float-2.c
new file mode 100644
index 0000000..fec784e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_float-2.c
@@ -0,0 +1,5 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=float -DFN_MAX=fmaxf -DFN_MIN=fminf" } */
+/* { dg-require-effective-target avx512vl } */
+
+#include "cond_op_maxmin_double-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-1.c
new file mode 100644
index 0000000..a1925c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=int64 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpmaxsq"  1 } } */
+/* { dg-final { scan-assembler-times "vpminsq"  1 } } */
+
+#include "cond_op_maxmin_d-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-2.c
new file mode 100644
index 0000000..205a65a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-2.c
@@ -0,0 +1,5 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=int64" } */
+/* { dg-require-effective-target avx512vl } */
+
+#include "cond_op_maxmin_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ub-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ub-1.c
new file mode 100644
index 0000000..117179f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ub-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=uint8 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpmaxub"  1 } } */
+/* { dg-final { scan-assembler-times "vpminub"  1 } } */
+
+#include "cond_op_maxmin_d-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ub-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ub-2.c
new file mode 100644
index 0000000..ac4a206
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ub-2.c
@@ -0,0 +1,6 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mavx512bw -mprefer-vector-width=256 -DTYPE=uint8" } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512bw } */
+
+#include "cond_op_maxmin_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ud-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ud-1.c
new file mode 100644
index 0000000..1ce0f82
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ud-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=uint32 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpmaxud"  1 } } */
+/* { dg-final { scan-assembler-times "vpminud"  1 } } */
+
+#include "cond_op_maxmin_d-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ud-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ud-2.c
new file mode 100644
index 0000000..d609ef0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ud-2.c
@@ -0,0 +1,5 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=uint32" } */
+/* { dg-require-effective-target avx512vl } */
+
+#include "cond_op_maxmin_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uq-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uq-1.c
new file mode 100644
index 0000000..82209f4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uq-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=uint64 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpmaxuq"  1 } } */
+/* { dg-final { scan-assembler-times "vpminuq"  1 } } */
+
+#include "cond_op_maxmin_d-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uq-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uq-2.c
new file mode 100644
index 0000000..c2053c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uq-2.c
@@ -0,0 +1,5 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=uint64" } */
+/* { dg-require-effective-target avx512vl } */
+
+#include "cond_op_maxmin_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uw-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uw-1.c
new file mode 100644
index 0000000..43d560d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uw-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=uint16 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpmaxuw"  1 } } */
+/* { dg-final { scan-assembler-times "vpminuw"  1 } } */
+
+#include "cond_op_maxmin_d-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uw-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uw-2.c
new file mode 100644
index 0000000..463fc52
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uw-2.c
@@ -0,0 +1,6 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mavx512bw -mprefer-vector-width=256 -DTYPE=uint16" } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512bw } */
+
+#include "cond_op_maxmin_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-1.c
new file mode 100644
index 0000000..d4d388e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=int16 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpmaxsw"  1 } } */
+/* { dg-final { scan-assembler-times "vpminsw"  1 } } */
+
+#include "cond_op_maxmin_d-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-2.c
new file mode 100644
index 0000000..d6e45e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-2.c
@@ -0,0 +1,6 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mavx512bw -mprefer-vector-width=256 -DTYPE=int16" } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512bw } */
+
+#include "cond_op_maxmin_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/eh_return-2.c b/gcc/testsuite/gcc.target/i386/eh_return-2.c
new file mode 100644
index 0000000..f23f449
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/eh_return-2.c
@@ -0,0 +1,16 @@
+/* PR target/101772  */
+/* { dg-do compile } */
+/* { dg-additional-options "-O0 -march=x86-64 -mstackrealign" } */
+
+struct _Unwind_Context _Unwind_Resume_or_Rethrow_this_context;
+
+void offset (int);
+
+struct _Unwind_Context {
+  void *reg[7];
+} _Unwind_Resume_or_Rethrow() {
+  struct _Unwind_Context cur_contextcur_context =
+      _Unwind_Resume_or_Rethrow_this_context;
+  offset(0);
+  __builtin_eh_return ((long) offset, 0);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
index e5616d8..77ace86 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-10b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
@@ -5,3 +5,4 @@
 
 /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 8 } } */
+/* { dg-final { scan-assembler-not "vzeroupper" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-11b.c b/gcc/testsuite/gcc.target/i386/pr100865-11b.c
index 12d55b9..7e458e8 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-11b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-11b.c
@@ -5,4 +5,4 @@
 
 /* { dg-final { scan-assembler-times "movabsq" 1 } } */
 /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-times "vmovdqa64\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-12b.c b/gcc/testsuite/gcc.target/i386/pr100865-12b.c
index 63a5629..dee0cfb 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-12b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-12b.c
@@ -5,4 +5,4 @@
 
 /* { dg-final { scan-assembler-times "movabsq" 1 } } */
 /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-times "vmovdqa64\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
index 8e8a7ea..80e9fdb 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
@@ -5,5 +5,7 @@
 
 /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 2 } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-6b.c b/gcc/testsuite/gcc.target/i386/pr100865-6b.c
index 44e74c6..35f2e96 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-6b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-6b.c
@@ -4,6 +4,9 @@
 #include "pr100865-6a.c"
 
 /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "vmovdqu32\[\\t \]%ymm\[0-9\]+, " 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-not "vpbroadcastd\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-7b.c b/gcc/testsuite/gcc.target/i386/pr100865-7b.c
index 0a68820..ad267c4 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-7b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-7b.c
@@ -5,5 +5,8 @@
 
 /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %ymm\[0-9\]+" 1 { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "vmovdqu64\[\\t \]%ymm\[0-9\]+, " 16 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8b.c b/gcc/testsuite/gcc.target/i386/pr100865-8b.c
index 99a10ad..4b7dd7c 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-8b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-8b.c
@@ -4,4 +4,4 @@
 #include "pr100865-8a.c"
 
 /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-times "vmovdqa64\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-9b.c b/gcc/testsuite/gcc.target/i386/pr100865-9b.c
index 1469624..a315dde 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-9b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-9b.c
@@ -4,4 +4,4 @@
 #include "pr100865-9a.c"
 
 /* { dg-final { scan-assembler-times "vpbroadcastw\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-times "vmovdqa64\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr101742a.c b/gcc/testsuite/gcc.target/i386/pr101742a.c
new file mode 100644
index 0000000..67ea405
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101742a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mtune=nano-x2" } */
+
+int n2;
+
+__attribute__ ((simd)) char
+w7 (void)
+{
+  short int xb = n2;
+  int qp;
+
+  for (qp = 0; qp < 2; ++qp)
+    xb = xb < 1;
+
+  return xb;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101742b.c b/gcc/testsuite/gcc.target/i386/pr101742b.c
new file mode 100644
index 0000000..ba19064
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101742b.c
@@ -0,0 +1,4 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mtune=nano-x2 -mtune-ctrl=sse_unaligned_store_optimal" } */
+
+#include "pr101742a.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr82941-1.c b/gcc/testsuite/gcc.target/i386/pr82941-1.c
index d7e530d..c3be2f5 100644
--- a/gcc/testsuite/gcc.target/i386/pr82941-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr82941-1.c
@@ -11,4 +11,5 @@ pr82941 ()
   z = y;
 }
 
-/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr82942-1.c b/gcc/testsuite/gcc.target/i386/pr82942-1.c
index 9cdf81a..29ead04 100644
--- a/gcc/testsuite/gcc.target/i386/pr82942-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr82942-1.c
@@ -3,4 +3,5 @@
 
 #include "pr82941-1.c"
 
-/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr82990-1.c b/gcc/testsuite/gcc.target/i386/pr82990-1.c
index ff1d6d4..bbf580f 100644
--- a/gcc/testsuite/gcc.target/i386/pr82990-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr82990-1.c
@@ -11,4 +11,5 @@ pr82941 ()
   z = y;
 }
 
-/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr82990-3.c b/gcc/testsuite/gcc.target/i386/pr82990-3.c
index 201fa98..89ddb20 100644
--- a/gcc/testsuite/gcc.target/i386/pr82990-3.c
+++ b/gcc/testsuite/gcc.target/i386/pr82990-3.c
@@ -3,4 +3,5 @@
 
 #include "pr82941-1.c"
 
-/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr82990-5.c b/gcc/testsuite/gcc.target/i386/pr82990-5.c
index 9932bdc..b9da0e7 100644
--- a/gcc/testsuite/gcc.target/i386/pr82990-5.c
+++ b/gcc/testsuite/gcc.target/i386/pr82990-5.c
@@ -11,4 +11,5 @@ pr82941 ()
   z = y;
 }
 
-/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr99744-3.c b/gcc/testsuite/gcc.target/i386/pr99744-3.c
new file mode 100644
index 0000000..6c50581
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr99744-3.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-serialize" } */
+
+#include <x86intrin.h>
+
+__attribute__ ((target("general-regs-only")))
+void
+foo1 (void)
+{
+  _serialize ();
+}
+
+/* { dg-error "target specific option mismatch" "" { target *-*-* } 0 } */
diff --git a/gcc/testsuite/gcc.target/i386/pr99744-4.c b/gcc/testsuite/gcc.target/i386/pr99744-4.c
new file mode 100644
index 0000000..9196e62
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr99744-4.c
@@ -0,0 +1,357 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi -mbmi2 -mcldemote -mclflushopt -mclwb -mclzero -mcrc32 -menqcmd -mfsgsbase -mfxsr -mhreset -mlzcnt -mlwp -mmovdir64b -mmovdiri -mmwaitx -mpconfig -mpku -mpopcnt -mptwrite -mrdpid -mrdrnd -mrdseed -mrtm -msgx -mshstk -mtbm -mtsxldtrk -mxsave -mxsavec -mxsaveopt -mxsaves -mwaitpkg -mwbnoinvd" } */
+/* { dg-additional-options "-muintr" { target { ! ia32 } } }  */
+
+/* Test calling GPR intrinsics from functions with general-regs-only
+   target attribute.  */
+
+#include <x86gprintrin.h>
+
+#define _CONCAT(x,y) x ## y
+
+#define test_0(func, type)						\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (void)						\
+  { return func (); }
+
+#define test_0_i1(func, type, imm)					\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (void)						\
+  { return func (imm); }
+
+#define test_1(func, type, op1_type)					\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (op1_type A)					\
+  { return func (A); }
+
+#define test_1_i1(func, type, op1_type, imm)				\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (op1_type A)					\
+  { return func (A, imm); }
+
+#define test_2(func, type, op1_type, op2_type)				\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (op1_type A, op2_type B)			\
+  { return func (A, B); }
+
+#define test_2_i1(func, type, op1_type, op2_type, imm)			\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (op1_type A, op2_type B)			\
+  { return func (A, B, imm); }
+
+#define test_3(func, type, op1_type, op2_type, op3_type)		\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (op1_type A, op2_type B, op3_type C)		\
+  { return func (A, B, C); }
+
+#define test_4(func, type, op1_type, op2_type, op3_type, op4_type)	\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (op1_type A, op2_type B, op3_type C,		\
+			  op4_type D)					\
+  { return func (A, B, C, D); }
+
+/* ia32intrin.h  */
+test_1 (__bsfd, int, int)
+test_1 (__bsrd, int, int)
+test_1 (__bswapd, int, int)
+test_1 (__popcntd, int, unsigned int)
+test_2 (__rolb, unsigned char, unsigned char, int)
+test_2 (__rolw, unsigned short, unsigned short, int)
+test_2 (__rold, unsigned int, unsigned int, int)
+test_2 (__rorb, unsigned char, unsigned char, int)
+test_2 (__rorw, unsigned short, unsigned short, int)
+test_2 (__rord, unsigned int, unsigned int, int)
+
+#ifndef __iamcu__
+/* adxintrin.h */
+test_4 (_subborrow_u32, unsigned char, unsigned char, unsigned int,
+	unsigned int, unsigned int *)
+test_4 (_addcarry_u32, unsigned char, unsigned char, unsigned int,
+	unsigned int, unsigned int *)
+test_4 (_addcarryx_u32, unsigned char, unsigned char, unsigned int,
+	unsigned int, unsigned int *)
+
+/* bmiintrin.h */
+test_1 (__tzcnt_u16, unsigned short, unsigned short)
+test_2 (__andn_u32, unsigned int, unsigned int, unsigned int)
+test_2 (__bextr_u32, unsigned int, unsigned int, unsigned int)
+test_3 (_bextr_u32, unsigned int, unsigned int, unsigned int,
+	unsigned int)
+test_1 (__blsi_u32, unsigned int, unsigned int)
+test_1 (_blsi_u32, unsigned int, unsigned int)
+test_1 (__blsmsk_u32, unsigned int, unsigned int)
+test_1 (_blsmsk_u32, unsigned int, unsigned int)
+test_1 (__blsr_u32, unsigned int, unsigned int)
+test_1 (_blsr_u32, unsigned int, unsigned int)
+test_1 (__tzcnt_u32, unsigned int, unsigned int)
+test_1 (_tzcnt_u32, unsigned int, unsigned int)
+
+/* bmi2intrin.h */
+test_2 (_bzhi_u32, unsigned int, unsigned int, unsigned int)
+test_2 (_pdep_u32, unsigned int, unsigned int, unsigned int)
+test_2 (_pext_u32, unsigned int, unsigned int, unsigned int)
+
+/* cetintrin.h */
+test_1 (_inc_ssp, void, unsigned int)
+test_0 (_saveprevssp, void)
+test_1 (_rstorssp, void, void *)
+test_2 (_wrssd, void, unsigned int, void *)
+test_2 (_wrussd, void, unsigned int, void *)
+test_0 (_setssbsy, void)
+test_1 (_clrssbsy, void, void *)
+
+/* cldemoteintrin.h */
+test_1 (_cldemote, void, void *)
+
+/* clflushoptintrin.h */
+test_1 (_mm_clflushopt, void, void *)
+
+/* clwbintrin.h */
+test_1 (_mm_clwb, void, void *)
+
+/* clzerointrin.h */
+test_1 (_mm_clzero, void, void *)
+
+/* enqcmdintrin.h */
+test_2 (_enqcmd, int, void *, const void *)
+test_2 (_enqcmds, int, void *, const void *)
+
+/* fxsrintrin.h */
+test_1 (_fxsave, void, void *)
+test_1 (_fxrstor, void, void *)
+
+/* hresetintrin.h */
+test_1 (_hreset, void, unsigned int)
+
+/* ia32intrin.h  */
+test_2 (__crc32b, unsigned int, unsigned char, unsigned char)
+test_2 (__crc32w, unsigned int, unsigned short, unsigned short)
+test_2 (__crc32d, unsigned int, unsigned int, unsigned int)
+test_1 (__rdpmc, unsigned long long, int)
+test_0 (__rdtsc, unsigned long long)
+test_1 (__rdtscp, unsigned long long, unsigned int *)
+test_0 (__pause, void)
+
+/* lzcntintrin.h */
+test_1 (__lzcnt16, unsigned short, unsigned short)
+test_1 (__lzcnt32, unsigned int, unsigned int)
+test_1 (_lzcnt_u32, unsigned int, unsigned int)
+
+/* lwpintrin.h */
+test_1 (__llwpcb, void, void *)
+test_0 (__slwpcb, void *)
+test_2_i1 (__lwpval32, void, unsigned int, unsigned int, 1)
+test_2_i1 (__lwpins32, unsigned char, unsigned int, unsigned int, 1)
+
+/* movdirintrin.h */
+test_2 (_directstoreu_u32, void, void *, unsigned int)
+test_2 (_movdir64b, void, void *, const void *)
+
+/* mwaitxintrin.h */
+test_3 (_mm_monitorx, void, void const *, unsigned int, unsigned int)
+test_3 (_mm_mwaitx, void, unsigned int, unsigned int, unsigned int)
+
+/* pconfigintrin.h */
+test_2 (_pconfig_u32, unsigned int, const unsigned int, size_t *)
+
+/* pkuintrin.h */
+test_0 (_rdpkru_u32, unsigned int)
+test_1 (_wrpkru, void, unsigned int)
+
+/* popcntintrin.h */
+test_1 (_mm_popcnt_u32, int, unsigned int)
+
+/* rdseedintrin.h */
+test_1 (_rdseed16_step, int, unsigned short *)
+test_1 (_rdseed32_step, int, unsigned int *)
+
+/* rtmintrin.h */
+test_0 (_xbegin, unsigned int)
+test_0 (_xend, void)
+test_0_i1 (_xabort, void, 1)
+
+/* sgxintrin.h */
+test_2 (_encls_u32, unsigned int, const unsigned int, size_t *)
+test_2 (_enclu_u32, unsigned int, const unsigned int, size_t *)
+test_2 (_enclv_u32, unsigned int, const unsigned int, size_t *)
+
+/* tbmintrin.h */
+test_1_i1 (__bextri_u32, unsigned int, unsigned int, 1)
+test_1 (__blcfill_u32, unsigned int, unsigned int)
+test_1 (__blci_u32, unsigned int, unsigned int)
+test_1 (__blcic_u32, unsigned int, unsigned int)
+test_1 (__blcmsk_u32, unsigned int, unsigned int)
+test_1 (__blcs_u32, unsigned int, unsigned int)
+test_1 (__blsfill_u32, unsigned int, unsigned int)
+test_1 (__blsic_u32, unsigned int, unsigned int)
+test_1 (__t1mskc_u32, unsigned int, unsigned int)
+test_1 (__tzmsk_u32, unsigned int, unsigned int)
+
+/* tsxldtrkintrin.h */
+test_0 (_xsusldtrk, void)
+test_0 (_xresldtrk, void)
+
+/* x86gprintrin.h */
+test_1 (_ptwrite32, void, unsigned int)
+test_1 (_rdrand16_step, int, unsigned short *)
+test_1 (_rdrand32_step, int, unsigned int *)
+test_0 (_wbinvd, void)
+
+/* xtestintrin.h */
+test_0 (_xtest, int)
+
+/* xsaveintrin.h */
+test_2 (_xsave, void, void *, long long)
+test_2 (_xrstor, void, void *, long long)
+test_2 (_xsetbv, void, unsigned int, long long)
+test_1 (_xgetbv, long long, unsigned int)
+
+/* xsavecintrin.h */
+test_2 (_xsavec, void, void *, long long)
+
+/* xsaveoptintrin.h */
+test_2 (_xsaveopt, void, void *, long long)
+
+/* xsavesintrin.h */
+test_2 (_xsaves, void, void *, long long)
+test_2 (_xrstors, void, void *, long long)
+
+/* wbnoinvdintrin.h */
+test_0 (_wbnoinvd, void)
+
+#ifdef __x86_64__
+/* adxintrin.h */
+test_4 (_subborrow_u64, unsigned char, unsigned char,
+	unsigned long long, unsigned long long,
+	unsigned long long *)
+test_4 (_addcarry_u64, unsigned char, unsigned char,
+	unsigned long long, unsigned long long,
+	unsigned long long *)
+test_4 (_addcarryx_u64, unsigned char, unsigned char,
+	unsigned long long, unsigned long long,
+	unsigned long long *)
+
+/* bmiintrin.h */
+test_2 (__andn_u64, unsigned long long, unsigned long long,
+	unsigned long long)
+test_2 (__bextr_u64, unsigned long long, unsigned long long,
+	unsigned long long)
+test_3 (_bextr_u64, unsigned long long, unsigned long long,
+	unsigned long long, unsigned long long)
+test_1 (__blsi_u64, unsigned long long, unsigned long long)
+test_1 (_blsi_u64, unsigned long long, unsigned long long)
+test_1 (__blsmsk_u64, unsigned long long, unsigned long long)
+test_1 (_blsmsk_u64, unsigned long long, unsigned long long)
+test_1 (__blsr_u64, unsigned long long, unsigned long long)
+test_1 (_blsr_u64, unsigned long long, unsigned long long)
+test_1 (__tzcnt_u64, unsigned long long, unsigned long long)
+test_1 (_tzcnt_u64, unsigned long long, unsigned long long)
+
+/* bmi2intrin.h */
+test_2 (_bzhi_u64, unsigned long long, unsigned long long,
+	unsigned long long)
+test_2 (_pdep_u64, unsigned long long, unsigned long long,
+	unsigned long long)
+test_2 (_pext_u64, unsigned long long, unsigned long long,
+	unsigned long long)
+test_3 (_mulx_u64, unsigned long long, unsigned long long,
+	unsigned long long, unsigned long long *)
+
+/* cetintrin.h */
+test_0 (_get_ssp, unsigned long long)
+test_2 (_wrssq, void, unsigned long long, void *)
+test_2 (_wrussq, void, unsigned long long, void *)
+
+/* fxsrintrin.h */
+test_1 (_fxsave64, void, void *)
+test_1 (_fxrstor64, void, void *)
+
+/* ia32intrin.h  */
+test_1 (__bsfq, int, long long)
+test_1 (__bsrq, int, long long)
+test_1 (__bswapq, long long, long long)
+test_2 (__crc32q, unsigned long long, unsigned long long,
+	unsigned long long)
+test_1 (__popcntq, long long, unsigned long long)
+test_2 (__rolq, unsigned long long, unsigned long long, int)
+test_2 (__rorq, unsigned long long, unsigned long long, int)
+test_0 (__readeflags, unsigned long long)
+test_1 (__writeeflags, void, unsigned int)
+
+/* lzcntintrin.h */
+test_1 (__lzcnt64, unsigned long long, unsigned long long)
+test_1 (_lzcnt_u64, unsigned long long, unsigned long long)
+
+/* lwpintrin.h */
+test_2_i1 (__lwpval64, void, unsigned long long, unsigned int, 1)
+test_2_i1 (__lwpins64, unsigned char, unsigned long long,
+	   unsigned int, 1)
+
+/* movdirintrin.h */
+test_2 (_directstoreu_u64, void, void *, unsigned long long)
+
+/* popcntintrin.h */
+test_1 (_mm_popcnt_u64, long long, unsigned long long)
+
+/* rdseedintrin.h */
+test_1 (_rdseed64_step, int, unsigned long long *)
+
+/* tbmintrin.h */
+test_1_i1 (__bextri_u64, unsigned long long, unsigned long long, 1)
+test_1 (__blcfill_u64, unsigned long long, unsigned long long)
+test_1 (__blci_u64, unsigned long long, unsigned long long)
+test_1 (__blcic_u64, unsigned long long, unsigned long long)
+test_1 (__blcmsk_u64, unsigned long long, unsigned long long)
+test_1 (__blcs_u64, unsigned long long, unsigned long long)
+test_1 (__blsfill_u64, unsigned long long, unsigned long long)
+test_1 (__blsic_u64, unsigned long long, unsigned long long)
+test_1 (__t1mskc_u64, unsigned long long, unsigned long long)
+test_1 (__tzmsk_u64, unsigned long long, unsigned long long)
+
+/* uintrintrin.h */
+test_0 (_clui, void)
+test_1 (_senduipi, void, unsigned long long)
+test_0 (_stui, void)
+test_0 (_testui, unsigned char)
+
+/* x86gprintrin.h */
+test_1 (_ptwrite64, void, unsigned long long)
+test_0 (_readfsbase_u32, unsigned int)
+test_0 (_readfsbase_u64, unsigned long long)
+test_0 (_readgsbase_u32, unsigned int)
+test_0 (_readgsbase_u64, unsigned long long)
+test_1 (_rdrand64_step, int, unsigned long long *)
+test_1 (_writefsbase_u32, void, unsigned int)
+test_1 (_writefsbase_u64, void, unsigned long long)
+test_1 (_writegsbase_u32, void, unsigned int)
+test_1 (_writegsbase_u64, void, unsigned long long)
+
+/* xsaveintrin.h */
+test_2 (_xsave64, void, void *, long long)
+test_2 (_xrstor64, void, void *, long long)
+
+/* xsavecintrin.h */
+test_2 (_xsavec64, void, void *, long long)
+
+/* xsaveoptintrin.h */
+test_2 (_xsaveopt64, void, void *, long long)
+
+/* xsavesintrin.h */
+test_2 (_xsaves64, void, void *, long long)
+test_2 (_xrstors64, void, void *, long long)
+
+/* waitpkgintrin.h */
+test_1 (_umonitor, void, void *)
+test_2 (_umwait, unsigned char, unsigned int, unsigned long long)
+test_2 (_tpause, unsigned char, unsigned int, unsigned long long)
+
+#else /* !__x86_64__ */
+/* bmi2intrin.h */
+test_3 (_mulx_u32, unsigned int, unsigned int, unsigned int,
+	unsigned int *)
+
+/* cetintrin.h */
+test_0 (_get_ssp, unsigned int)
+#endif /* __x86_64__ */
+
+#endif
diff --git a/gcc/testsuite/gcc.target/i386/pr99744-5.c b/gcc/testsuite/gcc.target/i386/pr99744-5.c
new file mode 100644
index 0000000..9e40e5e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr99744-5.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mmwait" } */
+
+/* Test calling MWAIT intrinsics from functions with general-regs-only
+   target attribute.  */
+
+#include <x86gprintrin.h>
+
+#define _CONCAT(x,y) x ## y
+
+#define test_2(func, type, op1_type, op2_type)				\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (op1_type A, op2_type B)			\
+  { return func (A, B); }
+
+#define test_3(func, type, op1_type, op2_type, op3_type)		\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (op1_type A, op2_type B, op3_type C)		\
+  { return func (A, B, C); }
+
+#ifndef __iamcu__
+/* mwaitintrin.h */
+test_3 (_mm_monitor, void, void const *, unsigned int, unsigned int)
+test_2 (_mm_mwait, void, unsigned int, unsigned int)
+#endif
diff --git a/gcc/testsuite/gcc.target/i386/pr99744-6.c b/gcc/testsuite/gcc.target/i386/pr99744-6.c
new file mode 100644
index 0000000..4025918
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr99744-6.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+#include <x86intrin.h>
+
+extern unsigned long long int curr_deadline;
+extern void bar (void);
+
+void
+foo1 (void)
+{
+  if (__rdtsc () < curr_deadline)
+    return; 
+  bar ();
+}
+
+void
+foo2 (unsigned int *p)
+{
+  if (__rdtscp (p) < curr_deadline)
+    return; 
+  bar ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr99744-7.c b/gcc/testsuite/gcc.target/i386/pr99744-7.c
new file mode 100644
index 0000000..30b7ca0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr99744-7.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mno-avx -Wno-psabi" } */
+
+#include <x86intrin.h>
+
+void
+foo (__m256 *x)
+{
+  x[0] = _mm256_sub_ps (x[1], x[2]);
+}
+
+/* { dg-error "target specific option mismatch" "" { target *-*-* } 0 } */
diff --git a/gcc/testsuite/gcc.target/i386/pr99744-8.c b/gcc/testsuite/gcc.target/i386/pr99744-8.c
new file mode 100644
index 0000000..115183e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr99744-8.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O -Wno-psabi" } */
+
+#include <x86intrin.h>
+
+__attribute__((target ("no-avx")))
+void
+foo (__m256 *x)
+{
+  x[0] = _mm256_sub_ps (x[1], x[2]);
+}
+
+/* { dg-error "target specific option mismatch" "" { target *-*-* } 0 } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-gather-1.c b/gcc/testsuite/gcc.target/i386/vect-gather-1.c
new file mode 100644
index 0000000..261b66b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-gather-1.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msse2 -fdump-tree-vect-details -fdump-tree-forwprop4" } */
+
+#ifndef INDEXTYPE
+#define INDEXTYPE int
+#endif
+double vmul(INDEXTYPE *rowstart, INDEXTYPE *rowend,
+	    double *luval, double *dst)
+{
+  double res = 0;
+  for (const INDEXTYPE * col = rowstart; col != rowend; ++col, ++luval)
+        res += *luval * dst[*col];
+  return res;
+}
+
+/* With gather emulation this should be profitable to vectorize
+   even with plain SSE2.  */
+/* { dg-final { scan-tree-dump "loop vectorized" "vect" } } */
+/* The index vector loads and promotions should be scalar after forwprop.  */
+/* { dg-final { scan-tree-dump-not "vec_unpack" "forwprop4" } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-phminposuw.c b/gcc/testsuite/gcc.target/powerpc/sse4_1-phminposuw.c
new file mode 100644
index 0000000..146df24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-phminposuw.c
@@ -0,0 +1,68 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mvsx -Wno-psabi" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+
+#define DIM(a) (sizeof (a) / sizeof (a)[0])
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128i x;
+      unsigned short s[8];
+    } src[] =
+    {
+      { .s = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 } },
+      { .s = { 0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 } },
+      { .s = { 0xffff, 0xffff, 0x0000, 0xffff, 0xffff, 0xffff, 0x0000, 0xffff } },
+      { .s = { 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008 } },
+      { .s = { 0x0008, 0x0007, 0x0006, 0x0005, 0x0004, 0x0003, 0x0002, 0x0001 } },
+      { .s = { 0xfff4, 0xfff3, 0xfff2, 0xfff1, 0xfff3, 0xfff1, 0xfff2, 0xfff3 } }
+    };
+  unsigned short minVal[DIM (src)];
+  int minInd[DIM (src)];
+  unsigned short minValScalar, minIndScalar;
+  int i, j;
+  union
+    {
+      int si;
+      unsigned short s[2];
+    } res;
+
+  for (i = 0; i < DIM (src); i++)
+    {
+      res.si = _mm_cvtsi128_si32 (_mm_minpos_epu16 (src[i].x));
+      minVal[i] = res.s[0];
+      minInd[i] = res.s[1] & 0b111;
+    }
+
+  for (i = 0; i < DIM (src); i++)
+    {
+      minValScalar = src[i].s[0];
+      minIndScalar = 0;
+
+      for (j = 1; j < 8; j++)
+	if (minValScalar > src[i].s[j])
+	  {
+	    minValScalar = src[i].s[j];
+	    minIndScalar = j;
+	  }
+
+      if (minValScalar != minVal[i] && minIndScalar != minInd[i])
+	abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/s390/vector/long-double-asm-in-out-hard-fp-reg.c b/gcc/testsuite/gcc.target/s390/vector/long-double-asm-in-out-hard-fp-reg.c
index 2dcaf08..a89dd46 100644
--- a/gcc/testsuite/gcc.target/s390/vector/long-double-asm-in-out-hard-fp-reg.c
+++ b/gcc/testsuite/gcc.target/s390/vector/long-double-asm-in-out-hard-fp-reg.c
@@ -16,13 +16,13 @@ sqxbr (long double x)
   return out;
 }
 
-/* Ideally `vpdi %v3,%v1,%v3,5` should be optimized away, but the compiler
+/* Ideally `vmrlg %v3,%v1,%v3` should be optimized away, but the compiler
  * can't do it, because the UNSPEC pattern operates on the whole register.
  * Using the SUBREG pattern solves this problem, but it's fragile.
  */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v2,%v0,%v2,5\n} 1 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v1,%v1,%v3,0\n} 2 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v3,%v1,%v3,5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrlg\t%v2,%v0,%v2\n} 1 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrhg\t%v1,%v1,%v3\n} 2 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrlg\t%v3,%v1,%v3\n} 1 } } */
 
 int
 main (void)
diff --git a/gcc/testsuite/gcc.target/s390/vector/long-double-asm-inout-hard-fp-reg.c b/gcc/testsuite/gcc.target/s390/vector/long-double-asm-inout-hard-fp-reg.c
index 6c5f88d..dd894c8 100644
--- a/gcc/testsuite/gcc.target/s390/vector/long-double-asm-inout-hard-fp-reg.c
+++ b/gcc/testsuite/gcc.target/s390/vector/long-double-asm-inout-hard-fp-reg.c
@@ -15,12 +15,12 @@ sqxbr (long double x)
   return inout;
 }
 
-/* Ideally there should be just one `vpdi %v6,%v4,%v6,5`, but the compiler
+/* Ideally there should be just one `vmrlg %v6,%v4,%v6`, but the compiler
  * can't optimize it away, because the UNSPEC pattern operates on the whole
  * register.  Using the SUBREG pattern solves this problem, but it's fragile.
  */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v6,%v4,%v6,5\n} 2 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v4,%v4,%v6,0\n} 2 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrlg\t%v6,%v4,%v6\n} 2 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrhg\t%v4,%v4,%v6\n} 2 } } */
 
 int
 main (void)
diff --git a/gcc/testsuite/gcc.target/s390/vector/perm-merge.c b/gcc/testsuite/gcc.target/s390/vector/perm-merge.c
new file mode 100644
index 0000000..51b23dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/perm-merge.c
@@ -0,0 +1,104 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=z14 -mzvector --save-temps" } */
+/* { dg-do run { target { s390_z14_hw } } } */
+
+/* { dg-final { scan-assembler-times "\tvmrhb\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrlb\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrhh\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrlh\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrhf\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlf\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrhg\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlg\t" 3 } } */
+
+#include "vec-types.h"
+
+#define GEN_MERGE_2(VEC_TYPE, HILO, A)			\
+  VEC_TYPE __attribute__((noinline))			\
+  merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {	\
+    return (VEC_TYPE){ a[0+A], b[0+A] }; }
+
+#define GEN_MERGE_4(VEC_TYPE, HILO, A)				\
+  VEC_TYPE __attribute__((noinline))				\
+  merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {		\
+    return (VEC_TYPE){ a[0+A], b[0+A], a[1+A], b[1+A] }; }
+
+#define GEN_MERGE_8(VEC_TYPE, HILO, A)					\
+  VEC_TYPE __attribute__((noinline))					\
+  merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {			\
+    return (VEC_TYPE){ a[0+A], b[0+A], a[1+A], b[1+A], a[2+A], b[2+A], a[3+A], b[3+A] }; }
+
+#define GEN_MERGE_16(VEC_TYPE, HILO, A)					\
+  VEC_TYPE __attribute__((noinline))					\
+  merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {			\
+    return (VEC_TYPE){ a[0+A], b[0+A], a[1+A], b[1+A], a[2+A], b[2+A], a[3+A], b[3+A], \
+      a[4+A], b[4+A], a[5+A], b[5+A], a[6+A], b[6+A], a[7+A], b[7+A]}; }
+
+
+GEN_MERGE_16(v16qi, l, 8)
+GEN_MERGE_16(v16qi, h, 0)
+GEN_MERGE_16(uv16qi, l, 8)
+GEN_MERGE_16(uv16qi, h, 0)
+
+GEN_MERGE_8(v8hi, l, 4)
+GEN_MERGE_8(v8hi, h, 0)
+GEN_MERGE_8(uv8hi, l, 4)
+GEN_MERGE_8(uv8hi, h, 0)
+
+GEN_MERGE_4(v4si, l, 2)
+GEN_MERGE_4(v4si, h, 0)
+GEN_MERGE_4(uv4si, l, 2)
+GEN_MERGE_4(uv4si, h, 0)
+
+GEN_MERGE_4(v4sf, l, 2)
+GEN_MERGE_4(v4sf, h, 0)
+
+GEN_MERGE_2(v2di, l, 1)
+GEN_MERGE_2(v2di, h, 0)
+GEN_MERGE_2(uv2di, l, 1)
+GEN_MERGE_2(uv2di, h, 0)
+
+GEN_MERGE_2(v2df, l, 1)
+GEN_MERGE_2(v2df, h, 0)
+
+
+#define CHECK_MERGE_LO(VEC_TYPE, SRC1, SRC2)		\
+  {							\
+    VEC_TYPE v = merge_l_##VEC_TYPE ((SRC1), (SRC2));	\
+    int elts = sizeof(v) / sizeof(v[0]);		\
+    for (int i = 0; i < elts; i++)			\
+      if (v[i] != (i + elts) / 2 + (i % 2) * elts)	\
+	__builtin_abort();				\
+  }
+
+#define CHECK_MERGE_HI(VEC_TYPE, SRC1, SRC2)		\
+  {							\
+    VEC_TYPE v = merge_h_##VEC_TYPE ((SRC1), (SRC2));	\
+    int elts = sizeof(v) / sizeof(v[0]);		\
+    for (int i = 0; i < elts; i++)			\
+      if (v[i] != i / 2 + (i % 2) * elts)		\
+	__builtin_abort();				\
+  }
+
+#define CHECK_MERGE(VEC_TYPE)						\
+  {									\
+    VEC_TYPE a = GEN_SEQ_VEC (VEC_TYPE, 0);				\
+    VEC_TYPE b = GEN_SEQ_VEC (VEC_TYPE, sizeof(VEC_TYPE) / sizeof(a[0])); \
+    CHECK_MERGE_LO (VEC_TYPE, a, b);					\
+    CHECK_MERGE_HI (VEC_TYPE, a, b);					\
+  }
+
+int
+main ()
+{
+  CHECK_MERGE(v16qi);
+  CHECK_MERGE(uv16qi);
+  CHECK_MERGE(v8hi);
+  CHECK_MERGE(uv8hi);
+  CHECK_MERGE(v4si);
+  CHECK_MERGE(uv4si);
+  CHECK_MERGE(v4sf);
+  CHECK_MERGE(v2di);
+  CHECK_MERGE(uv2di);
+  CHECK_MERGE(v2df);
+}
diff --git a/gcc/testsuite/gcc.target/s390/vector/perm-vpdi.c b/gcc/testsuite/gcc.target/s390/vector/perm-vpdi.c
new file mode 100644
index 0000000..cc92531
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/perm-vpdi.c
@@ -0,0 +1,49 @@
+/* { dg-do run { target { s390*-*-* } } } */
+/* { dg-options "-O3 -mzarch -march=z14 -mzvector --save-temps" } */
+
+/* { dg-final { scan-assembler-times "\tvmrhg\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlg\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvpdi\t" 6 } } */
+
+#include "vec-types.h"
+#include <vecintrin.h>
+
+#define GEN_PERMI_BITS(VEC_TYPE, BITS)				\
+  VEC_TYPE __attribute__((noinline))				\
+  permi_##BITS##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {		\
+    return (VEC_TYPE){a[((BITS) & 2) >> 1], b[(BITS) & 1] }; }
+
+#define GEN_PERMI(VEC_TYPE)			\
+  GEN_PERMI_BITS(VEC_TYPE, 0);			\
+  GEN_PERMI_BITS(VEC_TYPE, 1);			\
+  GEN_PERMI_BITS(VEC_TYPE, 2);			\
+  GEN_PERMI_BITS(VEC_TYPE, 3);			\
+
+GEN_PERMI(v2di)
+GEN_PERMI(uv2di)
+GEN_PERMI(v2df)
+
+
+#define CHECK_PERMI_BITS(VEC_TYPE, BITS)		\
+  VEC_TYPE r##BITS = permi_##BITS##_##VEC_TYPE (a, b);	\
+  if (r##BITS[0] != ((BITS) & 2) >> 1			\
+      || r##BITS[1] != ((BITS) & 1) + 2)		\
+    __builtin_abort();
+
+#define CHECK_PERMI(VEC_TYPE)			\
+  {						\
+    VEC_TYPE a = GEN_SEQ_VEC (VEC_TYPE, 0);	\
+    VEC_TYPE b = GEN_SEQ_VEC (VEC_TYPE, 2);	\
+    CHECK_PERMI_BITS (VEC_TYPE, 0);		\
+    CHECK_PERMI_BITS (VEC_TYPE, 1);		\
+    CHECK_PERMI_BITS (VEC_TYPE, 2);		\
+    CHECK_PERMI_BITS (VEC_TYPE, 3);		\
+  }
+
+int
+main ()
+{
+  CHECK_PERMI (v2di);
+  CHECK_PERMI (uv2di);
+  CHECK_PERMI (v2df);
+}
diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-types.h b/gcc/testsuite/gcc.target/s390/vector/vec-types.h
new file mode 100644
index 0000000..b7ffbe7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/vec-types.h
@@ -0,0 +1,35 @@
+#ifndef VEC_TYPES_H
+#define VEC_TYPES_H 1
+
+typedef __attribute__((vector_size(16))) signed char v16qi;
+typedef __attribute__((vector_size(16))) unsigned char uv16qi;
+
+typedef __attribute__((vector_size(16))) signed short v8hi;
+typedef __attribute__((vector_size(16))) unsigned short uv8hi;
+
+typedef __attribute__((vector_size(16))) signed int v4si;
+typedef __attribute__((vector_size(16))) unsigned int uv4si;
+
+typedef __attribute__((vector_size(16))) signed long long v2di;
+typedef __attribute__((vector_size(16))) unsigned long long uv2di;
+
+#if __SIZEOF_INT128__ == 16
+typedef __attribute__((vector_size(16))) __int128_t v1ti;
+#endif
+
+typedef __attribute__((vector_size(16))) double v2df;
+typedef __attribute__((vector_size(16))) long double v1tf;
+
+#if __ARCH__ >= 12
+typedef __attribute__((vector_size(16))) float v4sf;
+#endif
+
+#define GEN_SEQ_VEC(VEC_TYPE, ADDEND)					\
+  ({ VEC_TYPE dummy;							\
+    const int elts = sizeof(VEC_TYPE) / sizeof(dummy[0]);		\
+    typeof(dummy[0]) __attribute__((aligned(8))) ar[elts];		\
+    for (int i = 0; i < elts; i++)					\
+      ar[i] = (typeof(dummy[0]))(i + (ADDEND));				\
+    *(VEC_TYPE*)ar;})
+
+#endif
diff --git a/gcc/testsuite/gcc.target/s390/zvector/vec-permi.c b/gcc/testsuite/gcc.target/s390/zvector/vec-permi.c
deleted file mode 100644
index c0a852b..0000000
--- a/gcc/testsuite/gcc.target/s390/zvector/vec-permi.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/* { dg-do compile } */
-/* { dg-options "-O3 -march=z13 -mzarch --save-temps" } */
-/* { dg-do run { target { s390_z13_hw } } } */
-
-/*
- * The vector intrinsic vec_permi(a, b, c) chooses one of the two eight-byte
- * vector elements in each of a and b, depending on the value of c. The valid
- * values for c differ from the encoding for the M4 field in assembly and in the
- * binary instruction.
- *
- * selection | c | encoding in assembly
- * a[0] b[0] | 0 | 0
- * a[0] b[1] | 1 | 1
- * a[1] b[0] | 2 | 4
- * a[1] b[1] | 3 | 5
- *
- * (i.e., indices a[i] b[j] are encoded for c as (i<<1) | j, yet for the
- * M4 field as (i<<2) | j.
- */
-#include <assert.h>
-#include <vecintrin.h>
-
-typedef unsigned long long uv2di __attribute__((vector_size(16)));
-
-__attribute__ ((noipa)) static uv2di
-do_vec_permi(uv2di a, uv2di b, int c)
-{
-    switch(c) {
-	case 0: return vec_permi(a, b, 0);
-	case 1: return vec_permi(a, b, 1);
-	case 2: return vec_permi(a, b, 2);
-	case 3: return vec_permi(a, b, 3);
-	default: assert(0);
-    }
-}
-
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v\d+,%v\d+,%v\d+,0\n} 1 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v\d+,%v\d+,%v\d+,1\n} 1 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v\d+,%v\d+,%v\d+,4\n} 1 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v\d+,%v\d+,%v\d+,5\n} 1 } } */
-
-int
-main (void)
-{
-    uv2di a = { 0xa0, 0xa1 };
-    uv2di b = { 0xb0, 0xb1 };
-
-    for (int i = 0; i < 2; i++)
-	for (int j = 0; j < 2; j++) {
-	    uv2di res = do_vec_permi(a, b, (i<<1)|j);
-	    assert(res[0] == a[i]);
-	    assert(res[1] == b[j]);
-	}
-}
diff --git a/gcc/testsuite/gcc.target/s390/zvector/vec-types.h b/gcc/testsuite/gcc.target/s390/zvector/vec-types.h
new file mode 100644
index 0000000..35bd2a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/zvector/vec-types.h
@@ -0,0 +1,37 @@
+#ifndef VEC_TYPES_H
+#define VEC_TYPES_H 1
+
+#include <vecintrin.h>
+
+typedef __vector signed char v16qi;
+typedef __vector unsigned char uv16qi;
+
+typedef __vector signed short v8hi;
+typedef __vector unsigned short uv8hi;
+
+typedef __vector signed int v4si;
+typedef __vector unsigned int uv4si;
+
+typedef __vector signed long long v2di;
+typedef __vector unsigned long long uv2di;
+
+#if __SIZEOF_INT128__ == 16
+typedef __vector __int128_t v1ti;
+#endif
+
+typedef __vector double v2df;
+typedef __vector long double v1tf;
+
+#if __ARCH__ >= 12
+typedef __vector float v4sf;
+#endif
+
+#define GEN_SEQ_VEC(VEC_TYPE, ADDEND)					\
+  ({ VEC_TYPE dummy;							\
+    const int elts = sizeof(VEC_TYPE) / sizeof(dummy[0]);		\
+    typeof(dummy[0]) __attribute__((aligned(8))) ar[elts];		\
+    for (int i = 0; i < elts; i++)					\
+      ar[i] = (typeof(dummy[0]))(i + (ADDEND));				\
+    *(VEC_TYPE*)ar;})
+
+#endif
diff --git a/gcc/testsuite/gcc.target/s390/zvector/vec_merge.c b/gcc/testsuite/gcc.target/s390/zvector/vec_merge.c
new file mode 100644
index 0000000..348d1f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/zvector/vec_merge.c
@@ -0,0 +1,88 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=z14 -mzvector --save-temps" } */
+/* { dg-do run { target { s390_z14_hw } } } */
+
+/* { dg-final { scan-assembler-times "\tvmrhb\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrlb\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrhh\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrlh\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrhf\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlf\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrhg\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlg\t" 3 } } */
+
+#include "vec-types.h"
+#include <vecintrin.h>
+
+#define GEN_MERGE(VEC_TYPE, HILO)					\
+  VEC_TYPE __attribute__((noinline))					\
+  merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {			\
+    return vec_merge##HILO (a, b); }
+
+GEN_MERGE(v16qi, l)
+GEN_MERGE(v16qi, h)
+GEN_MERGE(uv16qi, l)
+GEN_MERGE(uv16qi, h)
+
+GEN_MERGE(v8hi, l)
+GEN_MERGE(v8hi, h)
+GEN_MERGE(uv8hi, l)
+GEN_MERGE(uv8hi, h)
+
+GEN_MERGE(v4si, l)
+GEN_MERGE(v4si, h)
+GEN_MERGE(uv4si, l)
+GEN_MERGE(uv4si, h)
+
+GEN_MERGE(v4sf, l)
+GEN_MERGE(v4sf, h)
+
+GEN_MERGE(v2di, l)
+GEN_MERGE(v2di, h)
+GEN_MERGE(uv2di, l)
+GEN_MERGE(uv2di, h)
+
+GEN_MERGE(v2df, l)
+GEN_MERGE(v2df, h)
+
+
+#define CHECK_MERGE_LO(VEC_TYPE, SRC1, SRC2)				\
+  {									\
+    VEC_TYPE v = merge_l_##VEC_TYPE ((SRC1), (SRC2));			\
+    int elts = sizeof(v) / sizeof(v[0]);				\
+    for (int i = 0; i < elts; i++)					\
+      if (v[i] != (i + elts) / 2 + (i % 2) * elts)			\
+	__builtin_abort();						\
+  }
+
+#define CHECK_MERGE_HI(VEC_TYPE, SRC1, SRC2)				\
+  {									\
+    VEC_TYPE v = merge_h_##VEC_TYPE ((SRC1), (SRC2));			\
+    int elts = sizeof(v) / sizeof(v[0]);				\
+    for (int i = 0; i < elts; i++)					\
+      if (v[i] != i / 2 + (i % 2) * elts)				\
+	__builtin_abort();						\
+  }
+
+#define CHECK_MERGE(VEC_TYPE)						\
+  {									\
+    VEC_TYPE a = GEN_SEQ_VEC (VEC_TYPE, 0);				\
+    VEC_TYPE b = GEN_SEQ_VEC (VEC_TYPE, sizeof(VEC_TYPE) / sizeof(a[0])); \
+    CHECK_MERGE_LO (VEC_TYPE, a, b);					\
+    CHECK_MERGE_HI (VEC_TYPE, a, b);					\
+  }
+
+int
+main ()
+{
+  CHECK_MERGE(v16qi);
+  CHECK_MERGE(uv16qi);
+  CHECK_MERGE(v8hi);
+  CHECK_MERGE(uv8hi);
+  CHECK_MERGE(v4si);
+  CHECK_MERGE(uv4si);
+  CHECK_MERGE(v4sf);
+  CHECK_MERGE(v2di);
+  CHECK_MERGE(uv2di);
+  CHECK_MERGE(v2df);
+}
diff --git a/gcc/testsuite/gcc.target/s390/zvector/vec_permi.c b/gcc/testsuite/gcc.target/s390/zvector/vec_permi.c
new file mode 100644
index 0000000..b66fa90
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/zvector/vec_permi.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=z13 -mzvector --save-temps" } */
+/* { dg-do run { target { s390_z13_hw } } } */
+
+/*
+ * The vector intrinsic vec_permi(a, b, c) chooses one of the two eight-byte
+ * vector elements in each of a and b, depending on the value of c. The valid
+ * values for c differ from the encoding for the M4 field in assembly and in the
+ * binary instruction.
+ *
+ * selection | c | encoding in assembly
+ * a[0] b[0] | 0 | 0          -> vmrhg
+ * a[0] b[1] | 1 | 1
+ * a[1] b[0] | 2 | 4
+ * a[1] b[1] | 3 | 5          -> vmrlg
+ *
+ * (i.e., indices a[i] b[j] are encoded for c as (i<<1) | j, yet for the
+ * M4 field as (i<<2) | j.
+ */
+
+/* { dg-final { scan-assembler-times "\tvmrhg\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlg\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvpdi\t" 6 } } */
+
+#include "vec-types.h"
+#include <vecintrin.h>
+
+#define GEN_PERMI_BITS(VEC_TYPE, BITS)			\
+  VEC_TYPE __attribute__((noinline))			\
+  permi_##BITS##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {	\
+    return vec_permi (a, b, (BITS)); }
+
+#define GEN_PERMI(VEC_TYPE)			\
+  GEN_PERMI_BITS(VEC_TYPE, 0);			\
+  GEN_PERMI_BITS(VEC_TYPE, 1);			\
+  GEN_PERMI_BITS(VEC_TYPE, 2);			\
+  GEN_PERMI_BITS(VEC_TYPE, 3);
+
+GEN_PERMI(v2di)
+GEN_PERMI(uv2di)
+GEN_PERMI(v2df)
+
+
+#define CHECK_PERMI_BITS(VEC_TYPE, BITS)		\
+  VEC_TYPE r##BITS = permi_##BITS##_##VEC_TYPE (a, b);	\
+  if (r##BITS[0] != ((BITS) & 2) >> 1			\
+      || r##BITS[1] != ((BITS) & 1) + 2)		\
+    __builtin_abort();
+
+#define CHECK_PERMI(VEC_TYPE)			\
+  {						\
+    VEC_TYPE a = GEN_SEQ_VEC (VEC_TYPE, 0);	\
+    VEC_TYPE b = GEN_SEQ_VEC (VEC_TYPE, 2);	\
+    CHECK_PERMI_BITS (VEC_TYPE, 0);		\
+    CHECK_PERMI_BITS (VEC_TYPE, 1);		\
+    CHECK_PERMI_BITS (VEC_TYPE, 2);		\
+    CHECK_PERMI_BITS (VEC_TYPE, 3);		\
+  }
+
+int
+main ()
+{
+  CHECK_PERMI (v2di);
+  CHECK_PERMI (uv2di);
+  CHECK_PERMI (v2df);
+}
diff --git a/gcc/testsuite/gfortran.dg/vect/vect-8.f90 b/gcc/testsuite/gfortran.dg/vect/vect-8.f90
index 9994805..cc1aebf 100644
--- a/gcc/testsuite/gfortran.dg/vect/vect-8.f90
+++ b/gcc/testsuite/gfortran.dg/vect/vect-8.f90
@@ -706,5 +706,5 @@ END SUBROUTINE kernel
 
 ! { dg-final { scan-tree-dump-times "vectorized 24 loops" 1 "vect" { target aarch64_sve } } }
 ! { dg-final { scan-tree-dump-times "vectorized 23 loops" 1 "vect" { target { aarch64*-*-* && { ! aarch64_sve } } } } }
-! { dg-final { scan-tree-dump-times "vectorized 2\[23\] loops" 1 "vect" { target { vect_intdouble_cvt && { ! aarch64*-*-* } } } } }
+! { dg-final { scan-tree-dump-times "vectorized 2\[234\] loops" 1 "vect" { target { vect_intdouble_cvt && { ! aarch64*-*-* } } } } }
 ! { dg-final { scan-tree-dump-times "vectorized 17 loops" 1 "vect" { target { { ! vect_intdouble_cvt } && { ! aarch64*-*-* } } } } }
diff --git a/gcc/testsuite/lib/profopt.exp b/gcc/testsuite/lib/profopt.exp
index 9997eb3..25f45ec 100644
--- a/gcc/testsuite/lib/profopt.exp
+++ b/gcc/testsuite/lib/profopt.exp
@@ -289,8 +289,8 @@ proc auto-profopt-execute { src } {
         return
     }
     set profile_wrapper [profopt-perf-wrapper]
-    set profile_option "-g -DFOR_AUTOFDO_TESTING"
-    set feedback_option "-fauto-profile -DFOR_AUTOFDO_TESTING"
+    set profile_option "-gdwarf-4 -DFOR_AUTOFDO_TESTING"
+    set feedback_option "-fauto-profile -DFOR_AUTOFDO_TESTING -fearly-inlining"
     set run_autofdo 1
     profopt-execute $src
     unset profile_wrapper
@@ -451,7 +451,7 @@ proc profopt-execute { src } {
 	    # convert profile
 	    if { $run_autofdo == 1 } {
                 set bprefix "afdo."
-		set cmd "create_gcov --binary $execname1 --profile=$tmpdir/$base.perf.data -gcov_version=1 --gcov=$tmpdir/$bprefix$base.$ext"
+		set cmd "create_gcov --binary $execname1 --profile=$tmpdir/$base.perf.data -gcov_version=2 --gcov=$tmpdir/$bprefix$base.$ext"
 		verbose "Running $cmd"
 		set id [remote_spawn "" $cmd]
 		if { $id < 0 } {
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 42ac9d0..44465b1 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -626,7 +626,7 @@ proc check_effective_target_keeps_null_pointer_checks { } {
 # this allows parallelism of 16 and higher of parallel gcc-auto-profile
 proc profopt-perf-wrapper { } {
     global srcdir
-    return "$srcdir/../config/i386/gcc-auto-profile -o perf.data -m8 "
+    return "$srcdir/../config/i386/gcc-auto-profile -m8 "
 }
 
 # Return true if profiling is supported on the target.
diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
index c05d22f..3a9e14f 100644
--- a/gcc/tree-sra.c
+++ b/gcc/tree-sra.c
@@ -2790,7 +2790,10 @@ propagate_subaccesses_from_rhs (struct access *lacc, struct access *racc)
 	{
 	  /* We are about to change the access type from aggregate to scalar,
 	     so we need to put the reverse flag onto the access, if any.  */
-	  const bool reverse = TYPE_REVERSE_STORAGE_ORDER (lacc->type);
+	  const bool reverse
+	    = TYPE_REVERSE_STORAGE_ORDER (lacc->type)
+	      && !POINTER_TYPE_P (racc->type)
+	      && !VECTOR_TYPE_P (racc->type);
 	  tree t = lacc->base;
 
 	  lacc->type = racc->type;
diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index db3b18b..bd64b8e 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -2757,6 +2757,182 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 }
 
 
+/* Rewrite the vector load at *GSI to component-wise loads if the load
+   is only used in BIT_FIELD_REF extractions with eventual intermediate
+   widening.  */
+
+static void
+optimize_vector_load (gimple_stmt_iterator *gsi)
+{
+  gimple *stmt = gsi_stmt (*gsi);
+  tree lhs = gimple_assign_lhs (stmt);
+  tree rhs = gimple_assign_rhs1 (stmt);
+
+  /* Gather BIT_FIELD_REFs to rewrite, looking through
+     VEC_UNPACK_{LO,HI}_EXPR.  */
+  use_operand_p use_p;
+  imm_use_iterator iter;
+  bool rewrite = true;
+  auto_vec<gimple *, 8> bf_stmts;
+  auto_vec<tree, 8> worklist;
+  worklist.quick_push (lhs);
+  do
+    {
+      tree def = worklist.pop ();
+      unsigned HOST_WIDE_INT def_eltsize
+	= TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (TREE_TYPE (def))));
+      FOR_EACH_IMM_USE_FAST (use_p, iter, def)
+	{
+	  gimple *use_stmt = USE_STMT (use_p);
+	  if (is_gimple_debug (use_stmt))
+	    continue;
+	  if (!is_gimple_assign (use_stmt))
+	    {
+	      rewrite = false;
+	      break;
+	    }
+	  enum tree_code use_code = gimple_assign_rhs_code (use_stmt);
+	  tree use_rhs = gimple_assign_rhs1 (use_stmt);
+	  if (use_code == BIT_FIELD_REF
+	      && TREE_OPERAND (use_rhs, 0) == def
+	      /* If its on the VEC_UNPACK_{HI,LO}_EXPR
+		 def need to verify it is element aligned.  */
+	      && (def == lhs
+		  || (known_eq (bit_field_size (use_rhs), def_eltsize)
+		      && constant_multiple_p (bit_field_offset (use_rhs),
+					      def_eltsize))))
+	    {
+	      bf_stmts.safe_push (use_stmt);
+	      continue;
+	    }
+	  /* Walk through one level of VEC_UNPACK_{LO,HI}_EXPR.  */
+	  if (def == lhs
+	      && (use_code == VEC_UNPACK_HI_EXPR
+		  || use_code == VEC_UNPACK_LO_EXPR)
+	      && use_rhs == lhs)
+	    {
+	      worklist.safe_push (gimple_assign_lhs (use_stmt));
+	      continue;
+	    }
+	  rewrite = false;
+	  break;
+	}
+      if (!rewrite)
+	break;
+    }
+  while (!worklist.is_empty ());
+
+  if (!rewrite)
+    {
+      gsi_next (gsi);
+      return;
+    }
+  /* We now have all ultimate uses of the load to rewrite in bf_stmts.  */
+
+  /* Prepare the original ref to be wrapped in adjusted BIT_FIELD_REFs.
+     For TARGET_MEM_REFs we have to separate the LEA from the reference.  */
+  tree load_rhs = rhs;
+  if (TREE_CODE (load_rhs) == TARGET_MEM_REF)
+    {
+      if (TREE_CODE (TREE_OPERAND (load_rhs, 0)) == ADDR_EXPR)
+	mark_addressable (TREE_OPERAND (TREE_OPERAND (load_rhs, 0), 0));
+      tree tem = make_ssa_name (TREE_TYPE (TREE_OPERAND (load_rhs, 0)));
+      gimple *new_stmt
+	= gimple_build_assign (tem, build1 (ADDR_EXPR, TREE_TYPE (tem),
+					    unshare_expr (load_rhs)));
+      gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+      load_rhs = build2_loc (EXPR_LOCATION (load_rhs),
+			     MEM_REF, TREE_TYPE (load_rhs), tem,
+			     build_int_cst
+			       (TREE_TYPE (TREE_OPERAND (load_rhs, 1)), 0));
+    }
+
+  /* Rewrite the BIT_FIELD_REFs to be actual loads, re-emitting them at
+     the place of the original load.  */
+  for (gimple *use_stmt : bf_stmts)
+    {
+      tree bfr = gimple_assign_rhs1 (use_stmt);
+      tree new_rhs = unshare_expr (load_rhs);
+      if (TREE_OPERAND (bfr, 0) != lhs)
+	{
+	  /* When the BIT_FIELD_REF is on the promoted vector we have to
+	     adjust it and emit a conversion afterwards.  */
+	  gimple *def_stmt
+	      = SSA_NAME_DEF_STMT (TREE_OPERAND (bfr, 0));
+	  enum tree_code def_code
+	      = gimple_assign_rhs_code (def_stmt);
+
+	  /* The adjusted BIT_FIELD_REF is of the promotion source
+	     vector size and at half of the offset...  */
+	  new_rhs = fold_build3 (BIT_FIELD_REF,
+				 TREE_TYPE (TREE_TYPE (lhs)),
+				 new_rhs,
+				 TYPE_SIZE (TREE_TYPE (TREE_TYPE (lhs))),
+				 size_binop (EXACT_DIV_EXPR,
+					     TREE_OPERAND (bfr, 2),
+					     bitsize_int (2)));
+	  /* ... and offsetted by half of the vector if VEC_UNPACK_HI_EXPR.  */
+	  if (def_code == (!BYTES_BIG_ENDIAN
+			   ? VEC_UNPACK_HI_EXPR : VEC_UNPACK_LO_EXPR))
+	    TREE_OPERAND (new_rhs, 2)
+	      = size_binop (PLUS_EXPR, TREE_OPERAND (new_rhs, 2),
+			    size_binop (EXACT_DIV_EXPR,
+					TYPE_SIZE (TREE_TYPE (lhs)),
+					bitsize_int (2)));
+	  tree tem = make_ssa_name (TREE_TYPE (TREE_TYPE (lhs)));
+	  gimple *new_stmt = gimple_build_assign (tem, new_rhs);
+	  location_t loc = gimple_location (use_stmt);
+	  gimple_set_location (new_stmt, loc);
+	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+	  /* Perform scalar promotion.  */
+	  new_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
+					  NOP_EXPR, tem);
+	  gimple_set_location (new_stmt, loc);
+	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+	}
+      else
+	{
+	  /* When the BIT_FIELD_REF is on the original load result
+	     we can just wrap that.  */
+	  tree new_rhs = fold_build3 (BIT_FIELD_REF, TREE_TYPE (bfr),
+				      unshare_expr (load_rhs),
+				      TREE_OPERAND (bfr, 1),
+				      TREE_OPERAND (bfr, 2));
+	  gimple *new_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
+						  new_rhs);
+	  location_t loc = gimple_location (use_stmt);
+	  gimple_set_location (new_stmt, loc);
+	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+	}
+      gimple_stmt_iterator gsi2 = gsi_for_stmt (use_stmt);
+      unlink_stmt_vdef (use_stmt);
+      gsi_remove (&gsi2, true);
+    }
+
+  /* Finally get rid of the intermediate stmts.  */
+  gimple *use_stmt;
+  FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
+    {
+      if (is_gimple_debug (use_stmt))
+	{
+	  if (gimple_debug_bind_p (use_stmt))
+	    {
+	      gimple_debug_bind_reset_value (use_stmt);
+	      update_stmt (use_stmt);
+	    }
+	  continue;
+	}
+      gimple_stmt_iterator gsi2 = gsi_for_stmt (use_stmt);
+      unlink_stmt_vdef (use_stmt);
+      release_defs (use_stmt);
+      gsi_remove (&gsi2, true);
+    }
+  /* And the original load.  */
+  release_defs (stmt);
+  gsi_remove (gsi, true);
+}
+
+
 /* Primitive "lattice" function for gimple_simplify.  */
 
 static tree
@@ -3007,71 +3183,15 @@ pass_forwprop::execute (function *fun)
 		gsi_next (&gsi);
 	    }
 	  else if (TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE
-		   && TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
+		   && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
+		       /* After vector lowering rewrite all loads, but
+			  initially do not since this conflicts with
+			  vector CONSTRUCTOR to shuffle optimization.  */
+		       || (fun->curr_properties & PROP_gimple_lvec))
 		   && gimple_assign_load_p (stmt)
 		   && !gimple_has_volatile_ops (stmt)
-		   && (TREE_CODE (gimple_assign_rhs1 (stmt))
-		       != TARGET_MEM_REF)
 		   && !stmt_can_throw_internal (cfun, stmt))
-	    {
-	      /* Rewrite loads used only in BIT_FIELD_REF extractions to
-	         component-wise loads.  */
-	      use_operand_p use_p;
-	      imm_use_iterator iter;
-	      bool rewrite = true;
-	      FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
-		{
-		  gimple *use_stmt = USE_STMT (use_p);
-		  if (is_gimple_debug (use_stmt))
-		    continue;
-		  if (!is_gimple_assign (use_stmt)
-		      || gimple_assign_rhs_code (use_stmt) != BIT_FIELD_REF
-		      || TREE_OPERAND (gimple_assign_rhs1 (use_stmt), 0) != lhs)
-		    {
-		      rewrite = false;
-		      break;
-		    }
-		}
-	      if (rewrite)
-		{
-		  gimple *use_stmt;
-		  FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
-		    {
-		      if (is_gimple_debug (use_stmt))
-			{
-			  if (gimple_debug_bind_p (use_stmt))
-			    {
-			      gimple_debug_bind_reset_value (use_stmt);
-			      update_stmt (use_stmt);
-			    }
-			  continue;
-			}
-
-		      tree bfr = gimple_assign_rhs1 (use_stmt);
-		      tree new_rhs = fold_build3 (BIT_FIELD_REF,
-						  TREE_TYPE (bfr),
-						  unshare_expr (rhs),
-						  TREE_OPERAND (bfr, 1),
-						  TREE_OPERAND (bfr, 2));
-		      gimple *new_stmt
-			= gimple_build_assign (gimple_assign_lhs (use_stmt),
-					       new_rhs);
-
-		      location_t loc = gimple_location (use_stmt);
-		      gimple_set_location (new_stmt, loc);
-		      gimple_stmt_iterator gsi2 = gsi_for_stmt (use_stmt);
-		      unlink_stmt_vdef (use_stmt);
-		      gsi_remove (&gsi2, true);
-
-		      gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
-		    }
-
-		  release_defs (stmt);
-		  gsi_remove (&gsi, true);
-		}
-	      else
-		gsi_next (&gsi);
-	    }
+	    optimize_vector_load (&gsi);
 
 	  else if (code == COMPLEX_EXPR)
 	    {
diff --git a/gcc/tree-tailcall.c b/gcc/tree-tailcall.c
index f2833d2..f2f3a6b 100644
--- a/gcc/tree-tailcall.c
+++ b/gcc/tree-tailcall.c
@@ -131,9 +131,6 @@ static tree m_acc, a_acc;
 
 static bitmap tailr_arg_needs_copy;
 
-static bool optimize_tail_call (struct tailcall *, bool);
-static void eliminate_tail_call (struct tailcall *);
-
 /* Returns false when the function is not suitable for tail call optimization
    from some reason (e.g. if it takes variable number of arguments).  */
 
@@ -926,10 +923,11 @@ decrease_profile (basic_block bb, profile_count count)
 }
 
 /* Eliminates tail call described by T.  TMP_VARS is a list of
-   temporary variables used to copy the function arguments.  */
+   temporary variables used to copy the function arguments.
+   Allocates *NEW_LOOP if not already done and initializes it.  */
 
 static void
-eliminate_tail_call (struct tailcall *t)
+eliminate_tail_call (struct tailcall *t, class loop *&new_loop)
 {
   tree param, rslt;
   gimple *stmt, *call;
@@ -999,6 +997,16 @@ eliminate_tail_call (struct tailcall *t)
   gcc_assert (e);
   PENDING_STMT (e) = NULL;
 
+  /* Add the new loop.  */
+  if (!new_loop)
+    {
+      new_loop = alloc_loop ();
+      new_loop->header = first;
+      new_loop->finite_p = true;
+    }
+  else
+    gcc_assert (new_loop->header == first);
+
   /* Add phi node entries for arguments.  The ordering of the phi nodes should
      be the same as the ordering of the arguments.  */
   for (param = DECL_ARGUMENTS (current_function_decl),
@@ -1037,11 +1045,12 @@ eliminate_tail_call (struct tailcall *t)
    mark the tailcalls for the sibcall optimization.  */
 
 static bool
-optimize_tail_call (struct tailcall *t, bool opt_tailcalls)
+optimize_tail_call (struct tailcall *t, bool opt_tailcalls,
+		    class loop *&new_loop)
 {
   if (t->tail_recursion)
     {
-      eliminate_tail_call (t);
+      eliminate_tail_call (t, new_loop);
       return true;
     }
 
@@ -1177,12 +1186,15 @@ tree_optimize_tail_calls_1 (bool opt_tailcalls)
       opt_tailcalls = false;
     }
 
+  class loop *new_loop = NULL;
   for (; tailcalls; tailcalls = next)
     {
       next = tailcalls->next;
-      changed |= optimize_tail_call (tailcalls, opt_tailcalls);
+      changed |= optimize_tail_call (tailcalls, opt_tailcalls, new_loop);
       free (tailcalls);
     }
+  if (new_loop)
+    add_loop (new_loop, loops_for_fn (cfun)->tree_root);
 
   if (a_acc || m_acc)
     {
@@ -1198,11 +1210,7 @@ tree_optimize_tail_calls_1 (bool opt_tailcalls)
     }
 
   if (changed)
-    {
-      /* We may have created new loops.  Make them magically appear.  */
-      loops_state_set (LOOPS_NEED_FIXUP);
-      free_dominance_info (CDI_DOMINATORS);
-    }
+    free_dominance_info (CDI_DOMINATORS);
 
   /* Add phi nodes for the virtual operands defined in the function to the
      header of the loop created by tail recursion elimination.  Do so
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 6995efb..17d24b4 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -4007,8 +4007,24 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 	      continue;
 	    }
 
-	  if (TYPE_PRECISION (TREE_TYPE (op0))
-	      < TYPE_PRECISION (TREE_TYPE (off)))
+	  /* Include the conversion if it is widening and we're using
+	     the IFN path or the target can handle the converted from
+	     offset or the current size is not already the same as the
+	     data vector element size.  */
+	  if ((TYPE_PRECISION (TREE_TYPE (op0))
+	       < TYPE_PRECISION (TREE_TYPE (off)))
+	      && (use_ifn_p
+		  || (DR_IS_READ (dr)
+		      ? (targetm.vectorize.builtin_gather
+			 && targetm.vectorize.builtin_gather (vectype,
+							      TREE_TYPE (op0),
+							      scale))
+		      : (targetm.vectorize.builtin_scatter
+			 && targetm.vectorize.builtin_scatter (vectype,
+							       TREE_TYPE (op0),
+							       scale)))
+		  || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
+				       TYPE_SIZE (TREE_TYPE (vectype)), 0)))
 	    {
 	      off = op0;
 	      offtype = TREE_TYPE (off);
@@ -4036,7 +4052,8 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
 				     vectype, memory_type, offtype, scale,
 				     &ifn, &offset_vectype))
-	return false;
+	ifn = IFN_LAST;
+      decl = NULL_TREE;
     }
   else
     {
@@ -4050,10 +4067,6 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 	  if (targetm.vectorize.builtin_scatter)
 	    decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
 	}
-
-      if (!decl)
-	return false;
-
       ifn = IFN_LAST;
       /* The offset vector type will be read from DECL when needed.  */
       offset_vectype = NULL_TREE;
@@ -4283,9 +4296,7 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
         {
 	  bool maybe_gather
 	    = DR_IS_READ (dr)
-	      && !TREE_THIS_VOLATILE (DR_REF (dr))
-	      && (targetm.vectorize.builtin_gather != NULL
-		  || supports_vec_gather_load_p ());
+	      && !TREE_THIS_VOLATILE (DR_REF (dr));
 	  bool maybe_scatter
 	    = DR_IS_WRITE (dr)
 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 00a57b2..1e21fe6 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2772,7 +2772,15 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
 
   /* Limit the VFs to what is likely to be the maximum number of iterations,
      to handle cases in which at least one loop_vinfo is fully-masked.  */
-  HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
+  HOST_WIDE_INT estimated_max_niter;
+  loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo);
+  unsigned HOST_WIDE_INT main_vf;
+  if (main_loop
+      && LOOP_VINFO_NITERS_KNOWN_P (main_loop)
+      && LOOP_VINFO_VECT_FACTOR (main_loop).is_constant (&main_vf))
+    estimated_max_niter = LOOP_VINFO_INT_NITERS (main_loop) % main_vf;
+  else
+    estimated_max_niter = likely_max_stmt_executions_int (loop);
   if (estimated_max_niter != -1)
     {
       if (known_le (estimated_max_niter, new_vf))
@@ -3064,7 +3072,16 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 			= opt_loop_vec_info::success (main_loop_vinfo);
 		    }
 		  else
-		    delete main_loop_vinfo;
+		    {
+		      if (dump_enabled_p ())
+			dump_printf_loc (MSG_NOTE, vect_location,
+					 "***** No longer preferring vector"
+					 " mode %s after reanalyzing the loop"
+					 " as a main loop\n",
+					 GET_MODE_NAME
+					   (main_loop_vinfo->vector_mode));
+		      delete main_loop_vinfo;
+		    }
 		}
 	    }
 
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index 743fd3f..25de97b 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -4811,7 +4811,7 @@ vect_recog_gather_scatter_pattern (vec_info *vinfo,
      function for the gather/scatter operation.  */
   gather_scatter_info gs_info;
   if (!vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info)
-      || gs_info.decl)
+      || gs_info.ifn == IFN_LAST)
     return NULL;
 
   /* Convert the mask to the right form.  */
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index a554c24..d169bed 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -4847,15 +4847,17 @@ static bool
 vectorizable_bb_reduc_epilogue (slp_instance instance,
 				stmt_vector_for_cost *cost_vec)
 {
-  enum tree_code reduc_code
-    = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
+  gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
+  enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
   if (reduc_code == MINUS_EXPR)
     reduc_code = PLUS_EXPR;
   internal_fn reduc_fn;
   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
   if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
       || reduc_fn == IFN_LAST
-      || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH))
+      || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
+      || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
+				     TREE_TYPE (vectype)))
     return false;
 
   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 074dfdc..e6b81a0 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1084,6 +1084,7 @@ static void
 vect_model_load_cost (vec_info *vinfo,
 		      stmt_vec_info stmt_info, unsigned ncopies, poly_uint64 vf,
 		      vect_memory_access_type memory_access_type,
+		      gather_scatter_info *gs_info,
 		      slp_tree slp_node,
 		      stmt_vector_for_cost *cost_vec)
 {
@@ -1172,9 +1173,17 @@ vect_model_load_cost (vec_info *vinfo,
   if (memory_access_type == VMAT_ELEMENTWISE
       || memory_access_type == VMAT_GATHER_SCATTER)
     {
-      /* N scalar loads plus gathering them into a vector.  */
       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
       unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
+      if (memory_access_type == VMAT_GATHER_SCATTER
+	  && gs_info->ifn == IFN_LAST && !gs_info->decl)
+	/* For emulated gathers N offset vector element extracts
+	   (we assume the scalar scaling and ptr + offset add is consumed by
+	   the load).  */
+	inside_cost += record_stmt_cost (cost_vec, ncopies * assumed_nunits,
+					 vec_to_scalar, stmt_info, 0,
+					 vect_body);
+      /* N scalar loads plus gathering them into a vector.  */
       inside_cost += record_stmt_cost (cost_vec,
 				       ncopies * assumed_nunits,
 				       scalar_load, stmt_info, 0, vect_body);
@@ -1184,7 +1193,9 @@ vect_model_load_cost (vec_info *vinfo,
 			&inside_cost, &prologue_cost, 
 			cost_vec, cost_vec, true);
   if (memory_access_type == VMAT_ELEMENTWISE
-      || memory_access_type == VMAT_STRIDED_SLP)
+      || memory_access_type == VMAT_STRIDED_SLP
+      || (memory_access_type == VMAT_GATHER_SCATTER
+	  && gs_info->ifn == IFN_LAST && !gs_info->decl))
     inside_cost += record_stmt_cost (cost_vec, ncopies, vec_construct,
 				     stmt_info, 0, vect_body);
 
@@ -1866,7 +1877,8 @@ vect_truncate_gather_scatter_offset (stmt_vec_info stmt_info,
       tree memory_type = TREE_TYPE (DR_REF (dr));
       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
 				     vectype, memory_type, offset_type, scale,
-				     &gs_info->ifn, &gs_info->offset_vectype))
+				     &gs_info->ifn, &gs_info->offset_vectype)
+	  || gs_info->ifn == IFN_LAST)
 	continue;
 
       gs_info->decl = NULL_TREE;
@@ -1901,7 +1913,7 @@ vect_use_strided_gather_scatters_p (stmt_vec_info stmt_info,
 				    gather_scatter_info *gs_info)
 {
   if (!vect_check_gather_scatter (stmt_info, loop_vinfo, gs_info)
-      || gs_info->decl)
+      || gs_info->ifn == IFN_LAST)
     return vect_truncate_gather_scatter_offset (stmt_info, loop_vinfo,
 						masked_p, gs_info);
 
@@ -2355,6 +2367,27 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
 			     vls_type == VLS_LOAD ? "gather" : "scatter");
 	  return false;
 	}
+      else if (gs_info->ifn == IFN_LAST && !gs_info->decl)
+	{
+	  if (vls_type != VLS_LOAD)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "unsupported emulated scatter.\n");
+	      return false;
+	    }
+	  else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()
+		   || !known_eq (TYPE_VECTOR_SUBPARTS (vectype),
+				 TYPE_VECTOR_SUBPARTS
+				   (gs_info->offset_vectype)))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "unsupported vector types for emulated "
+				 "gather.\n");
+	      return false;
+	    }
+	}
       /* Gather-scatter accesses perform only component accesses, alignment
 	 is irrelevant for them.  */
       *alignment_support_scheme = dr_unaligned_supported;
@@ -8692,6 +8725,15 @@ vectorizable_load (vec_info *vinfo,
 			     "unsupported access type for masked load.\n");
 	  return false;
 	}
+      else if (memory_access_type == VMAT_GATHER_SCATTER
+	       && gs_info.ifn == IFN_LAST
+	       && !gs_info.decl)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "unsupported masked emulated gather.\n");
+	  return false;
+	}
     }
 
   if (!vec_stmt) /* transformation not required.  */
@@ -8725,7 +8767,7 @@ vectorizable_load (vec_info *vinfo,
 
       STMT_VINFO_TYPE (orig_stmt_info) = load_vec_info_type;
       vect_model_load_cost (vinfo, stmt_info, ncopies, vf, memory_access_type,
-			    slp_node, cost_vec);
+			    &gs_info, slp_node, cost_vec);
       return true;
     }
 
@@ -9438,7 +9480,8 @@ vectorizable_load (vec_info *vinfo,
 		    unsigned int misalign;
 		    unsigned HOST_WIDE_INT align;
 
-		    if (memory_access_type == VMAT_GATHER_SCATTER)
+		    if (memory_access_type == VMAT_GATHER_SCATTER
+			&& gs_info.ifn != IFN_LAST)
 		      {
 			tree zero = build_zero_cst (vectype);
 			tree scale = size_int (gs_info.scale);
@@ -9456,6 +9499,51 @@ vectorizable_load (vec_info *vinfo,
 			data_ref = NULL_TREE;
 			break;
 		      }
+		    else if (memory_access_type == VMAT_GATHER_SCATTER)
+		      {
+			/* Emulated gather-scatter.  */
+			gcc_assert (!final_mask);
+			unsigned HOST_WIDE_INT const_nunits
+			  = nunits.to_constant ();
+			vec<constructor_elt, va_gc> *ctor_elts;
+			vec_alloc (ctor_elts, const_nunits);
+			gimple_seq stmts = NULL;
+			tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
+			tree scale = size_int (gs_info.scale);
+			align
+			  = get_object_alignment (DR_REF (first_dr_info->dr));
+			tree ltype = build_aligned_type (TREE_TYPE (vectype),
+							 align);
+			for (unsigned k = 0; k < const_nunits; ++k)
+			  {
+			    tree boff = size_binop (MULT_EXPR,
+						    TYPE_SIZE (idx_type),
+						    bitsize_int (k));
+			    tree idx = gimple_build (&stmts, BIT_FIELD_REF,
+						     idx_type, vec_offset,
+						     TYPE_SIZE (idx_type),
+						     boff);
+			    idx = gimple_convert (&stmts, sizetype, idx);
+			    idx = gimple_build (&stmts, MULT_EXPR,
+						sizetype, idx, scale);
+			    tree ptr = gimple_build (&stmts, PLUS_EXPR,
+						     TREE_TYPE (dataref_ptr),
+						     dataref_ptr, idx);
+			    ptr = gimple_convert (&stmts, ptr_type_node, ptr);
+			    tree elt = make_ssa_name (TREE_TYPE (vectype));
+			    tree ref = build2 (MEM_REF, ltype, ptr,
+					       build_int_cst (ref_type, 0));
+			    new_stmt = gimple_build_assign (elt, ref);
+			    gimple_seq_add_stmt (&stmts, new_stmt);
+			    CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
+			  }
+			gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+			new_stmt = gimple_build_assign (NULL_TREE,
+							build_constructor
+							  (vectype, ctor_elts));
+			data_ref = NULL_TREE;
+			break;
+		      }
 
 		    align =
 		      known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index deb2247..686644b4 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2192,4 +2192,108 @@ extern vect_pattern_decl_t slp_patterns[];
 /* Number of supported pattern matchers.  */
 extern size_t num__slp_patterns;
 
+/* ----------------------------------------------------------------------
+   Target support routines
+   -----------------------------------------------------------------------
+   The following routines are provided to simplify costing decisions in
+   target code.  Please add more as needed.  */
+
+/* Return true if an operaton of kind KIND for STMT_INFO represents
+   the extraction of an element from a vector in preparation for
+   storing the element to memory.  */
+inline bool
+vect_is_store_elt_extraction (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
+{
+  return (kind == vec_to_scalar
+	  && STMT_VINFO_DATA_REF (stmt_info)
+	  && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)));
+}
+
+/* Return true if STMT_INFO represents part of a reduction.  */
+inline bool
+vect_is_reduction (stmt_vec_info stmt_info)
+{
+  return (STMT_VINFO_REDUC_DEF (stmt_info)
+	  || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)));
+}
+
+/* If STMT_INFO describes a reduction, return the vect_reduction_type
+   of the reduction it describes, otherwise return -1.  */
+inline int
+vect_reduc_type (vec_info *vinfo, stmt_vec_info stmt_info)
+{
+  if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
+    if (STMT_VINFO_REDUC_DEF (stmt_info))
+      {
+	stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
+	return int (STMT_VINFO_REDUC_TYPE (reduc_info));
+      }
+  return -1;
+}
+
+/* If STMT_INFO is a COND_EXPR that includes an embedded comparison, return the
+   scalar type of the values being compared.  Return null otherwise.  */
+inline tree
+vect_embedded_comparison_type (stmt_vec_info stmt_info)
+{
+  if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
+    if (gimple_assign_rhs_code (assign) == COND_EXPR)
+      {
+	tree cond = gimple_assign_rhs1 (assign);
+	if (COMPARISON_CLASS_P (cond))
+	  return TREE_TYPE (TREE_OPERAND (cond, 0));
+      }
+  return NULL_TREE;
+}
+
+/* If STMT_INFO is a comparison or contains an embedded comparison, return the
+   scalar type of the values being compared.  Return null otherwise.  */
+inline tree
+vect_comparison_type (stmt_vec_info stmt_info)
+{
+  if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
+    if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison)
+      return TREE_TYPE (gimple_assign_rhs1 (assign));
+  return vect_embedded_comparison_type (stmt_info);
+}
+
+/* Return true if STMT_INFO extends the result of a load.  */
+inline bool
+vect_is_extending_load (class vec_info *vinfo, stmt_vec_info stmt_info)
+{
+  /* Although this is quite large for an inline function, this part
+     at least should be inline.  */
+  gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
+  if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
+    return false;
+
+  tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
+  tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
+  tree rhs_type = TREE_TYPE (rhs);
+  if (!INTEGRAL_TYPE_P (lhs_type)
+      || !INTEGRAL_TYPE_P (rhs_type)
+      || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
+    return false;
+
+  stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
+  return (def_stmt_info
+	  && STMT_VINFO_DATA_REF (def_stmt_info)
+	  && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
+}
+
+/* Return true if STMT_INFO is an integer truncation.  */
+inline bool
+vect_is_integer_truncation (stmt_vec_info stmt_info)
+{
+  gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
+  if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
+    return false;
+
+  tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
+  tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
+  return (INTEGRAL_TYPE_P (lhs_type)
+	  && INTEGRAL_TYPE_P (rhs_type)
+	  && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
+}
+
 #endif  /* GCC_TREE_VECTORIZER_H  */
author	Martin Liska <mliska@suse.cz>	2021-08-05 19:50:30 +0200
committer	Martin Liska <mliska@suse.cz>	2021-08-05 19:50:30 +0200
commit	f182597d273fe81ffae6dfece17fecadd01842f7 (patch)
tree	de04d1a91a0c59ca773707b395393d0ce12516e7 /gcc
parent	fc45f824a020dff1ec2ea68cef1d23345fb7d447 (diff)
parent	4739344d36e6d24764cbedde44a3fff6edc70f6c (diff)
download	gcc-f182597d273fe81ffae6dfece17fecadd01842f7.zip gcc-f182597d273fe81ffae6dfece17fecadd01842f7.tar.gz gcc-f182597d273fe81ffae6dfece17fecadd01842f7.tar.bz2