From 98d7f305d5081bc91c16b9d2b4d62196b86bca86 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 16 Jul 2021 10:29:46 -0700
Subject: x86: Use XMM31 for scratch SSE register

In 64-bit mode, use XMM31 for scratch SSE register to avoid vzeroupper
if possible.

gcc/

	* config/i386/i386.c (ix86_gen_scratch_sse_rtx): In 64-bit mode,
	try XMM31 to avoid vzeroupper.

gcc/testsuite/

	* gcc.target/i386/avx-vzeroupper-14.c: Pass -mno-avx512f to
	disable XMM31.
	* gcc.target/i386/avx-vzeroupper-15.c: Likewise.
	* gcc.target/i386/pr82941-1.c: Updated.  Check for vzeroupper.
	* gcc.target/i386/pr82942-1.c: Likewise.
	* gcc.target/i386/pr82990-1.c: Likewise.
	* gcc.target/i386/pr82990-3.c: Likewise.
	* gcc.target/i386/pr82990-5.c: Likewise.
	* gcc.target/i386/pr100865-4b.c: Likewise.
	* gcc.target/i386/pr100865-6b.c: Likewise.
	* gcc.target/i386/pr100865-7b.c: Likewise.
	* gcc.target/i386/pr100865-10b.c: Likewise.
	* gcc.target/i386/pr100865-8b.c: Updated.
	* gcc.target/i386/pr100865-9b.c: Likewise.
	* gcc.target/i386/pr100865-11b.c: Likewise.
	* gcc.target/i386/pr100865-12b.c: Likewise.
---
 gcc/config/i386/i386.c                            | 18 +++++++++++++++---
 gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c |  2 +-
 gcc/testsuite/gcc.target/i386/avx-vzeroupper-15.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr100865-10b.c      |  1 +
 gcc/testsuite/gcc.target/i386/pr100865-11b.c      |  2 +-
 gcc/testsuite/gcc.target/i386/pr100865-12b.c      |  2 +-
 gcc/testsuite/gcc.target/i386/pr100865-4b.c       |  2 ++
 gcc/testsuite/gcc.target/i386/pr100865-6b.c       |  5 ++++-
 gcc/testsuite/gcc.target/i386/pr100865-7b.c       |  5 ++++-
 gcc/testsuite/gcc.target/i386/pr100865-8b.c       |  2 +-
 gcc/testsuite/gcc.target/i386/pr100865-9b.c       |  2 +-
 gcc/testsuite/gcc.target/i386/pr82941-1.c         |  3 ++-
 gcc/testsuite/gcc.target/i386/pr82942-1.c         |  3 ++-
 gcc/testsuite/gcc.target/i386/pr82990-1.c         |  3 ++-
 gcc/testsuite/gcc.target/i386/pr82990-3.c         |  3 ++-
 gcc/testsuite/gcc.target/i386/pr82990-5.c         |  3 ++-
 16 files changed, 42 insertions(+), 16 deletions(-)

(limited to 'gcc')

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 842eb0e..ec06908 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -23335,9 +23335,21 @@ rtx
 ix86_gen_scratch_sse_rtx (machine_mode mode)
 {
   if (TARGET_SSE && !lra_in_progress)
-    return gen_rtx_REG (mode, (TARGET_64BIT
-			       ? LAST_REX_SSE_REG
-			       : LAST_SSE_REG));
+    {
+      unsigned int regno;
+      if (TARGET_64BIT)
+	{
+	  /* In 64-bit mode, use XMM31 to avoid vzeroupper and always
+	     use XMM31 for CSE.  */
+	  if (ix86_hard_regno_mode_ok (LAST_EXT_REX_SSE_REG, mode))
+	    regno = LAST_EXT_REX_SSE_REG;
+	  else
+	    regno = LAST_REX_SSE_REG;
+	}
+      else
+	regno = LAST_SSE_REG;
+      return gen_rtx_REG (mode, regno);
+    }
   else
     return gen_reg_rtx (mode);
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c
index a31b4a2..9590f25 100644
--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mavx -mtune=generic -dp" } */
+/* { dg-options "-O2 -mavx -mno-avx512f -mtune=generic -dp" } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-15.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-15.c
index 803936e..36dcf73 100644
--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-15.c
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-15.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mavx -mtune=generic -dp" } */
+/* { dg-options "-O2 -mavx -mno-avx512f -mtune=generic -dp" } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
index e5616d8..77ace86 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-10b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
@@ -5,3 +5,4 @@
 
 /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 8 } } */
+/* { dg-final { scan-assembler-not "vzeroupper" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-11b.c b/gcc/testsuite/gcc.target/i386/pr100865-11b.c
index 12d55b9..7e458e8 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-11b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-11b.c
@@ -5,4 +5,4 @@
 
 /* { dg-final { scan-assembler-times "movabsq" 1 } } */
 /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-times "vmovdqa64\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-12b.c b/gcc/testsuite/gcc.target/i386/pr100865-12b.c
index 63a5629..dee0cfb 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-12b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-12b.c
@@ -5,4 +5,4 @@
 
 /* { dg-final { scan-assembler-times "movabsq" 1 } } */
 /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-times "vmovdqa64\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
index 8e8a7ea..80e9fdb 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
@@ -5,5 +5,7 @@
 
 /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 2 } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-6b.c b/gcc/testsuite/gcc.target/i386/pr100865-6b.c
index 44e74c6..35f2e96 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-6b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-6b.c
@@ -4,6 +4,9 @@
 #include "pr100865-6a.c"
 
 /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "vmovdqu32\[\\t \]%ymm\[0-9\]+, " 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-not "vpbroadcastd\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-7b.c b/gcc/testsuite/gcc.target/i386/pr100865-7b.c
index 0a68820..ad267c4 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-7b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-7b.c
@@ -5,5 +5,8 @@
 
 /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %ymm\[0-9\]+" 1 { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "vmovdqu64\[\\t \]%ymm\[0-9\]+, " 16 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8b.c b/gcc/testsuite/gcc.target/i386/pr100865-8b.c
index 99a10ad..4b7dd7c 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-8b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-8b.c
@@ -4,4 +4,4 @@
 #include "pr100865-8a.c"
 
 /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-times "vmovdqa64\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-9b.c b/gcc/testsuite/gcc.target/i386/pr100865-9b.c
index 1469624..a315dde 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-9b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-9b.c
@@ -4,4 +4,4 @@
 #include "pr100865-9a.c"
 
 /* { dg-final { scan-assembler-times "vpbroadcastw\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
+/* { dg-final { scan-assembler-times "vmovdqa64\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr82941-1.c b/gcc/testsuite/gcc.target/i386/pr82941-1.c
index d7e530d..c3be2f5 100644
--- a/gcc/testsuite/gcc.target/i386/pr82941-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr82941-1.c
@@ -11,4 +11,5 @@ pr82941 ()
   z = y;
 }
 
-/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr82942-1.c b/gcc/testsuite/gcc.target/i386/pr82942-1.c
index 9cdf81a..29ead04 100644
--- a/gcc/testsuite/gcc.target/i386/pr82942-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr82942-1.c
@@ -3,4 +3,5 @@
 
 #include "pr82941-1.c"
 
-/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr82990-1.c b/gcc/testsuite/gcc.target/i386/pr82990-1.c
index ff1d6d4..bbf580f 100644
--- a/gcc/testsuite/gcc.target/i386/pr82990-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr82990-1.c
@@ -11,4 +11,5 @@ pr82941 ()
   z = y;
 }
 
-/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr82990-3.c b/gcc/testsuite/gcc.target/i386/pr82990-3.c
index 201fa98..89ddb20 100644
--- a/gcc/testsuite/gcc.target/i386/pr82990-3.c
+++ b/gcc/testsuite/gcc.target/i386/pr82990-3.c
@@ -3,4 +3,5 @@
 
 #include "pr82941-1.c"
 
-/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr82990-5.c b/gcc/testsuite/gcc.target/i386/pr82990-5.c
index 9932bdc..b9da0e7 100644
--- a/gcc/testsuite/gcc.target/i386/pr82990-5.c
+++ b/gcc/testsuite/gcc.target/i386/pr82990-5.c
@@ -11,4 +11,5 @@ pr82941 ()
   z = y;
 }
 
-/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
+/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
-- 
cgit v1.1


From eaa93a0f3d9f67c8cbc1dc849ea6feba432ff412 Mon Sep 17 00:00:00 2001
From: "Paul A. Clarke" <pc@us.ibm.com>
Date: Mon, 22 Feb 2021 19:13:28 -0600
Subject: rs6000: Add support for _mm_minpos_epu16

Add a naive implementation of the subject x86 intrinsic to
ease porting.

2021-08-03  Paul A. Clarke  <pc@us.ibm.com>

gcc
	* config/rs6000/smmintrin.h (_mm_minpos_epu16): New.
---
 gcc/config/rs6000/smmintrin.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'gcc')

diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h
index 0145b92..3767a67 100644
--- a/gcc/config/rs6000/smmintrin.h
+++ b/gcc/config/rs6000/smmintrin.h
@@ -296,4 +296,31 @@ _mm_floor_ss (__m128 __A, __m128 __B)
   return __r;
 }
 
+/* Return horizontal packed word minimum and its index in bits [15:0]
+   and bits [18:16] respectively.  */
+__inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_minpos_epu16 (__m128i __A)
+{
+  union __u
+    {
+      __m128i __m;
+      __v8hu __uh;
+    };
+  union __u __u = { .__m = __A }, __r = { .__m = {0} };
+  unsigned short __ridx = 0;
+  unsigned short __rmin = __u.__uh[__ridx];
+  for (unsigned long __i = 1; __i < 8; __i++)
+    {
+      if (__u.__uh[__i] < __rmin)
+	{
+	  __rmin = __u.__uh[__i];
+	  __ridx = __i;
+	}
+    }
+  __r.__uh[0] = __rmin;
+  __r.__uh[1] = __ridx;
+  return __r.__m;
+}
+
 #endif
-- 
cgit v1.1


From 0f44b097321c42ba8f9301f369a9799424aa6d46 Mon Sep 17 00:00:00 2001
From: "Paul A. Clarke" <pc@us.ibm.com>
Date: Mon, 22 Feb 2021 19:20:48 -0600
Subject: rs6000: Add test for _mm_minpos_epu16

Copy the test for _mm_minpos_epu16 from
gcc/testsuite/gcc.target/i386/sse4_1-phminposuw.c, with
a few adjustments:

- Adjust the dejagnu directives for powerpc platform.
- Make the data not be monotonically increasing,
  such that some of the returned values are not
  always the first value (index 0).
- Create a list of input data testing various scenarios
  including more than one minimum value and different
  orders and indices of the minimum value.
- Fix a masking issue where the index was being truncated
  to 2 bits instead of 3 bits, which wasn't found because
  all of the returned indices were 0 with the original
  generated data.
- Support big-endian.

2021-08-03  Paul A. Clarke  <pc@us.ibm.com>

gcc/testsuite
	* gcc.target/powerpc/sse4_1-phminposuw.c: Copy from
	gcc/testsuite/gcc.target/i386, adjust dg directives to suit,
	make more robust.
---
 .../gcc.target/powerpc/sse4_1-phminposuw.c         | 68 ++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_1-phminposuw.c

(limited to 'gcc')

diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-phminposuw.c b/gcc/testsuite/gcc.target/powerpc/sse4_1-phminposuw.c
new file mode 100644
index 0000000..146df24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-phminposuw.c
@@ -0,0 +1,68 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mvsx -Wno-psabi" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+
+#define DIM(a) (sizeof (a) / sizeof (a)[0])
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128i x;
+      unsigned short s[8];
+    } src[] =
+    {
+      { .s = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 } },
+      { .s = { 0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 } },
+      { .s = { 0xffff, 0xffff, 0x0000, 0xffff, 0xffff, 0xffff, 0x0000, 0xffff } },
+      { .s = { 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008 } },
+      { .s = { 0x0008, 0x0007, 0x0006, 0x0005, 0x0004, 0x0003, 0x0002, 0x0001 } },
+      { .s = { 0xfff4, 0xfff3, 0xfff2, 0xfff1, 0xfff3, 0xfff1, 0xfff2, 0xfff3 } }
+    };
+  unsigned short minVal[DIM (src)];
+  int minInd[DIM (src)];
+  unsigned short minValScalar, minIndScalar;
+  int i, j;
+  union
+    {
+      int si;
+      unsigned short s[2];
+    } res;
+
+  for (i = 0; i < DIM (src); i++)
+    {
+      res.si = _mm_cvtsi128_si32 (_mm_minpos_epu16 (src[i].x));
+      minVal[i] = res.s[0];
+      minInd[i] = res.s[1] & 0b111;
+    }
+
+  for (i = 0; i < DIM (src); i++)
+    {
+      minValScalar = src[i].s[0];
+      minIndScalar = 0;
+
+      for (j = 1; j < 8; j++)
+	if (minValScalar > src[i].s[j])
+	  {
+	    minValScalar = src[i].s[j];
+	    minIndScalar = j;
+	  }
+
+      if (minValScalar != minVal[i] && minIndScalar != minInd[i])
+	abort ();
+    }
+}
-- 
cgit v1.1


From aabf07cd5dc314135adde89830a86be157d7596b Mon Sep 17 00:00:00 2001
From: Martin Sebor <msebor@redhat.com>
Date: Tue, 3 Aug 2021 13:53:02 -0600
Subject: Disable a test case in ILP32 [PR101688].

Resolves:
PR testsuite/101688 - g++.dg/warn/Wstringop-overflow-4.C fails on 32-bit archs with new jump threader

gcc/testsuite:
	PR testsuite/101688
	* g++.dg/warn/Wstringop-overflow-4.C: Disable a test case in ILP32.
---
 gcc/testsuite/g++.dg/warn/Wstringop-overflow-4.C | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'gcc')

diff --git a/gcc/testsuite/g++.dg/warn/Wstringop-overflow-4.C b/gcc/testsuite/g++.dg/warn/Wstringop-overflow-4.C
index 121239a..c80977d 100644
--- a/gcc/testsuite/g++.dg/warn/Wstringop-overflow-4.C
+++ b/gcc/testsuite/g++.dg/warn/Wstringop-overflow-4.C
@@ -145,7 +145,12 @@ void test_strcpy_new_int16_t (size_t n, const size_t vals[])
   T (S (9), new int16_t[r_imin_imax * 2 + 1]);
 
   int r_0_imax = SR (0, INT_MAX);
-  T (S (1), new int16_t[r_0_imax]);
+
+  if (sizeof (int) < sizeof (size_t))
+    /* The code below might emit a warning when int is the same size
+       as size_t as a result of threading.  See PR 101688 comment #2.  */
+    T (S (1), new int16_t[r_0_imax]);
+
   T (S (2), new int16_t[r_0_imax + 1]);
   T (S (9), new int16_t[r_0_imax * 2 + 1]);
 
-- 
cgit v1.1


From f9ad3d5339faaaed6e15a7b27d90fbc66eb72f37 Mon Sep 17 00:00:00 2001
From: Eugene Rozenfeld <erozen@microsoft.com>
Date: Mon, 2 Aug 2021 17:12:04 -0700
Subject: Fixes for AutoFDO tests

* Changed several tests to use -fdump-ipa-afdo-optimized instead of -fdump-ipa-afdo
in dg-options so that the expected output can be found

* Increased the number of iterations in several tests so that perf can have
enough sampling events

Contributes to fixing PR gcov-profile/71672.

gcc/testsuite/ChangeLog:

	* g++.dg/tree-prof/indir-call-prof.C: Fix options, increase the number of iterations.
	* g++.dg/tree-prof/morefunc.C: Fix options, increase the number of iterations.
	* g++.dg/tree-prof/reorder.C: Fix options, increase the number of iterations.
	* gcc.dg/tree-prof/indir-call-prof-2.c: Fix options, increase the number of iterations.
	* gcc.dg/tree-prof/indir-call-prof.c: Fix options.
---
 gcc/testsuite/g++.dg/tree-prof/indir-call-prof.C   | 4 ++--
 gcc/testsuite/g++.dg/tree-prof/morefunc.C          | 7 ++++---
 gcc/testsuite/g++.dg/tree-prof/reorder.C           | 6 +++---
 gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c | 2 +-
 gcc/testsuite/gcc.dg/tree-prof/indir-call-prof.c   | 2 +-
 5 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'gcc')

diff --git a/gcc/testsuite/g++.dg/tree-prof/indir-call-prof.C b/gcc/testsuite/g++.dg/tree-prof/indir-call-prof.C
index 3374744..b454171 100644
--- a/gcc/testsuite/g++.dg/tree-prof/indir-call-prof.C
+++ b/gcc/testsuite/g++.dg/tree-prof/indir-call-prof.C
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-profile-optimized -fdump-ipa-afdo" } */
+/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-profile-optimized -fdump-ipa-afdo-optimized" } */
 
 struct A {
   A () {}
@@ -26,7 +26,7 @@ main (void)
 
   int i;
 
-  for (i = 0; i < 1000000; i++)
+  for (i = 0; i < 10000000; i++)
     {
       p = (A *)wrap ((void *)&a);
       p->AA ();
diff --git a/gcc/testsuite/g++.dg/tree-prof/morefunc.C b/gcc/testsuite/g++.dg/tree-prof/morefunc.C
index 621d09a..96e0073 100644
--- a/gcc/testsuite/g++.dg/tree-prof/morefunc.C
+++ b/gcc/testsuite/g++.dg/tree-prof/morefunc.C
@@ -1,4 +1,5 @@
-/* { dg-options "-O2 -fno-devirtualize --param=profile-func-internal-id=0 -fdump-ipa-profile-optimized -fdump-ipa-afdo -Wno-attributes -Wno-coverage-mismatch -Wno-missing-profile" } */
+/* { dg-options "-O2 -fno-devirtualize --param=profile-func-internal-id=0 -fdump-ipa-profile-optimized -fdump-ipa-afdo-optimized -Wno-attributes -Wno-coverage-mismatch -Wno-missing-profile" } */
+
 #include "reorder_class1.h"
 #include "reorder_class2.h"
 
@@ -19,7 +20,7 @@ static __attribute__((always_inline))
 void test1 (A *tc)
 {
   int i;
-  for (i = 0; i < 1000; i++)
+  for (i = 0; i < 10000000; i++)
      g += tc->foo(); 
    if (g<100) g++;
 }
@@ -28,7 +29,7 @@ static __attribute__((always_inline))
 void test2 (B *tc)
 {
   int i;
-  for (i = 0; i < 1000000; i++)
+  for (i = 0; i < 10000000; i++)
      g += tc->foo();
 }
 
diff --git a/gcc/testsuite/g++.dg/tree-prof/reorder.C b/gcc/testsuite/g++.dg/tree-prof/reorder.C
index 000fb65..5049096 100644
--- a/gcc/testsuite/g++.dg/tree-prof/reorder.C
+++ b/gcc/testsuite/g++.dg/tree-prof/reorder.C
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fno-devirtualize --param=profile-func-internal-id=0 -fdump-ipa-profile-optimized -fdump-ipa-afdo -Wno-coverage-mismatch -Wno-attributes" } */
+/* { dg-options "-O2 -fno-devirtualize --param=profile-func-internal-id=0 -fdump-ipa-profile-optimized -fdump-ipa-afdo-optimized -Wno-coverage-mismatch -Wno-attributes" } */
 
 #ifdef _PROFILE_USE
 #include "reorder_class1.h"
@@ -13,7 +13,7 @@ static __attribute__((always_inline))
 void test1 (A *tc)
 {
   int i;
-  for (i = 0; i < 1000000; i++)
+  for (i = 0; i < 10000000; i++)
      g += tc->foo(); 
    if (g<100) g++;
 }
@@ -22,7 +22,7 @@ static __attribute__((always_inline))
 void test2 (B *tc)
 {
   int i;
-  for (i = 0; i < 1000000; i++)
+  for (i = 0; i < 10000000; i++)
      g += tc->foo();
 }
 
diff --git a/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c b/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c
index bbba052..2585326 100644
--- a/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c
@@ -22,7 +22,7 @@ int
 main (void)
 {
   int i, val = 0;
-  for (i = 0; i < 100000; i++)
+  for (i = 0; i < 10000000; i++)
     {
       val = do_op (val, add1);
       val = do_op (val, sub1);
diff --git a/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof.c b/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof.c
index 138b85a..7020452 100644
--- a/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof.c
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-profile-optimized -fdump-ipa-afdo" } */
+/* { dg-options "-O2 -fdump-tree-optimized -fdump-ipa-profile-optimized -fdump-ipa-afdo-optimized" } */
 
 static int a1 (void)
 {
-- 
cgit v1.1


From 0ed093c7c3f755bc1cd80e5186abeb2f5c50ee0c Mon Sep 17 00:00:00 2001
From: Eugene Rozenfeld <erozen@microsoft.com>
Date: Mon, 2 Aug 2021 17:22:34 -0700
Subject: Fix indir-call-prof-2.c with AutoFDO

indir-call-prof-2.c has -fno-early-inlining but AutoFDO can't work without
early inlining (it needs to match the inlining of the profiled binary).
I changed profopt.exp to always pass -fearly-inlining for AutoFDO.
With that change the indirect call inlining in indir-call-prof-2.c happens in the early inliner
so I changed the dg-final-use-autofdo.

Contributes to fixing PR gcov-profile/71672

gcc/testsuite/ChangeLog:

	* gcc.dg/tree-prof/indir-call-prof-2.c: Fix dg-final-use-autofdo.
	* lib/profopt.exp: Pass -fearly-inlining when compiling with AutoFDO.
---
 gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c | 6 +++---
 gcc/testsuite/lib/profopt.exp                      | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'gcc')

diff --git a/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c b/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c
index 2585326..594c3f3 100644
--- a/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fno-early-inlining -fdump-ipa-profile-optimized -fdump-ipa-afdo" } */
+/* { dg-options "-O2 -fno-early-inlining -fdump-ipa-profile-optimized -fdump-tree-einline-optimized" } */
 volatile int one;
 static int
 add1 (int val)
@@ -31,5 +31,5 @@ main (void)
 }
 /* { dg-final-use-not-autofdo { scan-ipa-dump "Indirect call -> direct call.* add1 .will resolve by ipa-profile" "profile"} } */
 /* { dg-final-use-not-autofdo { scan-ipa-dump "Indirect call -> direct call.* sub1 .will resolve by ipa-profile" "profile"} } */
-/* { dg-final-use-autofdo { scan-ipa-dump "Indirect call -> direct call.* add1 .will resolve by ipa-profile" "afdo"} } */
-/* { dg-final-use-autofdo { scan-ipa-dump "Indirect call -> direct call.* sub1 .will resolve by ipa-profile" "afdo"} } */
+/* { dg-final-use-autofdo { scan-tree-dump "Inlining add1/1 into main/4." "einline"} } */
+/* { dg-final-use-autofdo { scan-tree-dump "Inlining sub1/2 into main/4." "einline"} } */
diff --git a/gcc/testsuite/lib/profopt.exp b/gcc/testsuite/lib/profopt.exp
index 9997eb3..030d25e 100644
--- a/gcc/testsuite/lib/profopt.exp
+++ b/gcc/testsuite/lib/profopt.exp
@@ -290,7 +290,7 @@ proc auto-profopt-execute { src } {
     }
     set profile_wrapper [profopt-perf-wrapper]
     set profile_option "-g -DFOR_AUTOFDO_TESTING"
-    set feedback_option "-fauto-profile -DFOR_AUTOFDO_TESTING"
+    set feedback_option "-fauto-profile -DFOR_AUTOFDO_TESTING -fearly-inlining"
     set run_autofdo 1
     profopt-execute $src
     unset profile_wrapper
-- 
cgit v1.1


From 9265b378531391498ec1727f67a45da72a6c07e9 Mon Sep 17 00:00:00 2001
From: Eugene Rozenfeld <erozen@microsoft.com>
Date: Mon, 2 Aug 2021 18:29:24 -0700
Subject: Fixes for AutoFDO testing

* create_gcov tool doesn't currently support dwarf 5 so I made a change in profopt.exp
  to pass -gdwarf-4 when compiling the binary to profile.

* I updated the invocation of create_gcov in profopt.exp to pass -gcov_version=2.
  I recently made a change to create_gcov to support version 2:
  https://github.com/google/autofdo/pull/117 .

* I removed useless -o perf.data from the invocation of gcc-auto-profile in
  target-supports.exp.

These changes contribute to fixing PR gcov-profile/71672.

gcc/testsuite/ChangeLog:

	* lib/profopt.exp: Pass gdwarf-4 when compiling test to profile; pass -gcov_version=2.
	* lib/target-supports.exp: Remove unnecessary -o perf.data passed to gcc-auto-profile.
---
 gcc/testsuite/lib/profopt.exp         | 4 ++--
 gcc/testsuite/lib/target-supports.exp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'gcc')

diff --git a/gcc/testsuite/lib/profopt.exp b/gcc/testsuite/lib/profopt.exp
index 030d25e..25f45ec 100644
--- a/gcc/testsuite/lib/profopt.exp
+++ b/gcc/testsuite/lib/profopt.exp
@@ -289,7 +289,7 @@ proc auto-profopt-execute { src } {
         return
     }
     set profile_wrapper [profopt-perf-wrapper]
-    set profile_option "-g -DFOR_AUTOFDO_TESTING"
+    set profile_option "-gdwarf-4 -DFOR_AUTOFDO_TESTING"
     set feedback_option "-fauto-profile -DFOR_AUTOFDO_TESTING -fearly-inlining"
     set run_autofdo 1
     profopt-execute $src
@@ -451,7 +451,7 @@ proc profopt-execute { src } {
 	    # convert profile
 	    if { $run_autofdo == 1 } {
                 set bprefix "afdo."
-		set cmd "create_gcov --binary $execname1 --profile=$tmpdir/$base.perf.data -gcov_version=1 --gcov=$tmpdir/$bprefix$base.$ext"
+		set cmd "create_gcov --binary $execname1 --profile=$tmpdir/$base.perf.data -gcov_version=2 --gcov=$tmpdir/$bprefix$base.$ext"
 		verbose "Running $cmd"
 		set id [remote_spawn "" $cmd]
 		if { $id < 0 } {
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 42ac9d0..44465b1 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -626,7 +626,7 @@ proc check_effective_target_keeps_null_pointer_checks { } {
 # this allows parallelism of 16 and higher of parallel gcc-auto-profile
 proc profopt-perf-wrapper { } {
     global srcdir
-    return "$srcdir/../config/i386/gcc-auto-profile -o perf.data -m8 "
+    return "$srcdir/../config/i386/gcc-auto-profile -m8 "
 }
 
 # Return true if profiling is supported on the target.
-- 
cgit v1.1


From 285aa6895d479bed8e72ad363290846645b6faa0 Mon Sep 17 00:00:00 2001
From: Eugene Rozenfeld <erozen@microsoft.com>
Date: Mon, 2 Aug 2021 18:36:09 -0700
Subject: Fix indirect call inlining with AutoFDO

The histogram value for indirect calls was incorrectly set up.
That is fixed now.

With this change the tree-prof tests checking indirect call inlining with AutoFDO
in gcc.dg and g++.dg are passing.

Resolves:
PR gcov-profile/71672 - inlining indirect calls does not work with autofdo

gcc/ChangeLog:
	PR gcov-profile/71672
	* auto-profile.c (afdo_indirect_call): Fix setup of the historgram value for indirect calls.
---
 gcc/auto-profile.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'gcc')

diff --git a/gcc/auto-profile.c b/gcc/auto-profile.c
index b23b82b..4c1fc6b 100644
--- a/gcc/auto-profile.c
+++ b/gcc/auto-profile.c
@@ -1009,13 +1009,18 @@ afdo_indirect_call (gimple_stmt_iterator *gsi, const icall_target_map &map,
 
   histogram_value hist = gimple_alloc_histogram_value (
       cfun, HIST_TYPE_INDIR_CALL, stmt, callee);
-  hist->n_counters = 3;
+  hist->n_counters = 4;
   hist->hvalue.counters = XNEWVEC (gcov_type, hist->n_counters);
   gimple_add_histogram_value (cfun, stmt, hist);
 
-  hist->hvalue.counters[0] = direct_call->profile_id;
-  hist->hvalue.counters[1] = max_iter->second;
-  hist->hvalue.counters[2] = total;
+  // Total counter
+  hist->hvalue.counters[0] = total;
+  // Number of value/counter pairs
+  hist->hvalue.counters[1] = 1;
+  // Value
+  hist->hvalue.counters[2] = direct_call->profile_id;
+  // Counter
+  hist->hvalue.counters[3] = max_iter->second;
 
   if (!transform)
     return;
-- 
cgit v1.1


From ebff536cf401f94129a50e50b69beeb09080a68a Mon Sep 17 00:00:00 2001
From: Segher Boessenkool <segher@kernel.crashing.org>
Date: Tue, 3 Aug 2021 22:22:37 +0000
Subject: rs6000: "e" is not a free constraint letter

It is the prefix of the "es" and "eI" constraints.

2021-08-03  Segher Boessenkool  <segher@kernel.crashing.org>

	* config/rs6000/constraints.md: Remove "e" from the list of available
	constraint characters.
---
 gcc/config/rs6000/constraints.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc')

diff --git a/gcc/config/rs6000/constraints.md b/gcc/config/rs6000/constraints.md
index 561ce97..c8cff1a 100644
--- a/gcc/config/rs6000/constraints.md
+++ b/gcc/config/rs6000/constraints.md
@@ -17,7 +17,7 @@
 ;; along with GCC; see the file COPYING3.  If not see
 ;; <http://www.gnu.org/licenses/>.
 
-;; Available constraint letters: e k q t u A B C D S T
+;; Available constraint letters: k q t u A B C D S T
 
 ;; Register constraints
 
-- 
cgit v1.1


From 3a7794b469f897e0141817785738e2faa73119b5 Mon Sep 17 00:00:00 2001
From: Segher Boessenkool <segher@kernel.crashing.org>
Date: Fri, 4 Jun 2021 19:10:38 +0000
Subject: rs6000: Replace & by &&

2021-08-03  Segher Boessenkool  <segher@kernel.crashing.org>

	* config/rs6000/vsx.md (*vsx_le_perm_store_<mode>): Use && instead of &.
---
 gcc/config/rs6000/vsx.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc')

diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 6f6fc0b..441735d 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -1014,7 +1014,7 @@
   [(set (match_operand:VSX_LE_128 0 "memory_operand" "=Z,Q")
         (match_operand:VSX_LE_128 1 "vsx_register_operand" "+wa,r"))]
   "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR
-   & !altivec_indexed_or_indirect_operand (operands[0], <MODE>mode)"
+   && !altivec_indexed_or_indirect_operand (operands[0], <MODE>mode)"
   "@
    #
    #"
-- 
cgit v1.1


From cbbd439a33e889a1a47b103951c53472fc8558eb Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <iant@golang.org>
Date: Mon, 2 Aug 2021 16:27:02 -0700
Subject: compiler: check slice to pointer-to-array conversion element type

When checking a slice to pointer-to-array conversion, I forgot to
verify that the elements types are identical.

For golang/go#395

Reviewed-on: https://go-review.googlesource.com/c/gofrontend/+/339329
---
 gcc/go/gofrontend/MERGE          | 2 +-
 gcc/go/gofrontend/expressions.cc | 5 ++++-
 gcc/go/gofrontend/types.cc       | 4 +++-
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'gcc')

diff --git a/gcc/go/gofrontend/MERGE b/gcc/go/gofrontend/MERGE
index 95b9340..801e039 100644
--- a/gcc/go/gofrontend/MERGE
+++ b/gcc/go/gofrontend/MERGE
@@ -1,4 +1,4 @@
-0a4d612e6b211780b294717503fc739bbd1f509c
+54361805bd611d896042b879ee7f6d2d4d088537
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
diff --git a/gcc/go/gofrontend/expressions.cc b/gcc/go/gofrontend/expressions.cc
index 15c9eab..51a8b7e 100644
--- a/gcc/go/gofrontend/expressions.cc
+++ b/gcc/go/gofrontend/expressions.cc
@@ -3962,7 +3962,10 @@ Type_conversion_expression::do_lower(Gogo*, Named_object*,
   if (type->points_to() != NULL
       && type->points_to()->array_type() != NULL
       && !type->points_to()->is_slice_type()
-      && val->type()->is_slice_type())
+      && val->type()->is_slice_type()
+      && Type::are_identical(type->points_to()->array_type()->element_type(),
+			     val->type()->array_type()->element_type(),
+			     0, NULL))
     {
       Temporary_statement* val_temp = NULL;
       if (!val->is_multi_eval_safe())
diff --git a/gcc/go/gofrontend/types.cc b/gcc/go/gofrontend/types.cc
index 7c7b2eb..0c44186 100644
--- a/gcc/go/gofrontend/types.cc
+++ b/gcc/go/gofrontend/types.cc
@@ -846,7 +846,9 @@ Type::are_convertible(const Type* lhs, const Type* rhs, std::string* reason)
   if (rhs->is_slice_type()
       && lhs->points_to() != NULL
       && lhs->points_to()->array_type() != NULL
-      && !lhs->points_to()->is_slice_type())
+      && !lhs->points_to()->is_slice_type()
+      && Type::are_identical(lhs->points_to()->array_type()->element_type(),
+			     rhs->array_type()->element_type(), 0, reason))
     return true;
 
   // An unsafe.Pointer type may be converted to any pointer type or to
-- 
cgit v1.1


From e435e72ad713cadd661072427588ec1c777c04e3 Mon Sep 17 00:00:00 2001
From: Ian Lance Taylor <iant@golang.org>
Date: Tue, 3 Aug 2021 11:36:24 -0700
Subject: compile, runtime: make selectnbrecv return two values

The only different between selectnbrecv and selectnbrecv2 is the later
set the input pointer value by second return value from chanrecv.

So by making selectnbrecv return two values from chanrecv, we can get
rid of selectnbrecv2, the compiler can now call only selectnbrecv and
generate simpler code.

This is the gofrontend version of https://golang.org/cl/292890.

Reviewed-on: https://go-review.googlesource.com/c/gofrontend/+/339529
---
 gcc/go/gofrontend/MERGE         |  2 +-
 gcc/go/gofrontend/runtime.def   |  8 ++----
 gcc/go/gofrontend/statements.cc | 59 ++++++++++++++++++++++-------------------
 3 files changed, 34 insertions(+), 35 deletions(-)

(limited to 'gcc')

diff --git a/gcc/go/gofrontend/MERGE b/gcc/go/gofrontend/MERGE
index 801e039..5a097ff 100644
--- a/gcc/go/gofrontend/MERGE
+++ b/gcc/go/gofrontend/MERGE
@@ -1,4 +1,4 @@
-54361805bd611d896042b879ee7f6d2d4d088537
+2031f0be9c0b5fda6421d290a0261eb6bd1c8205
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
diff --git a/gcc/go/gofrontend/runtime.def b/gcc/go/gofrontend/runtime.def
index fad8ceb..87a2708 100644
--- a/gcc/go/gofrontend/runtime.def
+++ b/gcc/go/gofrontend/runtime.def
@@ -204,12 +204,8 @@ DEF_GO_RUNTIME(SELECTNBSEND, "runtime.selectnbsend", P2(CHAN, POINTER), R1(BOOL)
 
 // Non-blocking receive a value from a channel, used for two-case select
 // statement with a default case.
-DEF_GO_RUNTIME(SELECTNBRECV, "runtime.selectnbrecv", P2(POINTER, CHAN), R1(BOOL))
-
-// Non-blocking tuple receive from a channel, used for two-case select
-// statement with a default case.
-DEF_GO_RUNTIME(SELECTNBRECV2, "runtime.selectnbrecv2", P3(POINTER, POINTER, CHAN),
-               R1(BOOL))
+DEF_GO_RUNTIME(SELECTNBRECV, "runtime.selectnbrecv", P2(POINTER, CHAN),
+	       R2(BOOL, BOOL))
 
 // Block execution.  Used for zero-case select.
 DEF_GO_RUNTIME(BLOCK, "runtime.block", P0(), R0())
diff --git a/gcc/go/gofrontend/statements.cc b/gcc/go/gofrontend/statements.cc
index 9643d1b..95fa3c4 100644
--- a/gcc/go/gofrontend/statements.cc
+++ b/gcc/go/gofrontend/statements.cc
@@ -6051,7 +6051,7 @@ Select_statement::lower_two_case(Block* b)
   Expression* chanref = Expression::make_temporary_reference(chantmp, loc);
 
   Block* bchan;
-  Expression* call;
+  Expression* cond;
   if (chancase.is_send())
     {
       // if selectnbsend(chan, &val) { body } else { default body }
@@ -6065,7 +6065,7 @@ Select_statement::lower_two_case(Block* b)
 
       Expression* ref = Expression::make_temporary_reference(ts, loc);
       Expression* addr = Expression::make_unary(OPERATOR_AND, ref, loc);
-      call = Runtime::make_call(Runtime::SELECTNBSEND, loc, 2, chanref, addr);
+      cond = Runtime::make_call(Runtime::SELECTNBSEND, loc, 2, chanref, addr);
       bchan = chancase.statements();
     }
   else
@@ -6075,34 +6075,31 @@ Select_statement::lower_two_case(Block* b)
 
       Expression* ref = Expression::make_temporary_reference(ts, loc);
       Expression* addr = Expression::make_unary(OPERATOR_AND, ref, loc);
-      Expression* okref = NULL;
-      if (chancase.closed() == NULL && chancase.closedvar() == NULL)
-        {
-          // Simple receive.
-          // if selectnbrecv(&lhs, chan) { body } else { default body }
-          call = Runtime::make_call(Runtime::SELECTNBRECV, loc, 2, addr, chanref);
-        }
-      else
-        {
-          // Tuple receive.
-          // if selectnbrecv2(&lhs, &ok, chan) { body } else { default body }
-
-          Type* booltype = Type::make_boolean_type();
-          Temporary_statement* okts = Statement::make_temporary(booltype, NULL,
-                                                                loc);
-          b->add_statement(okts);
-
-          okref = Expression::make_temporary_reference(okts, loc);
-          Expression* okaddr = Expression::make_unary(OPERATOR_AND, okref, loc);
-          call = Runtime::make_call(Runtime::SELECTNBRECV2, loc, 3, addr, okaddr,
-                                    chanref);
-        }
+
+      // selected, ok = selectnbrecv(&lhs, chan)
+      Call_expression* call = Runtime::make_call(Runtime::SELECTNBRECV, loc, 2,
+						 addr, chanref);
+
+      Temporary_statement* selected_temp =
+	Statement::make_temporary(Type::make_boolean_type(),
+				  Expression::make_call_result(call, 0),
+				  loc);
+      b->add_statement(selected_temp);
+
+      Temporary_statement* ok_temp =
+	Statement::make_temporary(Type::make_boolean_type(),
+				  Expression::make_call_result(call, 1),
+				  loc);
+      b->add_statement(ok_temp);
+
+      cond = Expression::make_temporary_reference(selected_temp, loc);
 
       Location cloc = chancase.location();
       bchan = new Block(b, loc);
       if (chancase.val() != NULL && !chancase.val()->is_sink_expression())
         {
-          Statement* as = Statement::make_assignment(chancase.val(), ref->copy(),
+          Statement* as = Statement::make_assignment(chancase.val(),
+						     ref->copy(),
                                                      cloc);
           bchan->add_statement(as);
         }
@@ -6114,12 +6111,18 @@ Select_statement::lower_two_case(Block* b)
 
       if (chancase.closed() != NULL && !chancase.closed()->is_sink_expression())
         {
+	  Expression* okref = Expression::make_temporary_reference(ok_temp,
+								   cloc);
           Statement* as = Statement::make_assignment(chancase.closed(),
-                                                     okref->copy(), cloc);
+                                                     okref, cloc);
           bchan->add_statement(as);
         }
       else if (chancase.closedvar() != NULL)
-        chancase.closedvar()->var_value()->set_init(okref->copy());
+	{
+	  Expression* okref = Expression::make_temporary_reference(ok_temp,
+								   cloc);
+	  chancase.closedvar()->var_value()->set_init(okref);
+	}
 
       Statement* bs = Statement::make_block_statement(chancase.statements(),
                                                       cloc);
@@ -6127,7 +6130,7 @@ Select_statement::lower_two_case(Block* b)
     }
 
   Statement* ifs =
-    Statement::make_if_statement(call, bchan, defcase.statements(), loc);
+    Statement::make_if_statement(cond, bchan, defcase.statements(), loc);
   b->add_statement(ifs);
 
   Statement* label =
-- 
cgit v1.1


From fa1407c7613214cb4a45734fdb14c4756a83808a Mon Sep 17 00:00:00 2001
From: GCC Administrator <gccadmin@gcc.gnu.org>
Date: Wed, 4 Aug 2021 00:16:51 +0000
Subject: Daily bump.

---
 gcc/ChangeLog           | 101 ++++++++++++++++++++++++++++++++++++++++++++++++
 gcc/DATESTAMP           |   2 +-
 gcc/analyzer/ChangeLog  |   6 +++
 gcc/testsuite/ChangeLog |  62 +++++++++++++++++++++++++++++
 4 files changed, 170 insertions(+), 1 deletion(-)

(limited to 'gcc')

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 87b0e27..04757ba 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,104 @@
+2021-08-03  Segher Boessenkool  <segher@kernel.crashing.org>
+
+	* config/rs6000/vsx.md (*vsx_le_perm_store_<mode>): Use && instead of &.
+
+2021-08-03  Segher Boessenkool  <segher@kernel.crashing.org>
+
+	* config/rs6000/constraints.md: Remove "e" from the list of available
+	constraint characters.
+
+2021-08-03  Eugene Rozenfeld  <erozen@microsoft.com>
+
+	PR gcov-profile/71672
+	* auto-profile.c (afdo_indirect_call): Fix setup of the historgram value for indirect calls.
+
+2021-08-03  Paul A. Clarke  <pc@us.ibm.com>
+
+	* config/rs6000/smmintrin.h (_mm_minpos_epu16): New.
+
+2021-08-03  H.J. Lu  <hjl.tools@gmail.com>
+
+	* config/i386/i386.c (ix86_gen_scratch_sse_rtx): In 64-bit mode,
+	try XMM31 to avoid vzeroupper.
+
+2021-08-03  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* doc/invoke.texi: Document -mtune=neoverse-512tvb and
+	-mcpu=neoverse-512tvb.
+	* config/aarch64/aarch64-cores.def (neoverse-512tvb): New entry.
+	* config/aarch64/aarch64-tune.md: Regenerate.
+	* config/aarch64/aarch64.c (neoverse512tvb_sve_vector_cost)
+	(neoverse512tvb_sve_issue_info, neoverse512tvb_vec_issue_info)
+	(neoverse512tvb_vector_cost, neoverse512tvb_tunings): New structures.
+	(aarch64_adjust_body_cost_sve): Handle -mtune=neoverse-512tvb.
+	(aarch64_adjust_body_cost): Likewise.
+
+2021-08-03  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/aarch64.c (aarch64_add_stmt_cost): Only
+	record issue information for operations that occur in the
+	innermost loop.
+
+2021-08-03  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/aarch64.c (aarch64_multiply_add_p): Add a vec_flags
+	parameter.  Detect cases in which an Advanced SIMD MLA would almost
+	certainly require a MOV.
+	(aarch64_count_ops): Update accordingly.
+
+2021-08-03  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/aarch64.c (aarch64_is_store_elt_extraction): New
+	function, split out from...
+	(aarch64_detect_vector_stmt_subtype): ...here.
+	(aarch64_add_stmt_cost): Treat extracting element 0 as free.
+
+2021-08-03  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/aarch64-protos.h (sve_vec_cost):
+	Add gather_load_x32_cost and gather_load_x64_cost.
+	* config/aarch64/aarch64.c (generic_sve_vector_cost)
+	(a64fx_sve_vector_cost, neoversev1_sve_vector_cost): Update
+	accordingly, using the values given by the scalar_load * number
+	of elements calculation that we used previously.
+	(aarch64_detect_vector_stmt_subtype): Use the new fields.
+
+2021-08-03  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/aarch64.c (aarch64_adjust_body_cost_sve): New
+	function, split out from...
+	(aarch64_adjust_body_cost): ...here.
+
+2021-08-03  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/fractional-cost.h: New file.
+	* config/aarch64/aarch64.c: Include <algorithm> (indirectly)
+	and cost_fraction.h.
+	(vec_cost_fraction): New typedef.
+	(aarch64_detect_scalar_stmt_subtype): Use it for statement costs.
+	(aarch64_detect_vector_stmt_subtype): Likewise.
+	(aarch64_sve_adjust_stmt_cost, aarch64_adjust_stmt_cost): Likewise.
+	(aarch64_estimate_min_cycles_per_iter): Use vec_cost_fraction
+	for cycle counts.
+	(aarch64_adjust_body_cost): Likewise.
+	(aarch64_test_cost_fraction): New function.
+	(aarch64_run_selftests): Call it.
+
+2021-08-03  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/aarch64-protos.h (tune_params::sve_width): Turn
+	into a bitmask.
+	* config/aarch64/aarch64.c (aarch64_cmp_autovec_modes): Update
+	accordingly.
+	(aarch64_estimated_poly_value): Likewise.  Use the least significant
+	set bit for the minimum and likely values.  Use the most significant
+	set bit for the maximum value.
+
+2021-08-03  liuhongt  <hongtao.liu@intel.com>
+
+	* config/i386/sse.md (cond_<insn><mode>): New expander.
+	(cond_mul<mode>): Ditto.
+
 2021-08-03  Kewen Lin  <linkw@linux.ibm.com>
 
 	* tree-cfg.c (move_sese_region_to_fn): Fix typos on dloop.
diff --git a/gcc/DATESTAMP b/gcc/DATESTAMP
index d34783f..856144c 100644
--- a/gcc/DATESTAMP
+++ b/gcc/DATESTAMP
@@ -1 +1 @@
-20210803
+20210804
diff --git a/gcc/analyzer/ChangeLog b/gcc/analyzer/ChangeLog
index 2da5aae5..4579796 100644
--- a/gcc/analyzer/ChangeLog
+++ b/gcc/analyzer/ChangeLog
@@ -1,3 +1,9 @@
+2021-08-03  Jakub Jelinek  <jakub@redhat.com>
+
+	PR analyzer/101721
+	* sm-malloc.cc (known_allocator_p): Only check DECL_FUNCTION_CODE on
+	BUILT_IN_NORMAL builtins.
+
 2021-07-29  Ankur Saini  <arsenic@sourceware.org>
 
 	* call-string.cc (call_string::element_t::operator==): New operator.
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 3806434..097a5b5 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,65 @@
+2021-08-03  Eugene Rozenfeld  <erozen@microsoft.com>
+
+	* lib/profopt.exp: Pass gdwarf-4 when compiling test to profile; pass -gcov_version=2.
+	* lib/target-supports.exp: Remove unnecessary -o perf.data passed to gcc-auto-profile.
+
+2021-08-03  Eugene Rozenfeld  <erozen@microsoft.com>
+
+	* gcc.dg/tree-prof/indir-call-prof-2.c: Fix dg-final-use-autofdo.
+	* lib/profopt.exp: Pass -fearly-inlining when compiling with AutoFDO.
+
+2021-08-03  Eugene Rozenfeld  <erozen@microsoft.com>
+
+	* g++.dg/tree-prof/indir-call-prof.C: Fix options, increase the number of iterations.
+	* g++.dg/tree-prof/morefunc.C: Fix options, increase the number of iterations.
+	* g++.dg/tree-prof/reorder.C: Fix options, increase the number of iterations.
+	* gcc.dg/tree-prof/indir-call-prof-2.c: Fix options, increase the number of iterations.
+	* gcc.dg/tree-prof/indir-call-prof.c: Fix options.
+
+2021-08-03  Martin Sebor  <msebor@redhat.com>
+
+	PR testsuite/101688
+	* g++.dg/warn/Wstringop-overflow-4.C: Disable a test case in ILP32.
+
+2021-08-03  Paul A. Clarke  <pc@us.ibm.com>
+
+	* gcc.target/powerpc/sse4_1-phminposuw.c: Copy from
+	gcc/testsuite/gcc.target/i386, adjust dg directives to suit,
+	make more robust.
+
+2021-08-03  H.J. Lu  <hjl.tools@gmail.com>
+
+	* gcc.target/i386/avx-vzeroupper-14.c: Pass -mno-avx512f to
+	disable XMM31.
+	* gcc.target/i386/avx-vzeroupper-15.c: Likewise.
+	* gcc.target/i386/pr82941-1.c: Updated.  Check for vzeroupper.
+	* gcc.target/i386/pr82942-1.c: Likewise.
+	* gcc.target/i386/pr82990-1.c: Likewise.
+	* gcc.target/i386/pr82990-3.c: Likewise.
+	* gcc.target/i386/pr82990-5.c: Likewise.
+	* gcc.target/i386/pr100865-4b.c: Likewise.
+	* gcc.target/i386/pr100865-6b.c: Likewise.
+	* gcc.target/i386/pr100865-7b.c: Likewise.
+	* gcc.target/i386/pr100865-10b.c: Likewise.
+	* gcc.target/i386/pr100865-8b.c: Updated.
+	* gcc.target/i386/pr100865-9b.c: Likewise.
+	* gcc.target/i386/pr100865-11b.c: Likewise.
+	* gcc.target/i386/pr100865-12b.c: Likewise.
+
+2021-08-03  liuhongt  <hongtao.liu@intel.com>
+
+	* gcc.target/i386/cond_op_addsubmul_d-1.c: New test.
+	* gcc.target/i386/cond_op_addsubmul_d-2.c: New test.
+	* gcc.target/i386/cond_op_addsubmul_q-1.c: New test.
+	* gcc.target/i386/cond_op_addsubmul_q-2.c: New test.
+	* gcc.target/i386/cond_op_addsubmul_w-1.c: New test.
+	* gcc.target/i386/cond_op_addsubmul_w-2.c: New test.
+
+2021-08-03  Jakub Jelinek  <jakub@redhat.com>
+
+	PR analyzer/101721
+	* gcc.dg/analyzer/pr101721.c: New test.
+
 2021-08-03  H.J. Lu  <hjl.tools@gmail.com>
 
 	PR target/80566
-- 
cgit v1.1


From 22e40cc7feb8abda85762e4f07719836d5c57f1a Mon Sep 17 00:00:00 2001
From: Cherry Mui <cherryyz@google.com>
Date: Tue, 3 Aug 2021 19:35:55 -0400
Subject: compiler: support new language constructs in escape analysis

Previous CLs add new language constructs in Go 1.17, specifically,
unsafe.Add, unsafe.Slice, and conversion from a slice to a pointer
to an array. This CL handles them in the escape analysis.

At the point of the escape analysis, unsafe.Add and unsafe.Slice
are still builtin calls, so just handle them in data flow.
Conversion from a slice to a pointer to an array has already been
lowered to a combination of compound expression, conditional
expression and slice info expressions, so handle them in the
escape analysis.

Reviewed-on: https://go-review.googlesource.com/c/gofrontend/+/339671
---
 gcc/go/gofrontend/MERGE          |  2 +-
 gcc/go/gofrontend/escape.cc      | 63 +++++++++++++++++++++++++++++----
 gcc/go/gofrontend/expressions.cc | 44 +----------------------
 gcc/go/gofrontend/expressions.h  | 75 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 134 insertions(+), 50 deletions(-)

(limited to 'gcc')

diff --git a/gcc/go/gofrontend/MERGE b/gcc/go/gofrontend/MERGE
index 5a097ff..be1a90f 100644
--- a/gcc/go/gofrontend/MERGE
+++ b/gcc/go/gofrontend/MERGE
@@ -1,4 +1,4 @@
-2031f0be9c0b5fda6421d290a0261eb6bd1c8205
+616ee658a6238e7de53592ebda5997f6de6a00de
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
diff --git a/gcc/go/gofrontend/escape.cc b/gcc/go/gofrontend/escape.cc
index cf68874..347ac25 100644
--- a/gcc/go/gofrontend/escape.cc
+++ b/gcc/go/gofrontend/escape.cc
@@ -2325,19 +2325,55 @@ Escape_analysis_assign::assign(Node* dst, Node* src)
 	  }
 	  break;
 
+        case Expression::EXPRESSION_SLICE_INFO:
+          {
+            Slice_info_expression* sie = e->slice_info_expression();
+            if (sie->info() == Expression::SLICE_INFO_VALUE_POINTER)
+              {
+                Node* slice = Node::make_node(sie->slice());
+                this->assign(dst, slice);
+              }
+          }
+          break;
+
 	case Expression::EXPRESSION_CALL:
 	  {
 	    Call_expression* call = e->call_expression();
             if (call->is_builtin())
               {
                 Builtin_call_expression* bce = call->builtin_call_expression();
-                if (bce->code() == Builtin_call_expression::BUILTIN_APPEND)
+                switch (bce->code())
                   {
-                    // Append returns the first argument.
-                    // The subsequent arguments are already leaked because
-                    // they are operands to append.
-                    Node* appendee = Node::make_node(call->args()->front());
-                    this->assign(dst, appendee);
+                  case Builtin_call_expression::BUILTIN_APPEND:
+                    {
+                      // Append returns the first argument.
+                      // The subsequent arguments are already leaked because
+                      // they are operands to append.
+                      Node* appendee = Node::make_node(call->args()->front());
+                      this->assign(dst, appendee);
+                    }
+                    break;
+
+                  case Builtin_call_expression::BUILTIN_ADD:
+                    {
+                      // unsafe.Add(p, off).
+                      // Flow p to result.
+                      Node* arg = Node::make_node(call->args()->front());
+                      this->assign(dst, arg);
+                    }
+                    break;
+
+                  case Builtin_call_expression::BUILTIN_SLICE:
+                    {
+                      // unsafe.Slice(p, len).
+                      // The resulting slice has the same backing store as p. Flow p to result.
+                      Node* arg = Node::make_node(call->args()->front());
+                      this->assign(dst, arg);
+                    }
+                    break;
+
+                  default:
+                    break;
                   }
                 break;
               }
@@ -2592,6 +2628,21 @@ Escape_analysis_assign::assign(Node* dst, Node* src)
 	  }
 	  break;
 
+        case Expression::EXPRESSION_CONDITIONAL:
+          {
+            Conditional_expression* ce = e->conditional_expression();
+            this->assign(dst, Node::make_node(ce->then_expr()));
+            this->assign(dst, Node::make_node(ce->else_expr()));
+          }
+          break;
+
+        case Expression::EXPRESSION_COMPOUND:
+          {
+            Compound_expression* ce = e->compound_expression();
+            this->assign(dst, Node::make_node(ce->expr()));
+          }
+          break;
+
 	default:
 	  // TODO(cmang): Add debug info here; this should not be reachable.
 	  // For now, just to be conservative, we'll just say dst flows to src.
diff --git a/gcc/go/gofrontend/expressions.cc b/gcc/go/gofrontend/expressions.cc
index 51a8b7e..3e433d6 100644
--- a/gcc/go/gofrontend/expressions.cc
+++ b/gcc/go/gofrontend/expressions.cc
@@ -18009,49 +18009,7 @@ Expression::make_type_info(Type* type, Type_info type_info)
   return new Type_info_expression(type, type_info);
 }
 
-// An expression that evaluates to some characteristic of a slice.
-// This is used when indexing, bound-checking, or nil checking a slice.
-
-class Slice_info_expression : public Expression
-{
- public:
-  Slice_info_expression(Expression* slice, Slice_info slice_info,
-                        Location location)
-    : Expression(EXPRESSION_SLICE_INFO, location),
-      slice_(slice), slice_info_(slice_info)
-  { }
-
- protected:
-  Type*
-  do_type();
-
-  void
-  do_determine_type(const Type_context*)
-  { }
-
-  Expression*
-  do_copy()
-  {
-    return new Slice_info_expression(this->slice_->copy(), this->slice_info_,
-                                     this->location());
-  }
-
-  Bexpression*
-  do_get_backend(Translate_context* context);
-
-  void
-  do_dump_expression(Ast_dump_context*) const;
-
-  void
-  do_issue_nil_check()
-  { this->slice_->issue_nil_check(); }
-
- private:
-  // The slice for which we are getting information.
-  Expression* slice_;
-  // What information we want.
-  Slice_info slice_info_;
-};
+// Slice_info_expression.
 
 // Return the type of the slice info.
 
diff --git a/gcc/go/gofrontend/expressions.h b/gcc/go/gofrontend/expressions.h
index 492849b..79a8785 100644
--- a/gcc/go/gofrontend/expressions.h
+++ b/gcc/go/gofrontend/expressions.h
@@ -62,6 +62,7 @@ class Type_guard_expression;
 class Heap_expression;
 class Receive_expression;
 class Slice_value_expression;
+class Slice_info_expression;
 class Conditional_expression;
 class Compound_expression;
 class Numeric_constant;
@@ -900,6 +901,14 @@ class Expression
   compound_expression()
   { return this->convert<Compound_expression, EXPRESSION_COMPOUND>(); }
 
+  // If this is a slice info expression, return the
+  // Slice_info_expression structure.  Otherwise, return NULL.
+  Slice_info_expression*
+  slice_info_expression()
+  {
+    return this->convert<Slice_info_expression, EXPRESSION_SLICE_INFO>();
+  }
+
   // Return true if this is a composite literal.
   bool
   is_composite_literal() const;
@@ -4262,6 +4271,60 @@ class Slice_value_expression : public Expression
   Expression* cap_;
 };
 
+// An expression that evaluates to some characteristic of a slice.
+// This is used when indexing, bound-checking, or nil checking a slice.
+
+class Slice_info_expression : public Expression
+{
+ public:
+  Slice_info_expression(Expression* slice, Slice_info slice_info,
+                        Location location)
+    : Expression(EXPRESSION_SLICE_INFO, location),
+      slice_(slice), slice_info_(slice_info)
+  { }
+
+  // The slice operand of this slice info expression.
+  Expression*
+  slice() const
+  { return this->slice_; }
+
+  // The info this expression is about.
+  Slice_info
+  info() const
+  { return this->slice_info_; }
+
+ protected:
+  Type*
+  do_type();
+
+  void
+  do_determine_type(const Type_context*)
+  { }
+
+  Expression*
+  do_copy()
+  {
+    return new Slice_info_expression(this->slice_->copy(), this->slice_info_,
+                                     this->location());
+  }
+
+  Bexpression*
+  do_get_backend(Translate_context* context);
+
+  void
+  do_dump_expression(Ast_dump_context*) const;
+
+  void
+  do_issue_nil_check()
+  { this->slice_->issue_nil_check(); }
+
+ private:
+  // The slice for which we are getting information.
+  Expression* slice_;
+  // What information we want.
+  Slice_info slice_info_;
+};
+
 // Conditional expressions.
 
 class Conditional_expression : public Expression
@@ -4277,6 +4340,14 @@ class Conditional_expression : public Expression
   condition() const
   { return this->cond_; }
 
+  Expression*
+  then_expr() const
+  { return this->then_; }
+
+  Expression*
+  else_expr() const
+  { return this->else_; }
+
  protected:
   int
   do_traverse(Traverse*);
@@ -4322,6 +4393,10 @@ class Compound_expression : public Expression
   init() const
   { return this->init_; }
 
+  Expression*
+  expr() const
+  { return this->expr_; }
+
  protected:
   int
   do_traverse(Traverse*);
-- 
cgit v1.1


From 2fc2e3917f9c8fd94f5d101477971d16c483ef88 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 4 Aug 2021 11:41:37 +0800
Subject: Support cond_{fma,fms,fnma,fnms} for vector float/double under
 AVX512.

gcc/ChangeLog:

	* config/i386/sse.md (cond_fma<mode>): New expander.
	(cond_fms<mode>): Ditto.
	(cond_fnma<mode>): Ditto.
	(cond_fnms<mode>): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/cond_op_fma_double-1.c: New test.
	* gcc.target/i386/cond_op_fma_double-2.c: New test.
	* gcc.target/i386/cond_op_fma_float-1.c: New test.
	* gcc.target/i386/cond_op_fma_float-2.c: New test.
---
 gcc/config/i386/sse.md                             |  96 ++++++++++
 .../gcc.target/i386/cond_op_fma_double-1.c         |  87 +++++++++
 .../gcc.target/i386/cond_op_fma_double-2.c         | 206 +++++++++++++++++++++
 .../gcc.target/i386/cond_op_fma_float-1.c          |  20 ++
 .../gcc.target/i386/cond_op_fma_float-2.c          |   4 +
 5 files changed, 413 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_fma_double-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_fma_float-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c

(limited to 'gcc')

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 52b2b42..f5968e0 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4438,6 +4438,29 @@
   [(set_attr "type" "ssemuladd")
    (set_attr "mode" "<MODE>")])
 
+(define_expand "cond_fma<mode>"
+  [(set (match_operand:VF_AVX512VL 0 "register_operand")
+	(vec_merge:VF_AVX512VL
+	  (fma:VF_AVX512VL
+	    (match_operand:VF_AVX512VL 2 "vector_operand")
+	    (match_operand:VF_AVX512VL 3 "vector_operand")
+	    (match_operand:VF_AVX512VL 4 "vector_operand"))
+	  (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_fma<mode>4 (tmp,
+			     operands[2],
+			     operands[3],
+			     operands[4]));
+  emit_move_insn (operands[0], gen_rtx_VEC_MERGE (<MODE>mode,
+						  tmp,
+						  operands[5],
+						  operands[1]));
+  DONE;
+})
+
 (define_insn "<avx512>_fmadd_<mode>_mask<round_name>"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v")
 	(vec_merge:VF_AVX512VL
@@ -4515,6 +4538,30 @@
   [(set_attr "type" "ssemuladd")
    (set_attr "mode" "<MODE>")])
 
+(define_expand "cond_fms<mode>"
+  [(set (match_operand:VF_AVX512VL 0 "register_operand")
+	(vec_merge:VF_AVX512VL
+	  (fma:VF_AVX512VL
+	    (match_operand:VF_AVX512VL 2 "vector_operand")
+	    (match_operand:VF_AVX512VL 3 "vector_operand")
+	    (neg:VF_AVX512VL
+	      (match_operand:VF_AVX512VL 4 "vector_operand")))
+	  (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_fms<mode>4 (tmp,
+			     operands[2],
+			     operands[3],
+			     operands[4]));
+  emit_move_insn (operands[0], gen_rtx_VEC_MERGE (<MODE>mode,
+						  tmp,
+						  operands[5],
+						  operands[1]));
+  DONE;
+})
+
 (define_insn "<avx512>_fmsub_<mode>_mask<round_name>"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v")
 	(vec_merge:VF_AVX512VL
@@ -4594,6 +4641,30 @@
   [(set_attr "type" "ssemuladd")
    (set_attr "mode" "<MODE>")])
 
+(define_expand "cond_fnma<mode>"
+  [(set (match_operand:VF_AVX512VL 0 "register_operand")
+	(vec_merge:VF_AVX512VL
+	  (fma:VF_AVX512VL
+	    (neg:VF_AVX512VL
+	      (match_operand:VF_AVX512VL 2 "vector_operand"))
+	    (match_operand:VF_AVX512VL 3 "vector_operand")
+	    (match_operand:VF_AVX512VL 4 "vector_operand"))
+	  (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_fnma<mode>4 (tmp,
+			      operands[2],
+			      operands[3],
+			      operands[4]));
+  emit_move_insn (operands[0], gen_rtx_VEC_MERGE (<MODE>mode,
+						  tmp,
+						  operands[5],
+						  operands[1]));
+  DONE;
+})
+
 (define_insn "<avx512>_fnmadd_<mode>_mask<round_name>"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v")
 	(vec_merge:VF_AVX512VL
@@ -4675,6 +4746,31 @@
   [(set_attr "type" "ssemuladd")
    (set_attr "mode" "<MODE>")])
 
+(define_expand "cond_fnms<mode>"
+  [(set (match_operand:VF_AVX512VL 0 "register_operand")
+	(vec_merge:VF_AVX512VL
+	  (fma:VF_AVX512VL
+	    (neg:VF_AVX512VL
+	      (match_operand:VF_AVX512VL 2 "vector_operand"))
+	    (match_operand:VF_AVX512VL 3 "vector_operand")
+	    (neg:VF_AVX512VL
+	      (match_operand:VF_AVX512VL 4 "vector_operand")))
+	  (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_fnms<mode>4 (tmp,
+			      operands[2],
+			      operands[3],
+			      operands[4]));
+  emit_move_insn (operands[0], gen_rtx_VEC_MERGE (<MODE>mode,
+						  tmp,
+						  operands[5],
+						  operands[1]));
+  DONE;
+})
+
 (define_insn "<avx512>_fnmsub_<mode>_mask<round_name>"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v")
 	(vec_merge:VF_AVX512VL
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_fma_double-1.c b/gcc/testsuite/gcc.target/i386/cond_op_fma_double-1.c
new file mode 100644
index 0000000..4e14b75
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_fma_double-1.c
@@ -0,0 +1,87 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-times ".COND_FMA" 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times ".COND_FNMA" 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times ".COND_FMS" 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times ".COND_FNMS" 3 "optimized" } } */
+/* { dg-final { scan-assembler-times "vfmadd132pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmsub132pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmadd231pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmadd231pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmsub231pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmsub231pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmadd132pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmsub132pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132pd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+
+#ifndef NUM
+#define NUM 800
+#endif
+#ifndef TYPE
+#define TYPE double
+#endif
+#ifndef __BUILTIN_FMA
+#define __BUILTIN_FMA __builtin_fma
+#endif
+
+TYPE a[NUM], b[NUM], c[NUM], d[NUM], e[NUM], j[NUM];
+#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
+#define MAX(X,Y) ((X) < (Y) ? (Y) : (X))
+
+#define FMA3(OPNAME, OP1, OP2)					\
+  void								\
+  __attribute__ ((noipa,optimize ("O3")))			\
+  foo3_##OPNAME ()						\
+  {								\
+    for (int i = 0; i != NUM; i++)				\
+      {								\
+	TYPE tmp = MAX(d[i], e[i]);				\
+	if (b[i] < c[i])					\
+	  a[i] = __BUILTIN_FMA (OP1 d[i], e[i], OP2 tmp);	\
+	else							\
+	  a[i] = tmp;						\
+      }								\
+  }
+
+#define FMAZ(OPNAME, OP1, OP2)					\
+  void								\
+  __attribute__ ((noipa,optimize ("O3")))			\
+  fooz_##OPNAME ()						\
+  {								\
+    for (int i = 0; i != NUM; i++)				\
+      if (b[i] < c[i])						\
+	a[i] = __BUILTIN_FMA (OP1 d[i], e[i], OP2 a[i]);	\
+      else							\
+	a[i] = .0;						\
+  }
+
+#define FMA1(OPNAME, OP1, OP2)					\
+  void								\
+  __attribute__ ((noipa,optimize ("O3")))			\
+  foo1_##OPNAME ()						\
+  {								\
+    for (int i = 0; i != NUM; i++)				\
+      if (b[i] < c[i])						\
+	a[i] = __BUILTIN_FMA (OP1 d[i], e[i], OP2 a[i]);	\
+      else							\
+	a[i] = d[i];						\
+  }
+
+
+FMAZ (fma,, +);
+FMAZ (fms,, -);
+FMAZ (fnma, -, +);
+FMAZ (fnms, -, -);
+
+FMA1 (fma,, +);
+FMA1 (fms,, -);
+FMA1 (fnma, -, +);
+FMA1 (fnms, -, -);
+
+FMA3 (fma,, +);
+FMA3 (fms,, -);
+FMA3 (fnma, -, +);
+FMA3 (fnms, -, -);
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c b/gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c
new file mode 100644
index 0000000..d8180de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c
@@ -0,0 +1,206 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */
+#define AVX512VL
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#include "cond_op_fma_double-1.c"
+#define FMA3_O2(OPNAME, OP1, OP2)				\
+  void								\
+  __attribute__ ((noipa,optimize ("O2")))			\
+  foo3_o2_##OPNAME ()						\
+  {								\
+    for (int i = 0; i != NUM; i++)				\
+      {								\
+	TYPE tmp = MAX(d[i], e[i]);				\
+	if (b[i] < c[i])					\
+	  j[i] = __BUILTIN_FMA (OP1 d[i], e[i], OP2 tmp);	\
+	else							\
+	  j[i] = tmp;						\
+      }								\
+  }
+
+#define FMAZ_O2(OPNAME, OP1, OP2)				\
+  void								\
+  __attribute__ ((noipa,optimize ("O2")))			\
+  fooz_o2_##OPNAME ()						\
+  {								\
+    for (int i = 0; i != NUM; i++)				\
+      if (b[i] < c[i])						\
+	j[i] = __BUILTIN_FMA (OP1 d[i], e[i], OP2 a[i]);	\
+      else							\
+	j[i] = .0;						\
+  }
+
+#define FMA1_O2(OPNAME, OP1, OP2)				\
+  void								\
+  __attribute__ ((noipa,optimize ("O2")))			\
+  foo1_o2_##OPNAME ()						\
+  {								\
+    for (int i = 0; i != NUM; i++)				\
+      if (b[i] < c[i])						\
+	j[i] = __BUILTIN_FMA (OP1 d[i], e[i], OP2 a[i]);	\
+      else							\
+	j[i] = d[i];						\
+  }
+
+FMAZ_O2 (fma,, +);
+FMAZ_O2 (fms,, -);
+FMAZ_O2 (fnma, -, +);
+FMAZ_O2 (fnms, -, -);
+
+FMA1_O2 (fma,, +);
+FMA1_O2 (fms,, -);
+FMA1_O2 (fnma, -, +);
+FMA1_O2 (fnms, -, -);
+
+FMA3_O2 (fma,, +);
+FMA3_O2 (fms,, -);
+FMA3_O2 (fnma, -, +);
+FMA3_O2 (fnms, -, -);
+
+static void
+test_256 (void)
+{
+  int sign = -1;
+  for (int i = 0; i != NUM; i++)
+    {
+      a[i] = 0;
+      d[i] = i * 2;
+      e[i] = i * i * 3 - i * 9 + 153;
+      b[i] = i * 83;
+      c[i] = b[i] + sign;
+      sign *= -1;
+      j[i] = 1;
+    }
+  foo1_o2_fma ();
+  /* foo1_fma need to be after foo1_o2_fma since
+     it changes a[i] which is used by foo1_o2_fma.  */
+  foo1_fma ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+	abort ();
+      a[i] = 0;
+      b[i] = 1;
+    }
+
+  foo1_o2_fms ();
+  foo1_fms ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+
+  foo1_o2_fnma ();
+  foo1_fnma ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+
+  foo1_o2_fnms ();
+  foo1_fnms ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+
+  fooz_o2_fma ();
+  fooz_fma ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      b[i] = 1;
+    }
+
+  fooz_o2_fms ();
+  fooz_fms ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+
+  fooz_o2_fnma ();
+  fooz_fnma ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+
+  fooz_o2_fnms ();
+  fooz_fnms ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+
+  foo3_o2_fma ();
+  foo3_fma ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      b[i] = 1;
+    }
+
+  foo3_o2_fms ();
+  foo3_fms ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+
+  foo3_o2_fnma ();
+  foo3_fnma ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+
+  foo3_o2_fnms ();
+  foo3_fnms ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+}
+
+static void
+test_128 ()
+{
+  
+}
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_fma_float-1.c b/gcc/testsuite/gcc.target/i386/cond_op_fma_float-1.c
new file mode 100644
index 0000000..a5752e7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_fma_float-1.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=float -fdump-tree-optimized -D__BUILTIN_FMA=__builtin_fmaf" } */
+/* { dg-final { scan-tree-dump-times ".COND_FMA" 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times ".COND_FNMA" 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times ".COND_FMS" 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times ".COND_FNMS" 3 "optimized" } } */
+/* { dg-final { scan-assembler-times "vfmadd132ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmsub132ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmadd231ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmadd231ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmsub231ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmsub231ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmadd132ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmadd132ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfmsub132ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+/* { dg-final { scan-assembler-times "vfnmsub132ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)"  1 } } */
+
+#include "cond_op_fma_double-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c b/gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c
new file mode 100644
index 0000000..0097735
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c
@@ -0,0 +1,4 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=float -D__BUILTIN_FMA=__builtin_fmaf" } */
+
+#include "cond_op_fma_double-2.c"
-- 
cgit v1.1


From 3ae1468e260bf1f8e8c8637133263010213b6ac9 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 4 Aug 2021 13:20:56 +0800
Subject: Add dg-require-effective-target for testcases.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/cond_op_addsubmul_d-2.c: Add
	dg-require-effective-target for avx512.
	* gcc.target/i386/cond_op_addsubmul_q-2.c: Ditto.
	* gcc.target/i386/cond_op_addsubmul_w-2.c: Ditto.
	* gcc.target/i386/cond_op_addsubmuldiv_double-2.c: Ditto.
	* gcc.target/i386/cond_op_addsubmuldiv_float-2.c: Ditto.
	* gcc.target/i386/cond_op_fma_double-2.c: Ditto.
	* gcc.target/i386/cond_op_fma_float-2.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c         | 2 ++
 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c         | 2 ++
 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c         | 2 ++
 gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c | 1 +
 gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c  | 1 +
 gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c          | 2 ++
 gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c           | 1 +
 7 files changed, 11 insertions(+)

(limited to 'gcc')

diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c
index 490f4af..046804b 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512vl } */
+
 #define AVX512VL
 #ifndef CHECK
 #define CHECK "avx512f-helper.h"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c
index 09a87de..56245b1 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c
@@ -1,4 +1,6 @@
 /* { dg-do run { target { ! ia32 } } } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -mavx512dq -DTYPE=long" } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512dq } */
 
 #include "cond_op_addsubmul_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c
index fdcdb34..bdcd2ef 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -mavx512bw -DTYPE=short" } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-require-effective-target avx512vl } */
 
 #define AVX512BW
 #include "cond_op_addsubmul_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c b/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c
index 360891f..5ec38df 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c
@@ -1,5 +1,6 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512vl } */
 
 #define AVX512VL
 #ifndef CHECK
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c b/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c
index 20ed737..c99c04c 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c
@@ -1,4 +1,5 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=float" } */
+/* { dg-require-effective-target avx512vl } */
 
 #include "cond_op_addsubmuldiv_double-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c b/gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c
index d8180de..4c6514e 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512vl } */
+
 #define AVX512VL
 #ifndef CHECK
 #define CHECK "avx512f-helper.h"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c b/gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c
index 0097735..e13d377 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c
@@ -1,4 +1,5 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=float -D__BUILTIN_FMA=__builtin_fmaf" } */
+/* { dg-require-effective-target avx512vl } */
 
 #include "cond_op_fma_double-2.c"
-- 
cgit v1.1


From 5c73b94fdc46f03c761ee5c66e30e00a2bf9ee91 Mon Sep 17 00:00:00 2001
From: Martin Liska <mliska@suse.cz>
Date: Wed, 4 Aug 2021 09:48:05 +0200
Subject: docs: document threader-mode param

gcc/ChangeLog:

	* doc/invoke.texi: Document threader-mode param.
---
 gcc/doc/invoke.texi | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'gcc')

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 65bb998..4efc8b7 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -13421,6 +13421,9 @@ Setting to 0 disables the analysis completely.
 @item modref-max-escape-points
 Specifies the maximum number of escape points tracked by modref per SSA-name.
 
+@item threader-mode
+Specifies the mode the backwards threader should run in.
+
 @item profile-func-internal-id
 A parameter to control whether to use function internal id in profile
 database lookup. If the value is 0, the compiler uses an id that
-- 
cgit v1.1


From 4d562591018a51f155a2e5d8b9f3e5860111a327 Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Wed, 4 Aug 2021 09:22:51 +0200
Subject: tree-optimization/101769 - tail recursion creates possibly infinite
 loop

This makes tail recursion optimization produce a loop structure
manually rather than relying on loop fixup.  That also allows the
loop to be marked as finite (it would eventually blow the stack
if it were not).

2021-08-04  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/101769
	* tree-tailcall.c (eliminate_tail_call): Add the created loop
	for the first recursion and return it via the new output parameter.
	(optimize_tail_call): Pass through new output param.
	(tree_optimize_tail_calls_1): After creating all latches,
	add the created loop to the loop tree.  Do not mark loops for fixup.

	* g++.dg/tree-ssa/pr101769.C: New testcase.
---
 gcc/testsuite/g++.dg/tree-ssa/pr101769.C | 56 ++++++++++++++++++++++++++++++++
 gcc/tree-tailcall.c                      | 34 +++++++++++--------
 2 files changed, 77 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/tree-ssa/pr101769.C

(limited to 'gcc')

diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr101769.C b/gcc/testsuite/g++.dg/tree-ssa/pr101769.C
new file mode 100644
index 0000000..4979c42
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr101769.C
@@ -0,0 +1,56 @@
+// { dg-do compile }
+// { dg-require-effective-target c++11 }
+// { dg-options "-O2 -fdump-tree-optimized" }
+
+struct Node
+{
+  Node*	right;
+  Node*	down;
+};
+
+inline
+void free_node(Node*)
+{
+}
+
+void free_all(Node* n_)
+{
+  if (n_ == nullptr) {
+      return;
+  }
+  free_all(n_->right);
+  do {
+      Node* t = n_->down;
+      free_node(n_);
+      n_ = t;
+  } while (n_);
+}
+
+void free_all2_r(Node* n_)
+{
+  if (n_->right) {
+      free_all2_r(n_->right);
+  }
+  do {
+      Node* t = n_->down;
+      free_node(n_);
+      n_ = t;
+  } while (n_);
+}
+
+void free_all2(Node* n_)
+{
+  if (n_) {
+      free_all2_r(n_);
+  }
+}
+
+void loop(Node* n_)
+{
+  do {
+      n_ = n_->down;
+  } while (n_);
+}
+
+// All functions should be empty.
+// { dg-final { scan-tree-dump-times "<bb " 4 "optimized" } }
diff --git a/gcc/tree-tailcall.c b/gcc/tree-tailcall.c
index f2833d2..f2f3a6b 100644
--- a/gcc/tree-tailcall.c
+++ b/gcc/tree-tailcall.c
@@ -131,9 +131,6 @@ static tree m_acc, a_acc;
 
 static bitmap tailr_arg_needs_copy;
 
-static bool optimize_tail_call (struct tailcall *, bool);
-static void eliminate_tail_call (struct tailcall *);
-
 /* Returns false when the function is not suitable for tail call optimization
    from some reason (e.g. if it takes variable number of arguments).  */
 
@@ -926,10 +923,11 @@ decrease_profile (basic_block bb, profile_count count)
 }
 
 /* Eliminates tail call described by T.  TMP_VARS is a list of
-   temporary variables used to copy the function arguments.  */
+   temporary variables used to copy the function arguments.
+   Allocates *NEW_LOOP if not already done and initializes it.  */
 
 static void
-eliminate_tail_call (struct tailcall *t)
+eliminate_tail_call (struct tailcall *t, class loop *&new_loop)
 {
   tree param, rslt;
   gimple *stmt, *call;
@@ -999,6 +997,16 @@ eliminate_tail_call (struct tailcall *t)
   gcc_assert (e);
   PENDING_STMT (e) = NULL;
 
+  /* Add the new loop.  */
+  if (!new_loop)
+    {
+      new_loop = alloc_loop ();
+      new_loop->header = first;
+      new_loop->finite_p = true;
+    }
+  else
+    gcc_assert (new_loop->header == first);
+
   /* Add phi node entries for arguments.  The ordering of the phi nodes should
      be the same as the ordering of the arguments.  */
   for (param = DECL_ARGUMENTS (current_function_decl),
@@ -1037,11 +1045,12 @@ eliminate_tail_call (struct tailcall *t)
    mark the tailcalls for the sibcall optimization.  */
 
 static bool
-optimize_tail_call (struct tailcall *t, bool opt_tailcalls)
+optimize_tail_call (struct tailcall *t, bool opt_tailcalls,
+		    class loop *&new_loop)
 {
   if (t->tail_recursion)
     {
-      eliminate_tail_call (t);
+      eliminate_tail_call (t, new_loop);
       return true;
     }
 
@@ -1177,12 +1186,15 @@ tree_optimize_tail_calls_1 (bool opt_tailcalls)
       opt_tailcalls = false;
     }
 
+  class loop *new_loop = NULL;
   for (; tailcalls; tailcalls = next)
     {
       next = tailcalls->next;
-      changed |= optimize_tail_call (tailcalls, opt_tailcalls);
+      changed |= optimize_tail_call (tailcalls, opt_tailcalls, new_loop);
       free (tailcalls);
     }
+  if (new_loop)
+    add_loop (new_loop, loops_for_fn (cfun)->tree_root);
 
   if (a_acc || m_acc)
     {
@@ -1198,11 +1210,7 @@ tree_optimize_tail_calls_1 (bool opt_tailcalls)
     }
 
   if (changed)
-    {
-      /* We may have created new loops.  Make them magically appear.  */
-      loops_state_set (LOOPS_NEED_FIXUP);
-      free_dominance_info (CDI_DOMINATORS);
-    }
+    free_dominance_info (CDI_DOMINATORS);
 
   /* Add phi nodes for the virtual operands defined in the function to the
      header of the loop created by tail recursion elimination.  Do so
-- 
cgit v1.1


From 9db0bcd9fdc2e3a659d56435cb18d553f4292edb Mon Sep 17 00:00:00 2001
From: Aldy Hernandez <aldyh@redhat.com>
Date: Wed, 4 Aug 2021 10:55:12 +0200
Subject: Mark path_range_query::dump as override.

gcc/ChangeLog:

	* gimple-range-path.h (path_range_query::dump): Mark override.
---
 gcc/gimple-range-path.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc')

diff --git a/gcc/gimple-range-path.h b/gcc/gimple-range-path.h
index 43f0ec8..0d2d2e7 100644
--- a/gcc/gimple-range-path.h
+++ b/gcc/gimple-range-path.h
@@ -41,7 +41,7 @@ public:
 			  const bitmap_head *imports);
   bool range_of_expr (irange &r, tree name, gimple * = NULL) override;
   bool range_of_stmt (irange &r, gimple *, tree name = NULL) override;
-  void dump (FILE *);
+  void dump (FILE *) override;
   void debug ();
 
 private:
-- 
cgit v1.1


From 9f26640f7b89c771b0ebffd7e7f5019d0709a955 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 4 Aug 2021 10:50:28 +0800
Subject: Refine predicate of peephole2 to general_reg_operand. [PR
 target/101743]

The define_peephole2 which is added by r12-2640-gf7bf03cf69ccb7dc
should only work on general registers, considering that x86 also
supports mov instructions between gpr, sse reg, mask reg, limiting the
peephole2 predicate to general_reg_operand.

gcc/ChangeLog:

	PR target/101743
	* config/i386/i386.md (peephole2): Refine predicate from
	register_operand to general_reg_operand.
---
 gcc/config/i386/i386.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'gcc')

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 0c23ddb..51e8b47 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -19423,11 +19423,11 @@
 ;; Eliminate a reg-reg mov by inverting the condition of a cmov (#1).
 ;; mov r0,r1; dec r0; mov r2,r3; cmov r0,r2 -> dec r1; mov r0,r3; cmov r0, r1
 (define_peephole2
- [(set (match_operand:SWI248 0 "register_operand")
-       (match_operand:SWI248 1 "register_operand"))
+ [(set (match_operand:SWI248 0 "general_reg_operand")
+       (match_operand:SWI248 1 "general_reg_operand"))
   (parallel [(set (reg FLAGS_REG) (match_operand 5))
 	     (set (match_dup 0) (match_operand:SWI248 6))])
-  (set (match_operand:SWI248 2 "register_operand")
+  (set (match_operand:SWI248 2 "general_reg_operand")
        (match_operand:SWI248 3))
   (set (match_dup 0)
        (if_then_else:SWI248 (match_operator 4 "ix86_comparison_operator"
@@ -19455,10 +19455,10 @@
 ;; Eliminate a reg-reg mov by inverting the condition of a cmov (#2).
 ;; mov r2,r3; mov r0,r1; dec r0; cmov r0,r2 -> dec r1; mov r0,r3; cmov r0, r1
 (define_peephole2
- [(set (match_operand:SWI248 2 "register_operand")
+ [(set (match_operand:SWI248 2 "general_reg_operand")
        (match_operand:SWI248 3))
-  (set (match_operand:SWI248 0 "register_operand")
-       (match_operand:SWI248 1 "register_operand"))
+  (set (match_operand:SWI248 0 "general_reg_operand")
+       (match_operand:SWI248 1 "general_reg_operand"))
   (parallel [(set (reg FLAGS_REG) (match_operand 5))
 	     (set (match_dup 0) (match_operand:SWI248 6))])
   (set (match_dup 0)
-- 
cgit v1.1


From 8aa14fa7d98b4d641de9c3ea8d0fa094e0a0ec76 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Wed, 4 Aug 2021 11:42:59 +0200
Subject: testsuite: Fix duplicated content of
 gcc.c-torture/execute/ieee/pr29302-1.x

The file has two identical halves, seems like twice applied patch.

2021-08-04  Jakub Jelinek  <jakub@redhat.com>

	* gcc.c-torture/execute/ieee/pr29302-1.x: Undo doubly applied patch.
---
 gcc/testsuite/gcc.c-torture/execute/ieee/pr29302-1.x | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'gcc')

diff --git a/gcc/testsuite/gcc.c-torture/execute/ieee/pr29302-1.x b/gcc/testsuite/gcc.c-torture/execute/ieee/pr29302-1.x
index 1922b14..3ae2096 100644
--- a/gcc/testsuite/gcc.c-torture/execute/ieee/pr29302-1.x
+++ b/gcc/testsuite/gcc.c-torture/execute/ieee/pr29302-1.x
@@ -4,9 +4,3 @@ if { [istarget "tic6x-*-*"] && [check_effective_target_ti_c67x] } {
     return 1
 }
 return 0
-if { [istarget "tic6x-*-*"] && [check_effective_target_ti_c67x] } {
-    # C6X uses -freciprocal-math by default.
-    set torture_execute_xfail "tic6x-*-*"
-    return 1
-}
-return 0
-- 
cgit v1.1


From af31cab04770f7a1a1da069415ab62ca2ef54fc4 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Wed, 4 Aug 2021 11:53:48 +0200
Subject: c++: Fix up #pragma omp declare {simd,variant} and acc routine
 parsing

When parsing default arguments, we need to temporarily clear parser->omp_declare_simd
and parser->oacc_routine, otherwise it can clash with further declarations
inside of e.g. lambdas inside of those default arguments.

2021-08-04  Jakub Jelinek  <jakub@redhat.com>

	PR c++/101759
	* parser.c (cp_parser_default_argument): Temporarily override
	parser->omp_declare_simd and parser->oacc_routine to NULL.

	* g++.dg/gomp/pr101759.C: New test.
	* g++.dg/goacc/pr101759.C: New test.
---
 gcc/cp/parser.c                       | 2 ++
 gcc/testsuite/g++.dg/goacc/pr101759.C | 5 +++++
 gcc/testsuite/g++.dg/gomp/pr101759.C  | 8 ++++++++
 3 files changed, 15 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/goacc/pr101759.C
 create mode 100644 gcc/testsuite/g++.dg/gomp/pr101759.C

(limited to 'gcc')

diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index 47bf7d9..02ce954 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -24488,6 +24488,8 @@ cp_parser_default_argument (cp_parser *parser, bool template_parm_p)
      set correctly.  */
   saved_greater_than_is_operator_p = parser->greater_than_is_operator_p;
   parser->greater_than_is_operator_p = !template_parm_p;
+  auto odsd = make_temp_override (parser->omp_declare_simd, NULL);
+  auto ord = make_temp_override (parser->oacc_routine, NULL);
   /* Local variable names (and the `this' keyword) may not
      appear in a default argument.  */
   saved_local_variables_forbidden_p = parser->local_variables_forbidden_p;
diff --git a/gcc/testsuite/g++.dg/goacc/pr101759.C b/gcc/testsuite/g++.dg/goacc/pr101759.C
new file mode 100644
index 0000000..522a5d4
--- /dev/null
+++ b/gcc/testsuite/g++.dg/goacc/pr101759.C
@@ -0,0 +1,5 @@
+// PR c++/101759
+// { dg-do compile { target c++11 } }
+
+#pragma acc routine
+int foo (int x = []() { extern int bar (int); return 1; }());
diff --git a/gcc/testsuite/g++.dg/gomp/pr101759.C b/gcc/testsuite/g++.dg/gomp/pr101759.C
new file mode 100644
index 0000000..905b875
--- /dev/null
+++ b/gcc/testsuite/g++.dg/gomp/pr101759.C
@@ -0,0 +1,8 @@
+// PR c++/101759
+// { dg-do compile { target c++11 } }
+
+#pragma omp declare simd
+int foo (int x = []() { extern int bar (int); return 1; }());
+int corge (int = 1);
+#pragma omp declare variant (corge) match (user={condition(true)})
+int baz (int x = []() { extern int qux (int); return 1; }());
-- 
cgit v1.1


From 87a0b607e40f8122c7fc45d496ef48799fe11550 Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Wed, 4 Aug 2021 11:42:41 +0200
Subject: tree-optimization/101756 - avoid vectorizing boolean MAX reductions

The following avoids vectorizing MIN/MAX reductions on bools which,
when ending up as vector(2) <signed-boolean:64> would need to be
adjusted because of the sign change.  The fix instead avoids any
reduction vectorization where the result isn't compatible
to the original scalar type since we don't compensate for that
either.

2021-08-04  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/101756
	* tree-vect-slp.c (vectorizable_bb_reduc_epilogue): Make sure
	the result of the reduction epilogue is compatible to the original
	scalar result.

	* gcc.dg/vect/bb-slp-pr101756.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/bb-slp-pr101756.c | 15 +++++++++++++++
 gcc/tree-vect-slp.c                         |  8 +++++---
 2 files changed, 20 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-pr101756.c

(limited to 'gcc')

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr101756.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr101756.c
new file mode 100644
index 0000000..9420e77
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr101756.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+
+__attribute__ ((simd)) int
+tq (long int ea, int of, int kk)
+{
+  int bc;
+
+  for (bc = 0; bc < 2; ++bc)
+    {
+      ++ea;
+      of |= !!kk < !!ea;
+    }
+
+  return of;
+}
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index a554c24..d169bed 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -4847,15 +4847,17 @@ static bool
 vectorizable_bb_reduc_epilogue (slp_instance instance,
 				stmt_vector_for_cost *cost_vec)
 {
-  enum tree_code reduc_code
-    = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
+  gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
+  enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
   if (reduc_code == MINUS_EXPR)
     reduc_code = PLUS_EXPR;
   internal_fn reduc_fn;
   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
   if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
       || reduc_fn == IFN_LAST
-      || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH))
+      || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
+      || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
+				     TREE_TYPE (vectype)))
     return false;
 
   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
-- 
cgit v1.1


From 2724d1bba6b36451404811fba3244f8897717ef3 Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Fri, 30 Jul 2021 11:06:50 +0200
Subject: Rewrite more vector loads to scalar loads

This teaches forwprop to rewrite more vector loads that are only
used in BIT_FIELD_REFs as scalar loads.  This provides the
remaining uplift to SPEC CPU 2017 510.parest_r on Zen 2 which
has CPU gathers disabled.

In particular vector load + vec_unpack + bit-field-ref is turned
into (extending) scalar loads which avoids costly XMM/GPR
transitions.  To not conflict with vector load + bit-field-ref
+ vector constructor matching to vector load + shuffle the
extended transform is only done after vector lowering.

2021-07-30  Richard Biener  <rguenther@suse.de>

	* tree-ssa-forwprop.c (pass_forwprop::execute): Split
	out code to decompose vector loads ...
	(optimize_vector_load): ... here.  Generalize it to
	handle intermediate widening and TARGET_MEM_REF loads
	and apply it to loads with a supported vector mode as well.
---
 gcc/tree-ssa-forwprop.c | 244 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 182 insertions(+), 62 deletions(-)

(limited to 'gcc')

diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index db3b18b..bd64b8e 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -2757,6 +2757,182 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 }
 
 
+/* Rewrite the vector load at *GSI to component-wise loads if the load
+   is only used in BIT_FIELD_REF extractions with eventual intermediate
+   widening.  */
+
+static void
+optimize_vector_load (gimple_stmt_iterator *gsi)
+{
+  gimple *stmt = gsi_stmt (*gsi);
+  tree lhs = gimple_assign_lhs (stmt);
+  tree rhs = gimple_assign_rhs1 (stmt);
+
+  /* Gather BIT_FIELD_REFs to rewrite, looking through
+     VEC_UNPACK_{LO,HI}_EXPR.  */
+  use_operand_p use_p;
+  imm_use_iterator iter;
+  bool rewrite = true;
+  auto_vec<gimple *, 8> bf_stmts;
+  auto_vec<tree, 8> worklist;
+  worklist.quick_push (lhs);
+  do
+    {
+      tree def = worklist.pop ();
+      unsigned HOST_WIDE_INT def_eltsize
+	= TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (TREE_TYPE (def))));
+      FOR_EACH_IMM_USE_FAST (use_p, iter, def)
+	{
+	  gimple *use_stmt = USE_STMT (use_p);
+	  if (is_gimple_debug (use_stmt))
+	    continue;
+	  if (!is_gimple_assign (use_stmt))
+	    {
+	      rewrite = false;
+	      break;
+	    }
+	  enum tree_code use_code = gimple_assign_rhs_code (use_stmt);
+	  tree use_rhs = gimple_assign_rhs1 (use_stmt);
+	  if (use_code == BIT_FIELD_REF
+	      && TREE_OPERAND (use_rhs, 0) == def
+	      /* If its on the VEC_UNPACK_{HI,LO}_EXPR
+		 def need to verify it is element aligned.  */
+	      && (def == lhs
+		  || (known_eq (bit_field_size (use_rhs), def_eltsize)
+		      && constant_multiple_p (bit_field_offset (use_rhs),
+					      def_eltsize))))
+	    {
+	      bf_stmts.safe_push (use_stmt);
+	      continue;
+	    }
+	  /* Walk through one level of VEC_UNPACK_{LO,HI}_EXPR.  */
+	  if (def == lhs
+	      && (use_code == VEC_UNPACK_HI_EXPR
+		  || use_code == VEC_UNPACK_LO_EXPR)
+	      && use_rhs == lhs)
+	    {
+	      worklist.safe_push (gimple_assign_lhs (use_stmt));
+	      continue;
+	    }
+	  rewrite = false;
+	  break;
+	}
+      if (!rewrite)
+	break;
+    }
+  while (!worklist.is_empty ());
+
+  if (!rewrite)
+    {
+      gsi_next (gsi);
+      return;
+    }
+  /* We now have all ultimate uses of the load to rewrite in bf_stmts.  */
+
+  /* Prepare the original ref to be wrapped in adjusted BIT_FIELD_REFs.
+     For TARGET_MEM_REFs we have to separate the LEA from the reference.  */
+  tree load_rhs = rhs;
+  if (TREE_CODE (load_rhs) == TARGET_MEM_REF)
+    {
+      if (TREE_CODE (TREE_OPERAND (load_rhs, 0)) == ADDR_EXPR)
+	mark_addressable (TREE_OPERAND (TREE_OPERAND (load_rhs, 0), 0));
+      tree tem = make_ssa_name (TREE_TYPE (TREE_OPERAND (load_rhs, 0)));
+      gimple *new_stmt
+	= gimple_build_assign (tem, build1 (ADDR_EXPR, TREE_TYPE (tem),
+					    unshare_expr (load_rhs)));
+      gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+      load_rhs = build2_loc (EXPR_LOCATION (load_rhs),
+			     MEM_REF, TREE_TYPE (load_rhs), tem,
+			     build_int_cst
+			       (TREE_TYPE (TREE_OPERAND (load_rhs, 1)), 0));
+    }
+
+  /* Rewrite the BIT_FIELD_REFs to be actual loads, re-emitting them at
+     the place of the original load.  */
+  for (gimple *use_stmt : bf_stmts)
+    {
+      tree bfr = gimple_assign_rhs1 (use_stmt);
+      tree new_rhs = unshare_expr (load_rhs);
+      if (TREE_OPERAND (bfr, 0) != lhs)
+	{
+	  /* When the BIT_FIELD_REF is on the promoted vector we have to
+	     adjust it and emit a conversion afterwards.  */
+	  gimple *def_stmt
+	      = SSA_NAME_DEF_STMT (TREE_OPERAND (bfr, 0));
+	  enum tree_code def_code
+	      = gimple_assign_rhs_code (def_stmt);
+
+	  /* The adjusted BIT_FIELD_REF is of the promotion source
+	     vector size and at half of the offset...  */
+	  new_rhs = fold_build3 (BIT_FIELD_REF,
+				 TREE_TYPE (TREE_TYPE (lhs)),
+				 new_rhs,
+				 TYPE_SIZE (TREE_TYPE (TREE_TYPE (lhs))),
+				 size_binop (EXACT_DIV_EXPR,
+					     TREE_OPERAND (bfr, 2),
+					     bitsize_int (2)));
+	  /* ... and offsetted by half of the vector if VEC_UNPACK_HI_EXPR.  */
+	  if (def_code == (!BYTES_BIG_ENDIAN
+			   ? VEC_UNPACK_HI_EXPR : VEC_UNPACK_LO_EXPR))
+	    TREE_OPERAND (new_rhs, 2)
+	      = size_binop (PLUS_EXPR, TREE_OPERAND (new_rhs, 2),
+			    size_binop (EXACT_DIV_EXPR,
+					TYPE_SIZE (TREE_TYPE (lhs)),
+					bitsize_int (2)));
+	  tree tem = make_ssa_name (TREE_TYPE (TREE_TYPE (lhs)));
+	  gimple *new_stmt = gimple_build_assign (tem, new_rhs);
+	  location_t loc = gimple_location (use_stmt);
+	  gimple_set_location (new_stmt, loc);
+	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+	  /* Perform scalar promotion.  */
+	  new_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
+					  NOP_EXPR, tem);
+	  gimple_set_location (new_stmt, loc);
+	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+	}
+      else
+	{
+	  /* When the BIT_FIELD_REF is on the original load result
+	     we can just wrap that.  */
+	  tree new_rhs = fold_build3 (BIT_FIELD_REF, TREE_TYPE (bfr),
+				      unshare_expr (load_rhs),
+				      TREE_OPERAND (bfr, 1),
+				      TREE_OPERAND (bfr, 2));
+	  gimple *new_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
+						  new_rhs);
+	  location_t loc = gimple_location (use_stmt);
+	  gimple_set_location (new_stmt, loc);
+	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+	}
+      gimple_stmt_iterator gsi2 = gsi_for_stmt (use_stmt);
+      unlink_stmt_vdef (use_stmt);
+      gsi_remove (&gsi2, true);
+    }
+
+  /* Finally get rid of the intermediate stmts.  */
+  gimple *use_stmt;
+  FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
+    {
+      if (is_gimple_debug (use_stmt))
+	{
+	  if (gimple_debug_bind_p (use_stmt))
+	    {
+	      gimple_debug_bind_reset_value (use_stmt);
+	      update_stmt (use_stmt);
+	    }
+	  continue;
+	}
+      gimple_stmt_iterator gsi2 = gsi_for_stmt (use_stmt);
+      unlink_stmt_vdef (use_stmt);
+      release_defs (use_stmt);
+      gsi_remove (&gsi2, true);
+    }
+  /* And the original load.  */
+  release_defs (stmt);
+  gsi_remove (gsi, true);
+}
+
+
 /* Primitive "lattice" function for gimple_simplify.  */
 
 static tree
@@ -3007,71 +3183,15 @@ pass_forwprop::execute (function *fun)
 		gsi_next (&gsi);
 	    }
 	  else if (TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE
-		   && TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
+		   && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
+		       /* After vector lowering rewrite all loads, but
+			  initially do not since this conflicts with
+			  vector CONSTRUCTOR to shuffle optimization.  */
+		       || (fun->curr_properties & PROP_gimple_lvec))
 		   && gimple_assign_load_p (stmt)
 		   && !gimple_has_volatile_ops (stmt)
-		   && (TREE_CODE (gimple_assign_rhs1 (stmt))
-		       != TARGET_MEM_REF)
 		   && !stmt_can_throw_internal (cfun, stmt))
-	    {
-	      /* Rewrite loads used only in BIT_FIELD_REF extractions to
-	         component-wise loads.  */
-	      use_operand_p use_p;
-	      imm_use_iterator iter;
-	      bool rewrite = true;
-	      FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
-		{
-		  gimple *use_stmt = USE_STMT (use_p);
-		  if (is_gimple_debug (use_stmt))
-		    continue;
-		  if (!is_gimple_assign (use_stmt)
-		      || gimple_assign_rhs_code (use_stmt) != BIT_FIELD_REF
-		      || TREE_OPERAND (gimple_assign_rhs1 (use_stmt), 0) != lhs)
-		    {
-		      rewrite = false;
-		      break;
-		    }
-		}
-	      if (rewrite)
-		{
-		  gimple *use_stmt;
-		  FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
-		    {
-		      if (is_gimple_debug (use_stmt))
-			{
-			  if (gimple_debug_bind_p (use_stmt))
-			    {
-			      gimple_debug_bind_reset_value (use_stmt);
-			      update_stmt (use_stmt);
-			    }
-			  continue;
-			}
-
-		      tree bfr = gimple_assign_rhs1 (use_stmt);
-		      tree new_rhs = fold_build3 (BIT_FIELD_REF,
-						  TREE_TYPE (bfr),
-						  unshare_expr (rhs),
-						  TREE_OPERAND (bfr, 1),
-						  TREE_OPERAND (bfr, 2));
-		      gimple *new_stmt
-			= gimple_build_assign (gimple_assign_lhs (use_stmt),
-					       new_rhs);
-
-		      location_t loc = gimple_location (use_stmt);
-		      gimple_set_location (new_stmt, loc);
-		      gimple_stmt_iterator gsi2 = gsi_for_stmt (use_stmt);
-		      unlink_stmt_vdef (use_stmt);
-		      gsi_remove (&gsi2, true);
-
-		      gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
-		    }
-
-		  release_defs (stmt);
-		  gsi_remove (&gsi, true);
-		}
-	      else
-		gsi_next (&gsi);
-	    }
+	    optimize_vector_load (&gsi);
 
 	  else if (code == COMPLEX_EXPR)
 	    {
-- 
cgit v1.1


From 96146e61cd7aee62c21c2845916ec42152918ab7 Mon Sep 17 00:00:00 2001
From: Roger Sayle <roger@nextmovesoftware.com>
Date: Wed, 4 Aug 2021 14:19:14 +0100
Subject: Fold (X<<C1)^(X<<C2) to a multiplication when possible.

The easiest way to motivate these additions to match.pd is with the
following example:

unsigned int foo(unsigned char i) {
  return i | (i<<8) | (i<<16) | (i<<24);
}

which mainline with -O2 on x86_64 currently generates:
foo:	movzbl  %dil, %edi
	movl    %edi, %eax
	movl    %edi, %edx
	sall    $8, %eax
	sall    $16, %edx
	orl     %edx, %eax
	orl     %edi, %eax
	sall    $24, %edi
	orl     %edi, %eax
	ret

but with this patch now becomes:
foo:	movzbl  %dil, %eax
        imull   $16843009, %eax, %eax
        ret

Interestingly, this transformation is already applied when using
addition, allowing synth_mult to select an optimal sequence, but
not when using the equivalent bit-wise ior or xor operators.

The solution is to use tree_nonzero_bits to check that the
potentially non-zero bits of each operand don't overlap, which
ensures that BIT_IOR_EXPR and BIT_XOR_EXPR produce the same
results as PLUS_EXPR, which effectively generalizes the old
fold_plusminus_mult_expr.  Technically, the transformation
is to canonicalize (X*C1)|(X*C2) and (X*C1)^(X*C2) to
X*(C1+C2) where X and X<<C are considered special cases.

2021-08-04  Roger Sayle  <roger@nextmovesoftware.com>
	    Marc Glisse  <marc.glisse@inria.fr>

gcc/ChangeLog
	* match.pd (bit_ior, bit_xor): Canonicalize (X*C1)|(X*C2) and
	(X*C1)^(X*C2) as X*(C1+C2), and related variants, using
	tree_nonzero_bits to ensure that operands are bit-wise disjoint.

gcc/testsuite/ChangeLog
	* gcc.dg/fold-ior-4.c: New test.
---
 gcc/match.pd                      | 56 +++++++++++++++++++++++++++++++++++
 gcc/testsuite/gcc.dg/fold-ior-4.c | 61 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 117 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/fold-ior-4.c

(limited to 'gcc')

diff --git a/gcc/match.pd b/gcc/match.pd
index 19cbad7..0fcfd0e 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -2833,6 +2833,62 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
     (convert (mult (convert:t @0) { cst; })))))
 #endif
 
+/* Canonicalize (X*C1)|(X*C2) and (X*C1)^(X*C2) to (C1+C2)*X when
+   tree_nonzero_bits allows IOR and XOR to be treated like PLUS.
+   Likewise, handle (X<<C3) and X as legitimate variants of X*C.  */
+(for op (bit_ior bit_xor)
+ (simplify
+  (op (mult:s@0 @1 INTEGER_CST@2)
+      (mult:s@3 @1 INTEGER_CST@4))
+  (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
+       && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0)
+   (mult @1
+	 { wide_int_to_tree (type, wi::to_wide (@2) + wi::to_wide (@4)); })))
+ (simplify
+  (op:c (mult:s@0 @1 INTEGER_CST@2)
+	(lshift:s@3 @1 INTEGER_CST@4))
+  (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
+       && tree_int_cst_sgn (@4) > 0
+       && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0)
+   (with { wide_int wone = wi::one (TYPE_PRECISION (type));
+	   wide_int c = wi::add (wi::to_wide (@2),
+				 wi::lshift (wone, wi::to_wide (@4))); }
+    (mult @1 { wide_int_to_tree (type, c); }))))
+ (simplify
+  (op:c (mult:s@0 @1 INTEGER_CST@2)
+	@1)
+  (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
+       && (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0)
+   (mult @1
+	 { wide_int_to_tree (type,
+			     wi::add (wi::to_wide (@2), 1)); })))
+ (simplify
+  (op (lshift:s@0 @1 INTEGER_CST@2)
+      (lshift:s@3 @1 INTEGER_CST@4))
+  (if (INTEGRAL_TYPE_P (type)
+       && tree_int_cst_sgn (@2) > 0
+       && tree_int_cst_sgn (@4) > 0
+       && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0)
+   (with { tree t = type;
+	   if (!TYPE_OVERFLOW_WRAPS (t))
+	     t = unsigned_type_for (t);
+	   wide_int wone = wi::one (TYPE_PRECISION (t));
+	   wide_int c = wi::add (wi::lshift (wone, wi::to_wide (@2)),
+				 wi::lshift (wone, wi::to_wide (@4))); }
+    (convert (mult:t (convert:t @1) { wide_int_to_tree (t,c); })))))
+ (simplify
+  (op:c (lshift:s@0 @1 INTEGER_CST@2)
+	@1)
+  (if (INTEGRAL_TYPE_P (type)
+       && tree_int_cst_sgn (@2) > 0
+       && (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0)
+   (with { tree t = type;
+	   if (!TYPE_OVERFLOW_WRAPS (t))
+	     t = unsigned_type_for (t);
+	   wide_int wone = wi::one (TYPE_PRECISION (t));
+	   wide_int c = wi::add (wi::lshift (wone, wi::to_wide (@2)), wone); }
+    (convert (mult:t (convert:t @1) { wide_int_to_tree (t, c); }))))))
+
 /* Simplifications of MIN_EXPR, MAX_EXPR, fmin() and fmax().  */
 
 (for minmax (min max FMIN_ALL FMAX_ALL)
diff --git a/gcc/testsuite/gcc.dg/fold-ior-4.c b/gcc/testsuite/gcc.dg/fold-ior-4.c
new file mode 100644
index 0000000..8f7213e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-ior-4.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+unsigned int test_ior(unsigned char i)
+{
+  return i | (i<<8) | (i<<16) | (i<<24);
+}
+
+unsigned int test_xor(unsigned char i)
+{
+  return i ^ (i<<8) ^ (i<<16) ^ (i<<24);
+}
+
+unsigned int test_ior_1s(unsigned char i)
+{
+  return i | (i<<8);
+}
+
+unsigned int test_ior_1u(unsigned char i)
+{
+  unsigned int t = i;
+  return t | (t<<8);
+}
+
+unsigned int test_xor_1s(unsigned char i)
+{
+  return i ^ (i<<8);
+}
+
+unsigned int test_xor_1u(unsigned char i)
+{
+  unsigned int t = i;
+  return t ^ (t<<8);
+}
+
+unsigned int test_ior_2s(unsigned char i)
+{
+  return (i<<8) | (i<<16);
+}
+
+unsigned int test_ior_2u(unsigned char i)
+{
+  unsigned int t = i;
+  return (t<<8) | (t<<16);
+}
+
+unsigned int test_xor_2s(unsigned char i)
+{
+  return (i<<8) ^ (i<<16);
+}
+
+unsigned int test_xor_2u(unsigned char i)
+{
+  unsigned int t = i;
+  return (t<<8) ^ (t<<16);
+}
+
+/* { dg-final { scan-tree-dump-not " \\^ " "optimized" } } */
+/* { dg-final { scan-tree-dump-not " \\| " "optimized" } } */
+/* { dg-final { scan-tree-dump-times " \\* 16843009" 2 "optimized" } } */
+
-- 
cgit v1.1


From f2e5d2717d9e249edc5e0d45e49e4f9ef81fc694 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 3 Aug 2021 06:17:22 -0700
Subject: by_pieces: Pass MAX_PIECES to op_by_pieces_d

Pass MAX_PIECES to op_by_pieces_d::op_by_pieces_d for move, store and
compare.

	PR target/101742
	* expr.c (op_by_pieces_d::op_by_pieces_d): Add a max_pieces
	argument to set m_max_size.
	(move_by_pieces_d): Pass MOVE_MAX_PIECES to op_by_pieces_d.
	(store_by_pieces_d): Pass STORE_MAX_PIECES to op_by_pieces_d.
	(compare_by_pieces_d): Pass COMPARE_MAX_PIECES to op_by_pieces_d.
---
 gcc/expr.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'gcc')

diff --git a/gcc/expr.c b/gcc/expr.c
index b65cfcf..096c031 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -1110,8 +1110,8 @@ class op_by_pieces_d
   }
 
  public:
-  op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
-		  unsigned HOST_WIDE_INT, unsigned int, bool,
+  op_by_pieces_d (unsigned int, rtx, bool, rtx, bool, by_pieces_constfn,
+		  void *, unsigned HOST_WIDE_INT, unsigned int, bool,
 		  bool = false);
   void run ();
 };
@@ -1120,10 +1120,12 @@ class op_by_pieces_d
    objects named TO and FROM, which are identified as loads or stores
    by TO_LOAD and FROM_LOAD.  If FROM is a load, the optional FROM_CFN
    and its associated FROM_CFN_DATA can be used to replace loads with
-   constant values.  LEN describes the length of the operation.  */
+   constant values.  MAX_PIECES describes the maximum number of bytes
+   at a time which can be moved efficiently.  LEN describes the length
+   of the operation.  */
 
-op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
-				rtx from, bool from_load,
+op_by_pieces_d::op_by_pieces_d (unsigned int max_pieces, rtx to,
+				bool to_load, rtx from, bool from_load,
 				by_pieces_constfn from_cfn,
 				void *from_cfn_data,
 				unsigned HOST_WIDE_INT len,
@@ -1131,7 +1133,7 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
 				bool qi_vector_mode)
   : m_to (to, to_load, NULL, NULL),
     m_from (from, from_load, from_cfn, from_cfn_data),
-    m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
+    m_len (len), m_max_size (max_pieces + 1),
     m_push (push), m_qi_vector_mode (qi_vector_mode)
 {
   int toi = m_to.get_addr_inc ();
@@ -1324,8 +1326,8 @@ class move_by_pieces_d : public op_by_pieces_d
  public:
   move_by_pieces_d (rtx to, rtx from, unsigned HOST_WIDE_INT len,
 		    unsigned int align)
-    : op_by_pieces_d (to, false, from, true, NULL, NULL, len, align,
-		      PUSHG_P (to))
+    : op_by_pieces_d (MOVE_MAX_PIECES, to, false, from, true, NULL,
+		      NULL, len, align, PUSHG_P (to))
   {
   }
   rtx finish_retmode (memop_ret);
@@ -1421,8 +1423,8 @@ class store_by_pieces_d : public op_by_pieces_d
   store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
 		     unsigned HOST_WIDE_INT len, unsigned int align,
 		     bool qi_vector_mode)
-    : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,
-		      align, false, qi_vector_mode)
+    : op_by_pieces_d (STORE_MAX_PIECES, to, false, NULL_RTX, true, cfn,
+		      cfn_data, len, align, false, qi_vector_mode)
   {
   }
   rtx finish_retmode (memop_ret);
@@ -1618,8 +1620,8 @@ class compare_by_pieces_d : public op_by_pieces_d
   compare_by_pieces_d (rtx op0, rtx op1, by_pieces_constfn op1_cfn,
 		       void *op1_cfn_data, HOST_WIDE_INT len, int align,
 		       rtx_code_label *fail_label)
-    : op_by_pieces_d (op0, true, op1, true, op1_cfn, op1_cfn_data, len,
-		      align, false)
+    : op_by_pieces_d (COMPARE_MAX_PIECES, op0, true, op1, true, op1_cfn,
+		      op1_cfn_data, len, align, false)
   {
     m_fail_label = fail_label;
   }
-- 
cgit v1.1


From 31855ba6b16cd138d7484076a08cd40d609654b8 Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Thu, 29 Jul 2021 14:14:48 +0200
Subject: Add emulated gather capability to the vectorizer

This adds a gather vectorization capability to the vectorizer
without target support by decomposing the offset vector, doing
sclar loads and then building a vector from the result.  This
is aimed mainly at cases where vectorizing the rest of the loop
offsets the cost of vectorizing the gather.

Note it's difficult to avoid vectorizing the offset load, but in
some cases later passes can turn the vector load + extract into
scalar loads, see the followup patch.

On SPEC CPU 2017 510.parest_r this improves runtime from 250s
to 219s on a Zen2 CPU which has its native gather instructions
disabled (using those the runtime instead increases to 254s)
using -Ofast -march=znver2 [-flto].  It turns out the critical
loops in this benchmark all perform gather operations.

2021-07-30  Richard Biener  <rguenther@suse.de>

	* tree-vect-data-refs.c (vect_check_gather_scatter):
	Include widening conversions only when the result is
	still handed by native gather or the current offset
	size not already matches the data size.
	Also succeed analysis in case there's no native support,
	noted by a IFN_LAST ifn and a NULL decl.
	(vect_analyze_data_refs): Always consider gathers.
	* tree-vect-patterns.c (vect_recog_gather_scatter_pattern):
	Test for no IFN gather rather than decl gather.
	* tree-vect-stmts.c (vect_model_load_cost): Pass in the
	gather-scatter info and cost emulated gathers accordingly.
	(vect_truncate_gather_scatter_offset): Properly test for
	no IFN gather.
	(vect_use_strided_gather_scatters_p): Likewise.
	(get_load_store_type): Handle emulated gathers and its
	restrictions.
	(vectorizable_load): Likewise.  Emulate them by extracting
	scalar offsets, doing scalar loads and a vector construct.

	* gcc.target/i386/vect-gather-1.c: New testcase.
	* gfortran.dg/vect/vect-8.f90: Adjust.
---
 gcc/testsuite/gcc.target/i386/vect-gather-1.c |  20 ++++++
 gcc/testsuite/gfortran.dg/vect/vect-8.f90     |   2 +-
 gcc/tree-vect-data-refs.c                     |  31 +++++---
 gcc/tree-vect-patterns.c                      |   2 +-
 gcc/tree-vect-stmts.c                         | 100 ++++++++++++++++++++++++--
 5 files changed, 137 insertions(+), 18 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-gather-1.c

(limited to 'gcc')

diff --git a/gcc/testsuite/gcc.target/i386/vect-gather-1.c b/gcc/testsuite/gcc.target/i386/vect-gather-1.c
new file mode 100644
index 0000000..261b66b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-gather-1.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msse2 -fdump-tree-vect-details -fdump-tree-forwprop4" } */
+
+#ifndef INDEXTYPE
+#define INDEXTYPE int
+#endif
+double vmul(INDEXTYPE *rowstart, INDEXTYPE *rowend,
+	    double *luval, double *dst)
+{
+  double res = 0;
+  for (const INDEXTYPE * col = rowstart; col != rowend; ++col, ++luval)
+        res += *luval * dst[*col];
+  return res;
+}
+
+/* With gather emulation this should be profitable to vectorize
+   even with plain SSE2.  */
+/* { dg-final { scan-tree-dump "loop vectorized" "vect" } } */
+/* The index vector loads and promotions should be scalar after forwprop.  */
+/* { dg-final { scan-tree-dump-not "vec_unpack" "forwprop4" } } */
diff --git a/gcc/testsuite/gfortran.dg/vect/vect-8.f90 b/gcc/testsuite/gfortran.dg/vect/vect-8.f90
index 9994805..cc1aebf 100644
--- a/gcc/testsuite/gfortran.dg/vect/vect-8.f90
+++ b/gcc/testsuite/gfortran.dg/vect/vect-8.f90
@@ -706,5 +706,5 @@ END SUBROUTINE kernel
 
 ! { dg-final { scan-tree-dump-times "vectorized 24 loops" 1 "vect" { target aarch64_sve } } }
 ! { dg-final { scan-tree-dump-times "vectorized 23 loops" 1 "vect" { target { aarch64*-*-* && { ! aarch64_sve } } } } }
-! { dg-final { scan-tree-dump-times "vectorized 2\[23\] loops" 1 "vect" { target { vect_intdouble_cvt && { ! aarch64*-*-* } } } } }
+! { dg-final { scan-tree-dump-times "vectorized 2\[234\] loops" 1 "vect" { target { vect_intdouble_cvt && { ! aarch64*-*-* } } } } }
 ! { dg-final { scan-tree-dump-times "vectorized 17 loops" 1 "vect" { target { { ! vect_intdouble_cvt } && { ! aarch64*-*-* } } } } }
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 6995efb..17d24b4 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -4007,8 +4007,24 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 	      continue;
 	    }
 
-	  if (TYPE_PRECISION (TREE_TYPE (op0))
-	      < TYPE_PRECISION (TREE_TYPE (off)))
+	  /* Include the conversion if it is widening and we're using
+	     the IFN path or the target can handle the converted from
+	     offset or the current size is not already the same as the
+	     data vector element size.  */
+	  if ((TYPE_PRECISION (TREE_TYPE (op0))
+	       < TYPE_PRECISION (TREE_TYPE (off)))
+	      && (use_ifn_p
+		  || (DR_IS_READ (dr)
+		      ? (targetm.vectorize.builtin_gather
+			 && targetm.vectorize.builtin_gather (vectype,
+							      TREE_TYPE (op0),
+							      scale))
+		      : (targetm.vectorize.builtin_scatter
+			 && targetm.vectorize.builtin_scatter (vectype,
+							       TREE_TYPE (op0),
+							       scale)))
+		  || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
+				       TYPE_SIZE (TREE_TYPE (vectype)), 0)))
 	    {
 	      off = op0;
 	      offtype = TREE_TYPE (off);
@@ -4036,7 +4052,8 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
 				     vectype, memory_type, offtype, scale,
 				     &ifn, &offset_vectype))
-	return false;
+	ifn = IFN_LAST;
+      decl = NULL_TREE;
     }
   else
     {
@@ -4050,10 +4067,6 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 	  if (targetm.vectorize.builtin_scatter)
 	    decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
 	}
-
-      if (!decl)
-	return false;
-
       ifn = IFN_LAST;
       /* The offset vector type will be read from DECL when needed.  */
       offset_vectype = NULL_TREE;
@@ -4283,9 +4296,7 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
         {
 	  bool maybe_gather
 	    = DR_IS_READ (dr)
-	      && !TREE_THIS_VOLATILE (DR_REF (dr))
-	      && (targetm.vectorize.builtin_gather != NULL
-		  || supports_vec_gather_load_p ());
+	      && !TREE_THIS_VOLATILE (DR_REF (dr));
 	  bool maybe_scatter
 	    = DR_IS_WRITE (dr)
 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index 743fd3f..25de97b 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -4811,7 +4811,7 @@ vect_recog_gather_scatter_pattern (vec_info *vinfo,
      function for the gather/scatter operation.  */
   gather_scatter_info gs_info;
   if (!vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info)
-      || gs_info.decl)
+      || gs_info.ifn == IFN_LAST)
     return NULL;
 
   /* Convert the mask to the right form.  */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 074dfdc..e6b81a0 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1084,6 +1084,7 @@ static void
 vect_model_load_cost (vec_info *vinfo,
 		      stmt_vec_info stmt_info, unsigned ncopies, poly_uint64 vf,
 		      vect_memory_access_type memory_access_type,
+		      gather_scatter_info *gs_info,
 		      slp_tree slp_node,
 		      stmt_vector_for_cost *cost_vec)
 {
@@ -1172,9 +1173,17 @@ vect_model_load_cost (vec_info *vinfo,
   if (memory_access_type == VMAT_ELEMENTWISE
       || memory_access_type == VMAT_GATHER_SCATTER)
     {
-      /* N scalar loads plus gathering them into a vector.  */
       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
       unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
+      if (memory_access_type == VMAT_GATHER_SCATTER
+	  && gs_info->ifn == IFN_LAST && !gs_info->decl)
+	/* For emulated gathers N offset vector element extracts
+	   (we assume the scalar scaling and ptr + offset add is consumed by
+	   the load).  */
+	inside_cost += record_stmt_cost (cost_vec, ncopies * assumed_nunits,
+					 vec_to_scalar, stmt_info, 0,
+					 vect_body);
+      /* N scalar loads plus gathering them into a vector.  */
       inside_cost += record_stmt_cost (cost_vec,
 				       ncopies * assumed_nunits,
 				       scalar_load, stmt_info, 0, vect_body);
@@ -1184,7 +1193,9 @@ vect_model_load_cost (vec_info *vinfo,
 			&inside_cost, &prologue_cost, 
 			cost_vec, cost_vec, true);
   if (memory_access_type == VMAT_ELEMENTWISE
-      || memory_access_type == VMAT_STRIDED_SLP)
+      || memory_access_type == VMAT_STRIDED_SLP
+      || (memory_access_type == VMAT_GATHER_SCATTER
+	  && gs_info->ifn == IFN_LAST && !gs_info->decl))
     inside_cost += record_stmt_cost (cost_vec, ncopies, vec_construct,
 				     stmt_info, 0, vect_body);
 
@@ -1866,7 +1877,8 @@ vect_truncate_gather_scatter_offset (stmt_vec_info stmt_info,
       tree memory_type = TREE_TYPE (DR_REF (dr));
       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
 				     vectype, memory_type, offset_type, scale,
-				     &gs_info->ifn, &gs_info->offset_vectype))
+				     &gs_info->ifn, &gs_info->offset_vectype)
+	  || gs_info->ifn == IFN_LAST)
 	continue;
 
       gs_info->decl = NULL_TREE;
@@ -1901,7 +1913,7 @@ vect_use_strided_gather_scatters_p (stmt_vec_info stmt_info,
 				    gather_scatter_info *gs_info)
 {
   if (!vect_check_gather_scatter (stmt_info, loop_vinfo, gs_info)
-      || gs_info->decl)
+      || gs_info->ifn == IFN_LAST)
     return vect_truncate_gather_scatter_offset (stmt_info, loop_vinfo,
 						masked_p, gs_info);
 
@@ -2355,6 +2367,27 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
 			     vls_type == VLS_LOAD ? "gather" : "scatter");
 	  return false;
 	}
+      else if (gs_info->ifn == IFN_LAST && !gs_info->decl)
+	{
+	  if (vls_type != VLS_LOAD)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "unsupported emulated scatter.\n");
+	      return false;
+	    }
+	  else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()
+		   || !known_eq (TYPE_VECTOR_SUBPARTS (vectype),
+				 TYPE_VECTOR_SUBPARTS
+				   (gs_info->offset_vectype)))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "unsupported vector types for emulated "
+				 "gather.\n");
+	      return false;
+	    }
+	}
       /* Gather-scatter accesses perform only component accesses, alignment
 	 is irrelevant for them.  */
       *alignment_support_scheme = dr_unaligned_supported;
@@ -8692,6 +8725,15 @@ vectorizable_load (vec_info *vinfo,
 			     "unsupported access type for masked load.\n");
 	  return false;
 	}
+      else if (memory_access_type == VMAT_GATHER_SCATTER
+	       && gs_info.ifn == IFN_LAST
+	       && !gs_info.decl)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "unsupported masked emulated gather.\n");
+	  return false;
+	}
     }
 
   if (!vec_stmt) /* transformation not required.  */
@@ -8725,7 +8767,7 @@ vectorizable_load (vec_info *vinfo,
 
       STMT_VINFO_TYPE (orig_stmt_info) = load_vec_info_type;
       vect_model_load_cost (vinfo, stmt_info, ncopies, vf, memory_access_type,
-			    slp_node, cost_vec);
+			    &gs_info, slp_node, cost_vec);
       return true;
     }
 
@@ -9438,7 +9480,8 @@ vectorizable_load (vec_info *vinfo,
 		    unsigned int misalign;
 		    unsigned HOST_WIDE_INT align;
 
-		    if (memory_access_type == VMAT_GATHER_SCATTER)
+		    if (memory_access_type == VMAT_GATHER_SCATTER
+			&& gs_info.ifn != IFN_LAST)
 		      {
 			tree zero = build_zero_cst (vectype);
 			tree scale = size_int (gs_info.scale);
@@ -9456,6 +9499,51 @@ vectorizable_load (vec_info *vinfo,
 			data_ref = NULL_TREE;
 			break;
 		      }
+		    else if (memory_access_type == VMAT_GATHER_SCATTER)
+		      {
+			/* Emulated gather-scatter.  */
+			gcc_assert (!final_mask);
+			unsigned HOST_WIDE_INT const_nunits
+			  = nunits.to_constant ();
+			vec<constructor_elt, va_gc> *ctor_elts;
+			vec_alloc (ctor_elts, const_nunits);
+			gimple_seq stmts = NULL;
+			tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
+			tree scale = size_int (gs_info.scale);
+			align
+			  = get_object_alignment (DR_REF (first_dr_info->dr));
+			tree ltype = build_aligned_type (TREE_TYPE (vectype),
+							 align);
+			for (unsigned k = 0; k < const_nunits; ++k)
+			  {
+			    tree boff = size_binop (MULT_EXPR,
+						    TYPE_SIZE (idx_type),
+						    bitsize_int (k));
+			    tree idx = gimple_build (&stmts, BIT_FIELD_REF,
+						     idx_type, vec_offset,
+						     TYPE_SIZE (idx_type),
+						     boff);
+			    idx = gimple_convert (&stmts, sizetype, idx);
+			    idx = gimple_build (&stmts, MULT_EXPR,
+						sizetype, idx, scale);
+			    tree ptr = gimple_build (&stmts, PLUS_EXPR,
+						     TREE_TYPE (dataref_ptr),
+						     dataref_ptr, idx);
+			    ptr = gimple_convert (&stmts, ptr_type_node, ptr);
+			    tree elt = make_ssa_name (TREE_TYPE (vectype));
+			    tree ref = build2 (MEM_REF, ltype, ptr,
+					       build_int_cst (ref_type, 0));
+			    new_stmt = gimple_build_assign (elt, ref);
+			    gimple_seq_add_stmt (&stmts, new_stmt);
+			    CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
+			  }
+			gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+			new_stmt = gimple_build_assign (NULL_TREE,
+							build_constructor
+							  (vectype, ctor_elts));
+			data_ref = NULL_TREE;
+			break;
+		      }
 
 		    align =
 		      known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
-- 
cgit v1.1


From 9fcb8ec60302f5f110f94a885b618993c28d18d3 Mon Sep 17 00:00:00 2001
From: Tamar Christina <tamar.christina@arm.com>
Date: Wed, 4 Aug 2021 14:36:15 +0100
Subject: [testsuite] Fix trapping access in test PR101750

I believe PR101750 to be a testism. Fix it by giving the class a name.

gcc/testsuite/ChangeLog:

	PR tree-optimization/101750
	* g++.dg/vect/pr99149.cc: Name class.
---
 gcc/testsuite/g++.dg/vect/pr99149.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc')

diff --git a/gcc/testsuite/g++.dg/vect/pr99149.cc b/gcc/testsuite/g++.dg/vect/pr99149.cc
index 00ebe9d..9d58426 100755
--- a/gcc/testsuite/g++.dg/vect/pr99149.cc
+++ b/gcc/testsuite/g++.dg/vect/pr99149.cc
@@ -11,7 +11,7 @@ public:
   a operator*(a d) { return a(b * b - c * c, b * c + c * d.b); }
 };
 int f, g;
-class {
+class mp {
   a *h;
   a *i;
 
-- 
cgit v1.1


From 96c82a16b2076891a9974d0f0e96a0b85fbc2df4 Mon Sep 17 00:00:00 2001
From: Bernd Edlinger <bernd.edlinger@hotmail.de>
Date: Sat, 24 Jul 2021 12:53:39 +0200
Subject: Fix debug info for ignored decls at start of assembly

Ignored functions decls that are compiled at the start of
the assembly have bogus line numbers until the first .file
directive, as reported in PR101575.

The corresponding binutils bug report is
https://sourceware.org/bugzilla/show_bug.cgi?id=28149

The work around for this issue is to emit a dummy .file
directive before the first function is compiled, unless
another .file directive was already emitted previously.

2021-08-04  Bernd Edlinger  <bernd.edlinger@hotmail.de>

	PR ada/101575
	* dwarf2out.c (dwarf2out_assembly_start): Emit a dummy
	.file statement when needed.
---
 gcc/dwarf2out.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'gcc')

diff --git a/gcc/dwarf2out.c b/gcc/dwarf2out.c
index 884f1e1..b91a9b5 100644
--- a/gcc/dwarf2out.c
+++ b/gcc/dwarf2out.c
@@ -29389,7 +29389,18 @@ dwarf2out_assembly_start (void)
       output_quoted_string (asm_out_file, remap_debug_filename (filename0));
       fputc ('\n', asm_out_file);
     }
+  else
 #endif
+  /* Work around for PR101575: output a dummy .file directive.  */
+  if (!last_emitted_file && dwarf_debuginfo_p ()
+      && debug_info_level >= DINFO_LEVEL_TERSE)
+    {
+      const char *filename0 = get_AT_string (comp_unit_die (), DW_AT_name);
+
+      if (filename0 == NULL)
+	filename0 = "<dummy>";
+      maybe_emit_file (lookup_filename (filename0));
+    }
 }
 
 /* A helper function for dwarf2out_finish called through
-- 
cgit v1.1


From 929f2cf4105ccf12d0684c6d5838f58f0ee5e7c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vincent=20Lef=C3=A8vre?= <vincent-gcc@vinc17.net>
Date: Wed, 4 Aug 2021 17:25:52 +0200
Subject: gcov: check return code of a fclose

gcc/ChangeLog:

	PR gcov-profile/101773
	* gcov-io.c (gcov_close): Check return code of a fclose.
---
 gcc/gcov-io.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'gcc')

diff --git a/gcc/gcov-io.c b/gcc/gcov-io.c
index 4b1e11d..7819593 100644
--- a/gcc/gcov-io.c
+++ b/gcc/gcov-io.c
@@ -199,7 +199,9 @@ gcov_close (void)
 {
   if (gcov_var.file)
     {
-      fclose (gcov_var.file);
+      if (fclose (gcov_var.file))
+	gcov_var.error = 1;
+
       gcov_var.file = 0;
     }
   gcov_var.mode = 0;
-- 
cgit v1.1


From eb55b5b0df26e95c98ab59d34e69189d4f61bc0c Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Wed, 4 Aug 2021 16:52:07 +0100
Subject: aarch64: Fix a typo

gcc/
	* config/aarch64/aarch64.c: Fix a typo.
---
 gcc/config/aarch64/aarch64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc')

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index f80de2c..81c002b 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -15032,7 +15032,7 @@ aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
      scalar operation.
 
    - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
-     the Advanced SIMD implementation.
+     Advanced SIMD implementation.
 
    - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
      SVE implementation.
-- 
cgit v1.1


From 315a1c3756cbc751c4af0ce0da2157a88d7c3b09 Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Wed, 4 Aug 2021 16:52:08 +0100
Subject: vect: Tweak dump messages for vector mode choice

After vect_analyze_loop has successfully analysed a loop for
one base vector mode B1, it considers using following base vector
modes to vectorise an epilogue.  However, for VECT_COMPARE_COSTS,
a later mode B2 might turn out to be better than B1 was.  Initially
this comparison will be between an epilogue loop (for B2) and a main
loop (for B1).  However, in r11-6458 I'd added code to reanalyse the
B2 epilogue loop as a main loop, partly for correctness and partly
for better costing.

This can lead to a situation in which we think that the B2 epilogue
loop was better than the B1 main loop, but that the B2 main loop is
not better than the B1 main loop.  There was no dump message to say
that this had happened, which made it look like B2 had still won.

gcc/
	* tree-vect-loop.c (vect_analyze_loop): Print a dump message
	when a reanalyzed loop fails to be cheaper than the current
	main loop.
---
 gcc/tree-vect-loop.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'gcc')

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 00a57b2..48a54b0 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -3064,7 +3064,16 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 			= opt_loop_vec_info::success (main_loop_vinfo);
 		    }
 		  else
-		    delete main_loop_vinfo;
+		    {
+		      if (dump_enabled_p ())
+			dump_printf_loc (MSG_NOTE, vect_location,
+					 "***** No longer preferring vector"
+					 " mode %s after reanalyzing the loop"
+					 " as a main loop\n",
+					 GET_MODE_NAME
+					   (main_loop_vinfo->vector_mode));
+		      delete main_loop_vinfo;
+		    }
 		}
 	    }
 
-- 
cgit v1.1


From 5a1017dc305c49c59129d45536630d02dbc01c45 Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Wed, 4 Aug 2021 16:52:09 +0100
Subject: vect: Tweak comparisons with existing epilogue loops

This patch uses a more accurate scalar iteration estimate when
comparing the epilogue of a constant-iteration loop with a candidate
replacement epilogue.

In the testcase, the patch prevents a 1-to-3-element SVE epilogue
from seeming better than a 64-bit Advanced SIMD epilogue.

gcc/
	* tree-vect-loop.c (vect_better_loop_vinfo_p): Detect cases in
	which old_loop_vinfo is an epilogue loop that handles a constant
	number of iterations.

gcc/testsuite/
	* gcc.target/aarch64/sve/cost_model_12.c: New test.
---
 gcc/testsuite/gcc.target/aarch64/sve/cost_model_12.c | 19 +++++++++++++++++++
 gcc/tree-vect-loop.c                                 | 10 +++++++++-
 2 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cost_model_12.c

(limited to 'gcc')

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_12.c b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_12.c
new file mode 100644
index 0000000..4c5226e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_12.c
@@ -0,0 +1,19 @@
+/* { dg-options "-O3 -mtune=neoverse-512tvb" } */
+
+void
+f (float x[restrict 10][1024],
+   float y[restrict 10][1024], float z)
+{
+  for (int i = 0; i < 10; ++i)
+    {
+#pragma GCC unroll 10
+      for (int j = 0; j < 10; ++j)
+	x[j][i] = y[j][i] * z;
+    }
+}
+
+/* We should unroll the outer loop, with 2x 16-byte vectors and 1x
+   8-byte vectors.  */
+/* { dg-final { scan-assembler-not {\tptrue\t} } } */
+/* { dg-final { scan-assembler {\tv[0-9]+\.4s,} } } */
+/* { dg-final { scan-assembler {\tv[0-9]+\.2s,} } } */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 48a54b0..1e21fe6 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2772,7 +2772,15 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
 
   /* Limit the VFs to what is likely to be the maximum number of iterations,
      to handle cases in which at least one loop_vinfo is fully-masked.  */
-  HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
+  HOST_WIDE_INT estimated_max_niter;
+  loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo);
+  unsigned HOST_WIDE_INT main_vf;
+  if (main_loop
+      && LOOP_VINFO_NITERS_KNOWN_P (main_loop)
+      && LOOP_VINFO_VECT_FACTOR (main_loop).is_constant (&main_vf))
+    estimated_max_niter = LOOP_VINFO_INT_NITERS (main_loop) % main_vf;
+  else
+    estimated_max_niter = likely_max_stmt_executions_int (loop);
   if (estimated_max_niter != -1)
     {
       if (known_le (estimated_max_niter, new_vf))
-- 
cgit v1.1


From 1d65c9d25199264bc8909018df1b0dca71c0b32d Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Mon, 19 Jul 2021 14:01:52 +0100
Subject: aarch64: Don't include vec_select element in SIMD multiply cost

The Neon multiply/multiply-accumulate/multiply-subtract instructions
can take various forms - multiplying full vector registers of values
or multiplying one vector by a single element of another. Regardless
of the form used, these instructions have the same cost, and this
should be reflected by the RTL cost function.

This patch adds RTL tree traversal in the Neon multiply cost function
to match the vec_select used by the lane-referencing forms of the
instructions already mentioned. This traversal prevents the cost of
the vec_select from being added into the cost of the multiply -
meaning that these instructions can now be emitted in the combine
pass as they are no longer deemed prohibitively expensive.

gcc/ChangeLog:

2021-07-19  Jonathan Wright  <jonathan.wright@arm.com>

	* config/aarch64/aarch64.c (aarch64_strip_duplicate_vec_elt):
	Define.
	(aarch64_rtx_mult_cost): Traverse RTL tree to prevent
	vec_select cost from being added into Neon multiply cost.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vmul_element_cost.c: New test.
---
 gcc/config/aarch64/aarch64.c                       | 34 +++++---
 .../gcc.target/aarch64/vmul_element_cost.c         | 94 ++++++++++++++++++++++
 2 files changed, 119 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vmul_element_cost.c

(limited to 'gcc')

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 81c002b..23829bb 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -12046,6 +12046,26 @@ aarch64_strip_extend (rtx x, bool strip_shift)
   return x;
 }
 
+
+/* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
+   any subsequent extend and VEC_SELECT from X. Returns the inner scalar
+   operand if successful, or the original expression on failure.  */
+static rtx
+aarch64_strip_duplicate_vec_elt (rtx x)
+{
+  if (GET_CODE (x) == VEC_DUPLICATE
+      && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
+    {
+      x = XEXP (x, 0);
+      if (GET_CODE (x) == VEC_SELECT)
+	x = XEXP (x, 0);
+      else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
+	       && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
+	x = XEXP (XEXP (x, 0), 0);
+    }
+  return x;
+}
+
 /* Return true iff CODE is a shift supported in combination
    with arithmetic instructions.  */
 
@@ -12114,15 +12134,11 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
       if (vec_flags & VEC_ADVSIMD)
 	{
 	  /* The by-element versions of the instruction have the same costs as
-	     the normal 3-vector version.  So don't add the costs of the
-	     duplicate into the costs of the multiply.  We make an assumption
-	     that the input to the VEC_DUPLICATE is already on the FP & SIMD
-	     side.  This means costing of a MUL by element pre RA is a bit
-	     optimistic.  */
-	  if (GET_CODE (op0) == VEC_DUPLICATE)
-	    op0 = XEXP (op0, 0);
-	  else if (GET_CODE (op1) == VEC_DUPLICATE)
-	    op1 = XEXP (op1, 0);
+	     the normal 3-vector version.  We make an assumption that the input
+	     to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
+	     costing of a MUL by element pre RA is a bit optimistic.  */
+	  op0 = aarch64_strip_duplicate_vec_elt (op0);
+	  op1 = aarch64_strip_duplicate_vec_elt (op1);
 	}
       cost += rtx_cost (op0, mode, MULT, 0, speed);
       cost += rtx_cost (op1, mode, MULT, 1, speed);
diff --git a/gcc/testsuite/gcc.target/aarch64/vmul_element_cost.c b/gcc/testsuite/gcc.target/aarch64/vmul_element_cost.c
new file mode 100644
index 0000000..c153775
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vmul_element_cost.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_MUL_UNIFORM(name, q, vectype, ts) \
+  vectype test_ ## name ## q ## _ ## ts (vectype a, vectype b, vectype c) \
+	{ \
+		vectype t0 = name ## q ## _n_ ## ts (a, c[1]); \
+		vectype t1 = name ## q ## _n_ ## ts (b, c[1]); \
+		return vmul ## q ## _ ## ts (t0, t1); \
+	}
+
+TEST_MUL_UNIFORM (vmul, , int16x4_t, s16)
+TEST_MUL_UNIFORM (vmul, , uint16x4_t, u16)
+TEST_MUL_UNIFORM (vmul, , int32x2_t, s32)
+TEST_MUL_UNIFORM (vmul, , uint32x2_t, u32)
+TEST_MUL_UNIFORM (vmul, , float32x2_t, f32)
+TEST_MUL_UNIFORM (vmul, q, int16x8_t, s16)
+TEST_MUL_UNIFORM (vmul, q, uint16x8_t, u16)
+TEST_MUL_UNIFORM (vmul, q, int32x4_t, s32)
+TEST_MUL_UNIFORM (vmul, q, uint32x4_t, u32)
+TEST_MUL_UNIFORM (vmul, q, float32x4_t, f32)
+TEST_MUL_UNIFORM (vmul, q, float64x2_t, f64)
+
+#define TEST_MLX_UNIFORM(name, q, vectype, ts) \
+  vectype test_ ## name ## q ## _ ## ts (vectype acc, vectype a, vectype b) \
+	{ \
+		acc = name ## q ## _n_ ## ts (acc, a, b[1]); \
+		return name ## q ## _n_ ## ts (acc, a, b[1]); \
+	}
+
+TEST_MLX_UNIFORM (vmla, , int16x4_t, s16)
+TEST_MLX_UNIFORM (vmla, , uint16x4_t, u16)
+TEST_MLX_UNIFORM (vmla, , int32x2_t, s32)
+TEST_MLX_UNIFORM (vmla, , uint32x2_t, u32)
+TEST_MLX_UNIFORM (vmla, , float32x2_t, f32)
+TEST_MLX_UNIFORM (vmla, q, int16x8_t, s16)
+TEST_MLX_UNIFORM (vmla, q, uint16x8_t, u16)
+TEST_MLX_UNIFORM (vmla, q, int32x4_t, s32)
+TEST_MLX_UNIFORM (vmla, q, uint32x4_t, u32)
+TEST_MLX_UNIFORM (vmla, q, float32x4_t, f32)
+
+TEST_MLX_UNIFORM (vmls, , int16x4_t, s16)
+TEST_MLX_UNIFORM (vmls, , uint16x4_t, u16)
+TEST_MLX_UNIFORM (vmls, , int32x2_t, s32)
+TEST_MLX_UNIFORM (vmls, , uint32x2_t, u32)
+TEST_MLX_UNIFORM (vmls, , float32x2_t, f32)
+TEST_MLX_UNIFORM (vmls, q, int16x8_t, s16)
+TEST_MLX_UNIFORM (vmls, q, uint16x8_t, u16)
+TEST_MLX_UNIFORM (vmls, q, int32x4_t, s32)
+TEST_MLX_UNIFORM (vmls, q, uint32x4_t, u32)
+TEST_MLX_UNIFORM (vmls, q, float32x4_t, f32)
+
+#define TEST_MUL_LONG(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## ts (intype a, intype b, intype c) \
+	{ \
+		rettype t0 = name ## ts (a, c[1]); \
+		rettype t1 = name ## ts (b, c[1]); \
+		return vqaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_MUL_LONG (vmull_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MUL_LONG (vmull_n_, uint32x4_t, uint16x4_t, u16, u32)
+TEST_MUL_LONG (vmull_n_, int64x2_t, int32x2_t, s32, s64)
+TEST_MUL_LONG (vmull_n_, uint64x2_t, uint32x2_t, u32, u64)
+
+TEST_MUL_LONG (vqdmull_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MUL_LONG (vqdmull_n_, int64x2_t, int32x2_t, s32, s64)
+
+#define TEST_MLX_LONG(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b) \
+	{ \
+		acc = name ## ts (acc, a, b[1]); \
+		return name ## ts (acc, a, b[1]); \
+	}
+
+TEST_MLX_LONG (vmlal_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MLX_LONG (vmlal_n_, uint32x4_t, uint16x4_t, u16, u32)
+TEST_MLX_LONG (vmlal_n_, int64x2_t, int32x2_t, s32, s64)
+TEST_MLX_LONG (vmlal_n_, uint64x2_t, uint32x2_t, u32, u64)
+
+TEST_MLX_LONG (vmlsl_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MLX_LONG (vmlsl_n_, uint32x4_t, uint16x4_t, u16, u32)
+TEST_MLX_LONG (vmlsl_n_, int64x2_t, int32x2_t, s32, s64)
+TEST_MLX_LONG (vmlsl_n_, uint64x2_t, uint32x2_t, u32, u64)
+
+TEST_MLX_LONG (vqdmlal_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MLX_LONG (vqdmlal_n_, int64x2_t, int32x2_t, s32, s64)
+
+TEST_MLX_LONG (vqdmlsl_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MLX_LONG (vqdmlsl_n_, int64x2_t, int32x2_t, s32, s64)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */
-- 
cgit v1.1


From 63834c84d43fc2eeeaa054c5e24d1e468e9eddab Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Mon, 19 Jul 2021 10:19:30 +0100
Subject: aarch64: Don't include vec_select high-half in SIMD multiply cost

The Neon multiply/multiply-accumulate/multiply-subtract instructions
can select the top or bottom half of the operand registers. This
selection does not change the cost of the underlying instruction and
this should be reflected by the RTL cost function.

This patch adds RTL tree traversal in the Neon multiply cost function
to match vec_select high-half of its operands. This traversal
prevents the cost of the vec_select from being added into the cost of
the multiply - meaning that these instructions can now be emitted in
the combine pass as they are no longer deemed prohibitively
expensive.

gcc/ChangeLog:

2021-07-19  Jonathan Wright  <jonathan.wright@arm.com>

	* config/aarch64/aarch64.c (aarch64_strip_extend_vec_half):
	Define.
	(aarch64_rtx_mult_cost): Traverse RTL tree to prevent cost of
	vec_select high-half from being added into Neon multiply
	cost.
	* rtlanal.c (vec_series_highpart_p): Define.
	* rtlanal.h (vec_series_highpart_p): Declare.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vmul_high_cost.c: New test.
---
 gcc/config/aarch64/aarch64.c                      | 22 ++++++
 gcc/rtlanal.c                                     | 19 +++++
 gcc/rtlanal.h                                     |  4 ++
 gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c | 85 +++++++++++++++++++++++
 4 files changed, 130 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c

(limited to 'gcc')

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 23829bb..e02cbcb 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -78,6 +78,7 @@
 #include "gimple-pretty-print.h"
 #include "tree-ssa-loop-niter.h"
 #include "fractional-cost.h"
+#include "rtlanal.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -12046,6 +12047,22 @@ aarch64_strip_extend (rtx x, bool strip_shift)
   return x;
 }
 
+/* Helper function for rtx cost calculation. Strip extension as well as any
+   inner VEC_SELECT high-half from X. Returns the inner vector operand if
+   successful, or the original expression on failure.  */
+static rtx
+aarch64_strip_extend_vec_half (rtx x)
+{
+  if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
+    {
+      x = XEXP (x, 0);
+      if (GET_CODE (x) == VEC_SELECT
+	  && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
+				    XEXP (x, 1)))
+	x = XEXP (x, 0);
+    }
+  return x;
+}
 
 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
    any subsequent extend and VEC_SELECT from X. Returns the inner scalar
@@ -12133,6 +12150,11 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
       if (vec_flags & VEC_ADVSIMD)
 	{
+	  /* The select-operand-high-half versions of the instruction have the
+	     same cost as the three vector version - don't add the costs of the
+	     extension or selection into the costs of the multiply.  */
+	  op0 = aarch64_strip_extend_vec_half (op0);
+	  op1 = aarch64_strip_extend_vec_half (op1);
 	  /* The by-element versions of the instruction have the same costs as
 	     the normal 3-vector version.  We make an assumption that the input
 	     to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c
index f7f3acb..d37f778 100644
--- a/gcc/rtlanal.c
+++ b/gcc/rtlanal.c
@@ -6957,6 +6957,25 @@ register_asm_p (const_rtx x)
 
      (vec_select:RESULT_MODE OP SEL)
 
+   is equivalent to the highpart RESULT_MODE of OP.  */
+
+bool
+vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel)
+{
+  int nunits;
+  if (GET_MODE_NUNITS (op_mode).is_constant (&nunits)
+      && targetm.can_change_mode_class (op_mode, result_mode, ALL_REGS))
+    {
+      int offset = BYTES_BIG_ENDIAN ? 0 : nunits - XVECLEN (sel, 0);
+      return rtvec_series_p (XVEC (sel, 0), offset);
+    }
+  return false;
+}
+
+/* Return true if, for all OP of mode OP_MODE:
+
+     (vec_select:RESULT_MODE OP SEL)
+
    is equivalent to the lowpart RESULT_MODE of OP.  */
 
 bool
diff --git a/gcc/rtlanal.h b/gcc/rtlanal.h
index e164242..542dc78 100644
--- a/gcc/rtlanal.h
+++ b/gcc/rtlanal.h
@@ -332,6 +332,10 @@ inline vec_rtx_properties_base::~vec_rtx_properties_base ()
 using vec_rtx_properties = growing_rtx_properties<vec_rtx_properties_base>;
 
 bool
+vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode,
+		       rtx sel);
+
+bool
 vec_series_lowpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel);
 
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
new file mode 100644
index 0000000..ecc02e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_MULL_VEC(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
+	{ \
+		rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), \
+					      vget_high_ ## ts (c)); \
+		rettype t1 = name ## _ ## ts (vget_high_ ## ts (b), \
+					      vget_high_ ## ts (c)); \
+		return vqaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_MULL_VEC (vmull, int16x8_t, int8x16_t, s8, s16)
+TEST_MULL_VEC (vmull, uint16x8_t, uint8x16_t, u8, u16)
+TEST_MULL_VEC (vmull, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_VEC (vmull, uint32x4_t, uint16x8_t, u16, u32)
+TEST_MULL_VEC (vmull, int64x2_t, int32x4_t, s32, s64)
+TEST_MULL_VEC (vmull, uint64x2_t, uint32x4_t, u32, u64)
+
+TEST_MULL_VEC (vqdmull, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_VEC (vqdmull, int64x2_t, int32x4_t, s32, s64)
+
+#define TEST_MULL_N(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
+	{ \
+		rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), b[1]); \
+		rettype t1 = name ## _ ## ts (vget_high_ ## ts (a), c[1]); \
+		return vqaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_MULL_N (vmull_n, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_N (vmull_n, uint32x4_t, uint16x8_t, u16, u32)
+TEST_MULL_N (vmull_n, int64x2_t, int32x4_t, s32, s64)
+TEST_MULL_N (vmull_n, uint64x2_t, uint32x4_t, u32, u64)
+
+TEST_MULL_N (vqdmull_n, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_N (vqdmull_n, int64x2_t, int32x4_t, s32, s64)
+
+#define TEST_MLXL_VEC(name, rettype, intype, ts) \
+  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b, \
+				    intype c) \
+	{ \
+		acc = name ## _ ## ts (acc, vget_high_ ## ts (a), \
+					    vget_high_ ## ts (b)); \
+		return name ## _ ## ts (acc, vget_high_ ## ts (a), \
+					     vget_high_ ## ts (c)); \
+	}
+
+TEST_MLXL_VEC (vmlal, int16x8_t, int8x16_t, s8)
+TEST_MLXL_VEC (vmlal, uint16x8_t, uint8x16_t, u8)
+TEST_MLXL_VEC (vmlal, int32x4_t, int16x8_t, s16)
+TEST_MLXL_VEC (vmlal, uint32x4_t, uint16x8_t, u16)
+
+TEST_MLXL_VEC (vmlsl, int16x8_t, int8x16_t, s8)
+TEST_MLXL_VEC (vmlsl, uint16x8_t, uint8x16_t, u8)
+TEST_MLXL_VEC (vmlsl, int32x4_t, int16x8_t, s16)
+TEST_MLXL_VEC (vmlsl, uint32x4_t, uint16x8_t, u16)
+
+#define TEST_MLXL_N(name, rettype, intype, ts) \
+  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b) \
+	{ \
+		acc = name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
+		return name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
+	}
+
+TEST_MLXL_N (vmlal_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vmlal_n, uint32x4_t, uint16x8_t, u16)
+TEST_MLXL_N (vmlal_n, int64x2_t, int32x4_t, s32)
+TEST_MLXL_N (vmlal_n, uint64x2_t, uint32x4_t, u32)
+
+TEST_MLXL_N (vmlsl_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vmlsl_n, uint32x4_t, uint16x8_t, u16)
+TEST_MLXL_N (vmlsl_n, int64x2_t, int32x4_t, s32)
+TEST_MLXL_N (vmlsl_n, uint64x2_t, uint32x4_t, u32)
+
+TEST_MLXL_N (vqdmlal_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vqdmlal_n, int64x2_t, int32x4_t, s32)
+
+TEST_MLXL_N (vqdmlsl_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vqdmlsl_n, int64x2_t, int32x4_t, s32)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */
-- 
cgit v1.1


From 5391688acc997e26375e42340cea885fa6ad0d7d Mon Sep 17 00:00:00 2001
From: Andreas Krebbel <krebbel@linux.ibm.com>
Date: Wed, 4 Aug 2021 18:40:09 +0200
Subject: IBM Z: Get rid of vec merge unspec

This patch gets rid of the unspecs we were using for the vector merge
instruction and replaces it with generic rtx.

gcc/ChangeLog:

	* config/s390/s390-modes.def: Add more vector modes to support
	concatenation of two vectors.
	* config/s390/s390-protos.h (s390_expand_merge_perm_const): Add
	prototype.
	(s390_expand_merge): Likewise.
	* config/s390/s390.c (s390_expand_merge_perm_const): New function.
	(s390_expand_merge): New function.
	* config/s390/s390.md (UNSPEC_VEC_MERGEH, UNSPEC_VEC_MERGEL):
	Remove constant definitions.
	* config/s390/vector.md (V_HW_2): Add mode iterators.
	(VI_HW_4, V_HW_4): Rename VI_HW_4 to V_HW_4.
	(vec_2x_nelts, vec_2x_wide): New mode attributes.
	(*vmrhb, *vmrlb, *vmrhh, *vmrlh, *vmrhf, *vmrlf, *vmrhg, *vmrlg):
	New pattern definitions.
	(vec_widen_umult_lo_<mode>, vec_widen_umult_hi_<mode>)
	(vec_widen_smult_lo_<mode>, vec_widen_smult_hi_<mode>)
	(vec_unpacks_lo_v4sf, vec_unpacks_hi_v4sf, vec_unpacks_lo_v2df)
	(vec_unpacks_hi_v2df): Adjust expanders to emit non-unspec RTX for
	vec merge.
	* config/s390/vx-builtins.md (V_HW_4): Remove mode iterator. Now
	in vector.md.
	(vec_mergeh<mode>, vec_mergel<mode>): Use s390_expand_merge to
	emit vec merge pattern.

gcc/testsuite/ChangeLog:

	* gcc.target/s390/vector/long-double-asm-in-out-hard-fp-reg.c:
	Instead of vpdi with 0 and 5 vmrlg and vmrhg are used now.
	* gcc.target/s390/vector/long-double-asm-inout-hard-fp-reg.c: Likewise.
	* gcc.target/s390/zvector/vec-types.h: New test.
	* gcc.target/s390/zvector/vec_merge.c: New test.
---
 gcc/config/s390/s390-modes.def                     |  11 +-
 gcc/config/s390/s390-protos.h                      |   2 +
 gcc/config/s390/s390.c                             |  36 ++++
 gcc/config/s390/s390.md                            |   2 -
 gcc/config/s390/vector.md                          | 208 +++++++++++++++++----
 gcc/config/s390/vx-builtins.md                     |  35 ++--
 .../vector/long-double-asm-in-out-hard-fp-reg.c    |   8 +-
 .../vector/long-double-asm-inout-hard-fp-reg.c     |   6 +-
 gcc/testsuite/gcc.target/s390/zvector/vec-types.h  |  37 ++++
 gcc/testsuite/gcc.target/s390/zvector/vec_merge.c  |  88 +++++++++
 10 files changed, 369 insertions(+), 64 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/zvector/vec-types.h
 create mode 100644 gcc/testsuite/gcc.target/s390/zvector/vec_merge.c

(limited to 'gcc')

diff --git a/gcc/config/s390/s390-modes.def b/gcc/config/s390/s390-modes.def
index 6d814fc..245c2b8 100644
--- a/gcc/config/s390/s390-modes.def
+++ b/gcc/config/s390/s390-modes.def
@@ -259,14 +259,17 @@ CC_MODE (CCVFANY);
 
 /* Vector modes.  */
 
-VECTOR_MODES (INT, 2);        /*                 V2QI */
-VECTOR_MODES (INT, 4);        /*            V4QI V2HI */
-VECTOR_MODES (INT, 8);        /*       V8QI V4HI V2SI */
-VECTOR_MODES (INT, 16);       /* V16QI V8HI V4SI V2DI */
+VECTOR_MODES (INT, 2);        /*                       V2QI */
+VECTOR_MODES (INT, 4);        /*                  V4QI V2HI */
+VECTOR_MODES (INT, 8);        /*             V8QI V4HI V2SI */
+VECTOR_MODES (INT, 16);       /*       V16QI V8HI V4SI V2DI */
+VECTOR_MODES (INT, 32);       /* V32QI V16HI V8SI V4DI V2TI */
 
 VECTOR_MODE (FLOAT, SF, 2);   /* V2SF */
 VECTOR_MODE (FLOAT, SF, 4);   /* V4SF */
+VECTOR_MODE (FLOAT, SF, 8);   /* V8SF */
 VECTOR_MODE (FLOAT, DF, 2);   /* V2DF */
+VECTOR_MODE (FLOAT, DF, 4);   /* V4DF */
 
 VECTOR_MODE (INT, QI, 1);     /* V1QI */
 VECTOR_MODE (INT, HI, 1);     /* V1HI */
diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
index 289e018..4b03c6e 100644
--- a/gcc/config/s390/s390-protos.h
+++ b/gcc/config/s390/s390-protos.h
@@ -122,6 +122,8 @@ extern void s390_expand_vec_compare_cc (rtx, enum rtx_code, rtx, rtx, bool);
 extern enum rtx_code s390_reverse_condition (machine_mode, enum rtx_code);
 extern void s390_expand_vcond (rtx, rtx, rtx, enum rtx_code, rtx, rtx);
 extern void s390_expand_vec_init (rtx, rtx);
+extern rtx s390_expand_merge_perm_const (machine_mode, bool);
+extern void s390_expand_merge (rtx, rtx, rtx, bool);
 extern rtx s390_build_signbit_mask (machine_mode);
 extern rtx s390_return_addr_rtx (int, rtx);
 extern rtx s390_back_chain_rtx (void);
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 8c7d366..3f4521e 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -7014,6 +7014,42 @@ s390_expand_vec_init (rtx target, rtx vals)
     }
 }
 
+/* Return a parallel of constant integers to be used as permutation
+   vector for a vector merge operation in MODE.  If HIGH_P is true the
+   left-most elements of the source vectors are merged otherwise the
+   right-most elements.  */
+rtx
+s390_expand_merge_perm_const (machine_mode mode, bool high_p)
+{
+  int nelts = GET_MODE_NUNITS (mode);
+  rtx perm[16];
+  int addend = high_p ? 0 : nelts;
+
+  for (int i = 0; i < nelts; i++)
+    perm[i] = GEN_INT ((i + addend) / 2 + (i % 2) * nelts);
+
+  return gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelts, perm));
+}
+
+/* Emit RTL to implement a vector merge operation of SRC1 and SRC2
+   which creates the result in TARGET. HIGH_P determines whether a
+   merge hi or lo will be generated.  */
+void
+s390_expand_merge (rtx target, rtx src1, rtx src2, bool high_p)
+{
+  machine_mode mode = GET_MODE (target);
+  opt_machine_mode opt_mode_2x = mode_for_vector (GET_MODE_INNER (mode),
+						  2 * GET_MODE_NUNITS (mode));
+  gcc_assert (opt_mode_2x.exists ());
+  machine_mode mode_double_nelts = opt_mode_2x.require ();
+  rtx constv = s390_expand_merge_perm_const (mode, high_p);
+  src1 = force_reg (GET_MODE (src1), src1);
+  src2 = force_reg (GET_MODE (src2), src2);
+  rtx x = gen_rtx_VEC_CONCAT (mode_double_nelts, src1, src2);
+  x = gen_rtx_VEC_SELECT (mode, x, constv);
+  emit_insn (gen_rtx_SET (target, x));
+}
+
 /* Emit a vector constant that contains 1s in each element's sign bit position
    and 0s in other positions.  MODE is the desired constant's mode.  */
 extern rtx
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 8ad21b0..d896fae 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -158,8 +158,6 @@
    UNSPEC_VEC_LOAD_BNDRY
    UNSPEC_VEC_LOAD_LEN
    UNSPEC_VEC_LOAD_LEN_R
-   UNSPEC_VEC_MERGEH
-   UNSPEC_VEC_MERGEL
    UNSPEC_VEC_PACK
    UNSPEC_VEC_PACK_SATURATE
    UNSPEC_VEC_PACK_SATURATE_CC
diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index ab605b3..51c6332 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -50,7 +50,10 @@
 (define_mode_iterator VI_HW_HSD [V8HI  V4SI V2DI])
 (define_mode_iterator VI_HW_HS  [V8HI  V4SI])
 (define_mode_iterator VI_HW_QH  [V16QI V8HI])
-(define_mode_iterator VI_HW_4   [V4SI V4SF])
+
+; Directly supported vector modes with a certain number of elements
+(define_mode_iterator V_HW_2   [V2DI V2DF])
+(define_mode_iterator V_HW_4   [V4SI V4SF])
 
 ; All integer vector modes supported in a vector register + TImode
 (define_mode_iterator VIT [V1QI V2QI V4QI V8QI V16QI V1HI V2HI V4HI V8HI V1SI V2SI V4SI V1DI V2DI V1TI TI])
@@ -163,14 +166,14 @@
 		       (DF "d") (V1DF "d") (V2DF "d")
 		       (TF "x") (V1TF "x")])
 
-; Vector with doubled element size.
+; Vector with widened element size but half the number of elements.
 (define_mode_attr vec_double [(V1QI "V1HI") (V2QI "V1HI") (V4QI "V2HI") (V8QI "V4HI") (V16QI "V8HI")
 			      (V1HI "V1SI") (V2HI "V1SI") (V4HI "V2SI") (V8HI "V4SI")
 			      (V1SI "V1DI") (V2SI "V1DI") (V4SI "V2DI")
 			      (V1DI "V1TI") (V2DI "V1TI")
 			      (V1SF "V1DF") (V2SF "V1DF") (V4SF "V2DF")])
 
-; Vector with half the element size.
+; Vector with shrinked element size but twice the number of elements.
 (define_mode_attr vec_half [(V1HI "V2QI") (V2HI "V4QI") (V4HI "V8QI") (V8HI "V16QI")
 			    (V1SI "V2HI") (V2SI "V4HI") (V4SI "V8HI")
 			    (V1DI "V2SI") (V2DI "V4SI")
@@ -178,6 +181,22 @@
 			    (V1DF "V2SF") (V2DF "V4SF")
 			    (V1TF "V1DF")])
 
+; Vector with twice the number of elements but same element size.
+(define_mode_attr vec_2x_nelts [(V1QI "V2QI") (V2QI "V4QI") (V4QI "V8QI") (V8QI "V16QI") (V16QI "V32QI")
+				(V1HI "V2HI") (V2HI "V4HI") (V4HI "V8HI") (V8HI "V16HI")
+				(V1SI "V2SI") (V2SI "V4SI") (V4SI "V8SI")
+				(V1DI "V2DI") (V2DI "V4DI")
+				(V1SF "V2SF") (V2SF "V4SF") (V4SF "V8SF")
+				(V1DF "V2DF") (V2DF "V4DF")])
+
+; Vector with widened element size and the same number of elements.
+(define_mode_attr vec_2x_wide [(V1QI "V1HI") (V2QI "V2HI") (V4QI "V4HI") (V8QI "V8HI") (V16QI "V16HI")
+			       (V1HI "V1SI") (V2HI "V2SI") (V4HI "V4SI") (V8HI "V8SI")
+			       (V1SI "V1DI") (V2SI "V2DI") (V4SI "V4DI")
+			       (V1DI "V1TI") (V2DI "V2TI")
+			       (V1SF "V1DF") (V2SF "V2DF") (V4SF "V4DF")
+			       (V1DF "V1TF") (V2DF "V2TF")])
+
 ; Vector with half the element size AND half the number of elements.
 (define_mode_attr vec_halfhalf
   [(V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI")
@@ -532,8 +551,8 @@
 })
 
 (define_insn "*vec_vllezlf<mode>"
-  [(set (match_operand:VI_HW_4              0 "register_operand" "=v")
-	(vec_concat:VI_HW_4
+  [(set (match_operand:V_HW_4              0 "register_operand" "=v")
+	(vec_concat:V_HW_4
 	 (vec_concat:<vec_halfnumelts>
 	  (match_operand:<non_vec> 1 "memory_operand"    "R")
 	  (const_int 0))
@@ -748,6 +767,109 @@
   "vperm\t%v0,%v1,%v2,%v3"
   [(set_attr "op_type" "VRR")])
 
+(define_insn "*vmrhb"
+  [(set (match_operand:V16QI                     0 "register_operand" "=v")
+        (vec_select:V16QI
+	  (vec_concat:V32QI (match_operand:V16QI 1 "register_operand"  "v")
+			    (match_operand:V16QI 2 "register_operand"  "v"))
+	  (parallel [(const_int 0) (const_int 16)
+		     (const_int 1) (const_int 17)
+		     (const_int 2) (const_int 18)
+		     (const_int 3) (const_int 19)
+		     (const_int 4) (const_int 20)
+		     (const_int 5) (const_int 21)
+		     (const_int 6) (const_int 22)
+		     (const_int 7) (const_int 23)])))]
+  "TARGET_VX"
+  "vmrhb\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrlb"
+  [(set (match_operand:V16QI                     0 "register_operand" "=v")
+        (vec_select:V16QI
+	  (vec_concat:V32QI (match_operand:V16QI 1 "register_operand"  "v")
+			    (match_operand:V16QI 2 "register_operand"  "v"))
+	  (parallel [(const_int  8) (const_int 24)
+		     (const_int  9) (const_int 25)
+		     (const_int 10) (const_int 26)
+		     (const_int 11) (const_int 27)
+		     (const_int 12) (const_int 28)
+		     (const_int 13) (const_int 29)
+		     (const_int 14) (const_int 30)
+		     (const_int 15) (const_int 31)])))]
+  "TARGET_VX"
+  "vmrlb\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrhh"
+  [(set (match_operand:V8HI                     0 "register_operand" "=v")
+        (vec_select:V8HI
+	  (vec_concat:V16HI (match_operand:V8HI 1 "register_operand"  "v")
+			    (match_operand:V8HI 2 "register_operand"  "v"))
+	  (parallel [(const_int 0) (const_int 8)
+		     (const_int 1) (const_int 9)
+		     (const_int 2) (const_int 10)
+		     (const_int 3) (const_int 11)])))]
+  "TARGET_VX"
+  "vmrhh\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrlh"
+  [(set (match_operand:V8HI                     0 "register_operand" "=v")
+        (vec_select:V8HI
+	  (vec_concat:V16HI (match_operand:V8HI 1 "register_operand"  "v")
+			    (match_operand:V8HI 2 "register_operand"  "v"))
+	  (parallel [(const_int 4) (const_int 12)
+		     (const_int 5) (const_int 13)
+		     (const_int 6) (const_int 14)
+		     (const_int 7) (const_int 15)])))]
+  "TARGET_VX"
+  "vmrlh\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrhf"
+  [(set (match_operand:V_HW_4                              0 "register_operand" "=v")
+        (vec_select:V_HW_4
+	  (vec_concat:<vec_2x_nelts> (match_operand:V_HW_4 1 "register_operand"  "v")
+				     (match_operand:V_HW_4 2 "register_operand"  "v"))
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 1) (const_int 5)])))]
+  "TARGET_VX"
+  "vmrhf\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrlf"
+  [(set (match_operand:V_HW_4                              0 "register_operand" "=v")
+        (vec_select:V_HW_4
+	  (vec_concat:<vec_2x_nelts> (match_operand:V_HW_4 1 "register_operand"  "v")
+				     (match_operand:V_HW_4 2 "register_operand"  "v"))
+	  (parallel [(const_int 2) (const_int 6)
+		     (const_int 3) (const_int 7)])))]
+  "TARGET_VX"
+  "vmrlf\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrhg"
+  [(set (match_operand:V_HW_2                              0 "register_operand" "=v")
+        (vec_select:V_HW_2
+	  (vec_concat:<vec_2x_nelts> (match_operand:V_HW_2 1 "register_operand"  "v")
+				     (match_operand:V_HW_2 2 "register_operand"  "v"))
+	  (parallel [(const_int 0) (const_int 2)])))]
+  "TARGET_VX"
+  "vmrhg\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrlg"
+  [(set (match_operand:V_HW_2                              0 "register_operand" "=v")
+        (vec_select:V_HW_2
+	  (vec_concat:<vec_2x_nelts> (match_operand:V_HW_2 1 "register_operand"  "v")
+				     (match_operand:V_HW_2 2 "register_operand"  "v"))
+	  (parallel [(const_int 1) (const_int 3)])))]
+  "TARGET_VX"
+  "vmrlg\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+
 (define_insn "*tf_to_fprx2_0"
   [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 0)
 	(subreg:DF (match_operand:TF    1 "general_operand"       "v") 0))]
@@ -1271,12 +1393,14 @@
 	(unspec:<vec_double> [(match_dup 1) (match_dup 2)]
 			     UNSPEC_VEC_UMULT_ODD))
    (set (match_operand:<vec_double>                 0 "register_operand" "")
-	(unspec:<vec_double> [(match_dup 3) (match_dup 4)]
-			     UNSPEC_VEC_MERGEL))]
+        (vec_select:<vec_double>
+	 (vec_concat:<vec_2x_wide> (match_dup 3) (match_dup 4))
+	 (match_dup 5)))]
   "TARGET_VX"
  {
    operands[3] = gen_reg_rtx (<vec_double>mode);
    operands[4] = gen_reg_rtx (<vec_double>mode);
+   operands[5] = s390_expand_merge_perm_const (<vec_double>mode, false);
  })
 
 (define_expand "vec_widen_umult_hi_<mode>"
@@ -1288,12 +1412,14 @@
 	(unspec:<vec_double> [(match_dup 1) (match_dup 2)]
 			     UNSPEC_VEC_UMULT_ODD))
    (set (match_operand:<vec_double>                 0 "register_operand" "")
-	(unspec:<vec_double> [(match_dup 3) (match_dup 4)]
-			     UNSPEC_VEC_MERGEH))]
+        (vec_select:<vec_double>
+	 (vec_concat:<vec_2x_wide> (match_dup 3) (match_dup 4))
+	 (match_dup 5)))]
   "TARGET_VX"
  {
    operands[3] = gen_reg_rtx (<vec_double>mode);
    operands[4] = gen_reg_rtx (<vec_double>mode);
+   operands[5] = s390_expand_merge_perm_const (<vec_double>mode, true);
  })
 
 (define_expand "vec_widen_smult_lo_<mode>"
@@ -1305,12 +1431,14 @@
 	(unspec:<vec_double> [(match_dup 1) (match_dup 2)]
 			     UNSPEC_VEC_SMULT_ODD))
    (set (match_operand:<vec_double>                 0 "register_operand" "")
-	(unspec:<vec_double> [(match_dup 3) (match_dup 4)]
-			     UNSPEC_VEC_MERGEL))]
+        (vec_select:<vec_double>
+	 (vec_concat:<vec_2x_wide> (match_dup 3) (match_dup 4))
+	 (match_dup 5)))]
   "TARGET_VX"
  {
    operands[3] = gen_reg_rtx (<vec_double>mode);
    operands[4] = gen_reg_rtx (<vec_double>mode);
+   operands[5] = s390_expand_merge_perm_const (<vec_double>mode, false);
  })
 
 (define_expand "vec_widen_smult_hi_<mode>"
@@ -1322,12 +1450,14 @@
 	(unspec:<vec_double> [(match_dup 1) (match_dup 2)]
 			     UNSPEC_VEC_SMULT_ODD))
    (set (match_operand:<vec_double>                 0 "register_operand" "")
-	(unspec:<vec_double> [(match_dup 3) (match_dup 4)]
-			     UNSPEC_VEC_MERGEH))]
+        (vec_select:<vec_double>
+	 (vec_concat:<vec_2x_wide> (match_dup 3) (match_dup 4))
+	 (match_dup 5)))]
   "TARGET_VX"
  {
    operands[3] = gen_reg_rtx (<vec_double>mode);
    operands[4] = gen_reg_rtx (<vec_double>mode);
+   operands[5] = s390_expand_merge_perm_const (<vec_double>mode, true);
  })
 
 ; vec_widen_ushiftl_hi
@@ -2166,29 +2296,35 @@
 
 (define_expand "vec_unpacks_lo_v4sf"
   [(set (match_dup 2)
-	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")
-		      (match_dup 1)]
-		     UNSPEC_VEC_MERGEL))
-   (set (match_operand:V2DF               0 "register_operand" "=v")
+        (vec_select:V4SF
+	 (vec_concat:V8SF (match_operand:V4SF 1 "register_operand" "") (match_dup 1))
+	 (match_dup 3)))
+   (set (match_operand:V2DF                   0 "register_operand" "")
 	(float_extend:V2DF
 	 (vec_select:V2SF
 	  (match_dup 2)
 	  (parallel [(const_int 0) (const_int 2)]))))]
   "TARGET_VX"
-{ operands[2] = gen_reg_rtx(V4SFmode); })
+{
+  operands[2] = gen_reg_rtx(V4SFmode);
+  operands[3] = s390_expand_merge_perm_const (V4SFmode, false);
+})
 
 (define_expand "vec_unpacks_hi_v4sf"
   [(set (match_dup 2)
-	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")
-		      (match_dup 1)]
-		     UNSPEC_VEC_MERGEH))
-   (set (match_operand:V2DF               0 "register_operand" "=v")
+        (vec_select:V4SF
+	 (vec_concat:V8SF (match_operand:V4SF 1 "register_operand" "") (match_dup 1))
+	 (match_dup 3)))
+   (set (match_operand:V2DF                   0 "register_operand" "")
 	(float_extend:V2DF
 	 (vec_select:V2SF
 	  (match_dup 2)
 	  (parallel [(const_int 0) (const_int 2)]))))]
   "TARGET_VX"
-{ operands[2] = gen_reg_rtx(V4SFmode); })
+{
+  operands[2] = gen_reg_rtx(V4SFmode);
+  operands[3] = s390_expand_merge_perm_const (V4SFmode, true);
+})
 
 
 ; double -> long double
@@ -2204,29 +2340,35 @@
 
 (define_expand "vec_unpacks_lo_v2df"
   [(set (match_dup 2)
-	(unspec:V2DF [(match_operand:V2DF 1 "register_operand" "v")
-		      (match_dup 1)]
-		     UNSPEC_VEC_MERGEL))
-   (set (match_operand:V1TF               0 "register_operand" "=v")
+        (vec_select:V2DF
+	 (vec_concat:V4DF (match_operand:V2DF 1 "register_operand" "") (match_dup 1))
+	 (match_dup 3)))
+   (set (match_operand:V1TF                   0 "register_operand" "")
 	(float_extend:V1TF
 	 (vec_select:V1DF
 	  (match_dup 2)
 	  (parallel [(const_int 0)]))))]
   "TARGET_VXE"
-{ operands[2] = gen_reg_rtx (V2DFmode); })
+{
+  operands[2] = gen_reg_rtx (V2DFmode);
+  operands[3] = s390_expand_merge_perm_const (V2DFmode, false);
+})
 
 (define_expand "vec_unpacks_hi_v2df"
   [(set (match_dup 2)
-	(unspec:V2DF [(match_operand:V2DF 1 "register_operand" "v")
-		      (match_dup 1)]
-		     UNSPEC_VEC_MERGEH))
-   (set (match_operand:V1TF               0 "register_operand" "=v")
+        (vec_select:V2DF
+	 (vec_concat:V4DF (match_operand:V2DF 1 "register_operand" "") (match_dup 1))
+	 (match_dup 3)))
+   (set (match_operand:V1TF                   0 "register_operand" "")
 	(float_extend:V1TF
 	 (vec_select:V1DF
 	  (match_dup 2)
 	  (parallel [(const_int 0)]))))]
   "TARGET_VXE"
-{ operands[2] = gen_reg_rtx (V2DFmode); })
+{
+  operands[2] = gen_reg_rtx (V2DFmode);
+  operands[3] = s390_expand_merge_perm_const (V2DFmode, true);
+})
 
 
 ; 2 x v2df -> 1 x v4sf
diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
index 3df501b..5abe43b 100644
--- a/gcc/config/s390/vx-builtins.md
+++ b/gcc/config/s390/vx-builtins.md
@@ -22,7 +22,7 @@
 
 (define_mode_iterator V_HW_32_64 [V4SI V2DI V2DF (V4SF "TARGET_VXE")])
 (define_mode_iterator VI_HW_SD [V4SI V2DI])
-(define_mode_iterator V_HW_4 [V4SI V4SF])
+
 ; Full size vector modes with more than one element which are directly supported in vector registers by the hardware.
 (define_mode_iterator VEC_HW  [V16QI V8HI V4SI V2DI V2DF (V4SF "TARGET_VXE")])
 (define_mode_iterator VECF_HW [(V4SF "TARGET_VXE") V2DF])
@@ -232,28 +232,27 @@
   [(set_attr "op_type" "VRS,VRX,VSI")])
 
 
-; FIXME: The following two patterns might using vec_merge. But what is
-; the canonical form: (vec_select (vec_merge op0 op1)) or (vec_merge
-; (vec_select op0) (vec_select op1)
 ; vmrhb, vmrhh, vmrhf, vmrhg
-(define_insn "vec_mergeh<mode>"
-  [(set (match_operand:V_128_NOSINGLE                         0 "register_operand" "=v")
-	(unspec:V_128_NOSINGLE [(match_operand:V_128_NOSINGLE 1 "register_operand"  "v")
-			(match_operand:V_128_NOSINGLE         2 "register_operand"  "v")]
-		       UNSPEC_VEC_MERGEH))]
+(define_expand "vec_mergeh<mode>"
+  [(match_operand:V_128_NOSINGLE 0 "register_operand" "")
+   (match_operand:V_128_NOSINGLE 1 "register_operand" "")
+   (match_operand:V_128_NOSINGLE 2 "register_operand" "")]
   "TARGET_VX"
-  "vmrh<bhfgq>\t%v0,%1,%2"
-  [(set_attr "op_type" "VRR")])
+{
+  s390_expand_merge (operands[0], operands[1], operands[2], true);
+  DONE;
+})
 
 ; vmrlb, vmrlh, vmrlf, vmrlg
-(define_insn "vec_mergel<mode>"
-  [(set (match_operand:V_128_NOSINGLE                         0 "register_operand" "=v")
-	(unspec:V_128_NOSINGLE [(match_operand:V_128_NOSINGLE 1 "register_operand"  "v")
-			(match_operand:V_128_NOSINGLE         2 "register_operand"  "v")]
-		     UNSPEC_VEC_MERGEL))]
+(define_expand "vec_mergel<mode>"
+  [(match_operand:V_128_NOSINGLE 0 "register_operand" "")
+   (match_operand:V_128_NOSINGLE 1 "register_operand" "")
+   (match_operand:V_128_NOSINGLE 2 "register_operand" "")]
   "TARGET_VX"
-  "vmrl<bhfgq>\t%v0,%1,%2"
-  [(set_attr "op_type" "VRR")])
+{
+  s390_expand_merge (operands[0], operands[1], operands[2], false);
+  DONE;
+})
 
 
 ; Vector pack
diff --git a/gcc/testsuite/gcc.target/s390/vector/long-double-asm-in-out-hard-fp-reg.c b/gcc/testsuite/gcc.target/s390/vector/long-double-asm-in-out-hard-fp-reg.c
index 2dcaf08..a89dd46 100644
--- a/gcc/testsuite/gcc.target/s390/vector/long-double-asm-in-out-hard-fp-reg.c
+++ b/gcc/testsuite/gcc.target/s390/vector/long-double-asm-in-out-hard-fp-reg.c
@@ -16,13 +16,13 @@ sqxbr (long double x)
   return out;
 }
 
-/* Ideally `vpdi %v3,%v1,%v3,5` should be optimized away, but the compiler
+/* Ideally `vmrlg %v3,%v1,%v3` should be optimized away, but the compiler
  * can't do it, because the UNSPEC pattern operates on the whole register.
  * Using the SUBREG pattern solves this problem, but it's fragile.
  */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v2,%v0,%v2,5\n} 1 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v1,%v1,%v3,0\n} 2 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v3,%v1,%v3,5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrlg\t%v2,%v0,%v2\n} 1 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrhg\t%v1,%v1,%v3\n} 2 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrlg\t%v3,%v1,%v3\n} 1 } } */
 
 int
 main (void)
diff --git a/gcc/testsuite/gcc.target/s390/vector/long-double-asm-inout-hard-fp-reg.c b/gcc/testsuite/gcc.target/s390/vector/long-double-asm-inout-hard-fp-reg.c
index 6c5f88d..dd894c8 100644
--- a/gcc/testsuite/gcc.target/s390/vector/long-double-asm-inout-hard-fp-reg.c
+++ b/gcc/testsuite/gcc.target/s390/vector/long-double-asm-inout-hard-fp-reg.c
@@ -15,12 +15,12 @@ sqxbr (long double x)
   return inout;
 }
 
-/* Ideally there should be just one `vpdi %v6,%v4,%v6,5`, but the compiler
+/* Ideally there should be just one `vmrlg %v6,%v4,%v6`, but the compiler
  * can't optimize it away, because the UNSPEC pattern operates on the whole
  * register.  Using the SUBREG pattern solves this problem, but it's fragile.
  */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v6,%v4,%v6,5\n} 2 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v4,%v4,%v6,0\n} 2 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrlg\t%v6,%v4,%v6\n} 2 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrhg\t%v4,%v4,%v6\n} 2 } } */
 
 int
 main (void)
diff --git a/gcc/testsuite/gcc.target/s390/zvector/vec-types.h b/gcc/testsuite/gcc.target/s390/zvector/vec-types.h
new file mode 100644
index 0000000..35bd2a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/zvector/vec-types.h
@@ -0,0 +1,37 @@
+#ifndef VEC_TYPES_H
+#define VEC_TYPES_H 1
+
+#include <vecintrin.h>
+
+typedef __vector signed char v16qi;
+typedef __vector unsigned char uv16qi;
+
+typedef __vector signed short v8hi;
+typedef __vector unsigned short uv8hi;
+
+typedef __vector signed int v4si;
+typedef __vector unsigned int uv4si;
+
+typedef __vector signed long long v2di;
+typedef __vector unsigned long long uv2di;
+
+#if __SIZEOF_INT128__ == 16
+typedef __vector __int128_t v1ti;
+#endif
+
+typedef __vector double v2df;
+typedef __vector long double v1tf;
+
+#if __ARCH__ >= 12
+typedef __vector float v4sf;
+#endif
+
+#define GEN_SEQ_VEC(VEC_TYPE, ADDEND)					\
+  ({ VEC_TYPE dummy;							\
+    const int elts = sizeof(VEC_TYPE) / sizeof(dummy[0]);		\
+    typeof(dummy[0]) __attribute__((aligned(8))) ar[elts];		\
+    for (int i = 0; i < elts; i++)					\
+      ar[i] = (typeof(dummy[0]))(i + (ADDEND));				\
+    *(VEC_TYPE*)ar;})
+
+#endif
diff --git a/gcc/testsuite/gcc.target/s390/zvector/vec_merge.c b/gcc/testsuite/gcc.target/s390/zvector/vec_merge.c
new file mode 100644
index 0000000..348d1f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/zvector/vec_merge.c
@@ -0,0 +1,88 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=z14 -mzvector --save-temps" } */
+/* { dg-do run { target { s390_z14_hw } } } */
+
+/* { dg-final { scan-assembler-times "\tvmrhb\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrlb\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrhh\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrlh\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrhf\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlf\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrhg\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlg\t" 3 } } */
+
+#include "vec-types.h"
+#include <vecintrin.h>
+
+#define GEN_MERGE(VEC_TYPE, HILO)					\
+  VEC_TYPE __attribute__((noinline))					\
+  merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {			\
+    return vec_merge##HILO (a, b); }
+
+GEN_MERGE(v16qi, l)
+GEN_MERGE(v16qi, h)
+GEN_MERGE(uv16qi, l)
+GEN_MERGE(uv16qi, h)
+
+GEN_MERGE(v8hi, l)
+GEN_MERGE(v8hi, h)
+GEN_MERGE(uv8hi, l)
+GEN_MERGE(uv8hi, h)
+
+GEN_MERGE(v4si, l)
+GEN_MERGE(v4si, h)
+GEN_MERGE(uv4si, l)
+GEN_MERGE(uv4si, h)
+
+GEN_MERGE(v4sf, l)
+GEN_MERGE(v4sf, h)
+
+GEN_MERGE(v2di, l)
+GEN_MERGE(v2di, h)
+GEN_MERGE(uv2di, l)
+GEN_MERGE(uv2di, h)
+
+GEN_MERGE(v2df, l)
+GEN_MERGE(v2df, h)
+
+
+#define CHECK_MERGE_LO(VEC_TYPE, SRC1, SRC2)				\
+  {									\
+    VEC_TYPE v = merge_l_##VEC_TYPE ((SRC1), (SRC2));			\
+    int elts = sizeof(v) / sizeof(v[0]);				\
+    for (int i = 0; i < elts; i++)					\
+      if (v[i] != (i + elts) / 2 + (i % 2) * elts)			\
+	__builtin_abort();						\
+  }
+
+#define CHECK_MERGE_HI(VEC_TYPE, SRC1, SRC2)				\
+  {									\
+    VEC_TYPE v = merge_h_##VEC_TYPE ((SRC1), (SRC2));			\
+    int elts = sizeof(v) / sizeof(v[0]);				\
+    for (int i = 0; i < elts; i++)					\
+      if (v[i] != i / 2 + (i % 2) * elts)				\
+	__builtin_abort();						\
+  }
+
+#define CHECK_MERGE(VEC_TYPE)						\
+  {									\
+    VEC_TYPE a = GEN_SEQ_VEC (VEC_TYPE, 0);				\
+    VEC_TYPE b = GEN_SEQ_VEC (VEC_TYPE, sizeof(VEC_TYPE) / sizeof(a[0])); \
+    CHECK_MERGE_LO (VEC_TYPE, a, b);					\
+    CHECK_MERGE_HI (VEC_TYPE, a, b);					\
+  }
+
+int
+main ()
+{
+  CHECK_MERGE(v16qi);
+  CHECK_MERGE(uv16qi);
+  CHECK_MERGE(v8hi);
+  CHECK_MERGE(uv8hi);
+  CHECK_MERGE(v4si);
+  CHECK_MERGE(uv4si);
+  CHECK_MERGE(v4sf);
+  CHECK_MERGE(v2di);
+  CHECK_MERGE(uv2di);
+  CHECK_MERGE(v2df);
+}
-- 
cgit v1.1


From 0aa7091befa9fdb67f7013dbd454d336a31ef71d Mon Sep 17 00:00:00 2001
From: Andreas Krebbel <krebbel@linux.ibm.com>
Date: Wed, 4 Aug 2021 18:40:09 +0200
Subject: IBM Z: Get rid of vpdi unspec

The patch gets rid of the unspec used for the vector permute double
immediate instruction and replaces it with generic rtx.

gcc/ChangeLog:

	* config/s390/s390.md (UNSPEC_VEC_PERMI): Remove constant
	definition.
	* config/s390/vector.md (*vpdi1<mode>, *vpdi4<mode>): New pattern
	definitions.
	* config/s390/vx-builtins.md (*vec_permi<mode>): Emit generic rtx
	instead of an unspec.

gcc/testsuite/ChangeLog:

	* gcc.target/s390/zvector/vec-permi.c: Removed.
	* gcc.target/s390/zvector/vec_permi.c: New test.
---
 gcc/config/s390/s390.md                           |  1 -
 gcc/config/s390/vector.md                         | 26 +++++++++
 gcc/config/s390/vx-builtins.md                    | 26 ++++-----
 gcc/testsuite/gcc.target/s390/zvector/vec-permi.c | 54 -------------------
 gcc/testsuite/gcc.target/s390/zvector/vec_permi.c | 66 +++++++++++++++++++++++
 5 files changed, 102 insertions(+), 71 deletions(-)
 delete mode 100644 gcc/testsuite/gcc.target/s390/zvector/vec-permi.c
 create mode 100644 gcc/testsuite/gcc.target/s390/zvector/vec_permi.c

(limited to 'gcc')

diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index d896fae..1b894a9 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -166,7 +166,6 @@
    UNSPEC_VEC_PACK_UNSIGNED_SATURATE_CC
    UNSPEC_VEC_PACK_UNSIGNED_SATURATE_GENCC
    UNSPEC_VEC_PERM
-   UNSPEC_VEC_PERMI
    UNSPEC_VEC_EXTEND
    UNSPEC_VEC_STORE_LEN
    UNSPEC_VEC_STORE_LEN_R
diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index 51c6332..48dc564 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -767,6 +767,32 @@
   "vperm\t%v0,%v1,%v2,%v3"
   [(set_attr "op_type" "VRR")])
 
+
+; First DW of op1 and second DW of op2
+(define_insn "*vpdi1<mode>"
+  [(set (match_operand:V_HW_2   0 "register_operand" "=v")
+	(vec_select:V_HW_2
+	 (vec_concat:<vec_2x_nelts>
+	  (match_operand:V_HW_2 1 "register_operand"  "v")
+	  (match_operand:V_HW_2 2 "register_operand"  "v"))
+	 (parallel [(const_int 0) (const_int 3)])))]
+  "TARGET_VX"
+  "vpdi\t%v0,%v1,%v2,1"
+  [(set_attr "op_type" "VRR")])
+
+; Second DW of op1 and first of op2
+(define_insn "*vpdi4<mode>"
+  [(set (match_operand:V_HW_2   0 "register_operand" "=v")
+	(vec_select:V_HW_2
+	 (vec_concat:<vec_2x_nelts>
+	  (match_operand:V_HW_2 1 "register_operand"  "v")
+	  (match_operand:V_HW_2 2 "register_operand"  "v"))
+	 (parallel [(const_int 1) (const_int 2)])))]
+  "TARGET_VX"
+  "vpdi\t%v0,%v1,%v2,4"
+  [(set_attr "op_type" "VRR")])
+
+
 (define_insn "*vmrhb"
   [(set (match_operand:V16QI                     0 "register_operand" "=v")
         (vec_select:V16QI
diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
index 5abe43b..3799e83 100644
--- a/gcc/config/s390/vx-builtins.md
+++ b/gcc/config/s390/vx-builtins.md
@@ -403,28 +403,22 @@
   "vperm\t%v0,%v1,%v2,%v3"
   [(set_attr "op_type" "VRR")])
 
+; Incoming op3 is in vec_permi format and will we turned into a
+; permute vector consisting of op3 and op4.
 (define_expand "vec_permi<mode>"
-  [(set (match_operand:V_HW_64                  0 "register_operand"   "")
-	(unspec:V_HW_64 [(match_operand:V_HW_64 1 "register_operand"   "")
-			 (match_operand:V_HW_64 2 "register_operand"   "")
-			 (match_operand:QI      3 "const_mask_operand" "")]
-			UNSPEC_VEC_PERMI))]
+  [(set (match_operand:V_HW_2   0 "register_operand" "")
+	(vec_select:V_HW_2
+	 (vec_concat:<vec_2x_nelts>
+	  (match_operand:V_HW_2 1 "register_operand" "")
+	  (match_operand:V_HW_2 2 "register_operand" ""))
+	 (parallel [(match_operand:QI 3 "const_mask_operand" "") (match_dup 4)])))]
   "TARGET_VX"
 {
   HOST_WIDE_INT val = INTVAL (operands[3]);
-  operands[3] = GEN_INT ((val & 1) | (val & 2) << 1);
+  operands[3] = GEN_INT ((val & 2) >> 1);
+  operands[4] = GEN_INT ((val & 1) + 2);
 })
 
-(define_insn "*vec_permi<mode>"
-  [(set (match_operand:V_HW_64                  0 "register_operand"  "=v")
-	(unspec:V_HW_64 [(match_operand:V_HW_64 1 "register_operand"   "v")
-			 (match_operand:V_HW_64 2 "register_operand"   "v")
-			 (match_operand:QI      3 "const_mask_operand" "C")]
-			UNSPEC_VEC_PERMI))]
-  "TARGET_VX && (UINTVAL (operands[3]) & 10) == 0"
-  "vpdi\t%v0,%v1,%v2,%b3"
-  [(set_attr "op_type" "VRR")])
-
 
 ; Vector replicate
 
diff --git a/gcc/testsuite/gcc.target/s390/zvector/vec-permi.c b/gcc/testsuite/gcc.target/s390/zvector/vec-permi.c
deleted file mode 100644
index c0a852b..0000000
--- a/gcc/testsuite/gcc.target/s390/zvector/vec-permi.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/* { dg-do compile } */
-/* { dg-options "-O3 -march=z13 -mzarch --save-temps" } */
-/* { dg-do run { target { s390_z13_hw } } } */
-
-/*
- * The vector intrinsic vec_permi(a, b, c) chooses one of the two eight-byte
- * vector elements in each of a and b, depending on the value of c. The valid
- * values for c differ from the encoding for the M4 field in assembly and in the
- * binary instruction.
- *
- * selection | c | encoding in assembly
- * a[0] b[0] | 0 | 0
- * a[0] b[1] | 1 | 1
- * a[1] b[0] | 2 | 4
- * a[1] b[1] | 3 | 5
- *
- * (i.e., indices a[i] b[j] are encoded for c as (i<<1) | j, yet for the
- * M4 field as (i<<2) | j.
- */
-#include <assert.h>
-#include <vecintrin.h>
-
-typedef unsigned long long uv2di __attribute__((vector_size(16)));
-
-__attribute__ ((noipa)) static uv2di
-do_vec_permi(uv2di a, uv2di b, int c)
-{
-    switch(c) {
-	case 0: return vec_permi(a, b, 0);
-	case 1: return vec_permi(a, b, 1);
-	case 2: return vec_permi(a, b, 2);
-	case 3: return vec_permi(a, b, 3);
-	default: assert(0);
-    }
-}
-
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v\d+,%v\d+,%v\d+,0\n} 1 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v\d+,%v\d+,%v\d+,1\n} 1 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v\d+,%v\d+,%v\d+,4\n} 1 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v\d+,%v\d+,%v\d+,5\n} 1 } } */
-
-int
-main (void)
-{
-    uv2di a = { 0xa0, 0xa1 };
-    uv2di b = { 0xb0, 0xb1 };
-
-    for (int i = 0; i < 2; i++)
-	for (int j = 0; j < 2; j++) {
-	    uv2di res = do_vec_permi(a, b, (i<<1)|j);
-	    assert(res[0] == a[i]);
-	    assert(res[1] == b[j]);
-	}
-}
diff --git a/gcc/testsuite/gcc.target/s390/zvector/vec_permi.c b/gcc/testsuite/gcc.target/s390/zvector/vec_permi.c
new file mode 100644
index 0000000..b66fa90
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/zvector/vec_permi.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=z13 -mzvector --save-temps" } */
+/* { dg-do run { target { s390_z13_hw } } } */
+
+/*
+ * The vector intrinsic vec_permi(a, b, c) chooses one of the two eight-byte
+ * vector elements in each of a and b, depending on the value of c. The valid
+ * values for c differ from the encoding for the M4 field in assembly and in the
+ * binary instruction.
+ *
+ * selection | c | encoding in assembly
+ * a[0] b[0] | 0 | 0          -> vmrhg
+ * a[0] b[1] | 1 | 1
+ * a[1] b[0] | 2 | 4
+ * a[1] b[1] | 3 | 5          -> vmrlg
+ *
+ * (i.e., indices a[i] b[j] are encoded for c as (i<<1) | j, yet for the
+ * M4 field as (i<<2) | j.
+ */
+
+/* { dg-final { scan-assembler-times "\tvmrhg\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlg\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvpdi\t" 6 } } */
+
+#include "vec-types.h"
+#include <vecintrin.h>
+
+#define GEN_PERMI_BITS(VEC_TYPE, BITS)			\
+  VEC_TYPE __attribute__((noinline))			\
+  permi_##BITS##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {	\
+    return vec_permi (a, b, (BITS)); }
+
+#define GEN_PERMI(VEC_TYPE)			\
+  GEN_PERMI_BITS(VEC_TYPE, 0);			\
+  GEN_PERMI_BITS(VEC_TYPE, 1);			\
+  GEN_PERMI_BITS(VEC_TYPE, 2);			\
+  GEN_PERMI_BITS(VEC_TYPE, 3);
+
+GEN_PERMI(v2di)
+GEN_PERMI(uv2di)
+GEN_PERMI(v2df)
+
+
+#define CHECK_PERMI_BITS(VEC_TYPE, BITS)		\
+  VEC_TYPE r##BITS = permi_##BITS##_##VEC_TYPE (a, b);	\
+  if (r##BITS[0] != ((BITS) & 2) >> 1			\
+      || r##BITS[1] != ((BITS) & 1) + 2)		\
+    __builtin_abort();
+
+#define CHECK_PERMI(VEC_TYPE)			\
+  {						\
+    VEC_TYPE a = GEN_SEQ_VEC (VEC_TYPE, 0);	\
+    VEC_TYPE b = GEN_SEQ_VEC (VEC_TYPE, 2);	\
+    CHECK_PERMI_BITS (VEC_TYPE, 0);		\
+    CHECK_PERMI_BITS (VEC_TYPE, 1);		\
+    CHECK_PERMI_BITS (VEC_TYPE, 2);		\
+    CHECK_PERMI_BITS (VEC_TYPE, 3);		\
+  }
+
+int
+main ()
+{
+  CHECK_PERMI (v2di);
+  CHECK_PERMI (uv2di);
+  CHECK_PERMI (v2df);
+}
-- 
cgit v1.1


From 4e34925ef1aeab73e022d80149be8cec92c48667 Mon Sep 17 00:00:00 2001
From: Andreas Krebbel <krebbel@linux.ibm.com>
Date: Wed, 4 Aug 2021 18:40:10 +0200
Subject: IBM Z: Remove redundant V_HW_64 mode iterator.

gcc/ChangeLog:

	* config/s390/vector.md (V_HW_64): Remove mode iterator.
	(*vec_load_pair<mode>): Use V_HW_2 instead of V_HW_64.
	* config/s390/vx-builtins.md
	(vec_scatter_element<V_HW_2:mode>_SI): Use V_HW_2 instead of
	V_HW_64.
---
 gcc/config/s390/vector.md      |  7 +++----
 gcc/config/s390/vx-builtins.md | 14 +++++++-------
 2 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'gcc')

diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index 48dc564..d224165 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -36,7 +36,6 @@
 (define_mode_iterator V_HW2 [V16QI V8HI V4SI V2DI V2DF (V4SF "TARGET_VXE")
 			     (V1TF "TARGET_VXE") (TF "TARGET_VXE")])
 
-(define_mode_iterator V_HW_64 [V2DI V2DF])
 (define_mode_iterator VT_HW_HSDT [V8HI V4SI V4SF V2DI V2DF V1TI V1TF TI TF])
 (define_mode_iterator V_HW_HSD [V8HI V4SI (V4SF "TARGET_VXE") V2DI V2DF])
 
@@ -1972,9 +1971,9 @@
 })
 
 (define_insn "*vec_load_pair<mode>"
-  [(set (match_operand:V_HW_64                       0 "register_operand" "=v,v")
-	(vec_concat:V_HW_64 (match_operand:<non_vec> 1 "register_operand"  "d,v")
-			    (match_operand:<non_vec> 2 "register_operand"  "d,v")))]
+  [(set (match_operand:V_HW_2                       0 "register_operand" "=v,v")
+	(vec_concat:V_HW_2 (match_operand:<non_vec> 1 "register_operand"  "d,v")
+			   (match_operand:<non_vec> 2 "register_operand"  "d,v")))]
   "TARGET_VX"
   "@
    vlvgp\t%v0,%1,%2
diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
index 3799e83..3e7b854 100644
--- a/gcc/config/s390/vx-builtins.md
+++ b/gcc/config/s390/vx-builtins.md
@@ -452,17 +452,17 @@
 
 ; A 31 bit target address is generated from 64 bit elements
 ; vsceg
-(define_insn "vec_scatter_element<V_HW_64:mode>_SI"
+(define_insn "vec_scatter_element<V_HW_2:mode>_SI"
   [(set (mem:<non_vec>
 	 (plus:SI (subreg:SI
-		   (unspec:<non_vec_int> [(match_operand:V_HW_64 1 "register_operand"   "v")
-					  (match_operand:QI      3 "const_mask_operand" "C")]
+		   (unspec:<non_vec_int> [(match_operand:V_HW_2 1 "register_operand"   "v")
+					  (match_operand:QI     3 "const_mask_operand" "C")]
 					 UNSPEC_VEC_EXTRACT) 4)
-		  (match_operand:SI                              2 "address_operand"   "ZQ")))
-	(unspec:<non_vec> [(match_operand:V_HW_64                0 "register_operand"   "v")
+		  (match_operand:SI                             2 "address_operand"   "ZQ")))
+	(unspec:<non_vec> [(match_operand:V_HW_2                0 "register_operand"   "v")
 			   (match_dup 3)] UNSPEC_VEC_EXTRACT))]
-  "TARGET_VX && !TARGET_64BIT && UINTVAL (operands[3]) < GET_MODE_NUNITS (<V_HW_64:MODE>mode)"
-  "vsce<V_HW_64:bhfgq>\t%v0,%O2(%v1,%R2),%3"
+  "TARGET_VX && !TARGET_64BIT && UINTVAL (operands[3]) < GET_MODE_NUNITS (<V_HW_2:MODE>mode)"
+  "vsce<V_HW_2:bhfgq>\t%v0,%O2(%v1,%R2),%3"
   [(set_attr "op_type" "VRV")])
 
 ; Element size and target address size is the same
-- 
cgit v1.1


From 6dc8c4656444153c9e2f98d382de39728a849672 Mon Sep 17 00:00:00 2001
From: Andreas Krebbel <krebbel@linux.ibm.com>
Date: Wed, 4 Aug 2021 18:40:10 +0200
Subject: IBM Z: Implement TARGET_VECTORIZE_VEC_PERM_CONST for vector merge

This patch implements the TARGET_VECTORIZE_VEC_PERM_CONST in the IBM Z
backend. The initial implementation only exploits the vector merge
instruction but there is more to come.

gcc/ChangeLog:

	* config/s390/s390.c (MAX_VECT_LEN): Define macro.
	(struct expand_vec_perm_d): Define struct.
	(expand_perm_with_merge): New function.
	(vectorize_vec_perm_const_1): New function.
	(s390_vectorize_vec_perm_const): New function.
	(TARGET_VECTORIZE_VEC_PERM_CONST): Define target macro.

gcc/testsuite/ChangeLog:

	* gcc.target/s390/vector/perm-merge.c: New test.
	* gcc.target/s390/vector/vec-types.h: New test.
---
 gcc/config/s390/s390.c                            | 105 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/s390/vector/perm-merge.c | 104 +++++++++++++++++++++
 gcc/testsuite/gcc.target/s390/vector/vec-types.h  |  35 ++++++++
 3 files changed, 244 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/perm-merge.c
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-types.h

(limited to 'gcc')

diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 3f4521e..8dc805f 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -16926,6 +16926,107 @@ s390_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
   return after_md_seq;
 }
 
+#define MAX_VECT_LEN	16
+
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  machine_mode vmode;
+  unsigned char nelt;
+  bool testing_p;
+};
+
+/* Try to expand the vector permute operation described by D using the
+   vector merge instructions vml and vmh.  Return true if vector merge
+   could be used.  */
+static bool
+expand_perm_with_merge (const struct expand_vec_perm_d &d)
+{
+  bool merge_lo_p = true;
+  bool merge_hi_p = true;
+
+  if (d.nelt % 2)
+    return false;
+
+  // For V4SI this checks for: { 0, 4, 1, 5 }
+  for (int telt = 0; telt < d.nelt; telt++)
+    if (d.perm[telt] != telt / 2 + (telt % 2) * d.nelt)
+      {
+	merge_hi_p = false;
+	break;
+      }
+
+  if (!merge_hi_p)
+    {
+      // For V4SI this checks for: { 2, 6, 3, 7 }
+      for (int telt = 0; telt < d.nelt; telt++)
+	if (d.perm[telt] != (telt + d.nelt) / 2 + (telt % 2) * d.nelt)
+	  {
+	    merge_lo_p = false;
+	    break;
+	  }
+    }
+  else
+    merge_lo_p = false;
+
+  if (d.testing_p)
+    return merge_lo_p || merge_hi_p;
+
+  if (merge_lo_p || merge_hi_p)
+    s390_expand_merge (d.target, d.op0, d.op1, merge_hi_p);
+
+  return merge_lo_p || merge_hi_p;
+}
+
+/* Try to find the best sequence for the vector permute operation
+   described by D.  Return true if the operation could be
+   expanded.  */
+static bool
+vectorize_vec_perm_const_1 (const struct expand_vec_perm_d &d)
+{
+  if (expand_perm_with_merge (d))
+    return true;
+
+  return false;
+}
+
+/* Return true if we can emit instructions for the constant
+   permutation vector in SEL.  If OUTPUT, IN0, IN1 are non-null the
+   hook is supposed to emit the required INSNs.  */
+
+bool
+s390_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, rtx op1,
+			       const vec_perm_indices &sel)
+{
+  struct expand_vec_perm_d d;
+  unsigned int i, nelt;
+
+  if (!s390_vector_mode_supported_p (vmode) || GET_MODE_SIZE (vmode) != 16)
+    return false;
+
+  d.target = target;
+  d.op0 = op0;
+  d.op1 = op1;
+
+  d.vmode = vmode;
+  gcc_assert (VECTOR_MODE_P (d.vmode));
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = target == NULL_RTX;
+
+  gcc_assert (target == NULL_RTX || REG_P (target));
+  gcc_assert (sel.length () == nelt);
+
+  for (i = 0; i < nelt; i++)
+    {
+      unsigned char e = sel[i];
+      gcc_assert (e < 2 * nelt);
+      d.perm[i] = e;
+    }
+
+  return vectorize_vec_perm_const_1 (d);
+}
+
 /* Initialize GCC target structure.  */
 
 #undef  TARGET_ASM_ALIGNED_HI_OP
@@ -17236,6 +17337,10 @@ s390_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
 #undef TARGET_MD_ASM_ADJUST
 #define TARGET_MD_ASM_ADJUST s390_md_asm_adjust
 
+#undef TARGET_VECTORIZE_VEC_PERM_CONST
+#define TARGET_VECTORIZE_VEC_PERM_CONST s390_vectorize_vec_perm_const
+
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-s390.h"
diff --git a/gcc/testsuite/gcc.target/s390/vector/perm-merge.c b/gcc/testsuite/gcc.target/s390/vector/perm-merge.c
new file mode 100644
index 0000000..51b23dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/perm-merge.c
@@ -0,0 +1,104 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=z14 -mzvector --save-temps" } */
+/* { dg-do run { target { s390_z14_hw } } } */
+
+/* { dg-final { scan-assembler-times "\tvmrhb\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrlb\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrhh\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrlh\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrhf\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlf\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrhg\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlg\t" 3 } } */
+
+#include "vec-types.h"
+
+#define GEN_MERGE_2(VEC_TYPE, HILO, A)			\
+  VEC_TYPE __attribute__((noinline))			\
+  merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {	\
+    return (VEC_TYPE){ a[0+A], b[0+A] }; }
+
+#define GEN_MERGE_4(VEC_TYPE, HILO, A)				\
+  VEC_TYPE __attribute__((noinline))				\
+  merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {		\
+    return (VEC_TYPE){ a[0+A], b[0+A], a[1+A], b[1+A] }; }
+
+#define GEN_MERGE_8(VEC_TYPE, HILO, A)					\
+  VEC_TYPE __attribute__((noinline))					\
+  merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {			\
+    return (VEC_TYPE){ a[0+A], b[0+A], a[1+A], b[1+A], a[2+A], b[2+A], a[3+A], b[3+A] }; }
+
+#define GEN_MERGE_16(VEC_TYPE, HILO, A)					\
+  VEC_TYPE __attribute__((noinline))					\
+  merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {			\
+    return (VEC_TYPE){ a[0+A], b[0+A], a[1+A], b[1+A], a[2+A], b[2+A], a[3+A], b[3+A], \
+      a[4+A], b[4+A], a[5+A], b[5+A], a[6+A], b[6+A], a[7+A], b[7+A]}; }
+
+
+GEN_MERGE_16(v16qi, l, 8)
+GEN_MERGE_16(v16qi, h, 0)
+GEN_MERGE_16(uv16qi, l, 8)
+GEN_MERGE_16(uv16qi, h, 0)
+
+GEN_MERGE_8(v8hi, l, 4)
+GEN_MERGE_8(v8hi, h, 0)
+GEN_MERGE_8(uv8hi, l, 4)
+GEN_MERGE_8(uv8hi, h, 0)
+
+GEN_MERGE_4(v4si, l, 2)
+GEN_MERGE_4(v4si, h, 0)
+GEN_MERGE_4(uv4si, l, 2)
+GEN_MERGE_4(uv4si, h, 0)
+
+GEN_MERGE_4(v4sf, l, 2)
+GEN_MERGE_4(v4sf, h, 0)
+
+GEN_MERGE_2(v2di, l, 1)
+GEN_MERGE_2(v2di, h, 0)
+GEN_MERGE_2(uv2di, l, 1)
+GEN_MERGE_2(uv2di, h, 0)
+
+GEN_MERGE_2(v2df, l, 1)
+GEN_MERGE_2(v2df, h, 0)
+
+
+#define CHECK_MERGE_LO(VEC_TYPE, SRC1, SRC2)		\
+  {							\
+    VEC_TYPE v = merge_l_##VEC_TYPE ((SRC1), (SRC2));	\
+    int elts = sizeof(v) / sizeof(v[0]);		\
+    for (int i = 0; i < elts; i++)			\
+      if (v[i] != (i + elts) / 2 + (i % 2) * elts)	\
+	__builtin_abort();				\
+  }
+
+#define CHECK_MERGE_HI(VEC_TYPE, SRC1, SRC2)		\
+  {							\
+    VEC_TYPE v = merge_h_##VEC_TYPE ((SRC1), (SRC2));	\
+    int elts = sizeof(v) / sizeof(v[0]);		\
+    for (int i = 0; i < elts; i++)			\
+      if (v[i] != i / 2 + (i % 2) * elts)		\
+	__builtin_abort();				\
+  }
+
+#define CHECK_MERGE(VEC_TYPE)						\
+  {									\
+    VEC_TYPE a = GEN_SEQ_VEC (VEC_TYPE, 0);				\
+    VEC_TYPE b = GEN_SEQ_VEC (VEC_TYPE, sizeof(VEC_TYPE) / sizeof(a[0])); \
+    CHECK_MERGE_LO (VEC_TYPE, a, b);					\
+    CHECK_MERGE_HI (VEC_TYPE, a, b);					\
+  }
+
+int
+main ()
+{
+  CHECK_MERGE(v16qi);
+  CHECK_MERGE(uv16qi);
+  CHECK_MERGE(v8hi);
+  CHECK_MERGE(uv8hi);
+  CHECK_MERGE(v4si);
+  CHECK_MERGE(uv4si);
+  CHECK_MERGE(v4sf);
+  CHECK_MERGE(v2di);
+  CHECK_MERGE(uv2di);
+  CHECK_MERGE(v2df);
+}
diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-types.h b/gcc/testsuite/gcc.target/s390/vector/vec-types.h
new file mode 100644
index 0000000..b7ffbe7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/vec-types.h
@@ -0,0 +1,35 @@
+#ifndef VEC_TYPES_H
+#define VEC_TYPES_H 1
+
+typedef __attribute__((vector_size(16))) signed char v16qi;
+typedef __attribute__((vector_size(16))) unsigned char uv16qi;
+
+typedef __attribute__((vector_size(16))) signed short v8hi;
+typedef __attribute__((vector_size(16))) unsigned short uv8hi;
+
+typedef __attribute__((vector_size(16))) signed int v4si;
+typedef __attribute__((vector_size(16))) unsigned int uv4si;
+
+typedef __attribute__((vector_size(16))) signed long long v2di;
+typedef __attribute__((vector_size(16))) unsigned long long uv2di;
+
+#if __SIZEOF_INT128__ == 16
+typedef __attribute__((vector_size(16))) __int128_t v1ti;
+#endif
+
+typedef __attribute__((vector_size(16))) double v2df;
+typedef __attribute__((vector_size(16))) long double v1tf;
+
+#if __ARCH__ >= 12
+typedef __attribute__((vector_size(16))) float v4sf;
+#endif
+
+#define GEN_SEQ_VEC(VEC_TYPE, ADDEND)					\
+  ({ VEC_TYPE dummy;							\
+    const int elts = sizeof(VEC_TYPE) / sizeof(dummy[0]);		\
+    typeof(dummy[0]) __attribute__((aligned(8))) ar[elts];		\
+    for (int i = 0; i < elts; i++)					\
+      ar[i] = (typeof(dummy[0]))(i + (ADDEND));				\
+    *(VEC_TYPE*)ar;})
+
+#endif
-- 
cgit v1.1


From 361da782a25031c6ae3967bf8c10a8119845255c Mon Sep 17 00:00:00 2001
From: Andreas Krebbel <krebbel@linux.ibm.com>
Date: Wed, 4 Aug 2021 18:40:11 +0200
Subject: IBM Z: Implement TARGET_VECTORIZE_VEC_PERM_CONST for vpdi

This patch makes use of the vector permute double immediate
instruction for constant permute vectors.

gcc/ChangeLog:

	* config/s390/s390.c (expand_perm_with_vpdi): New function.
	(vectorize_vec_perm_const_1): Call expand_perm_with_vpdi.
	* config/s390/vector.md (*vpdi1<mode>, @vpdi1<mode>): Enable a
	parameterized expander.
	(*vpdi4<mode>, @vpdi4<mode>): Likewise.

gcc/testsuite/ChangeLog:

	* gcc.target/s390/vector/perm-vpdi.c: New test.
---
 gcc/config/s390/s390.c                           | 47 +++++++++++++++++++++++
 gcc/config/s390/vector.md                        |  5 +--
 gcc/testsuite/gcc.target/s390/vector/perm-vpdi.c | 49 ++++++++++++++++++++++++
 3 files changed, 98 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/perm-vpdi.c

(limited to 'gcc')

diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 8dc805f..673a134 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -16979,6 +16979,50 @@ expand_perm_with_merge (const struct expand_vec_perm_d &d)
   return merge_lo_p || merge_hi_p;
 }
 
+/* Try to expand the vector permute operation described by D using the
+   vector permute doubleword immediate instruction vpdi.  Return true
+   if vpdi could be used.
+
+   VPDI allows 4 different immediate values (0, 1, 4, 5). The 0 and 5
+   cases are covered by vmrhg and vmrlg already.  So we only care
+   about the 1, 4 cases here.
+   1 - First element of src1 and second of src2
+   4 - Second element of src1 and first of src2  */
+static bool
+expand_perm_with_vpdi (const struct expand_vec_perm_d &d)
+{
+  bool vpdi1_p = false;
+  bool vpdi4_p = false;
+  rtx op0_reg, op1_reg;
+
+  // Only V2DI and V2DF are supported here.
+  if (d.nelt != 2)
+    return false;
+
+  if (d.perm[0] == 0 && d.perm[1] == 3)
+    vpdi1_p = true;
+
+  if (d.perm[0] == 1 && d.perm[1] == 2)
+    vpdi4_p = true;
+
+  if (!vpdi1_p && !vpdi4_p)
+    return false;
+
+  if (d.testing_p)
+    return true;
+
+  op0_reg = force_reg (GET_MODE (d.op0), d.op0);
+  op1_reg = force_reg (GET_MODE (d.op1), d.op1);
+
+  if (vpdi1_p)
+    emit_insn (gen_vpdi1 (d.vmode, d.target, op0_reg, op1_reg));
+
+  if (vpdi4_p)
+    emit_insn (gen_vpdi4 (d.vmode, d.target, op0_reg, op1_reg));
+
+  return true;
+}
+
 /* Try to find the best sequence for the vector permute operation
    described by D.  Return true if the operation could be
    expanded.  */
@@ -16988,6 +17032,9 @@ vectorize_vec_perm_const_1 (const struct expand_vec_perm_d &d)
   if (expand_perm_with_merge (d))
     return true;
 
+  if (expand_perm_with_vpdi (d))
+    return true;
+
   return false;
 }
 
diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index d224165..70274a6 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -768,7 +768,7 @@
 
 
 ; First DW of op1 and second DW of op2
-(define_insn "*vpdi1<mode>"
+(define_insn "@vpdi1<mode>"
   [(set (match_operand:V_HW_2   0 "register_operand" "=v")
 	(vec_select:V_HW_2
 	 (vec_concat:<vec_2x_nelts>
@@ -780,7 +780,7 @@
   [(set_attr "op_type" "VRR")])
 
 ; Second DW of op1 and first of op2
-(define_insn "*vpdi4<mode>"
+(define_insn "@vpdi4<mode>"
   [(set (match_operand:V_HW_2   0 "register_operand" "=v")
 	(vec_select:V_HW_2
 	 (vec_concat:<vec_2x_nelts>
@@ -926,7 +926,6 @@
   operands[5] = simplify_gen_subreg (DFmode, operands[1], TFmode, 8);
 })
 
-; vec_perm_const for V2DI using vpdi?
 
 ;;
 ;; Vector integer arithmetic instructions
diff --git a/gcc/testsuite/gcc.target/s390/vector/perm-vpdi.c b/gcc/testsuite/gcc.target/s390/vector/perm-vpdi.c
new file mode 100644
index 0000000..cc92531
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/vector/perm-vpdi.c
@@ -0,0 +1,49 @@
+/* { dg-do run { target { s390*-*-* } } } */
+/* { dg-options "-O3 -mzarch -march=z14 -mzvector --save-temps" } */
+
+/* { dg-final { scan-assembler-times "\tvmrhg\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlg\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvpdi\t" 6 } } */
+
+#include "vec-types.h"
+#include <vecintrin.h>
+
+#define GEN_PERMI_BITS(VEC_TYPE, BITS)				\
+  VEC_TYPE __attribute__((noinline))				\
+  permi_##BITS##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {		\
+    return (VEC_TYPE){a[((BITS) & 2) >> 1], b[(BITS) & 1] }; }
+
+#define GEN_PERMI(VEC_TYPE)			\
+  GEN_PERMI_BITS(VEC_TYPE, 0);			\
+  GEN_PERMI_BITS(VEC_TYPE, 1);			\
+  GEN_PERMI_BITS(VEC_TYPE, 2);			\
+  GEN_PERMI_BITS(VEC_TYPE, 3);			\
+
+GEN_PERMI(v2di)
+GEN_PERMI(uv2di)
+GEN_PERMI(v2df)
+
+
+#define CHECK_PERMI_BITS(VEC_TYPE, BITS)		\
+  VEC_TYPE r##BITS = permi_##BITS##_##VEC_TYPE (a, b);	\
+  if (r##BITS[0] != ((BITS) & 2) >> 1			\
+      || r##BITS[1] != ((BITS) & 1) + 2)		\
+    __builtin_abort();
+
+#define CHECK_PERMI(VEC_TYPE)			\
+  {						\
+    VEC_TYPE a = GEN_SEQ_VEC (VEC_TYPE, 0);	\
+    VEC_TYPE b = GEN_SEQ_VEC (VEC_TYPE, 2);	\
+    CHECK_PERMI_BITS (VEC_TYPE, 0);		\
+    CHECK_PERMI_BITS (VEC_TYPE, 1);		\
+    CHECK_PERMI_BITS (VEC_TYPE, 2);		\
+    CHECK_PERMI_BITS (VEC_TYPE, 3);		\
+  }
+
+int
+main ()
+{
+  CHECK_PERMI (v2di);
+  CHECK_PERMI (uv2di);
+  CHECK_PERMI (v2df);
+}
-- 
cgit v1.1


From 09dba016db937e61be21ef1e9581065a9ed2847d Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Wed, 4 Aug 2021 06:15:04 -0700
Subject: x86: Avoid stack realignment when copying data with SSE register

To avoid stack realignment, call ix86_gen_scratch_sse_rtx to get a
scratch SSE register to copy data with with SSE register from one
memory location to another.

gcc/

	PR target/101772
	* config/i386/i386-expand.c (ix86_expand_vector_move): Call
	ix86_gen_scratch_sse_rtx to get a scratch SSE register to copy
	data with SSE register from one memory location to another.

gcc/testsuite/

	PR target/101772
	* gcc.target/i386/eh_return-2.c: New test.
---
 gcc/config/i386/i386-expand.c               |  6 +++++-
 gcc/testsuite/gcc.target/i386/eh_return-2.c | 16 ++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/eh_return-2.c

(limited to 'gcc')

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 1d469bf..bd21efa 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -613,7 +613,11 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
 	 arguments in memory.  */
       if (!register_operand (op0, mode)
 	  && !register_operand (op1, mode))
-	op1 = force_reg (mode, op1);
+	{
+	  rtx scratch = ix86_gen_scratch_sse_rtx (mode);
+	  emit_move_insn (scratch, op1);
+	  op1 = scratch;
+	}
 
       tmp[0] = op0; tmp[1] = op1;
       ix86_expand_vector_move_misalign (mode, tmp);
diff --git a/gcc/testsuite/gcc.target/i386/eh_return-2.c b/gcc/testsuite/gcc.target/i386/eh_return-2.c
new file mode 100644
index 0000000..f23f449
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/eh_return-2.c
@@ -0,0 +1,16 @@
+/* PR target/101772  */
+/* { dg-do compile } */
+/* { dg-additional-options "-O0 -march=x86-64 -mstackrealign" } */
+
+struct _Unwind_Context _Unwind_Resume_or_Rethrow_this_context;
+
+void offset (int);
+
+struct _Unwind_Context {
+  void *reg[7];
+} _Unwind_Resume_or_Rethrow() {
+  struct _Unwind_Context cur_contextcur_context =
+      _Unwind_Resume_or_Rethrow_this_context;
+  offset(0);
+  __builtin_eh_return ((long) offset, 0);
+}
-- 
cgit v1.1


From 5738a64f8b3cf132b88b39af84b9f5f5a9a1554c Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 3 Aug 2021 06:17:22 -0700
Subject: x86: Update STORE_MAX_PIECES

Update STORE_MAX_PIECES to allow 16/32/64 bytes only if inter-unit move
is enabled since vec_duplicate enabled by inter-unit move is used to
implement store_by_pieces of 16/32/64 bytes.

gcc/

	PR target/101742
	* config/i386/i386.h (STORE_MAX_PIECES): Allow 16/32/64 bytes
	only if TARGET_INTER_UNIT_MOVES_TO_VEC is true.

gcc/testsuite/

	PR target/101742
	* gcc.target/i386/pr101742a.c: New test.
	* gcc.target/i386/pr101742b.c: Likewise.
---
 gcc/config/i386/i386.h                    | 26 +++++++++++++++-----------
 gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr101742b.c |  4 ++++
 3 files changed, 35 insertions(+), 11 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c

(limited to 'gcc')

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index bed9cd9..21fe51b 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1780,18 +1780,22 @@ typedef struct ix86_args {
 	  && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
 	 ? 16 : UNITS_PER_WORD)))
 
-/* STORE_MAX_PIECES is the number of bytes at a time that we can
-   store efficiently.  */
+/* STORE_MAX_PIECES is the number of bytes at a time that we can store
+   efficiently.  Allow 16/32/64 bytes only if inter-unit move is enabled
+   since vec_duplicate enabled by inter-unit move is used to implement
+   store_by_pieces of 16/32/64 bytes.  */
 #define STORE_MAX_PIECES \
-  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
-   ? 64 \
-   : ((TARGET_AVX \
-       && !TARGET_PREFER_AVX128 \
-       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
-      ? 32 \
-      : ((TARGET_SSE2 \
-	  && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
-	 ? 16 : UNITS_PER_WORD)))
+  (TARGET_INTER_UNIT_MOVES_TO_VEC \
+   ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+      ? 64 \
+      : ((TARGET_AVX \
+	  && !TARGET_PREFER_AVX128 \
+	  && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+	  ? 32 \
+	  : ((TARGET_SSE2 \
+	      && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+	      ? 16 : UNITS_PER_WORD))) \
+   : UNITS_PER_WORD)
 
 /* If a memory-to-memory move would take MOVE_RATIO or more simple
    move-instruction pairs, we will do a cpymem or libcall instead.
diff --git a/gcc/testsuite/gcc.target/i386/pr101742a.c b/gcc/testsuite/gcc.target/i386/pr101742a.c
new file mode 100644
index 0000000..67ea405
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101742a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mtune=nano-x2" } */
+
+int n2;
+
+__attribute__ ((simd)) char
+w7 (void)
+{
+  short int xb = n2;
+  int qp;
+
+  for (qp = 0; qp < 2; ++qp)
+    xb = xb < 1;
+
+  return xb;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101742b.c b/gcc/testsuite/gcc.target/i386/pr101742b.c
new file mode 100644
index 0000000..ba19064
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101742b.c
@@ -0,0 +1,4 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mtune=nano-x2 -mtune-ctrl=sse_unaligned_store_optimal" } */
+
+#include "pr101742a.c"
-- 
cgit v1.1


From ded2c2c068f6f2825474758cb03a05070a5837e8 Mon Sep 17 00:00:00 2001
From: David Malcolm <dmalcolm@redhat.com>
Date: Wed, 4 Aug 2021 18:21:21 -0400
Subject: analyzer: initial implementation of asm support [PR101570]

gcc/ChangeLog:
	PR analyzer/101570
	* Makefile.in (ANALYZER_OBJS): Add analyzer/region-model-asm.o.

gcc/analyzer/ChangeLog:
	PR analyzer/101570
	* analyzer.cc (maybe_reconstruct_from_def_stmt): Add GIMPLE_ASM
	case.
	* analyzer.h (class asm_output_svalue): New forward decl.
	(class reachable_regions): New forward decl.
	* complexity.cc (complexity::from_vec_svalue): New.
	* complexity.h (complexity::from_vec_svalue): New decl.
	* engine.cc (feasibility_state::maybe_update_for_edge): Handle
	asm stmts by calling on_asm_stmt.
	* region-model-asm.cc: New file.
	* region-model-manager.cc
	(region_model_manager::maybe_fold_asm_output_svalue): New.
	(region_model_manager::get_or_create_asm_output_svalue): New.
	(region_model_manager::log_stats): Log m_asm_output_values_map.
	* region-model.cc (region_model::on_stmt_pre): Handle GIMPLE_ASM.
	* region-model.h (visitor::visit_asm_output_svalue): New.
	(region_model_manager::get_or_create_asm_output_svalue): New decl.
	(region_model_manager::maybe_fold_asm_output_svalue): New decl.
	(region_model_manager::asm_output_values_map_t): New typedef.
	(region_model_manager::m_asm_output_values_map): New field.
	(region_model::on_asm_stmt): New.
	* store.cc (binding_cluster::on_asm): New.
	* store.h (binding_cluster::on_asm): New decl.
	* svalue.cc (svalue::cmp_ptr): Handle SK_ASM_OUTPUT.
	(asm_output_svalue::dump_to_pp): New.
	(asm_output_svalue::dump_input): New.
	(asm_output_svalue::input_idx_to_asm_idx): New.
	(asm_output_svalue::accept): New.
	* svalue.h (enum svalue_kind): Add SK_ASM_OUTPUT.
	(svalue::dyn_cast_asm_output_svalue): New.
	(class asm_output_svalue): New.
	(is_a_helper <const asm_output_svalue *>::test): New.
	(struct default_hash_traits<asm_output_svalue::key_t>): New.

gcc/testsuite/ChangeLog:
	PR analyzer/101570
	* gcc.dg/analyzer/asm-x86-1.c: New test.
	* gcc.dg/analyzer/asm-x86-lp64-1.c: New test.
	* gcc.dg/analyzer/asm-x86-lp64-2.c: New test.
	* gcc.dg/analyzer/pr101570.c: New test.
	* gcc.dg/analyzer/torture/asm-x86-linux-array_index_mask_nospec.c:
	New test.
	* gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-1.c: New
	test.
	* gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-2.c: New
	test.
	* gcc.dg/analyzer/torture/asm-x86-linux-cpuid.c: New test.
	* gcc.dg/analyzer/torture/asm-x86-linux-rdmsr-paravirt.c: New
	test.
	* gcc.dg/analyzer/torture/asm-x86-linux-rdmsr.c: New test.
	* gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-full.c:
	New test.
	* gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-reduced.c:
	New test.

Signed-off-by: David Malcolm <dmalcolm@redhat.com>
---
 gcc/Makefile.in                                    |   1 +
 gcc/analyzer/analyzer.cc                           |   1 +
 gcc/analyzer/analyzer.h                            |   2 +
 gcc/analyzer/complexity.cc                         |  16 ++
 gcc/analyzer/complexity.h                          |   1 +
 gcc/analyzer/engine.cc                             |   2 +
 gcc/analyzer/region-model-asm.cc                   | 303 +++++++++++++++++++
 gcc/analyzer/region-model-manager.cc               |  48 ++++
 gcc/analyzer/region-model.cc                       |   5 +-
 gcc/analyzer/region-model.h                        |  13 +
 gcc/analyzer/store.cc                              |  17 ++
 gcc/analyzer/store.h                               |   1 +
 gcc/analyzer/svalue.cc                             |  89 ++++++
 gcc/analyzer/svalue.h                              | 145 +++++++++-
 gcc/testsuite/gcc.dg/analyzer/asm-x86-1.c          |  69 +++++
 gcc/testsuite/gcc.dg/analyzer/asm-x86-lp64-1.c     | 131 +++++++++
 gcc/testsuite/gcc.dg/analyzer/asm-x86-lp64-2.c     |  34 +++
 gcc/testsuite/gcc.dg/analyzer/pr101570.c           |   5 +
 .../asm-x86-linux-array_index_mask_nospec.c        |  74 +++++
 .../torture/asm-x86-linux-cpuid-paravirt-1.c       |  81 ++++++
 .../torture/asm-x86-linux-cpuid-paravirt-2.c       | 135 +++++++++
 .../gcc.dg/analyzer/torture/asm-x86-linux-cpuid.c  |  46 +++
 .../torture/asm-x86-linux-rdmsr-paravirt.c         | 210 ++++++++++++++
 .../gcc.dg/analyzer/torture/asm-x86-linux-rdmsr.c  |  33 +++
 .../asm-x86-linux-wfx_get_ps_timeout-full.c        | 319 +++++++++++++++++++++
 .../asm-x86-linux-wfx_get_ps_timeout-reduced.c     |  77 +++++
 26 files changed, 1855 insertions(+), 3 deletions(-)
 create mode 100644 gcc/analyzer/region-model-asm.cc
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/asm-x86-1.c
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/asm-x86-lp64-1.c
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/asm-x86-lp64-2.c
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/pr101570.c
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-array_index_mask_nospec.c
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-1.c
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-2.c
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid.c
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-rdmsr-paravirt.c
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-rdmsr.c
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-full.c
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-reduced.c

(limited to 'gcc')

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 98eb479..c0f6e0a 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1262,6 +1262,7 @@ ANALYZER_OBJS = \
 	analyzer/program-state.o \
 	analyzer/region.o \
 	analyzer/region-model.o \
+	analyzer/region-model-asm.o \
 	analyzer/region-model-impl-calls.o \
 	analyzer/region-model-manager.o \
 	analyzer/region-model-reachability.o \
diff --git a/gcc/analyzer/analyzer.cc b/gcc/analyzer/analyzer.cc
index b845b86..5578877 100644
--- a/gcc/analyzer/analyzer.cc
+++ b/gcc/analyzer/analyzer.cc
@@ -131,6 +131,7 @@ maybe_reconstruct_from_def_stmt (tree ssa_name,
     {
     default:
       gcc_unreachable ();
+    case GIMPLE_ASM:
     case GIMPLE_NOP:
     case GIMPLE_PHI:
       /* Can't handle these.  */
diff --git a/gcc/analyzer/analyzer.h b/gcc/analyzer/analyzer.h
index 8de5d60..896b350 100644
--- a/gcc/analyzer/analyzer.h
+++ b/gcc/analyzer/analyzer.h
@@ -53,6 +53,7 @@ class svalue;
   class widening_svalue;
   class compound_svalue;
   class conjured_svalue;
+  class asm_output_svalue;
 typedef hash_set<const svalue *> svalue_set;
 class region;
   class frame_region;
@@ -77,6 +78,7 @@ class call_details;
 struct rejected_constraint;
 class constraint_manager;
 class equiv_class;
+class reachable_regions;
 
 class pending_diagnostic;
 class state_change_event;
diff --git a/gcc/analyzer/complexity.cc b/gcc/analyzer/complexity.cc
index ece4272..ae9f982 100644
--- a/gcc/analyzer/complexity.cc
+++ b/gcc/analyzer/complexity.cc
@@ -90,6 +90,22 @@ complexity::from_pair (const complexity &c1, const complexity &c2)
 		     MAX (c1.m_max_depth, c2.m_max_depth) + 1);
 }
 
+/* Get complexity for a new node that references the svalues in VEC.  */
+
+complexity
+complexity::from_vec_svalue (const vec<const svalue *> &vec)
+{
+  unsigned num_nodes = 0;
+  unsigned max_depth = 0;
+  for (auto iter_sval : vec)
+    {
+      const complexity &iter_c = iter_sval->get_complexity ();
+      num_nodes += iter_c.m_num_nodes;
+      max_depth = MAX (max_depth, iter_c.m_max_depth);
+    }
+  return complexity (num_nodes + 1, max_depth + 1);
+}
+
 } // namespace ana
 
 #endif /* #if ENABLE_ANALYZER */
diff --git a/gcc/analyzer/complexity.h b/gcc/analyzer/complexity.h
index 459987e..85c0372 100644
--- a/gcc/analyzer/complexity.h
+++ b/gcc/analyzer/complexity.h
@@ -36,6 +36,7 @@ struct complexity
   complexity (const region *reg);
   complexity (const svalue *sval);
   static complexity from_pair (const complexity &c1, const complexity &c);
+  static complexity from_vec_svalue (const vec<const svalue *> &vec);
 
   /* The total number of svalues and regions in the tree of this
      entity, including the entity itself.  */
diff --git a/gcc/analyzer/engine.cc b/gcc/analyzer/engine.cc
index ee625fb..ecd4265 100644
--- a/gcc/analyzer/engine.cc
+++ b/gcc/analyzer/engine.cc
@@ -3718,6 +3718,8 @@ feasibility_state::maybe_update_for_edge (logger *logger,
 
       if (const gassign *assign = dyn_cast <const gassign *> (stmt))
 	m_model.on_assignment (assign, NULL);
+      else if (const gasm *asm_stmt = dyn_cast <const gasm *> (stmt))
+	m_model.on_asm_stmt (asm_stmt, NULL);
       else if (const gcall *call = dyn_cast <const gcall *> (stmt))
 	{
 	  bool terminate_path;
diff --git a/gcc/analyzer/region-model-asm.cc b/gcc/analyzer/region-model-asm.cc
new file mode 100644
index 0000000..3efc3fd
--- /dev/null
+++ b/gcc/analyzer/region-model-asm.cc
@@ -0,0 +1,303 @@
+/* Handling inline asm in the analyzer.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   Contributed by David Malcolm <dmalcolm@redhat.com>.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tree.h"
+#include "function.h"
+#include "basic-block.h"
+#include "gimple.h"
+#include "gimple-iterator.h"
+#include "diagnostic-core.h"
+#include "pretty-print.h"
+#include "tristate.h"
+#include "selftest.h"
+#include "json.h"
+#include "analyzer/analyzer.h"
+#include "analyzer/analyzer-logging.h"
+#include "options.h"
+#include "analyzer/call-string.h"
+#include "analyzer/program-point.h"
+#include "analyzer/store.h"
+#include "analyzer/region-model.h"
+#include "analyzer/region-model-reachability.h"
+#include "stmt.h"
+
+#if ENABLE_ANALYZER
+
+namespace ana {
+
+/* Minimal asm support for the analyzer.
+
+   The objective of this code is to:
+   - minimize false positives from the analyzer on the Linux kernel
+   (which makes heavy use of inline asm), whilst
+   - avoiding having to "teach" the compiler anything about specific strings
+   in asm statements.
+
+   Specifically, we want to:
+
+   (a) mark asm outputs and certain other regions as having been written to,
+       to avoid false postives from -Wanalyzer-use-of-uninitialized-value.
+
+   (b) identify some of these stmts as "deterministic" so that we can
+       write consistent outputs given consistent inputs, so that we can
+       avoid false positives for paths in which an asm is invoked twice
+       with the same inputs and is expected to emit the same output.
+
+   This file implements heuristics for achieving the above.  */
+
+/* Determine if ASM_STMT is deterministic, in the sense of (b) above.
+
+   Consider this x86 function taken from the Linux kernel
+   (arch/x86/include/asm/barrier.h):
+
+     static inline unsigned long array_index_mask_nospec(unsigned long index,
+							 unsigned long size)
+     {
+       unsigned long mask;
+
+       asm volatile ("cmp %1,%2; sbb %0,%0;"
+		     :"=r" (mask)
+		     :"g"(size),"r" (index)
+		     :"cc");
+       return mask;
+     }
+
+   The above is a mitigation for Spectre-variant-1 attacks, for clamping
+   an array access to within the range of [0, size] if the CPU speculates
+   past the array bounds.
+
+   However, it is ultimately used to implement wdev_to_wvif:
+
+     static inline struct wfx_vif *
+     wdev_to_wvif(struct wfx_dev *wdev, int vif_id)
+     {
+       vif_id = array_index_nospec(vif_id, ARRAY_SIZE(wdev->vif));
+       if (!wdev->vif[vif_id]) {
+	 return NULL;
+       }
+       return (struct wfx_vif *)wdev->vif[vif_id]->drv_priv;
+     }
+
+   which is used by:
+
+     if (wdev_to_wvif(wvif->wdev, 1))
+       return wdev_to_wvif(wvif->wdev, 1)->vif;
+
+   The code has been written to assume that wdev_to_wvif is deterministic,
+   and won't change from returning non-NULL at the "if" clause to
+   returning NULL at the "->vif" dereference.
+
+   By treating the above specific "asm volatile" as deterministic we avoid
+   a false positive from -Wanalyzer-null-dereference.  */
+
+static bool
+deterministic_p (const gasm *asm_stmt)
+{
+  /* Assume something volatile with no inputs is querying
+     changeable state e.g. rdtsc.  */
+  if (gimple_asm_ninputs (asm_stmt) == 0
+      && gimple_asm_volatile_p (asm_stmt))
+    return false;
+
+  /* Otherwise assume it's purely a function of its inputs.  */
+  return true;
+}
+
+/* Update this model for the asm STMT, using CTXT to report any
+   diagnostics.
+
+   Compare with cfgexpand.c: expand_asm_stmt.  */
+
+void
+region_model::on_asm_stmt (const gasm *stmt, region_model_context *ctxt)
+{
+  logger *logger = ctxt ? ctxt->get_logger () : NULL;
+  LOG_SCOPE (logger);
+
+  const unsigned noutputs = gimple_asm_noutputs (stmt);
+  const unsigned ninputs = gimple_asm_ninputs (stmt);
+
+  auto_vec<tree> output_tvec;
+  auto_vec<tree> input_tvec;
+  auto_vec<const char *> constraints;
+
+  /* Copy the gimple vectors into new vectors that we can manipulate.  */
+  output_tvec.safe_grow (noutputs, true);
+  input_tvec.safe_grow (ninputs, true);
+  constraints.safe_grow (noutputs + ninputs, true);
+
+  for (unsigned i = 0; i < noutputs; ++i)
+    {
+      tree t = gimple_asm_output_op (stmt, i);
+      output_tvec[i] = TREE_VALUE (t);
+      constraints[i] = TREE_STRING_POINTER (TREE_VALUE (TREE_PURPOSE (t)));
+    }
+  for (unsigned i = 0; i < ninputs; i++)
+    {
+      tree t = gimple_asm_input_op (stmt, i);
+      input_tvec[i] = TREE_VALUE (t);
+      constraints[i + noutputs]
+	= TREE_STRING_POINTER (TREE_VALUE (TREE_PURPOSE (t)));
+    }
+
+  /* Determine which regions are reachable from the inputs
+     to this stmt.  */
+  reachable_regions reachable_regs (this);
+
+  int num_errors = 0;
+
+  auto_vec<const region *> output_regions (noutputs);
+  for (unsigned i = 0; i < noutputs; ++i)
+    {
+      tree val = output_tvec[i];
+      const char *constraint;
+      bool is_inout;
+      bool allows_reg;
+      bool allows_mem;
+
+      const region *dst_reg = get_lvalue (val, ctxt);
+      output_regions.quick_push (dst_reg);
+      reachable_regs.add (dst_reg, true);
+
+      /* Try to parse the output constraint.  If that fails, there's
+	 no point in going further.  */
+      constraint = constraints[i];
+      if (!parse_output_constraint (&constraint, i, ninputs, noutputs,
+				    &allows_mem, &allows_reg, &is_inout))
+	{
+	  if (logger)
+	    logger->log ("error parsing constraint for output %i: %qs",
+			 i, constraint);
+	  num_errors++;
+	  continue;
+	}
+
+      if (logger)
+	{
+	  logger->log ("output %i: %qs %qE"
+		       " is_inout: %i allows_reg: %i allows_mem: %i",
+		       i, constraint, val,
+		       (int)is_inout, (int)allows_reg, (int)allows_mem);
+	  logger->start_log_line ();
+	  logger->log_partial ("  region: ");
+	  dst_reg->dump_to_pp (logger->get_printer (), true);
+	  logger->end_log_line ();
+	}
+
+    }
+
+  /* Ideally should combine with inout_svals to determine the
+     "effective inputs" and use this for the asm_output_svalue.  */
+
+  auto_vec<const svalue *> input_svals (ninputs);
+  for (unsigned i = 0; i < ninputs; i++)
+    {
+      tree val = input_tvec[i];
+      const char *constraint = constraints[i + noutputs];
+      bool allows_reg, allows_mem;
+      if (! parse_input_constraint (&constraint, i, ninputs, noutputs, 0,
+				    constraints.address (),
+				    &allows_mem, &allows_reg))
+	{
+	  if (logger)
+	    logger->log ("error parsing constraint for input %i: %qs",
+			 i, constraint);
+	  num_errors++;
+	  continue;
+	}
+
+      tree src_expr = input_tvec[i];
+      const svalue *src_sval = get_rvalue (src_expr, ctxt);
+      check_for_poison (src_sval, src_expr, ctxt);
+      input_svals.quick_push (src_sval);
+      reachable_regs.handle_sval (src_sval);
+
+      if (logger)
+	{
+	  logger->log ("input %i: %qs %qE"
+		       " allows_reg: %i allows_mem: %i",
+		       i, constraint, val,
+		       (int)allows_reg, (int)allows_mem);
+	  logger->start_log_line ();
+	  logger->log_partial ("  sval: ");
+	  src_sval->dump_to_pp (logger->get_printer (), true);
+	  logger->end_log_line ();
+	}
+    }
+
+  if (num_errors > 0)
+    gcc_unreachable ();
+
+  if (logger)
+    {
+      logger->log ("reachability: ");
+      reachable_regs.dump_to_pp (logger->get_printer ());
+      logger->end_log_line ();
+    }
+
+  /* Given the regions that were reachable from the inputs we
+     want to clobber them.
+     This is similar to region_model::handle_unrecognized_call,
+     but the unknown call policies seems too aggressive (e.g. purging state
+     from anything that's ever escaped).  Instead, clobber any clusters
+     that were reachable in *this* asm stmt, rather than those that
+     escaped, and we don't treat the values as having escaped.
+     We also assume that asm stmts don't affect sm-state.  */
+  for (auto iter = reachable_regs.begin_mutable_base_regs ();
+       iter != reachable_regs.end_mutable_base_regs (); ++iter)
+    {
+      const region *base_reg = *iter;
+      if (base_reg->symbolic_for_unknown_ptr_p ())
+	continue;
+
+      binding_cluster *cluster = m_store.get_or_create_cluster (base_reg);
+      cluster->on_asm (stmt, m_mgr->get_store_manager ());
+    }
+
+  /* Update the outputs.  */
+  for (unsigned output_idx = 0; output_idx < noutputs; output_idx++)
+    {
+      tree dst_expr = output_tvec[output_idx];
+      const region *dst_reg = output_regions[output_idx];
+
+      const svalue *sval;
+      if (deterministic_p (stmt)
+	  && input_svals.length () <= asm_output_svalue::MAX_INPUTS)
+	sval = m_mgr->get_or_create_asm_output_svalue (TREE_TYPE (dst_expr),
+						       stmt,
+						       output_idx,
+						       input_svals);
+      else
+	{
+	  sval = m_mgr->get_or_create_conjured_svalue (TREE_TYPE (dst_expr),
+						       stmt,
+						       dst_reg);
+	  purge_state_involving (sval, ctxt);
+	}
+      set_value (dst_reg, sval, ctxt);
+    }
+}
+
+} // namespace ana
+
+#endif /* #if ENABLE_ANALYZER */
diff --git a/gcc/analyzer/region-model-manager.cc b/gcc/analyzer/region-model-manager.cc
index 14c57d8..9e4644f 100644
--- a/gcc/analyzer/region-model-manager.cc
+++ b/gcc/analyzer/region-model-manager.cc
@@ -1087,6 +1087,51 @@ region_model_manager::get_or_create_conjured_svalue (tree type,
   return conjured_sval;
 }
 
+/* Subroutine of region_model_manager::get_or_create_asm_output_svalue.
+   Return a folded svalue, or NULL.  */
+
+const svalue *
+region_model_manager::
+maybe_fold_asm_output_svalue (tree type,
+			      const vec<const svalue *> &inputs)
+{
+  /* Unknown inputs should lead to unknown results.  */
+  for (const auto &iter : inputs)
+    if (iter->get_kind () == SK_UNKNOWN)
+      return get_or_create_unknown_svalue (type);
+
+  return NULL;
+}
+
+/* Return the svalue * of type TYPE for OUTPUT_IDX of the deterministic
+   asm stmt ASM_STMT, given INPUTS as inputs.  */
+
+const svalue *
+region_model_manager::
+get_or_create_asm_output_svalue (tree type,
+				 const gasm *asm_stmt,
+				 unsigned output_idx,
+				 const vec<const svalue *> &inputs)
+{
+  gcc_assert (inputs.length () <= asm_output_svalue::MAX_INPUTS);
+
+  if (const svalue *folded
+	= maybe_fold_asm_output_svalue (type, inputs))
+    return folded;
+
+  const char *asm_string = gimple_asm_string (asm_stmt);
+  const unsigned noutputs = gimple_asm_noutputs (asm_stmt);
+
+  asm_output_svalue::key_t key (type, asm_string, output_idx, inputs);
+  if (asm_output_svalue **slot = m_asm_output_values_map.get (key))
+    return *slot;
+  asm_output_svalue *asm_output_sval
+    = new asm_output_svalue (type, asm_string, output_idx, noutputs, inputs);
+  RETURN_UNKNOWN_IF_TOO_COMPLEX (asm_output_sval);
+  m_asm_output_values_map.put (key, asm_output_sval);
+  return asm_output_sval;
+}
+
 /* Given STRING_CST, a STRING_CST and BYTE_OFFSET_CST a constant,
    attempt to get the character at that offset, returning either
    the svalue for the character constant, or NULL if unsuccessful.  */
@@ -1505,6 +1550,9 @@ region_model_manager::log_stats (logger *logger, bool show_objs) const
   log_uniq_map (logger, show_objs, "widening_svalue", m_widening_values_map);
   log_uniq_map (logger, show_objs, "compound_svalue", m_compound_values_map);
   log_uniq_map (logger, show_objs, "conjured_svalue", m_conjured_values_map);
+  log_uniq_map (logger, show_objs, "asm_output_svalue",
+		m_asm_output_values_map);
+
   logger->log ("max accepted svalue num_nodes: %i",
 	       m_max_complexity.m_num_nodes);
   logger->log ("max accepted svalue max_depth: %i",
diff --git a/gcc/analyzer/region-model.cc b/gcc/analyzer/region-model.cc
index 1bc411b..58da7e3 100644
--- a/gcc/analyzer/region-model.cc
+++ b/gcc/analyzer/region-model.cc
@@ -980,7 +980,10 @@ region_model::on_stmt_pre (const gimple *stmt,
       break;
 
     case GIMPLE_ASM:
-      /* No-op for now.  */
+      {
+	const gasm *asm_stmt = as_a <const gasm *> (stmt);
+	on_asm_stmt (asm_stmt, ctxt);
+      }
       break;
 
     case GIMPLE_CALL:
diff --git a/gcc/analyzer/region-model.h b/gcc/analyzer/region-model.h
index d07ce9c..30f02a0 100644
--- a/gcc/analyzer/region-model.h
+++ b/gcc/analyzer/region-model.h
@@ -221,6 +221,7 @@ public:
   virtual void visit_widening_svalue (const widening_svalue *) {}
   virtual void visit_compound_svalue (const compound_svalue *) {}
   virtual void visit_conjured_svalue (const conjured_svalue *) {}
+  virtual void visit_asm_output_svalue (const asm_output_svalue *) {}
 
   virtual void visit_region (const region *) {}
 };
@@ -274,6 +275,11 @@ public:
 					       const binding_map &map);
   const svalue *get_or_create_conjured_svalue (tree type, const gimple *stmt,
 					       const region *id_reg);
+  const svalue *
+  get_or_create_asm_output_svalue (tree type,
+				   const gasm *asm_stmt,
+				   unsigned output_idx,
+				   const vec<const svalue *> &inputs);
 
   const svalue *maybe_get_char_from_string_cst (tree string_cst,
 						tree byte_offset_cst);
@@ -346,6 +352,8 @@ private:
   const svalue *maybe_undo_optimize_bit_field_compare (tree type,
 						       const compound_svalue *compound_sval,
 						       tree cst, const svalue *arg1);
+  const svalue *maybe_fold_asm_output_svalue (tree type,
+					      const vec<const svalue *> &inputs);
 
   unsigned m_next_region_id;
   root_region m_root_region;
@@ -410,6 +418,10 @@ private:
 		   conjured_svalue *> conjured_values_map_t;
   conjured_values_map_t m_conjured_values_map;
 
+  typedef hash_map<asm_output_svalue::key_t,
+		   asm_output_svalue *> asm_output_values_map_t;
+  asm_output_values_map_t m_asm_output_values_map;
+
   bool m_check_complexity;
 
   /* Maximum complexity of svalues that weren't rejected.  */
@@ -537,6 +549,7 @@ class region_model
   void on_assignment (const gassign *stmt, region_model_context *ctxt);
   const svalue *get_gassign_result (const gassign *assign,
 				    region_model_context *ctxt);
+  void on_asm_stmt (const gasm *asm_stmt, region_model_context *ctxt);
   bool on_call_pre (const gcall *stmt, region_model_context *ctxt,
 		    bool *out_terminate_path);
   void on_call_post (const gcall *stmt,
diff --git a/gcc/analyzer/store.cc b/gcc/analyzer/store.cc
index 8ee414d..eac1295 100644
--- a/gcc/analyzer/store.cc
+++ b/gcc/analyzer/store.cc
@@ -1796,6 +1796,23 @@ binding_cluster::on_unknown_fncall (const gcall *call,
     }
 }
 
+/* Mark this cluster as having been clobbered by STMT.  */
+
+void
+binding_cluster::on_asm (const gasm *stmt,
+			 store_manager *mgr)
+{
+  m_map.empty ();
+
+  /* Bind it to a new "conjured" value using CALL.  */
+  const svalue *sval
+    = mgr->get_svalue_manager ()->get_or_create_conjured_svalue
+    (m_base_region->get_type (), stmt, m_base_region);
+  bind (mgr, m_base_region, sval);
+
+  m_touched = true;
+}
+
 /* Return true if this binding_cluster has no information
    i.e. if there are no bindings, and it hasn't been marked as having
    escaped, or touched symbolically.  */
diff --git a/gcc/analyzer/store.h b/gcc/analyzer/store.h
index bc58694..b75691e 100644
--- a/gcc/analyzer/store.h
+++ b/gcc/analyzer/store.h
@@ -603,6 +603,7 @@ public:
 
   void mark_as_escaped ();
   void on_unknown_fncall (const gcall *call, store_manager *mgr);
+  void on_asm (const gasm *stmt, store_manager *mgr);
 
   bool escaped_p () const { return m_escaped; }
   bool touched_p () const { return m_touched; }
diff --git a/gcc/analyzer/svalue.cc b/gcc/analyzer/svalue.cc
index a1e6f50..6913161 100644
--- a/gcc/analyzer/svalue.cc
+++ b/gcc/analyzer/svalue.cc
@@ -508,6 +508,29 @@ svalue::cmp_ptr (const svalue *sval1, const svalue *sval2)
 				conjured_sval2->get_id_region ());
       }
       break;
+    case SK_ASM_OUTPUT:
+      {
+	const asm_output_svalue *asm_output_sval1
+	  = (const asm_output_svalue *)sval1;
+	const asm_output_svalue *asm_output_sval2
+	  = (const asm_output_svalue *)sval2;
+	if (int asm_string_cmp = strcmp (asm_output_sval1->get_asm_string (),
+					 asm_output_sval2->get_asm_string ()))
+	  return asm_string_cmp;
+	if (int output_idx_cmp = ((int)asm_output_sval1->get_output_idx ()
+				  - (int)asm_output_sval2->get_output_idx ()))
+	  return output_idx_cmp;
+	if (int cmp = ((int)asm_output_sval1->get_num_inputs ()
+		       - (int)asm_output_sval2->get_num_inputs ()))
+	  return cmp;
+	for (unsigned i = 0; i < asm_output_sval1->get_num_inputs (); i++)
+	  if (int input_cmp
+	      = svalue::cmp_ptr (asm_output_sval1->get_input (i),
+				 asm_output_sval2->get_input (i)))
+	    return input_cmp;
+	return 0;
+      }
+      break;
     }
 }
 
@@ -1794,6 +1817,72 @@ conjured_svalue::accept (visitor *v) const
   m_id_reg->accept (v);
 }
 
+/* class asm_output_svalue : public svalue.  */
+
+/* Implementation of svalue::dump_to_pp vfunc for asm_output_svalue.  */
+
+void
+asm_output_svalue::dump_to_pp (pretty_printer *pp, bool simple) const
+{
+  if (simple)
+    {
+      pp_printf (pp, "ASM_OUTPUT(%qs, %%%i, {",
+		 get_asm_string (),
+		 get_output_idx ());
+      for (unsigned i = 0; i < m_num_inputs; i++)
+	{
+	  if (i > 0)
+	    pp_string (pp, ", ");
+	  dump_input (pp, 0, m_input_arr[i], simple);
+	}
+      pp_string (pp, "})");
+    }
+  else
+    {
+      pp_printf (pp, "asm_output_svalue (%qs, %%%i, {",
+		 get_asm_string (),
+		 get_output_idx ());
+      for (unsigned i = 0; i < m_num_inputs; i++)
+	{
+	  if (i > 0)
+	    pp_string (pp, ", ");
+	  dump_input (pp, 0, m_input_arr[i], simple);
+	}
+      pp_string (pp, "})");
+    }
+}
+
+/* Subroutine of asm_output_svalue::dump_to_pp.  */
+
+void
+asm_output_svalue::dump_input (pretty_printer *pp,
+			       unsigned input_idx,
+			       const svalue *sval,
+			       bool simple) const
+{
+  pp_printf (pp, "%%%i: ", input_idx_to_asm_idx (input_idx));
+  sval->dump_to_pp (pp, simple);
+}
+
+/* Convert INPUT_IDX from an index into the array of inputs
+   into the index of all operands for the asm stmt.  */
+
+unsigned
+asm_output_svalue::input_idx_to_asm_idx (unsigned input_idx) const
+{
+  return input_idx + m_num_outputs;
+}
+
+/* Implementation of svalue::accept vfunc for asm_output_svalue.  */
+
+void
+asm_output_svalue::accept (visitor *v) const
+{
+  v->visit_asm_output_svalue (this);
+  for (unsigned i = 0; i < m_num_inputs; i++)
+    m_input_arr[i]->accept (v);
+}
+
 } // namespace ana
 
 #endif /* #if ENABLE_ANALYZER */
diff --git a/gcc/analyzer/svalue.h b/gcc/analyzer/svalue.h
index debe439..63f7d15 100644
--- a/gcc/analyzer/svalue.h
+++ b/gcc/analyzer/svalue.h
@@ -47,7 +47,8 @@ enum svalue_kind
   SK_PLACEHOLDER,
   SK_WIDENING,
   SK_COMPOUND,
-  SK_CONJURED
+  SK_CONJURED,
+  SK_ASM_OUTPUT
 };
 
 /* svalue and its subclasses.
@@ -74,7 +75,9 @@ enum svalue_kind
      widening_svalue (SK_WIDENING): a merger of two svalues (possibly
        in an iteration).
      compound_svalue (SK_COMPOUND): a mapping of bit-ranges to svalues
-     conjured_svalue (SK_CONJURED): a value arising from a stmt.  */
+     conjured_svalue (SK_CONJURED): a value arising from a stmt
+     asm_output_svalue (SK_ASM_OUTPUT): an output from a deterministic
+       asm stmt.  */
 
 /* An abstract base class representing a value held by a region of memory.  */
 
@@ -124,6 +127,8 @@ public:
   dyn_cast_compound_svalue () const { return NULL; }
   virtual const conjured_svalue *
   dyn_cast_conjured_svalue () const { return NULL; }
+  virtual const asm_output_svalue *
+  dyn_cast_asm_output_svalue () const { return NULL; }
 
   tree maybe_get_constant () const;
   const region *maybe_get_region () const;
@@ -1394,4 +1399,140 @@ template <> struct default_hash_traits<conjured_svalue::key_t>
   static const bool empty_zero_p = true;
 };
 
+namespace ana {
+
+/* An output from a deterministic asm stmt, where we want to identify a
+   particular unknown value, rather than resorting to the unknown_value
+   singleton.
+
+   Comparisons of variables that share the same asm_output_svalue are known
+   to be equal, even if we don't know what the value is.  */
+
+class asm_output_svalue : public svalue
+{
+public:
+  /* Imposing an upper limit and using a (small) array allows key_t
+     to avoid memory management.  */
+  static const unsigned MAX_INPUTS = 2;
+
+  /* A support class for uniquifying instances of asm_output_svalue.  */
+  struct key_t
+  {
+    key_t (tree type,
+	   const char *asm_string,
+	   unsigned output_idx,
+	   const vec<const svalue *> &inputs)
+    : m_type (type), m_asm_string (asm_string), m_output_idx (output_idx),
+      m_num_inputs (inputs.length ())
+    {
+      gcc_assert (inputs.length () <= MAX_INPUTS);
+      for (unsigned i = 0; i < m_num_inputs; i++)
+	m_input_arr[i] = inputs[i];
+    }
+
+    hashval_t hash () const
+    {
+      inchash::hash hstate;
+      hstate.add_ptr (m_type);
+      /* We don't bother hashing m_asm_str.  */
+      hstate.add_int (m_output_idx);
+      for (unsigned i = 0; i < m_num_inputs; i++)
+	hstate.add_ptr (m_input_arr[i]);
+      return hstate.end ();
+    }
+
+    bool operator== (const key_t &other) const
+    {
+      if (!(m_type == other.m_type
+	    && 0 == (strcmp (m_asm_string, other.m_asm_string))
+	    && m_output_idx == other.m_output_idx
+	    && m_num_inputs == other.m_num_inputs))
+	return false;
+      for (unsigned i = 0; i < m_num_inputs; i++)
+	if (m_input_arr[i] != other.m_input_arr[i])
+	  return false;
+      return true;
+    }
+
+    /* Use m_asm_string to mark empty/deleted, as m_type can be NULL for
+       legitimate instances.  */
+    void mark_deleted () { m_asm_string = reinterpret_cast<const char *> (1); }
+    void mark_empty () { m_asm_string = NULL; }
+    bool is_deleted () const
+    {
+      return m_asm_string == reinterpret_cast<const char *> (1);
+    }
+    bool is_empty () const { return m_asm_string == NULL; }
+
+    tree m_type;
+    const char *m_asm_string;
+    unsigned m_output_idx;
+    unsigned m_num_inputs;
+    const svalue *m_input_arr[MAX_INPUTS];
+  };
+
+  asm_output_svalue (tree type,
+		     const char *asm_string,
+		     unsigned output_idx,
+		     unsigned num_outputs,
+		     const vec<const svalue *> &inputs)
+  : svalue (complexity::from_vec_svalue (inputs), type),
+    m_asm_string (asm_string),
+    m_output_idx (output_idx),
+    m_num_outputs (num_outputs),
+    m_num_inputs (inputs.length ())
+  {
+    gcc_assert (inputs.length () <= MAX_INPUTS);
+    for (unsigned i = 0; i < m_num_inputs; i++)
+      m_input_arr[i] = inputs[i];
+  }
+
+  enum svalue_kind get_kind () const FINAL OVERRIDE { return SK_ASM_OUTPUT; }
+  const asm_output_svalue *
+  dyn_cast_asm_output_svalue () const FINAL OVERRIDE
+  {
+    return this;
+  }
+
+  void dump_to_pp (pretty_printer *pp, bool simple) const FINAL OVERRIDE;
+  void accept (visitor *v) const FINAL OVERRIDE;
+
+  const char *get_asm_string () const { return m_asm_string; }
+  unsigned get_output_idx () const { return m_output_idx; }
+  unsigned get_num_inputs () const { return m_num_inputs; }
+  const svalue *get_input (unsigned idx) const { return m_input_arr[idx]; }
+
+ private:
+  void dump_input (pretty_printer *pp,
+		   unsigned input_idx,
+		   const svalue *sval,
+		   bool simple) const;
+  unsigned input_idx_to_asm_idx (unsigned input_idx) const;
+
+  const char *m_asm_string;
+  unsigned m_output_idx;
+
+  /* We capture this so that we can offset the input indices
+     to match the %0, %1, %2 in the asm_string when dumping.  */
+  unsigned m_num_outputs;
+
+  unsigned m_num_inputs;
+  const svalue *m_input_arr[MAX_INPUTS];
+};
+
+} // namespace ana
+
+template <>
+template <>
+inline bool
+is_a_helper <const asm_output_svalue *>::test (const svalue *sval)
+{
+  return sval->get_kind () == SK_ASM_OUTPUT;
+}
+
+template <> struct default_hash_traits<asm_output_svalue::key_t>
+: public member_function_hash_traits<asm_output_svalue::key_t>
+{
+  static const bool empty_zero_p = true;
+};
 #endif /* GCC_ANALYZER_SVALUE_H */
diff --git a/gcc/testsuite/gcc.dg/analyzer/asm-x86-1.c b/gcc/testsuite/gcc.dg/analyzer/asm-x86-1.c
new file mode 100644
index 0000000..f6026b7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/asm-x86-1.c
@@ -0,0 +1,69 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+
+#include "analyzer-decls.h"
+
+int test_out (void)
+{
+  int dst_a, dst_b;
+  asm ("mov 42, %0"
+       : "=r" (dst_a));
+  asm ("mov 42, %0"
+       : "=r" (dst_b));
+  __analyzer_eval (dst_a == dst_b); /* { dg-warning "TRUE" } */
+  return dst_a;
+}
+
+int test_out_in (int src_a)
+{
+  int dst_a, dst_b;
+  asm ("mov %1, %0"
+       : "=r" (dst_a)
+       : "r" (src_a));
+  asm ("mov %1, %0"
+       : "=r" (dst_b)
+       : "r" (src_a));
+  __analyzer_eval (dst_a == dst_b); /* { dg-warning "TRUE" } */
+  return dst_a;
+}
+
+int test_out_in_in (int src_a, int src_b)
+{
+  int dst_a, dst_b;
+  asm ("mov %1, %0;\n"
+       "add %2, %0"
+       : "=r" (dst_a)
+       : "r" (src_a),
+	 "r" (src_b));
+  asm ("mov %1, %0;\n"
+       "add %2, %0"
+       : "=r" (dst_b)
+       : "r" (src_a),
+	 "r" (src_b));
+  __analyzer_eval (dst_a == dst_b); /* { dg-warning "TRUE" } */
+  return dst_a;
+}
+
+void test_inout_1 (int v)
+{
+  int saved = v;
+  int result_a, result_b;
+  asm ("dec %0"
+       : "+r" (v));
+  result_a = v;
+
+  asm ("dec %0"
+       : "+r" (v));
+  result_b = v;
+
+  __analyzer_eval (v == saved); /* { dg-warning "UNKNOWN" } */
+  __analyzer_eval (v == result_a); /* { dg-warning "UNKNOWN" } */
+  __analyzer_eval (v == result_b); /* { dg-warning "TRUE" } */
+}
+
+void test_inout_2 (void)
+{
+  int v;
+  int result_a, result_b;
+  asm ("dec %0" /* { dg-warning "use of uninitialized value 'v'" } */
+       : "+r" (v));
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/asm-x86-lp64-1.c b/gcc/testsuite/gcc.dg/analyzer/asm-x86-lp64-1.c
new file mode 100644
index 0000000..c235e22
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/asm-x86-lp64-1.c
@@ -0,0 +1,131 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-require-effective-target lp64 } */
+
+#include "analyzer-decls.h"
+
+#include <stdint.h>
+
+int test_1 (int src)
+{
+  int dst;
+  asm ("mov %1, %0\n\t"
+       "add $1, %0"
+       : "=r" (dst)
+       : "r" (src));
+  return dst;
+}
+
+uint32_t test_2 (uint32_t Mask)
+{
+  uint32_t Index;
+  asm ("bsfl %[aMask], %[aIndex]"
+       : [aIndex] "=r" (Index)
+       : [aMask] "r" (Mask)
+       : "cc");
+  return Index;
+}
+
+int test_3a (int p1, int p2)
+{
+  asm goto ("btl %1, %0\n\t"
+	    "jc %l2"
+	    : // No outputs
+	    : "r" (p1), "r" (p2)
+	    : "cc"
+	    : carry);
+
+  return 0;
+
+ carry:
+  return 1;
+}
+
+int test_3b (int p1, int p2)
+{
+  asm goto ("btl %1, %0\n\t"
+	    "jc %l[carry]"
+	    : // No outputs
+	    : "r" (p1), "r" (p2)
+	    : "cc"
+	    : carry);
+
+  return 0;
+
+ carry:
+  return 1;
+}
+
+uint64_t test_4 (void)
+{
+  uint64_t start_time, end_time;
+
+  // Get start time
+  asm volatile ("rdtsc\n\t"    // Returns the time in EDX:EAX.
+		"shl $32, %%rdx\n\t"  // Shift the upper bits left.
+		"or %%rdx, %0"        // 'Or' in the lower bits.
+		: "=a" (start_time)
+		:
+		: "rdx");
+
+  // could do other work here
+
+  // Get end time
+  asm volatile ("rdtsc\n\t"    // Returns the time in EDX:EAX.
+		"shl $32, %%rdx\n\t"  // Shift the upper bits left.
+		"or %%rdx, %0"        // 'Or' in the lower bits.
+		: "=a" (end_time)
+		:
+		: "rdx");
+
+  __analyzer_eval (start_time == end_time); /* { dg-warning "UNKNOWN" } */
+
+  // Get elapsed time
+  return end_time - start_time;
+}
+
+static uint64_t get_time (void)
+{
+  uint64_t result;
+  asm volatile ("rdtsc\n\t"    // Returns the time in EDX:EAX.
+		"shl $32, %%rdx\n\t"  // Shift the upper bits left.
+		"or %%rdx, %0"        // 'Or' in the lower bits.
+		: "=a" (result)
+		:
+		: "rdx");
+  return result;
+}
+
+uint64_t test_4a (void)
+{
+  uint64_t start_time, end_time;
+
+  start_time = get_time ();
+  // could do other work here
+  end_time = get_time ();
+
+  __analyzer_eval (start_time == end_time); /* { dg-warning "UNKNOWN" } */
+
+  // Get elapsed time
+  return end_time - start_time;
+}
+
+asm ("\t.pushsection .text\n"
+     "\t.globl add_asm\n"
+     "\t.type add_asm, @function\n"
+     "add_asm:\n"
+     "\tmovq %rdi, %rax\n"
+     "\tadd %rsi, %rax\n"
+     "\tret\n"
+     "\t.popsection\n");
+
+int test_5 (int count)
+{
+  asm goto ("dec %0; jb %l[stop]"
+	    : "+r" (count)
+	    :
+	    :
+	    : stop);
+  return count;
+stop:
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/asm-x86-lp64-2.c b/gcc/testsuite/gcc.dg/analyzer/asm-x86-lp64-2.c
new file mode 100644
index 0000000..fa50739
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/asm-x86-lp64-2.c
@@ -0,0 +1,34 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-require-effective-target lp64 } */
+
+/* Adapted from Linux x86: page_ref_dec_and_test.c (GPL-2.0).  */
+
+typedef _Bool bool;
+
+typedef struct {
+  int counter;
+} atomic_t;
+
+bool
+arch_atomic_dec_and_test(atomic_t *v) {
+  return ({
+    bool c;
+    asm volatile(".pushsection .smp_locks,\"a\"\n"
+                 ".balign 4\n"
+                 ".long 671f - .\n"
+                 ".popsection\n"
+                 "671:"
+                 "\n\tlock; "
+                 "decl"
+                 " "
+                 "%[var]"
+                 "\n\t/* output condition code "
+                 "e"
+                 "*/\n"
+                 : [ var ] "+m"(v->counter), "=@cc"
+                                             "e"(c)
+                 :
+                 : "memory");
+    c;
+  });
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/pr101570.c b/gcc/testsuite/gcc.dg/analyzer/pr101570.c
new file mode 100644
index 0000000..809bad6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/pr101570.c
@@ -0,0 +1,5 @@
+void
+test2 (_Complex double f)
+{
+  __asm__ ("" : "=r" (__real f));
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-array_index_mask_nospec.c b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-array_index_mask_nospec.c
new file mode 100644
index 0000000..6201fdb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-array_index_mask_nospec.c
@@ -0,0 +1,74 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } { "" } } */
+
+#include "../analyzer-decls.h"
+
+/* Copied from linux: arch/x86/include/asm/barrier.h (GPL-2.0) */
+
+static inline unsigned long array_index_mask_nospec(unsigned long index,
+						    unsigned long size)
+{
+	unsigned long mask;
+
+	asm volatile ("cmp %1,%2; sbb %0,%0;"
+			:"=r" (mask)
+			:"g"(size),"r" (index)
+			:"cc");
+	return mask;
+}
+
+/* The analyzer ought to treat array_index_mask_nospec as being
+   effectively pure.  */
+
+void test_1 (unsigned long index, unsigned long size)
+{
+  unsigned long a = array_index_mask_nospec (index, size);
+  unsigned long b = array_index_mask_nospec (index, size);
+  __analyzer_eval (a == b); /* { dg-warning "TRUE" } */
+}
+
+void test_2 (unsigned long index_a, unsigned long size_a,
+	     unsigned long index_b, unsigned long size_b)
+{
+  unsigned long aa_1 = array_index_mask_nospec (index_a, size_a);
+  unsigned long ab_1 = array_index_mask_nospec (index_a, size_b);
+  unsigned long ba_1 = array_index_mask_nospec (index_b, size_a);
+  unsigned long bb_1 = array_index_mask_nospec (index_b, size_b);
+
+  unsigned long aa_2 = array_index_mask_nospec (index_a, size_a);
+  unsigned long ab_2 = array_index_mask_nospec (index_a, size_b);
+  unsigned long ba_2 = array_index_mask_nospec (index_b, size_a);
+  unsigned long bb_2 = array_index_mask_nospec (index_b, size_b);
+
+  __analyzer_eval (aa_1 == aa_2); /* { dg-warning "TRUE" } */
+  __analyzer_eval (ab_1 == ab_2); /* { dg-warning "TRUE" } */
+  __analyzer_eval (ba_1 == ba_2); /* { dg-warning "TRUE" } */
+  __analyzer_eval (bb_1 == bb_2); /* { dg-warning "TRUE" } */
+
+  __analyzer_eval (aa_1 == ab_1); /* { dg-warning "UNKNOWN" } */
+  __analyzer_eval (aa_1 == ba_1); /* { dg-warning "UNKNOWN" } */
+  __analyzer_eval (aa_1 == bb_1); /* { dg-warning "UNKNOWN" } */
+
+  __analyzer_eval (ab_1 == ba_1); /* { dg-warning "UNKNOWN" } */
+  __analyzer_eval (ab_1 == bb_1); /* { dg-warning "UNKNOWN" } */
+
+  __analyzer_eval (ba_1 == bb_1); /* { dg-warning "UNKNOWN" } */
+}
+
+/* Equivalent asm strings should be treated the same, rather
+   than requiring the results to come from the same stmt.  */
+
+void test_3 (unsigned long index, unsigned long size)
+{
+  unsigned long a = array_index_mask_nospec (index, size);
+  unsigned long b;
+
+  /* Copy of the asm from array_index_mask_nospec.  */
+  asm volatile ("cmp %1,%2; sbb %0,%0;"
+		:"=r" (b)
+		:"g"(size),"r" (index)
+		:"cc");
+
+  __analyzer_eval (a == b); /* { dg-warning "TRUE" } */
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-1.c b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-1.c
new file mode 100644
index 0000000..cf5cf97
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-1.c
@@ -0,0 +1,81 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } { "" } } */
+
+/* Adapted/reduced from linux kernel (GPL-2.0).  */
+
+register unsigned long current_stack_pointer asm("rsp");
+
+struct pv_cpu_ops {
+  /* snip */
+  void (*cpuid)(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
+                unsigned int *edx);
+  /* snip */
+};
+struct paravirt_patch_template {
+  struct pv_cpu_ops cpu;
+  /* snip */
+};
+extern struct paravirt_patch_template pv_ops;
+
+/* snip */
+static void cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
+		  unsigned int *edx) {
+  unsigned long __edi = __edi, __esi = __esi, __edx = __edx, __ecx = __ecx,
+                __eax = __eax;
+  asm volatile(
+      "771:\n\t"
+      "999:\n\t"
+      ".pushsection .discard.retpoline_safe\n\t"
+      " "
+      ".quad"
+      " "
+      " 999b\n\t"
+      ".popsection\n\t"
+      "call *%c[paravirt_opptr];"
+      "\n"
+      "772:\n"
+      ".pushsection .parainstructions,\"a\"\n"
+      " "
+      ".balign 8"
+      " "
+      "\n"
+      " "
+      ".quad"
+      " "
+      " 771b\n"
+      "  .byte "
+      "%c[paravirt_typenum]"
+      "\n"
+      "  .byte 772b-771b\n"
+      "  .short "
+      "%c[paravirt_clobber]"
+      "\n"
+      ".popsection\n"
+      : "=D"(__edi), "=S"(__esi), "=d"(__edx), "=c"(__ecx),
+        "+r"(current_stack_pointer)
+      : [ paravirt_typenum ] "i"(
+            (__builtin_offsetof(struct paravirt_patch_template, cpu.cpuid) /
+             sizeof(void *))),
+        [ paravirt_opptr ] "i"(&(pv_ops.cpu.cpuid)),
+        [ paravirt_clobber ] "i"(((1 << 9) - 1)), "D"((unsigned long)(eax)),
+        "S"((unsigned long)(ebx)), "d"((unsigned long)(ecx)),
+        "c"((unsigned long)(edx))
+      : "memory", "cc", "rax", "r8", "r9", "r10", "r11");
+}
+
+extern void check_init_int(int v);
+
+void test(unsigned int op) {
+  unsigned int eax, ebx, ecx, edx;
+
+  eax = op;
+  ecx = 0;
+  cpuid(&eax, &ebx, &ecx, &edx);
+
+  check_init_int(eax);
+  check_init_int(ebx); /* { dg-bogus "use of uninitialized value 'ebx'" } */
+  check_init_int(ecx);
+  check_init_int(edx); /* { dg-bogus "use of uninitialized value 'edx'" } */
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-2.c b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-2.c
new file mode 100644
index 0000000..c4b365f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-2.c
@@ -0,0 +1,135 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } { "" } } */
+
+/* Adapted/reduced from linux kernel (GPL-2.0).  */
+
+typedef __SIZE_TYPE__ size_t;
+
+#define offsetof(TYPE, MEMBER)	((size_t)&((TYPE *)0)->MEMBER)
+
+#define __stringify_1(x...)	#x
+#define __stringify(x...)	__stringify_1(x)
+#define __ASM_FORM(x, ...)		" " __stringify(x,##__VA_ARGS__) " "
+#define __ASM_FORM_RAW(x, ...)		    __stringify(x,##__VA_ARGS__)
+#define __ASM_SEL(a,b)		__ASM_FORM(b)
+#define __ASM_SEL_RAW(a,b)	__ASM_FORM_RAW(b)
+#define __ASM_REG(reg)         __ASM_SEL_RAW(e##reg, r##reg)
+#define _ASM_PTR	__ASM_SEL(.long, .quad)
+#define _ASM_ALIGN   __ASM_SEL(.balign 4, .balign 8)
+#define _ASM_SP		__ASM_REG(sp)
+
+
+register unsigned long current_stack_pointer asm(_ASM_SP);
+#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
+
+#define ANNOTATE_RETPOLINE_SAFE					\
+	"999:\n\t"						\
+	".pushsection .discard.retpoline_safe\n\t"		\
+	_ASM_PTR " 999b\n\t"					\
+	".popsection\n\t"
+
+/* Adapted from Linux arch/x86/include/asm/paravirt.h  */
+
+struct pv_cpu_ops {
+  /* snip */
+  void (*cpuid)(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
+		unsigned int *edx);
+  /* snip */
+};
+
+struct paravirt_patch_template {
+  struct pv_cpu_ops cpu;
+  /* snip */
+};
+extern struct paravirt_patch_template pv_ops;
+
+#define PARAVIRT_PATCH(x)					\
+	(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
+
+#define paravirt_type(op)				\
+	[paravirt_typenum] "i" (PARAVIRT_PATCH(op)),	\
+	[paravirt_opptr] "i" (&(pv_ops.op))
+#define paravirt_clobber(clobber)		\
+	[paravirt_clobber] "i" (clobber)
+
+#define CLBR_ANY  ((1 << 9) - 1)
+
+#define _paravirt_alt(insn_string, type, clobber)	\
+	"771:\n\t" insn_string "\n" "772:\n"		\
+	".pushsection .parainstructions,\"a\"\n"	\
+	_ASM_ALIGN "\n"					\
+	_ASM_PTR " 771b\n"				\
+	"  .byte " type "\n"				\
+	"  .byte 772b-771b\n"				\
+	"  .short " clobber "\n"			\
+	".popsection\n"
+
+#define paravirt_alt(insn_string)					\
+	_paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
+
+#define PARAVIRT_CALL					\
+	ANNOTATE_RETPOLINE_SAFE				\
+	"call *%c[paravirt_opptr];"
+
+#define PVOP_CALL_ARGS						\
+	unsigned long __edi = __edi, __esi = __esi,		\
+		__edx = __edx, __ecx = __ecx, __eax = __eax;
+
+#define PVOP_CALL_ARG1(x)		"D" ((unsigned long)(x))
+#define PVOP_CALL_ARG2(x)		"S" ((unsigned long)(x))
+#define PVOP_CALL_ARG3(x)		"d" ((unsigned long)(x))
+#define PVOP_CALL_ARG4(x)		"c" ((unsigned long)(x))
+
+#define PVOP_VCALL_CLOBBERS	"=D" (__edi),				\
+				"=S" (__esi), "=d" (__edx),		\
+				"=c" (__ecx)
+/* void functions are still allowed [re]ax for scratch */
+#define PVOP_VCALLEE_CLOBBERS	"=a" (__eax)
+
+#define VEXTRA_CLOBBERS	 , "rax", "r8", "r9", "r10", "r11"
+
+#define PVOP_TEST_NULL(op)	((void)pv_ops.op)
+
+#define ____PVOP_CALL(ret, op, clbr, call_clbr, extra_clbr, ...)	\
+	({								\
+		PVOP_CALL_ARGS;						\
+		PVOP_TEST_NULL(op);					\
+		asm volatile(paravirt_alt(PARAVIRT_CALL)		\
+			     : call_clbr, ASM_CALL_CONSTRAINT		\
+			     : paravirt_type(op),			\
+			       paravirt_clobber(clbr),			\
+			       ##__VA_ARGS__				\
+			     : "memory", "cc" extra_clbr);		\
+		ret;							\
+	})
+
+#define __PVOP_VCALL(op, ...)						\
+	(void)____PVOP_CALL(, op, CLBR_ANY, PVOP_VCALL_CLOBBERS,	\
+		       VEXTRA_CLOBBERS, ##__VA_ARGS__)
+
+#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4)				\
+	__PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2),	\
+		     PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
+
+static void cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
+		  unsigned int *edx)
+{
+  PVOP_VCALL4(cpu.cpuid, eax, ebx, ecx, edx);
+}
+
+extern void check_init_int(int v);
+
+void test(unsigned int op) {
+  unsigned int eax, ebx, ecx, edx;
+
+  eax = op;
+  ecx = 0;
+  cpuid(&eax, &ebx, &ecx, &edx);
+
+  check_init_int(eax);
+  check_init_int(ebx); /* { dg-bogus "use of uninitialized value 'ebx'" } */
+  check_init_int(ecx);
+  check_init_int(edx); /* { dg-bogus "use of uninitialized value 'edx'" } */
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid.c b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid.c
new file mode 100644
index 0000000..243931a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-cpuid.c
@@ -0,0 +1,46 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-require-effective-target lp64 } */
+
+#include "../analyzer-decls.h"
+
+typedef unsigned __INT32_TYPE__ u32;
+typedef unsigned __INT64_TYPE__ u64;
+
+extern void check_init_u32 (u32 v);
+extern void check_init_u64 (u32 v);
+
+/* Adapted from linux kernel: arch/x86/include/asm/processor.h (GPL-2.0).  */
+
+static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
+				unsigned int *ecx, unsigned int *edx)
+{
+	/* ecx is often an input as well as an output. */
+	asm volatile("cpuid"
+	    : "=a" (*eax),
+	      "=b" (*ebx),
+	      "=c" (*ecx),
+	      "=d" (*edx)
+	    : "0" (*eax), "2" (*ecx)
+	    : "memory");
+}
+
+static inline void cpuid(unsigned int op,
+			 unsigned int *eax, unsigned int *ebx,
+			 unsigned int *ecx, unsigned int *edx)
+{
+	*eax = op;
+	*ecx = 0;
+	native_cpuid(eax, ebx, ecx, edx);
+}
+
+void test_1 (void)
+{
+  u32 eax, ebx, ecx, edx;
+  cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); /* from "amd_get_topology".  */
+
+  /* Verify that they are now initialized.  */
+  check_init_u32 (eax);
+  check_init_u32 (ebx);
+  check_init_u32 (ecx);
+  check_init_u32 (edx);
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-rdmsr-paravirt.c b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-rdmsr-paravirt.c
new file mode 100644
index 0000000..d994787
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-rdmsr-paravirt.c
@@ -0,0 +1,210 @@
+/* Adapted from Linux: arch/x86/include/asm/paravirt.h */
+
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } { "" } } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
+
+/* Adapted/reduced from linux kernel (GPL-2.0).  */
+
+#include "../analyzer-decls.h"
+
+typedef unsigned char u8;
+typedef unsigned __INT32_TYPE__ u32;
+typedef unsigned __INT64_TYPE__ u64;
+typedef __SIZE_TYPE__ size_t;
+
+#define offsetof(TYPE, MEMBER)	((size_t)&((TYPE *)0)->MEMBER)
+
+#define __stringify_1(x...)	#x
+#define __stringify(x...)	__stringify_1(x)
+
+# define __ASM_FORM(x, ...)		" " __stringify(x,##__VA_ARGS__) " "
+# define __ASM_FORM_RAW(x, ...)		    __stringify(x,##__VA_ARGS__)
+
+#ifndef __x86_64__
+/* 32 bit */
+# define __ASM_SEL(a,b)		__ASM_FORM(a)
+# define __ASM_SEL_RAW(a,b)	__ASM_FORM_RAW(a)
+#else
+/* 64 bit */
+# define __ASM_SEL(a,b)		__ASM_FORM(b)
+# define __ASM_SEL_RAW(a,b)	__ASM_FORM_RAW(b)
+#endif
+
+#define __ASM_REG(reg)         __ASM_SEL_RAW(e##reg, r##reg)
+
+#define _ASM_PTR	__ASM_SEL(.long, .quad)
+#define _ASM_ALIGN   __ASM_SEL(.balign 4, .balign 8)
+
+#define _ASM_SP		__ASM_REG(sp)
+
+
+register unsigned long current_stack_pointer asm(_ASM_SP);
+#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
+
+#define ANNOTATE_RETPOLINE_SAFE					\
+	"999:\n\t"						\
+	".pushsection .discard.retpoline_safe\n\t"		\
+	_ASM_PTR " 999b\n\t"					\
+	".popsection\n\t"
+
+/* Adapted from Linux arch/x86/include/asm/paravirt.h  */
+
+
+/* snip */
+
+/* ./arch/x86/include/asm/paravirt.h I think; was:
+   PVOP_VCALL4(cpu.cpuid, eax, ebx, ecx, edx);
+
+*/
+
+#ifndef __x86_64__
+#define CLBR_ANY  ((1 << 4) - 1)
+#else
+#define CLBR_ANY  ((1 << 9) - 1)
+#endif /* X86_64 */
+
+struct pv_cpu_ops {
+  /* snip */
+  u64 (*read_msr_safe)(unsigned int msr, int *err);
+  /* snip */
+};
+
+struct paravirt_patch_template {
+  struct pv_cpu_ops cpu;
+  /* snip */
+};
+extern struct paravirt_patch_template pv_ops;
+
+#define PARAVIRT_PATCH(x)					\
+	(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
+
+#define paravirt_type(op)				\
+	[paravirt_typenum] "i" (PARAVIRT_PATCH(op)),	\
+	[paravirt_opptr] "i" (&(pv_ops.op))
+#define paravirt_clobber(clobber)		\
+	[paravirt_clobber] "i" (clobber)
+
+/*
+ * Generate some code, and mark it as patchable by the
+ * apply_paravirt() alternate instruction patcher.
+ */
+#define _paravirt_alt(insn_string, type, clobber)	\
+	"771:\n\t" insn_string "\n" "772:\n"		\
+	".pushsection .parainstructions,\"a\"\n"	\
+	_ASM_ALIGN "\n"					\
+	_ASM_PTR " 771b\n"				\
+	"  .byte " type "\n"				\
+	"  .byte 772b-771b\n"				\
+	"  .short " clobber "\n"			\
+	".popsection\n"
+
+/* Generate patchable code, with the default asm parameters. */
+#define paravirt_alt(insn_string)					\
+	_paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
+
+#define PARAVIRT_CALL					\
+	ANNOTATE_RETPOLINE_SAFE				\
+	"call *%c[paravirt_opptr];"
+
+#ifndef __x86_64__
+
+/* 32-bit.  */
+
+#define PVOP_CALL_ARGS							\
+	unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx;
+
+#define PVOP_CALL_ARG1(x)		"a" ((unsigned long)(x))
+#define PVOP_CALL_ARG2(x)		"d" ((unsigned long)(x))
+
+#define PVOP_VCALL_CLOBBERS		"=a" (__eax), "=d" (__edx),	\
+					"=c" (__ecx)
+#define PVOP_CALL_CLOBBERS		PVOP_VCALL_CLOBBERS
+
+#define PVOP_VCALLEE_CLOBBERS		"=a" (__eax), "=d" (__edx)
+#define PVOP_CALLEE_CLOBBERS		PVOP_VCALLEE_CLOBBERS
+
+#define EXTRA_CLOBBERS
+
+#else
+
+/* 64-bit.  */
+
+/* [re]ax isn't an arg, but the return val */
+#define PVOP_CALL_ARGS						\
+	unsigned long __edi = __edi, __esi = __esi,		\
+		__edx = __edx, __ecx = __ecx, __eax = __eax;
+
+#define PVOP_CALL_ARG1(x)		"D" ((unsigned long)(x))
+#define PVOP_CALL_ARG2(x)		"S" ((unsigned long)(x))
+
+#define PVOP_VCALL_CLOBBERS	"=D" (__edi),				\
+				"=S" (__esi), "=d" (__edx),		\
+				"=c" (__ecx)
+#define PVOP_CALL_CLOBBERS	PVOP_VCALL_CLOBBERS, "=a" (__eax)
+#define PVOP_VCALLEE_CLOBBERS	"=a" (__eax)
+#define PVOP_CALLEE_CLOBBERS	PVOP_VCALLEE_CLOBBERS
+
+#define EXTRA_CLOBBERS	 , "r8", "r9", "r10", "r11"
+#endif	/* CONFIG_X86_32 */
+
+#define PVOP_TEST_NULL(op)	((void)pv_ops.op)
+
+#define PVOP_RETVAL(rettype)						\
+	({	unsigned long __mask = ~0UL;				\
+		switch (sizeof(rettype)) {				\
+		case 1: __mask =       0xffUL; break;			\
+		case 2: __mask =     0xffffUL; break;			\
+		case 4: __mask = 0xffffffffUL; break;			\
+		default: break;						\
+		}							\
+		__mask & __eax;						\
+	})
+
+#define ____PVOP_CALL(ret, op, clbr, call_clbr, extra_clbr, ...)	\
+	({								\
+		PVOP_CALL_ARGS;						\
+		PVOP_TEST_NULL(op);					\
+		asm volatile(paravirt_alt(PARAVIRT_CALL)		\
+			     : call_clbr, ASM_CALL_CONSTRAINT		\
+			     : paravirt_type(op),			\
+			       paravirt_clobber(clbr),			\
+			       ##__VA_ARGS__				\
+			     : "memory", "cc" extra_clbr);		\
+		ret;							\
+	})
+
+#define __PVOP_CALL(rettype, op, ...)					\
+	____PVOP_CALL(PVOP_RETVAL(rettype), op, CLBR_ANY,		\
+		      PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, ##__VA_ARGS__)
+
+#define PVOP_CALL2(rettype, op, arg1, arg2)				\
+	__PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2))
+
+static inline u64 paravirt_read_msr_safe(unsigned msr, int *err)
+{
+	return PVOP_CALL2(u64, cpu.read_msr_safe, msr, err);
+}
+
+#define rdmsr_safe(msr, a, b)				\
+({							\
+	int _err;					\
+	u64 _l = paravirt_read_msr_safe(msr, &_err);	\
+	(*a) = (u32)_l;					\
+	(*b) = _l >> 32;				\
+	_err;						\
+})
+
+
+void check_init_int(int);
+void check_init_u32(u32);
+
+void test(void)
+{
+  int err;
+  u32 eax, edx;
+  err = rdmsr_safe(0, &eax, &edx);
+  check_init_int(err);
+  check_init_u32(eax);
+  check_init_u32(edx);
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-rdmsr.c b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-rdmsr.c
new file mode 100644
index 0000000..0a1c48f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-rdmsr.c
@@ -0,0 +1,33 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } { "" } } */
+
+#include "../analyzer-decls.h"
+
+/* Adapted from Linux: arch/x86/include/asm/msr.h (GPL-2.0)  */
+
+#ifdef __x86_64__
+#define DECLARE_ARGS(val, low, high)	unsigned long low, high
+#define EAX_EDX_VAL(val, low, high)	((low) | (high) << 32)
+#define EAX_EDX_RET(val, low, high)	"=a" (low), "=d" (high)
+#else
+#define DECLARE_ARGS(val, low, high)	unsigned long long val
+#define EAX_EDX_VAL(val, low, high)	(val)
+#define EAX_EDX_RET(val, low, high)	"=A" (val)
+#endif
+
+static unsigned long long __rdmsr(unsigned int msr)
+{
+	DECLARE_ARGS(val, low, high);
+
+	asm volatile("1: rdmsr\n"
+		     "2:\n"
+		     : EAX_EDX_RET(val, low, high) : "c" (msr));
+
+	return EAX_EDX_VAL(val, low, high);
+}
+
+void test (void)
+{
+  __analyzer_eval (__rdmsr (0)); /* { dg-warning "UNKNOWN" } */
+  __analyzer_eval (__rdmsr (1)); /* { dg-warning "UNKNOWN" } */
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-full.c b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-full.c
new file mode 100644
index 0000000..e90dccf5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-full.c
@@ -0,0 +1,319 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-additional-options "-fsanitize=bounds -fno-analyzer-call-summaries" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
+
+/* Reduced from linux kernel: drivers/staging/wfx/sta.c (GPL-2.0)
+   on x86_64 with "allyesconfig"
+
+   This test is deliberately not fully reduced, as an integration test
+   that the analyzer doesn't emit bogus "dereference of NULL" warnings
+   on the repeated wdev_to_wvif calls.  */
+
+#define NULL ((void *)0)
+
+/* Types.  */
+
+typedef unsigned char __u8;
+typedef unsigned short __u16;
+__extension__ typedef unsigned long long __u64;
+
+typedef __u8 u8;
+typedef __u16 u16;
+typedef __u64 u64;
+
+enum { false = 0, true = 1 };
+typedef _Bool bool;
+
+struct device;
+
+typedef struct {
+  int counter;
+} atomic_t;
+
+struct static_key {
+  atomic_t enabled;
+  union {
+    unsigned long type;
+    struct jump_entry *entries;
+    struct static_key_mod *next;
+  };
+};
+
+struct static_key_true {
+  struct static_key key;
+};
+
+struct static_key_false {
+  struct static_key key;
+};
+
+struct _ddebug {
+  const char *modname;
+  const char *function;
+  const char *filename;
+  const char *format;
+  unsigned int lineno : 18;
+  unsigned int flags : 8;
+
+  union {
+    struct static_key_true dd_key_true;
+    struct static_key_false dd_key_false;
+  } key;
+
+} __attribute__((aligned(8)));
+
+enum nl80211_iftype {
+  /* [...snip...] */
+  NL80211_IFTYPE_AP,
+  /* [...snip...] */
+  NUM_NL80211_IFTYPES,
+  NL80211_IFTYPE_MAX = NUM_NL80211_IFTYPES - 1
+};
+
+struct ieee80211_channel {
+  /* [...snip...] */
+  u16 hw_value;
+  /* [...snip...] */
+};
+
+struct cfg80211_chan_def {
+  struct ieee80211_channel *chan;
+  /* [...snip...] */
+};
+
+struct ieee80211_bss_conf {
+  /* [...snip...] */
+  bool assoc, ibss_joined;
+  /* [...snip...] */
+  struct cfg80211_chan_def chandef;
+  /* [...snip...] */
+  bool ps;
+  /* [...snip...] */
+};
+
+struct ieee80211_conf {
+  /* [...snip...] */
+  int power_level, dynamic_ps_timeout;
+  /* [...snip...] */
+};
+
+struct ieee80211_vif {
+  enum nl80211_iftype type;
+  struct ieee80211_bss_conf bss_conf;
+  /* [...snip...] */
+  u8 drv_priv[] __attribute__((__aligned__(sizeof(void *))));
+};
+
+struct ieee80211_hw {
+  struct ieee80211_conf conf;
+  /* [...snip...] */
+};
+
+struct wfx_dev {
+  /* [...snip...] */
+  struct device *dev;
+  struct ieee80211_hw *hw;
+  struct ieee80211_vif *vif[2];
+  /* [...snip...] */
+  int force_ps_timeout;
+};
+
+struct wfx_vif {
+  struct wfx_dev *wdev;
+  struct ieee80211_vif *vif;
+  /* [...snip...] */
+};
+
+/* Function decls.  */
+
+extern __attribute__((__format__(printf, 1, 2))) void
+__warn_printk(const char *fmt, ...);
+
+extern bool ____wrong_branch_error(void);
+
+extern __attribute__((__format__(printf, 3, 4))) void
+__dynamic_dev_dbg(struct _ddebug *descriptor, const struct device *dev,
+                  const char *fmt, ...);
+
+bool wfx_api_older_than(struct wfx_dev *wdev, int major, int minor);
+
+/* Function defns.  */
+
+static inline unsigned long array_index_mask_nospec(unsigned long index,
+                                                    unsigned long size) {
+  unsigned long mask;
+
+  asm volatile("cmp %1,%2; sbb %0,%0;"
+               : "=r"(mask)
+               : "g"(size), "r"(index)
+               : "cc");
+  return mask;
+}
+
+static inline __attribute__((__always_inline__)) bool
+arch_static_branch(struct static_key *key, bool branch) {
+  asm goto("1:"
+           "jmp %l[l_yes] # objtool NOPs this \n\t"
+           ".pushsection __jump_table,  \"aw\" \n\t"
+           " "
+           ".balign 8"
+           " "
+           "\n\t"
+           ".long 1b - . \n\t"
+           ".long %l[l_yes] - . \n\t"
+           " "
+           ".quad"
+           " "
+           "%c0 + %c1 - .\n\t"
+           ".popsection \n\t"
+           :
+           : "i"(key), "i"(2 | branch)
+           :
+           : l_yes);
+  asm("");
+
+  return false;
+l_yes:
+  return true;
+}
+
+static inline __attribute__((__always_inline__)) bool
+arch_static_branch_jump(struct static_key *const key, const bool branch) {
+  asm goto("1:"
+           "jmp %l[l_yes]\n\t"
+           ".pushsection __jump_table,  \"aw\" \n\t"
+           " "
+           ".balign 8"
+           " "
+           "\n\t"
+           ".long 1b - . \n\t"
+           ".long %l[l_yes] - . \n\t"
+           " "
+           ".quad"
+           " "
+           "%c0 + %c1 - .\n\t"
+           ".popsection \n\t"
+           :
+           : "i"(key), "i"(branch)
+           :
+           : l_yes);
+  asm("");
+
+  return false;
+l_yes:
+  return true;
+}
+
+static inline struct wfx_vif *wdev_to_wvif(struct wfx_dev *wdev, int vif_id) {
+  if (vif_id >=
+      (sizeof(wdev->vif) / sizeof((wdev->vif)[0]) + ((int)(sizeof(struct {
+         int : (-!!(__builtin_types_compatible_p(typeof((wdev->vif)),
+                                                 typeof(&(wdev->vif)[0]))));
+       }))))) {
+    static struct _ddebug __attribute__((__aligned__(8)))
+    __attribute__((__section__("__dyndbg"))) __UNIQUE_ID_ddebug1678 = {
+        .modname = "wfx",
+        .function = __func__,
+        .filename = "drivers/staging/wfx/wfx.h",
+        .format = ("requesting non-existent vif: %d\n"),
+        .lineno = 97,
+        .flags = 0,
+        .key.dd_key_false = ((struct static_key_false){
+            .key = {.enabled = {0}, {.entries = (void *)0UL}},
+        })};
+    if (({
+          bool branch;
+          if (__builtin_types_compatible_p(
+                  typeof(*&__UNIQUE_ID_ddebug1678.key.dd_key_false),
+                  struct static_key_true))
+            branch = arch_static_branch_jump(
+                &(&__UNIQUE_ID_ddebug1678.key.dd_key_false)->key, false);
+          else if (__builtin_types_compatible_p(
+                       typeof(*&__UNIQUE_ID_ddebug1678.key.dd_key_false),
+                       struct static_key_false))
+            branch = arch_static_branch(
+                &(&__UNIQUE_ID_ddebug1678.key.dd_key_false)->key, false);
+          else
+            branch = ____wrong_branch_error();
+          __builtin_expect(!!(branch), 0);
+        }))
+      __dynamic_dev_dbg(&__UNIQUE_ID_ddebug1678, wdev->dev,
+                        "requesting non-existent vif: %d\n", vif_id);
+    return NULL;
+  }
+  typeof(vif_id) _i = (vif_id);
+  typeof((sizeof(wdev->vif) / sizeof((wdev->vif)[0]) + ((int)(sizeof(struct {
+            int : (-!!(__builtin_types_compatible_p(typeof((wdev->vif)),
+                                                    typeof(&(wdev->vif)[0]))));
+          }))))) _s =
+      ((sizeof(wdev->vif) / sizeof((wdev->vif)[0]) + ((int)(sizeof(struct {
+          int : (-!!(__builtin_types_compatible_p(typeof((wdev->vif)),
+                                                  typeof(&(wdev->vif)[0]))));
+        })))));
+  unsigned long _mask = array_index_mask_nospec(_i, _s);
+  vif_id = (typeof(_i))(_i & _mask);
+  if (!wdev->vif[vif_id]) {
+    static struct _ddebug __attribute__((__aligned__(8)))
+    __attribute__((__section__("__dyndbg"))) __UNIQUE_ID_ddebug1681 = {
+        .modname = "wfx",
+        .function = __func__,
+        .filename = "drivers/staging/wfx/wfx.h",
+        .format = ("requesting non-allocated vif: %d\n"),
+        .lineno = 102,
+        .flags = 0,
+        .key.dd_key_false = ((struct static_key_false){
+            .key = {.enabled = {0}, {.entries = (void *)0UL}},
+        })};
+    if (({
+          bool branch;
+          if (__builtin_types_compatible_p(
+                  typeof(*&__UNIQUE_ID_ddebug1681.key.dd_key_false),
+                  struct static_key_true))
+            branch = arch_static_branch_jump(
+                &(&__UNIQUE_ID_ddebug1681.key.dd_key_false)->key, false);
+          else if (__builtin_types_compatible_p(
+                       typeof(*&__UNIQUE_ID_ddebug1681.key.dd_key_false),
+                       struct static_key_false))
+            branch = arch_static_branch(
+                &(&__UNIQUE_ID_ddebug1681.key.dd_key_false)->key, false);
+          else
+            branch = ____wrong_branch_error();
+          __builtin_expect(!!(branch), 0);
+        }))
+      __dynamic_dev_dbg(&__UNIQUE_ID_ddebug1681, wdev->dev,
+                        "requesting non-allocated vif: %d\n", vif_id);
+    return NULL;
+  }
+  return (struct wfx_vif *)wdev->vif[vif_id]->drv_priv;
+}
+
+int wfx_get_ps_timeout(struct wfx_vif *wvif, bool *enable_ps) {
+  struct ieee80211_channel *chan0 = NULL, *chan1 = NULL;
+  struct ieee80211_conf *conf = &wvif->wdev->hw->conf;
+
+  if (wdev_to_wvif(wvif->wdev, 0))
+    chan0 = wdev_to_wvif(wvif->wdev, 0)->vif->bss_conf.chandef.chan; /* { dg-bogus "dereference of NULL" } */
+  if (wdev_to_wvif(wvif->wdev, 1))
+    chan1 = wdev_to_wvif(wvif->wdev, 1)->vif->bss_conf.chandef.chan; /* { dg-bogus "dereference of NULL" } */
+  if (chan0 && chan1 && chan0->hw_value != chan1->hw_value &&
+      wvif->vif->type != NL80211_IFTYPE_AP) {
+
+    if (enable_ps)
+      *enable_ps = true;
+    if (wvif->wdev->force_ps_timeout > -1)
+      return wvif->wdev->force_ps_timeout;
+    else if (wfx_api_older_than(wvif->wdev, 3, 2))
+      return 0;
+    else
+      return 30;
+  }
+  if (enable_ps)
+    *enable_ps = wvif->vif->bss_conf.ps;
+  if (wvif->wdev->force_ps_timeout > -1)
+    return wvif->wdev->force_ps_timeout;
+  else if (wvif->vif->bss_conf.assoc && wvif->vif->bss_conf.ps)
+    return conf->dynamic_ps_timeout;
+  else
+    return -1;
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-reduced.c b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-reduced.c
new file mode 100644
index 0000000..a18c58c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-reduced.c
@@ -0,0 +1,77 @@
+/* { dg-do assemble { target x86_64-*-* } } */
+/* { dg-require-effective-target lp64 } */
+
+/* Reproducer for false positive from -Wanalyzer-null-dereference seen
+   in Linux kernel (drivers/staging/wfx/sta.c; GPL-2.0) due to
+   the analyzer not grokking that array_index_mask_nospec is
+   effectively pure, and thus not realizing that array_index_no_spec
+   is also pure, leading to wdev_to_wvif not being treated as pure,
+   and thus able to return non-NULL and then NULL.  */
+
+typedef unsigned char u8;
+#define NULL ((void *)0)
+
+/* Types.  */
+
+struct ieee80211_vif {
+  int placeholder;
+  /* snip */
+  u8 drv_priv[];
+};
+
+struct wfx_dev {
+  /* snip */
+  struct ieee80211_vif *vif[2];
+  /* snip */
+};
+
+struct wfx_vif {
+  struct wfx_dev *wdev;
+  struct ieee80211_vif *vif;
+  /* snip */
+};
+
+/* Copied from arch/x86/include/asm/barrier.h */
+
+static inline unsigned long array_index_mask_nospec(unsigned long index,
+		unsigned long size)
+{
+	unsigned long mask;
+
+	asm volatile ("cmp %1,%2; sbb %0,%0;"
+			:"=r" (mask)
+			:"g"(size),"r" (index)
+			:"cc");
+	return mask;
+}
+
+/* Simplified from include/linux/kernel.h */
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+/* Simplified from include/linux/nospec.h */
+
+#define array_index_nospec(index, size)					\
+({									\
+	typeof(index) _i = (index);					\
+	typeof(size) _s = (size);					\
+	unsigned long _mask = array_index_mask_nospec(_i, _s);		\
+	/* snip */							\
+	(typeof(_i)) (_i & _mask);					\
+})
+
+/* Simplified from drivers/staging/wfx/wfx.h */
+
+static inline struct wfx_vif *wdev_to_wvif(struct wfx_dev *wdev, int vif_id) {
+  vif_id = array_index_nospec(vif_id, ARRAY_SIZE(wdev->vif));
+  if (!wdev->vif[vif_id]) {
+    return NULL;
+  }
+  return (struct wfx_vif *)wdev->vif[vif_id]->drv_priv;
+}
+
+struct ieee80211_vif *test (struct wfx_vif *wvif) {
+  if (wdev_to_wvif(wvif->wdev, 1))
+    return wdev_to_wvif(wvif->wdev, 1)->vif; /* { dg-bogus "dereference of NULL" } */
+  else
+    return NULL;
+}
-- 
cgit v1.1


From 2697f8324fbb09b0d92036ba6a6b8a2b8d256b23 Mon Sep 17 00:00:00 2001
From: GCC Administrator <gccadmin@gcc.gnu.org>
Date: Thu, 5 Aug 2021 00:17:03 +0000
Subject: Daily bump.

---
 gcc/ChangeLog           | 204 ++++++++++++++++++++++++++++++++++++++++++++++++
 gcc/DATESTAMP           |   2 +-
 gcc/analyzer/ChangeLog  |  36 +++++++++
 gcc/cp/ChangeLog        |   6 ++
 gcc/testsuite/ChangeLog | 120 ++++++++++++++++++++++++++++
 5 files changed, 367 insertions(+), 1 deletion(-)

(limited to 'gcc')

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 04757ba..d888dc5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,207 @@
+2021-08-04  David Malcolm  <dmalcolm@redhat.com>
+
+	PR analyzer/101570
+	* Makefile.in (ANALYZER_OBJS): Add analyzer/region-model-asm.o.
+
+2021-08-04  H.J. Lu  <hjl.tools@gmail.com>
+
+	PR target/101742
+	* config/i386/i386.h (STORE_MAX_PIECES): Allow 16/32/64 bytes
+	only if TARGET_INTER_UNIT_MOVES_TO_VEC is true.
+
+2021-08-04  H.J. Lu  <hjl.tools@gmail.com>
+
+	PR target/101772
+	* config/i386/i386-expand.c (ix86_expand_vector_move): Call
+	ix86_gen_scratch_sse_rtx to get a scratch SSE register to copy
+	data with SSE register from one memory location to another.
+
+2021-08-04  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* config/s390/s390.c (expand_perm_with_vpdi): New function.
+	(vectorize_vec_perm_const_1): Call expand_perm_with_vpdi.
+	* config/s390/vector.md (*vpdi1<mode>, @vpdi1<mode>): Enable a
+	parameterized expander.
+	(*vpdi4<mode>, @vpdi4<mode>): Likewise.
+
+2021-08-04  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* config/s390/s390.c (MAX_VECT_LEN): Define macro.
+	(struct expand_vec_perm_d): Define struct.
+	(expand_perm_with_merge): New function.
+	(vectorize_vec_perm_const_1): New function.
+	(s390_vectorize_vec_perm_const): New function.
+	(TARGET_VECTORIZE_VEC_PERM_CONST): Define target macro.
+
+2021-08-04  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* config/s390/vector.md (V_HW_64): Remove mode iterator.
+	(*vec_load_pair<mode>): Use V_HW_2 instead of V_HW_64.
+	* config/s390/vx-builtins.md
+	(vec_scatter_element<V_HW_2:mode>_SI): Use V_HW_2 instead of
+	V_HW_64.
+
+2021-08-04  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* config/s390/s390.md (UNSPEC_VEC_PERMI): Remove constant
+	definition.
+	* config/s390/vector.md (*vpdi1<mode>, *vpdi4<mode>): New pattern
+	definitions.
+	* config/s390/vx-builtins.md (*vec_permi<mode>): Emit generic rtx
+	instead of an unspec.
+
+2021-08-04  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* config/s390/s390-modes.def: Add more vector modes to support
+	concatenation of two vectors.
+	* config/s390/s390-protos.h (s390_expand_merge_perm_const): Add
+	prototype.
+	(s390_expand_merge): Likewise.
+	* config/s390/s390.c (s390_expand_merge_perm_const): New function.
+	(s390_expand_merge): New function.
+	* config/s390/s390.md (UNSPEC_VEC_MERGEH, UNSPEC_VEC_MERGEL):
+	Remove constant definitions.
+	* config/s390/vector.md (V_HW_2): Add mode iterators.
+	(VI_HW_4, V_HW_4): Rename VI_HW_4 to V_HW_4.
+	(vec_2x_nelts, vec_2x_wide): New mode attributes.
+	(*vmrhb, *vmrlb, *vmrhh, *vmrlh, *vmrhf, *vmrlf, *vmrhg, *vmrlg):
+	New pattern definitions.
+	(vec_widen_umult_lo_<mode>, vec_widen_umult_hi_<mode>)
+	(vec_widen_smult_lo_<mode>, vec_widen_smult_hi_<mode>)
+	(vec_unpacks_lo_v4sf, vec_unpacks_hi_v4sf, vec_unpacks_lo_v2df)
+	(vec_unpacks_hi_v2df): Adjust expanders to emit non-unspec RTX for
+	vec merge.
+	* config/s390/vx-builtins.md (V_HW_4): Remove mode iterator. Now
+	in vector.md.
+	(vec_mergeh<mode>, vec_mergel<mode>): Use s390_expand_merge to
+	emit vec merge pattern.
+
+2021-08-04  Jonathan Wright  <jonathan.wright@arm.com>
+
+	* config/aarch64/aarch64.c (aarch64_strip_extend_vec_half):
+	Define.
+	(aarch64_rtx_mult_cost): Traverse RTL tree to prevent cost of
+	vec_select high-half from being added into Neon multiply
+	cost.
+	* rtlanal.c (vec_series_highpart_p): Define.
+	* rtlanal.h (vec_series_highpart_p): Declare.
+
+2021-08-04  Jonathan Wright  <jonathan.wright@arm.com>
+
+	* config/aarch64/aarch64.c (aarch64_strip_duplicate_vec_elt):
+	Define.
+	(aarch64_rtx_mult_cost): Traverse RTL tree to prevent
+	vec_select cost from being added into Neon multiply cost.
+
+2021-08-04  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* tree-vect-loop.c (vect_better_loop_vinfo_p): Detect cases in
+	which old_loop_vinfo is an epilogue loop that handles a constant
+	number of iterations.
+
+2021-08-04  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* tree-vect-loop.c (vect_analyze_loop): Print a dump message
+	when a reanalyzed loop fails to be cheaper than the current
+	main loop.
+
+2021-08-04  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/aarch64.c: Fix a typo.
+
+2021-08-04  Vincent Lefèvre  <vincent-gcc@vinc17.net>
+
+	PR gcov-profile/101773
+	* gcov-io.c (gcov_close): Check return code of a fclose.
+
+2021-08-04  Bernd Edlinger  <bernd.edlinger@hotmail.de>
+
+	PR ada/101575
+	* dwarf2out.c (dwarf2out_assembly_start): Emit a dummy
+	.file statement when needed.
+
+2021-08-04  Richard Biener  <rguenther@suse.de>
+
+	* tree-vect-data-refs.c (vect_check_gather_scatter):
+	Include widening conversions only when the result is
+	still handed by native gather or the current offset
+	size not already matches the data size.
+	Also succeed analysis in case there's no native support,
+	noted by a IFN_LAST ifn and a NULL decl.
+	(vect_analyze_data_refs): Always consider gathers.
+	* tree-vect-patterns.c (vect_recog_gather_scatter_pattern):
+	Test for no IFN gather rather than decl gather.
+	* tree-vect-stmts.c (vect_model_load_cost): Pass in the
+	gather-scatter info and cost emulated gathers accordingly.
+	(vect_truncate_gather_scatter_offset): Properly test for
+	no IFN gather.
+	(vect_use_strided_gather_scatters_p): Likewise.
+	(get_load_store_type): Handle emulated gathers and its
+	restrictions.
+	(vectorizable_load): Likewise.  Emulate them by extracting
+	scalar offsets, doing scalar loads and a vector construct.
+
+2021-08-04  H.J. Lu  <hjl.tools@gmail.com>
+
+	PR target/101742
+	* expr.c (op_by_pieces_d::op_by_pieces_d): Add a max_pieces
+	argument to set m_max_size.
+	(move_by_pieces_d): Pass MOVE_MAX_PIECES to op_by_pieces_d.
+	(store_by_pieces_d): Pass STORE_MAX_PIECES to op_by_pieces_d.
+	(compare_by_pieces_d): Pass COMPARE_MAX_PIECES to op_by_pieces_d.
+
+2021-08-04  Roger Sayle  <roger@nextmovesoftware.com>
+	    Marc Glisse  <marc.glisse@inria.fr>
+
+	* match.pd (bit_ior, bit_xor): Canonicalize (X*C1)|(X*C2) and
+	(X*C1)^(X*C2) as X*(C1+C2), and related variants, using
+	tree_nonzero_bits to ensure that operands are bit-wise disjoint.
+
+2021-08-04  Richard Biener  <rguenther@suse.de>
+
+	* tree-ssa-forwprop.c (pass_forwprop::execute): Split
+	out code to decompose vector loads ...
+	(optimize_vector_load): ... here.  Generalize it to
+	handle intermediate widening and TARGET_MEM_REF loads
+	and apply it to loads with a supported vector mode as well.
+
+2021-08-04  Richard Biener  <rguenther@suse.de>
+
+	PR tree-optimization/101756
+	* tree-vect-slp.c (vectorizable_bb_reduc_epilogue): Make sure
+	the result of the reduction epilogue is compatible to the original
+	scalar result.
+
+2021-08-04  liuhongt  <hongtao.liu@intel.com>
+
+	PR target/101743
+	* config/i386/i386.md (peephole2): Refine predicate from
+	register_operand to general_reg_operand.
+
+2021-08-04  Aldy Hernandez  <aldyh@redhat.com>
+
+	* gimple-range-path.h (path_range_query::dump): Mark override.
+
+2021-08-04  Richard Biener  <rguenther@suse.de>
+
+	PR tree-optimization/101769
+	* tree-tailcall.c (eliminate_tail_call): Add the created loop
+	for the first recursion and return it via the new output parameter.
+	(optimize_tail_call): Pass through new output param.
+	(tree_optimize_tail_calls_1): After creating all latches,
+	add the created loop to the loop tree.  Do not mark loops for fixup.
+
+2021-08-04  Martin Liska  <mliska@suse.cz>
+
+	* doc/invoke.texi: Document threader-mode param.
+
+2021-08-04  liuhongt  <hongtao.liu@intel.com>
+
+	* config/i386/sse.md (cond_fma<mode>): New expander.
+	(cond_fms<mode>): Ditto.
+	(cond_fnma<mode>): Ditto.
+	(cond_fnms<mode>): Ditto.
+
 2021-08-03  Segher Boessenkool  <segher@kernel.crashing.org>
 
 	* config/rs6000/vsx.md (*vsx_le_perm_store_<mode>): Use && instead of &.
diff --git a/gcc/DATESTAMP b/gcc/DATESTAMP
index 856144c..6168f46 100644
--- a/gcc/DATESTAMP
+++ b/gcc/DATESTAMP
@@ -1 +1 @@
-20210804
+20210805
diff --git a/gcc/analyzer/ChangeLog b/gcc/analyzer/ChangeLog
index 4579796..43e3c63 100644
--- a/gcc/analyzer/ChangeLog
+++ b/gcc/analyzer/ChangeLog
@@ -1,3 +1,39 @@
+2021-08-04  David Malcolm  <dmalcolm@redhat.com>
+
+	PR analyzer/101570
+	* analyzer.cc (maybe_reconstruct_from_def_stmt): Add GIMPLE_ASM
+	case.
+	* analyzer.h (class asm_output_svalue): New forward decl.
+	(class reachable_regions): New forward decl.
+	* complexity.cc (complexity::from_vec_svalue): New.
+	* complexity.h (complexity::from_vec_svalue): New decl.
+	* engine.cc (feasibility_state::maybe_update_for_edge): Handle
+	asm stmts by calling on_asm_stmt.
+	* region-model-asm.cc: New file.
+	* region-model-manager.cc
+	(region_model_manager::maybe_fold_asm_output_svalue): New.
+	(region_model_manager::get_or_create_asm_output_svalue): New.
+	(region_model_manager::log_stats): Log m_asm_output_values_map.
+	* region-model.cc (region_model::on_stmt_pre): Handle GIMPLE_ASM.
+	* region-model.h (visitor::visit_asm_output_svalue): New.
+	(region_model_manager::get_or_create_asm_output_svalue): New decl.
+	(region_model_manager::maybe_fold_asm_output_svalue): New decl.
+	(region_model_manager::asm_output_values_map_t): New typedef.
+	(region_model_manager::m_asm_output_values_map): New field.
+	(region_model::on_asm_stmt): New.
+	* store.cc (binding_cluster::on_asm): New.
+	* store.h (binding_cluster::on_asm): New decl.
+	* svalue.cc (svalue::cmp_ptr): Handle SK_ASM_OUTPUT.
+	(asm_output_svalue::dump_to_pp): New.
+	(asm_output_svalue::dump_input): New.
+	(asm_output_svalue::input_idx_to_asm_idx): New.
+	(asm_output_svalue::accept): New.
+	* svalue.h (enum svalue_kind): Add SK_ASM_OUTPUT.
+	(svalue::dyn_cast_asm_output_svalue): New.
+	(class asm_output_svalue): New.
+	(is_a_helper <const asm_output_svalue *>::test): New.
+	(struct default_hash_traits<asm_output_svalue::key_t>): New.
+
 2021-08-03  Jakub Jelinek  <jakub@redhat.com>
 
 	PR analyzer/101721
diff --git a/gcc/cp/ChangeLog b/gcc/cp/ChangeLog
index 5468792..5b3e191 100644
--- a/gcc/cp/ChangeLog
+++ b/gcc/cp/ChangeLog
@@ -1,3 +1,9 @@
+2021-08-04  Jakub Jelinek  <jakub@redhat.com>
+
+	PR c++/101759
+	* parser.c (cp_parser_default_argument): Temporarily override
+	parser->omp_declare_simd and parser->oacc_routine to NULL.
+
 2021-08-02  Patrick Palka  <ppalka@redhat.com>
 
 	PR c++/100828
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 097a5b5..04b011b 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,123 @@
+2021-08-04  David Malcolm  <dmalcolm@redhat.com>
+
+	PR analyzer/101570
+	* gcc.dg/analyzer/asm-x86-1.c: New test.
+	* gcc.dg/analyzer/asm-x86-lp64-1.c: New test.
+	* gcc.dg/analyzer/asm-x86-lp64-2.c: New test.
+	* gcc.dg/analyzer/pr101570.c: New test.
+	* gcc.dg/analyzer/torture/asm-x86-linux-array_index_mask_nospec.c:
+	New test.
+	* gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-1.c: New
+	test.
+	* gcc.dg/analyzer/torture/asm-x86-linux-cpuid-paravirt-2.c: New
+	test.
+	* gcc.dg/analyzer/torture/asm-x86-linux-cpuid.c: New test.
+	* gcc.dg/analyzer/torture/asm-x86-linux-rdmsr-paravirt.c: New
+	test.
+	* gcc.dg/analyzer/torture/asm-x86-linux-rdmsr.c: New test.
+	* gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-full.c:
+	New test.
+	* gcc.dg/analyzer/torture/asm-x86-linux-wfx_get_ps_timeout-reduced.c:
+	New test.
+
+2021-08-04  H.J. Lu  <hjl.tools@gmail.com>
+
+	PR target/101742
+	* gcc.target/i386/pr101742a.c: New test.
+	* gcc.target/i386/pr101742b.c: Likewise.
+
+2021-08-04  H.J. Lu  <hjl.tools@gmail.com>
+
+	PR target/101772
+	* gcc.target/i386/eh_return-2.c: New test.
+
+2021-08-04  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* gcc.target/s390/vector/perm-vpdi.c: New test.
+
+2021-08-04  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* gcc.target/s390/vector/perm-merge.c: New test.
+	* gcc.target/s390/vector/vec-types.h: New test.
+
+2021-08-04  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* gcc.target/s390/zvector/vec-permi.c: Removed.
+	* gcc.target/s390/zvector/vec_permi.c: New test.
+
+2021-08-04  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* gcc.target/s390/vector/long-double-asm-in-out-hard-fp-reg.c:
+	Instead of vpdi with 0 and 5 vmrlg and vmrhg are used now.
+	* gcc.target/s390/vector/long-double-asm-inout-hard-fp-reg.c: Likewise.
+	* gcc.target/s390/zvector/vec-types.h: New test.
+	* gcc.target/s390/zvector/vec_merge.c: New test.
+
+2021-08-04  Jonathan Wright  <jonathan.wright@arm.com>
+
+	* gcc.target/aarch64/vmul_high_cost.c: New test.
+
+2021-08-04  Jonathan Wright  <jonathan.wright@arm.com>
+
+	* gcc.target/aarch64/vmul_element_cost.c: New test.
+
+2021-08-04  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* gcc.target/aarch64/sve/cost_model_12.c: New test.
+
+2021-08-04  Tamar Christina  <tamar.christina@arm.com>
+
+	PR tree-optimization/101750
+	* g++.dg/vect/pr99149.cc: Name class.
+
+2021-08-04  Richard Biener  <rguenther@suse.de>
+
+	* gcc.target/i386/vect-gather-1.c: New testcase.
+	* gfortran.dg/vect/vect-8.f90: Adjust.
+
+2021-08-04  Roger Sayle  <roger@nextmovesoftware.com>
+	    Marc Glisse  <marc.glisse@inria.fr>
+
+	* gcc.dg/fold-ior-4.c: New test.
+
+2021-08-04  Richard Biener  <rguenther@suse.de>
+
+	PR tree-optimization/101756
+	* gcc.dg/vect/bb-slp-pr101756.c: New testcase.
+
+2021-08-04  Jakub Jelinek  <jakub@redhat.com>
+
+	PR c++/101759
+	* g++.dg/gomp/pr101759.C: New test.
+	* g++.dg/goacc/pr101759.C: New test.
+
+2021-08-04  Jakub Jelinek  <jakub@redhat.com>
+
+	* gcc.c-torture/execute/ieee/pr29302-1.x: Undo doubly applied patch.
+
+2021-08-04  Richard Biener  <rguenther@suse.de>
+
+	PR tree-optimization/101769
+	* g++.dg/tree-ssa/pr101769.C: New testcase.
+
+2021-08-04  liuhongt  <hongtao.liu@intel.com>
+
+	* gcc.target/i386/cond_op_addsubmul_d-2.c: Add
+	dg-require-effective-target for avx512.
+	* gcc.target/i386/cond_op_addsubmul_q-2.c: Ditto.
+	* gcc.target/i386/cond_op_addsubmul_w-2.c: Ditto.
+	* gcc.target/i386/cond_op_addsubmuldiv_double-2.c: Ditto.
+	* gcc.target/i386/cond_op_addsubmuldiv_float-2.c: Ditto.
+	* gcc.target/i386/cond_op_fma_double-2.c: Ditto.
+	* gcc.target/i386/cond_op_fma_float-2.c: Ditto.
+
+2021-08-04  liuhongt  <hongtao.liu@intel.com>
+
+	* gcc.target/i386/cond_op_fma_double-1.c: New test.
+	* gcc.target/i386/cond_op_fma_double-2.c: New test.
+	* gcc.target/i386/cond_op_fma_float-1.c: New test.
+	* gcc.target/i386/cond_op_fma_float-2.c: New test.
+
 2021-08-03  Eugene Rozenfeld  <erozen@microsoft.com>
 
 	* lib/profopt.exp: Pass gdwarf-4 when compiling test to profile; pass -gcov_version=2.
-- 
cgit v1.1


From 9a8c3fc2b2cc6d73b2e3006625fca2b588ebc1b0 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 4 Aug 2021 16:03:58 +0800
Subject: Support cond_{smax,smin,umax,umin} for vector integer modes under
 AVX512.

gcc/ChangeLog:

	* config/i386/sse.md (cond_<code><mode>): New expander.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/cond_op_maxmin_b-1.c: New test.
	* gcc.target/i386/cond_op_maxmin_b-2.c: New test.
	* gcc.target/i386/cond_op_maxmin_d-1.c: New test.
	* gcc.target/i386/cond_op_maxmin_d-2.c: New test.
	* gcc.target/i386/cond_op_maxmin_q-1.c: New test.
	* gcc.target/i386/cond_op_maxmin_q-2.c: New test.
	* gcc.target/i386/cond_op_maxmin_ub-1.c: New test.
	* gcc.target/i386/cond_op_maxmin_ub-2.c: New test.
	* gcc.target/i386/cond_op_maxmin_ud-1.c: New test.
	* gcc.target/i386/cond_op_maxmin_ud-2.c: New test.
	* gcc.target/i386/cond_op_maxmin_uq-1.c: New test.
	* gcc.target/i386/cond_op_maxmin_uq-2.c: New test.
	* gcc.target/i386/cond_op_maxmin_uw-1.c: New test.
	* gcc.target/i386/cond_op_maxmin_uw-2.c: New test.
	* gcc.target/i386/cond_op_maxmin_w-1.c: New test.
	* gcc.target/i386/cond_op_maxmin_w-2.c: New test.
---
 gcc/config/i386/sse.md                             | 18 ++++++
 gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-1.c |  8 +++
 gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-2.c |  6 ++
 gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-1.c | 41 +++++++++++++
 gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-2.c | 67 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-1.c |  8 +++
 gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-2.c |  5 ++
 .../gcc.target/i386/cond_op_maxmin_ub-1.c          |  8 +++
 .../gcc.target/i386/cond_op_maxmin_ub-2.c          |  6 ++
 .../gcc.target/i386/cond_op_maxmin_ud-1.c          |  8 +++
 .../gcc.target/i386/cond_op_maxmin_ud-2.c          |  5 ++
 .../gcc.target/i386/cond_op_maxmin_uq-1.c          |  8 +++
 .../gcc.target/i386/cond_op_maxmin_uq-2.c          |  5 ++
 .../gcc.target/i386/cond_op_maxmin_uw-1.c          |  8 +++
 .../gcc.target/i386/cond_op_maxmin_uw-2.c          |  6 ++
 gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-1.c |  8 +++
 gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-2.c |  6 ++
 17 files changed, 221 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_ub-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_ub-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_ud-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_ud-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_uq-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_uq-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_uw-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_uw-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-2.c

(limited to 'gcc')

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f5968e0..6035411 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -13070,6 +13070,24 @@
    (set_attr "prefix" "vex")
    (set_attr "mode" "OI")])
 
+(define_expand "cond_<code><mode>"
+  [(set (match_operand:VI1248_AVX512VLBW 0 "register_operand")
+	(vec_merge:VI1248_AVX512VLBW
+	  (maxmin:VI1248_AVX512VLBW
+	    (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand")
+	    (match_operand:VI1248_AVX512VLBW 3 "nonimmediate_operand"))
+	  (match_operand:VI1248_AVX512VLBW 4 "nonimm_or_0_operand")
+	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  emit_insn (gen_<code><mode>3_mask (operands[0],
+				     operands[2],
+				     operands[3],
+				     operands[4],
+				     operands[1]));
+  DONE;
+})
+
 (define_expand "<code><mode>3_mask"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand")
 	(vec_merge:VI48_AVX512VL
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-1.c
new file mode 100644
index 0000000..78c6600
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=int8 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpmaxsb"  1 } } */
+/* { dg-final { scan-assembler-times "vpminsb"  1 } } */
+
+#include "cond_op_maxmin_d-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-2.c
new file mode 100644
index 0000000..8ba7a3f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_b-2.c
@@ -0,0 +1,6 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mavx512bw -mprefer-vector-width=256 -DTYPE=int8" } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512bw } */
+
+#include "cond_op_maxmin_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-1.c
new file mode 100644
index 0000000..2543d36
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-1.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpmaxsd"  1 } } */
+/* { dg-final { scan-assembler-times "vpminsd"  1 } } */
+
+typedef char int8;
+typedef unsigned char uint8;
+typedef short int16;
+typedef unsigned short uint16;
+typedef int int32;
+typedef unsigned int uint32;
+typedef long long int64;
+typedef unsigned long long uint64;
+
+#ifndef NUM
+#define NUM 800
+#endif
+#ifndef TYPE
+#define TYPE int
+#endif
+
+TYPE a[NUM], b[NUM], c[NUM], d[NUM], e[NUM], j[NUM];
+#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
+#define MAX(X,Y) ((X) < (Y) ? (Y) : (X))
+
+#define BIN(OPNAME, OP)				\
+  void						\
+  __attribute__ ((noipa,optimize ("O3")))	\
+  foo_##OPNAME ()				\
+  {						\
+    for (int i = 0; i != NUM; i++)		\
+      if (b[i] < c[i])				\
+	a[i] = OP(d[i], e[i]);			\
+      else					\
+	a[i] = d[i] - e[i];			\
+  }
+
+BIN (max, MAX);
+BIN (min, MIN);
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-2.c
new file mode 100644
index 0000000..f715f54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_d-2.c
@@ -0,0 +1,67 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#include "cond_op_maxmin_d-1.c"
+#define BINO2(OPNAME, OP)			\
+  void						\
+  __attribute__ ((noipa,optimize ("O2")))	\
+  foo_o2_##OPNAME ()				\
+  {						\
+    for (int i = 0; i != NUM; i++)		\
+      if (b[i] < c[i])				\
+	j[i] = OP(d[i], e[i]);			\
+      else					\
+	j[i] = d[i] - e[i];			\
+  }
+
+BINO2 (max, MAX);
+BINO2 (min, MIN);
+
+static void
+test_256 (void)
+{
+  int sign = -1;
+  for (int i = 0; i != NUM; i++)
+    {
+      a[i] = 0;
+      d[i] = i * 2;
+      e[i] = i * i * 3 - i * 9 + 153;
+      b[i] = i * 83;
+      c[i] = b[i] + sign;
+      sign *= -1;
+      j[i] = 1;
+    }
+  foo_max ();
+  foo_o2_max ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+	abort ();
+      a[i] = 0;
+      b[i] = 1;
+    }
+
+  foo_min ();
+  foo_o2_min ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+}
+
+static void
+test_128 ()
+{
+  
+}
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-1.c
new file mode 100644
index 0000000..a1925c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=int64 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpmaxsq"  1 } } */
+/* { dg-final { scan-assembler-times "vpminsq"  1 } } */
+
+#include "cond_op_maxmin_d-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-2.c
new file mode 100644
index 0000000..205a65a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_q-2.c
@@ -0,0 +1,5 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=int64" } */
+/* { dg-require-effective-target avx512vl } */
+
+#include "cond_op_maxmin_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ub-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ub-1.c
new file mode 100644
index 0000000..117179f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ub-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=uint8 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpmaxub"  1 } } */
+/* { dg-final { scan-assembler-times "vpminub"  1 } } */
+
+#include "cond_op_maxmin_d-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ub-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ub-2.c
new file mode 100644
index 0000000..ac4a206
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ub-2.c
@@ -0,0 +1,6 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mavx512bw -mprefer-vector-width=256 -DTYPE=uint8" } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512bw } */
+
+#include "cond_op_maxmin_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ud-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ud-1.c
new file mode 100644
index 0000000..1ce0f82
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ud-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=uint32 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpmaxud"  1 } } */
+/* { dg-final { scan-assembler-times "vpminud"  1 } } */
+
+#include "cond_op_maxmin_d-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ud-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ud-2.c
new file mode 100644
index 0000000..d609ef0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_ud-2.c
@@ -0,0 +1,5 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=uint32" } */
+/* { dg-require-effective-target avx512vl } */
+
+#include "cond_op_maxmin_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uq-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uq-1.c
new file mode 100644
index 0000000..82209f4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uq-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=uint64 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpmaxuq"  1 } } */
+/* { dg-final { scan-assembler-times "vpminuq"  1 } } */
+
+#include "cond_op_maxmin_d-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uq-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uq-2.c
new file mode 100644
index 0000000..c2053c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uq-2.c
@@ -0,0 +1,5 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=uint64" } */
+/* { dg-require-effective-target avx512vl } */
+
+#include "cond_op_maxmin_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uw-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uw-1.c
new file mode 100644
index 0000000..43d560d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uw-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=uint16 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpmaxuw"  1 } } */
+/* { dg-final { scan-assembler-times "vpminuw"  1 } } */
+
+#include "cond_op_maxmin_d-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uw-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uw-2.c
new file mode 100644
index 0000000..463fc52
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_uw-2.c
@@ -0,0 +1,6 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mavx512bw -mprefer-vector-width=256 -DTYPE=uint16" } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512bw } */
+
+#include "cond_op_maxmin_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-1.c
new file mode 100644
index 0000000..d4d388e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=int16 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpmaxsw"  1 } } */
+/* { dg-final { scan-assembler-times "vpminsw"  1 } } */
+
+#include "cond_op_maxmin_d-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-2.c
new file mode 100644
index 0000000..d6e45e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_w-2.c
@@ -0,0 +1,6 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mavx512bw -mprefer-vector-width=256 -DTYPE=int16" } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512bw } */
+
+#include "cond_op_maxmin_d-2.c"
-- 
cgit v1.1


From f7aa81892eb54bc040ee6f7fd6134d800a5ee89c Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 4 Aug 2021 18:15:43 +0800
Subject: Support cond_{smax,smin} for vector float/double modes under AVX512.

gcc/ChangeLog:

	* config/i386/sse.md (cond_<code><mode>): New expander.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/cond_op_maxmin_double-1.c: New test.
	* gcc.target/i386/cond_op_maxmin_double-2.c: New test.
	* gcc.target/i386/cond_op_maxmin_float-1.c: New test.
	* gcc.target/i386/cond_op_maxmin_float-2.c: New test.
---
 gcc/config/i386/sse.md                             | 18 ++++++
 .../gcc.target/i386/cond_op_maxmin_double-1.c      | 39 +++++++++++++
 .../gcc.target/i386/cond_op_maxmin_double-2.c      | 67 ++++++++++++++++++++++
 .../gcc.target/i386/cond_op_maxmin_float-1.c       |  8 +++
 .../gcc.target/i386/cond_op_maxmin_float-2.c       |  5 ++
 5 files changed, 137 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_float-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_maxmin_float-2.c

(limited to 'gcc')

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 6035411..51733a3 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2376,6 +2376,24 @@
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "SF")])
 
+(define_expand "cond_<code><mode>"
+  [(set (match_operand:VF 0 "register_operand")
+	(vec_merge:VF
+	  (smaxmin:VF
+	    (match_operand:VF 2 "vector_operand")
+	    (match_operand:VF 3 "vector_operand"))
+	  (match_operand:VF 4 "nonimm_or_0_operand")
+	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "<MODE_SIZE> == 64 || TARGET_AVX512VL"
+{
+  emit_insn (gen_<code><mode>3_mask (operands[0],
+				     operands[2],
+				     operands[3],
+				     operands[4],
+				     operands[1]));
+  DONE;
+})
+
 (define_expand "<code><mode>3<mask_name><round_saeonly_name>"
   [(set (match_operand:VF 0 "register_operand")
 	(smaxmin:VF
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-1.c
new file mode 100644
index 0000000..eda8e19
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-1.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vmaxpd"  1 } } */
+/* { dg-final { scan-assembler-times "vminpd"  1 } } */
+
+#include<math.h>
+#ifndef NUM
+#define NUM 800
+#endif
+#ifndef TYPE
+#define TYPE double
+#endif
+#ifndef FN_MAX
+#define FN_MAX fmax
+#endif
+#ifndef FN_MIN
+#define FN_MIN fmin
+#endif
+
+TYPE a[NUM], b[NUM], c[NUM], d[NUM], e[NUM], j[NUM];
+#define MAX FN_MAX
+#define MIN FN_MIN
+
+#define BIN(OPNAME, OP)				\
+  void						\
+  __attribute__ ((noipa,optimize ("Ofast")))	\
+  foo_##OPNAME ()				\
+  {						\
+    for (int i = 0; i != NUM; i++)		\
+      if (b[i] < c[i])				\
+	a[i] = (OP (d[i], e[i]));		\
+      else					\
+	a[i] = d[i] - e[i];			\
+  }
+
+BIN (max, MAX);
+BIN (min, MIN);
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-2.c
new file mode 100644
index 0000000..c50a831
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_double-2.c
@@ -0,0 +1,67 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -ffast-math" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#include "cond_op_maxmin_double-1.c"
+#define BINO2(OPNAME, OP)			\
+  void						\
+  __attribute__ ((noipa))			\
+  foo_o2_##OPNAME ()				\
+  {						\
+    for (int i = 0; i != NUM; i++)		\
+      if (b[i] < c[i])				\
+	j[i] = OP(d[i], e[i]);			\
+      else					\
+	j[i] = d[i] - e[i];			\
+  }
+
+BINO2 (max, MAX);
+BINO2 (min, MIN);
+
+static void
+test_256 (void)
+{
+  int sign = -1;
+  for (int i = 0; i != NUM; i++)
+    {
+      a[i] = 0;
+      d[i] = i * 2;
+      e[i] = i * i * 3 - i * 9 + 153;
+      b[i] = i * 83;
+      c[i] = b[i] + sign;
+      sign *= -1;
+      j[i] = 1;
+    }
+  foo_max ();
+  foo_o2_max ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+	abort ();
+      a[i] = 0;
+      b[i] = 1;
+    }
+
+  foo_min ();
+  foo_o2_min ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+}
+
+static void
+test_128 ()
+{
+  
+}
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_float-1.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_float-1.c
new file mode 100644
index 0000000..2d2157d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_float-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=float -fdump-tree-optimized -DFN_MAX=fmaxf -DFN_MIN=fminf" } */
+/* { dg-final { scan-tree-dump ".COND_MAX" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_MIN" "optimized" } } */
+/* { dg-final { scan-assembler-times "vmaxps"  1 } } */
+/* { dg-final { scan-assembler-times "vminps"  1 } } */
+
+#include "cond_op_maxmin_double-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_maxmin_float-2.c b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_float-2.c
new file mode 100644
index 0000000..fec784e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_maxmin_float-2.c
@@ -0,0 +1,5 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=float -DFN_MAX=fmaxf -DFN_MIN=fminf" } */
+/* { dg-require-effective-target avx512vl } */
+
+#include "cond_op_maxmin_double-2.c"
-- 
cgit v1.1


From c16f21c7cf97ce48967e42d3b5d22ea169a9c2c8 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 4 Aug 2021 18:43:22 +0800
Subject: Support cond_{xor,ior,and} for vector integer mode under AVX512.

gcc/ChangeLog:

	* config/i386/sse.md (cond_<code><mode>): New expander.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/cond_op_anylogic_d-1.c: New test.
	* gcc.target/i386/cond_op_anylogic_d-2.c: New test.
	* gcc.target/i386/cond_op_anylogic_q-1.c: New test.
	* gcc.target/i386/cond_op_anylogic_q-2.c: New test.
---
 gcc/config/i386/sse.md                             | 18 +++++
 .../gcc.target/i386/cond_op_anylogic_d-1.c         | 38 +++++++++++
 .../gcc.target/i386/cond_op_anylogic_d-2.c         | 78 ++++++++++++++++++++++
 .../gcc.target/i386/cond_op_anylogic_q-1.c         | 10 +++
 .../gcc.target/i386/cond_op_anylogic_q-2.c         |  5 ++
 5 files changed, 149 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_anylogic_q-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_anylogic_q-2.c

(limited to 'gcc')

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 51733a3..a46a237 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -14063,6 +14063,24 @@
   DONE;
 })
 
+(define_expand "cond_<code><mode>"
+  [(set (match_operand:VI48_AVX512VL 0 "register_operand")
+	(vec_merge:VI48_AVX512VL
+	  (any_logic:VI48_AVX512VL
+	    (match_operand:VI48_AVX512VL 2 "vector_operand")
+	    (match_operand:VI48_AVX512VL 3 "vector_operand"))
+	  (match_operand:VI48_AVX512VL 4 "nonimm_or_0_operand")
+	  (match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  emit_insn (gen_<code><mode>3_mask (operands[0],
+				     operands[2],
+				     operands[3],
+				     operands[4],
+				     operands[1]));
+  DONE;
+})
+
 (define_insn "<mask_codefor><code><mode>3<mask_name>"
   [(set (match_operand:VI48_AVX_AVX512F 0 "register_operand" "=x,x,v")
 	(any_logic:VI48_AVX_AVX512F
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-1.c b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-1.c
new file mode 100644
index 0000000..8951f4a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-1.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_AND" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_XOR" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_IOR" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpxord"  1 } } */
+/* { dg-final { scan-assembler-times "vpord"  1 } } */
+/* { dg-final { scan-assembler-times "vpandd"  1 } } */
+
+typedef int int32;
+typedef unsigned int uint32;
+typedef long long int64;
+typedef unsigned long long uint64;
+
+#ifndef NUM
+#define NUM 800
+#endif
+#ifndef TYPE
+#define TYPE int
+#endif
+
+TYPE a[NUM], b[NUM], c[NUM], d[NUM], e[NUM], j[NUM];
+
+#define BIN(OPNAME, OP)				\
+  void						\
+  __attribute__ ((noipa,optimize ("O3")))	\
+  foo_##OPNAME ()				\
+  {						\
+    for (int i = 0; i != NUM; i++)		\
+      if (b[i] < c[i])				\
+	a[i] = d[i] OP e[i];			\
+      else					\
+	a[i] = d[i] - e[i];			\
+  }
+
+BIN (and, &);
+BIN (ior, |);
+BIN (xor, ^);
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-2.c b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-2.c
new file mode 100644
index 0000000..23ca412
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_d-2.c
@@ -0,0 +1,78 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#include "cond_op_anylogic_d-1.c"
+#define BINO2(OPNAME, OP)			\
+  void						\
+  __attribute__ ((noipa,optimize ("O2")))	\
+  foo_o2_##OPNAME ()				\
+  {						\
+    for (int i = 0; i != NUM; i++)		\
+      if (b[i] < c[i])				\
+	j[i] = d[i] OP e[i];			\
+      else					\
+	j[i] = d[i] - e[i];			\
+  }
+
+BINO2 (and, &);
+BINO2 (ior, |);
+BINO2 (xor, ^);
+
+static void
+test_256 (void)
+{
+  int sign = -1;
+  for (int i = 0; i != NUM; i++)
+    {
+      a[i] = 0;
+      d[i] = i * 2;
+      e[i] = i * i * 3 - i * 9 + 153;
+      b[i] = i * 83;
+      c[i] = b[i] + sign;
+      sign *= -1;
+      j[i] = 1;
+    }
+  foo_and ();
+  foo_o2_and ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+	abort ();
+      a[i] = 0;
+      b[i] = 1;
+    }
+
+  foo_xor ();
+  foo_o2_xor ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+
+  foo_ior ();
+  foo_o2_ior ();
+  for (int i = 0; i != NUM; i++)
+    {
+      if (a[i] != j[i])
+  	abort ();
+      a[i] = 0;
+      j[i] = 1;
+    }
+}
+
+static void
+test_128 ()
+{
+  
+}
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_anylogic_q-1.c b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_q-1.c
new file mode 100644
index 0000000..cb47701
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_q-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512 -DTYPE=int64 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump ".COND_AND" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_XOR" "optimized" } } */
+/* { dg-final { scan-tree-dump ".COND_IOR" "optimized" } } */
+/* { dg-final { scan-assembler-times "vpxorq"  1 } } */
+/* { dg-final { scan-assembler-times "vporq"  1 } } */
+/* { dg-final { scan-assembler-times "vpandq"  1 } } */
+
+#include "cond_op_anylogic_d-1.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_anylogic_q-2.c b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_q-2.c
new file mode 100644
index 0000000..709babf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cond_op_anylogic_q-2.c
@@ -0,0 +1,5 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=int64" } */
+/* { dg-require-effective-target avx512vl } */
+
+#include "cond_op_anylogic_d-2.c"
-- 
cgit v1.1


From ac8a2fbedf59eecda6d1c049952e10946ffc4a61 Mon Sep 17 00:00:00 2001
From: Cherry Mui <cherryyz@google.com>
Date: Wed, 4 Aug 2021 18:24:47 -0400
Subject: compiler: make escape analysis more robust about builtin functions

In the places where we handle builtin functions, list all
supported ones, and fail if an unexpected one is seen. So if a
new builtin function is added in the future we can detect it,
instead of silently treating it as nonescaping.

Reviewed-on: https://go-review.googlesource.com/c/gofrontend/+/339992
---
 gcc/go/gofrontend/MERGE     |  2 +-
 gcc/go/gofrontend/escape.cc | 56 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 55 insertions(+), 3 deletions(-)

(limited to 'gcc')

diff --git a/gcc/go/gofrontend/MERGE b/gcc/go/gofrontend/MERGE
index be1a90f..394530c 100644
--- a/gcc/go/gofrontend/MERGE
+++ b/gcc/go/gofrontend/MERGE
@@ -1,4 +1,4 @@
-616ee658a6238e7de53592ebda5997f6de6a00de
+b47bcf942daa9a0c252db9b57b8f138adbfcdaa2
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
diff --git a/gcc/go/gofrontend/escape.cc b/gcc/go/gofrontend/escape.cc
index 347ac25..c8978ac 100644
--- a/gcc/go/gofrontend/escape.cc
+++ b/gcc/go/gofrontend/escape.cc
@@ -1608,8 +1608,33 @@ Escape_analysis_assign::expression(Expression** pexpr)
                 }
                 break;
 
-              default:
+              case Builtin_call_expression::BUILTIN_CLOSE:
+              case Builtin_call_expression::BUILTIN_DELETE:
+              case Builtin_call_expression::BUILTIN_PRINT:
+              case Builtin_call_expression::BUILTIN_PRINTLN:
+              case Builtin_call_expression::BUILTIN_LEN:
+              case Builtin_call_expression::BUILTIN_CAP:
+              case Builtin_call_expression::BUILTIN_COMPLEX:
+              case Builtin_call_expression::BUILTIN_REAL:
+              case Builtin_call_expression::BUILTIN_IMAG:
+              case Builtin_call_expression::BUILTIN_RECOVER:
+              case Builtin_call_expression::BUILTIN_ALIGNOF:
+              case Builtin_call_expression::BUILTIN_OFFSETOF:
+              case Builtin_call_expression::BUILTIN_SIZEOF:
+                // these do not escape.
+                break;
+
+              case Builtin_call_expression::BUILTIN_ADD:
+              case Builtin_call_expression::BUILTIN_SLICE:
+                // handled in ::assign.
                 break;
+
+              case Builtin_call_expression::BUILTIN_MAKE:
+              case Builtin_call_expression::BUILTIN_NEW:
+                // should have been lowered to runtime calls at this point.
+                // fallthrough
+              default:
+                go_unreachable();
               }
             break;
           }
@@ -2372,8 +2397,35 @@ Escape_analysis_assign::assign(Node* dst, Node* src)
                     }
                     break;
 
-                  default:
+                  case Builtin_call_expression::BUILTIN_LEN:
+                  case Builtin_call_expression::BUILTIN_CAP:
+                  case Builtin_call_expression::BUILTIN_COMPLEX:
+                  case Builtin_call_expression::BUILTIN_REAL:
+                  case Builtin_call_expression::BUILTIN_IMAG:
+                  case Builtin_call_expression::BUILTIN_RECOVER:
+                  case Builtin_call_expression::BUILTIN_ALIGNOF:
+                  case Builtin_call_expression::BUILTIN_OFFSETOF:
+                  case Builtin_call_expression::BUILTIN_SIZEOF:
+                    // these do not escape.
+                    break;
+
+                  case Builtin_call_expression::BUILTIN_COPY:
+                    // handled in ::expression.
                     break;
+
+                  case Builtin_call_expression::BUILTIN_CLOSE:
+                  case Builtin_call_expression::BUILTIN_DELETE:
+                  case Builtin_call_expression::BUILTIN_PRINT:
+                  case Builtin_call_expression::BUILTIN_PRINTLN:
+                  case Builtin_call_expression::BUILTIN_PANIC:
+                    // these do not have result.
+                    // fallthrough
+                  case Builtin_call_expression::BUILTIN_MAKE:
+                  case Builtin_call_expression::BUILTIN_NEW:
+                    // should have been lowered to runtime calls at this point.
+                    // fallthrough
+                  default:
+                    go_unreachable();
                   }
                 break;
               }
-- 
cgit v1.1


From 4e3129b0caceec008a940aa5eada253cd0f0b3ec Mon Sep 17 00:00:00 2001
From: Eric Botcazou <ebotcazou@gcc.gnu.org>
Date: Thu, 5 Aug 2021 10:21:30 +0200
Subject: Fix oversight in handling of reverse SSO in SRA pass

The scalar storage order does not apply to pointer and vector components.

gcc/
	PR tree-optimization/101626
	* tree-sra.c (propagate_subaccesses_from_rhs): Do not set the
	reverse scalar storage order on a pointer or vector component.

gcc/testsuite/
	* gcc.dg/sso-15.c: New test.
---
 gcc/testsuite/gcc.dg/sso-15.c | 36 ++++++++++++++++++++++++++++++++++++
 gcc/tree-sra.c                |  5 ++++-
 2 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/sso-15.c

(limited to 'gcc')

diff --git a/gcc/testsuite/gcc.dg/sso-15.c b/gcc/testsuite/gcc.dg/sso-15.c
new file mode 100644
index 0000000..d8a711d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/sso-15.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define REV_ENDIANNESS __attribute__((scalar_storage_order("big-endian")))
+#else
+#define REV_ENDIANNESS __attribute__((scalar_storage_order("little-endian")))
+#endif
+
+struct X { int *p; } REV_ENDIANNESS;
+
+struct X x;
+
+struct X __attribute__((noinline)) foo (int *p)
+{
+  struct X x;
+  x.p = p;
+  return x;
+}
+
+void __attribute((noinline)) bar (void)
+{
+  *x.p = 1;
+}
+
+extern void abort (void);
+
+int main (void)
+{
+  int i = 0;
+  x = foo(&i);
+  bar();
+  if (i != 1)
+    abort ();
+  return 0;
+}
diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
index c05d22f..3a9e14f 100644
--- a/gcc/tree-sra.c
+++ b/gcc/tree-sra.c
@@ -2790,7 +2790,10 @@ propagate_subaccesses_from_rhs (struct access *lacc, struct access *racc)
 	{
 	  /* We are about to change the access type from aggregate to scalar,
 	     so we need to put the reverse flag onto the access, if any.  */
-	  const bool reverse = TYPE_REVERSE_STORAGE_ORDER (lacc->type);
+	  const bool reverse
+	    = TYPE_REVERSE_STORAGE_ORDER (lacc->type)
+	      && !POINTER_TYPE_P (racc->type)
+	      && !VECTOR_TYPE_P (racc->type);
 	  tree t = lacc->base;
 
 	  lacc->type = racc->type;
-- 
cgit v1.1


From d0a5624bb40470cb0ee03fe86530118ca6de0b3a Mon Sep 17 00:00:00 2001
From: Kewen Lin <linkw@linux.ibm.com>
Date: Sun, 25 Jul 2021 20:52:08 -0500
Subject: cfgloop: Make loops_list support an optional loop_p root

This patch follows Richi's suggestion to add one optional
argument class loop* root to loops_list's CTOR, it can
provide the ability to construct a visiting list starting
from the given class loop* ROOT rather than the default
tree_root of loops_for_fn (FN), for visiting a subset of
the loop tree.

It unifies all orders of walkings into walk_loop_tree, but
it still uses linear search for LI_ONLY_INNERMOST when
looking at the whole loop tree since it has a more stable
bound.

gcc/ChangeLog:

	* cfgloop.h (loops_list::loops_list): Add one optional argument
	root and adjust accordingly, update loop tree walking and factor
	out to ...
	* cfgloop.c (loops_list::walk_loop_tree): ... this.  New function.
---
 gcc/cfgloop.c |  66 ++++++++++++++++++++++++++++++++++++++
 gcc/cfgloop.h | 100 +++++++++++++++++++++++-----------------------------------
 2 files changed, 106 insertions(+), 60 deletions(-)

(limited to 'gcc')

diff --git a/gcc/cfgloop.c b/gcc/cfgloop.c
index 6284ae2..2ba9918 100644
--- a/gcc/cfgloop.c
+++ b/gcc/cfgloop.c
@@ -2104,3 +2104,69 @@ mark_loop_for_removal (loop_p loop)
   loop->latch = NULL;
   loops_state_set (LOOPS_NEED_FIXUP);
 }
+
+/* Starting from loop tree ROOT, walk loop tree as the visiting
+   order specified by FLAGS.  The supported visiting orders
+   are:
+     - LI_ONLY_INNERMOST
+     - LI_FROM_INNERMOST
+     - Preorder (if neither of above is specified)  */
+
+void
+loops_list::walk_loop_tree (class loop *root, unsigned flags)
+{
+  bool only_innermost_p = flags & LI_ONLY_INNERMOST;
+  bool from_innermost_p = flags & LI_FROM_INNERMOST;
+  bool preorder_p = !(only_innermost_p || from_innermost_p);
+
+  /* Early handle root without any inner loops, make later
+     processing simpler, that is all loops processed in the
+     following while loop are impossible to be root.  */
+  if (!root->inner)
+    {
+      if (flags & LI_INCLUDE_ROOT)
+	this->to_visit.quick_push (root->num);
+      return;
+    }
+  else if (preorder_p && flags & LI_INCLUDE_ROOT)
+    this->to_visit.quick_push (root->num);
+
+  class loop *aloop;
+  for (aloop = root->inner;
+       aloop->inner != NULL;
+       aloop = aloop->inner)
+    {
+      if (preorder_p)
+	this->to_visit.quick_push (aloop->num);
+      continue;
+    }
+
+  while (1)
+    {
+      gcc_assert (aloop != root);
+      if (from_innermost_p || aloop->inner == NULL)
+	this->to_visit.quick_push (aloop->num);
+
+      if (aloop->next)
+	{
+	  for (aloop = aloop->next;
+	       aloop->inner != NULL;
+	       aloop = aloop->inner)
+	    {
+	      if (preorder_p)
+		this->to_visit.quick_push (aloop->num);
+	      continue;
+	    }
+	}
+      else if (loop_outer (aloop) == root)
+	break;
+      else
+	aloop = loop_outer (aloop);
+    }
+
+  /* When visiting from innermost, we need to consider root here
+     since the previous while loop doesn't handle it.  */
+  if (from_innermost_p && flags & LI_INCLUDE_ROOT)
+    this->to_visit.quick_push (root->num);
+}
+
diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
index 885632b..0f71a6bf 100644
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@@ -669,13 +669,15 @@ as_const (T &t)
 }
 
 /* A list for visiting loops, which contains the loop numbers instead of
-   the loop pointers.  The scope is restricted in function FN and the
-   visiting order is specified by FLAGS.  */
+   the loop pointers.  If the loop ROOT is offered (non-null), the visiting
+   will start from it, otherwise it would start from the tree_root of
+   loops_for_fn (FN) instead.  The scope is restricted in function FN and
+   the visiting order is specified by FLAGS.  */
 
 class loops_list
 {
 public:
-  loops_list (function *fn, unsigned flags);
+  loops_list (function *fn, unsigned flags, class loop *root = nullptr);
 
   template <typename T> class Iter
   {
@@ -750,6 +752,10 @@ public:
   }
 
 private:
+  /* Walk loop tree starting from ROOT as the visiting order specified
+     by FLAGS.  */
+  void walk_loop_tree (class loop *root, unsigned flags);
+
   /* The function we are visiting.  */
   function *fn;
 
@@ -782,76 +788,50 @@ loops_list::Iter<T>::fill_curr_loop ()
 }
 
 /* Set up the loops list to visit according to the specified
-   function scope FN and iterating order FLAGS.  */
+   function scope FN and iterating order FLAGS.  If ROOT is
+   not null, the visiting would start from it, otherwise it
+   will start from tree_root of loops_for_fn (FN).  */
 
-inline loops_list::loops_list (function *fn, unsigned flags)
+inline loops_list::loops_list (function *fn, unsigned flags, class loop *root)
 {
-  class loop *aloop;
-  unsigned i;
-  int mn;
+  struct loops *loops = loops_for_fn (fn);
+  gcc_assert (!root || loops);
+
+  /* Check mutually exclusive flags should not co-exist.  */
+  unsigned checked_flags = LI_ONLY_INNERMOST | LI_FROM_INNERMOST;
+  gcc_assert ((flags & checked_flags) != checked_flags);
 
   this->fn = fn;
-  if (!loops_for_fn (fn))
+  if (!loops)
     return;
 
+  class loop *tree_root = root ? root : loops->tree_root;
+
   this->to_visit.reserve_exact (number_of_loops (fn));
-  mn = (flags & LI_INCLUDE_ROOT) ? 0 : 1;
 
-  if (flags & LI_ONLY_INNERMOST)
+  /* When root is tree_root of loops_for_fn (fn) and the visiting
+     order is LI_ONLY_INNERMOST, we would like to use linear
+     search here since it has a more stable bound than the
+     walk_loop_tree.  */
+  if (flags & LI_ONLY_INNERMOST && tree_root == loops->tree_root)
     {
-      for (i = 0; vec_safe_iterate (loops_for_fn (fn)->larray, i, &aloop); i++)
-	if (aloop != NULL
-	    && aloop->inner == NULL
-	    && aloop->num >= mn)
-	  this->to_visit.quick_push (aloop->num);
-    }
-  else if (flags & LI_FROM_INNERMOST)
-    {
-      /* Push the loops to LI->TO_VISIT in postorder.  */
-      for (aloop = loops_for_fn (fn)->tree_root;
-	   aloop->inner != NULL;
-	   aloop = aloop->inner)
-	continue;
-
-      while (1)
+      gcc_assert (tree_root->num == 0);
+      if (tree_root->inner == NULL)
 	{
-	  if (aloop->num >= mn)
-	    this->to_visit.quick_push (aloop->num);
-
-	  if (aloop->next)
-	    {
-	      for (aloop = aloop->next;
-		   aloop->inner != NULL;
-		   aloop = aloop->inner)
-		continue;
-	    }
-	  else if (!loop_outer (aloop))
-	    break;
-	  else
-	    aloop = loop_outer (aloop);
+	  if (flags & LI_INCLUDE_ROOT)
+	    this->to_visit.quick_push (0);
+
+	  return;
 	}
+
+      class loop *aloop;
+      unsigned int i;
+      for (i = 1; vec_safe_iterate (loops->larray, i, &aloop); i++)
+	if (aloop != NULL && aloop->inner == NULL)
+	  this->to_visit.quick_push (aloop->num);
     }
   else
-    {
-      /* Push the loops to LI->TO_VISIT in preorder.  */
-      aloop = loops_for_fn (fn)->tree_root;
-      while (1)
-	{
-	  if (aloop->num >= mn)
-	    this->to_visit.quick_push (aloop->num);
-
-	  if (aloop->inner != NULL)
-	    aloop = aloop->inner;
-	  else
-	    {
-	      while (aloop != NULL && aloop->next == NULL)
-		aloop = loop_outer (aloop);
-	      if (aloop == NULL)
-		break;
-	      aloop = aloop->next;
-	    }
-	}
-    }
+    walk_loop_tree (tree_root, flags);
 }
 
 /* The properties of the target.  */
-- 
cgit v1.1


From f0fc1e66238e2e9fe0cbc6e5b77fd163bf887b2c Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Thu, 5 Aug 2021 11:39:50 +0200
Subject: Adjust gcc.dg/vect/bb-slp-pr101756.c

This adjusts the testcase for excess diagnostics emitted by some
targets because of the attribute simd usage like

warning: GCC does not currently support mixed size types for 'simd' functions

on aarch64.

2021-08-05  Richard Biener  <rguenther@suse.de>

	* gcc.dg/vect/bb-slp-pr101756.c: Add -w.
---
 gcc/testsuite/gcc.dg/vect/bb-slp-pr101756.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'gcc')

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr101756.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr101756.c
index 9420e77..de7f180 100644
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-pr101756.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr101756.c
@@ -1,4 +1,6 @@
 /* { dg-do compile } */
+/* SIMD support can emit additional diagnostics.  */
+/* { dg-additional-options "-w" } */
 
 __attribute__ ((simd)) int
 tq (long int ea, int of, int kk)
-- 
cgit v1.1


From 8cd27a3b25558e5be7f8595fc1c828bc46641671 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 28 Jul 2021 15:49:29 +0100
Subject: aarch64: Don't include vec_select high-half in SIMD add cost

The Neon add-long/add-widen instructions can select the top or bottom
half of the operand registers. This selection does not change the
cost of the underlying instruction and this should be reflected by
the RTL cost function.

This patch adds RTL tree traversal in the Neon add cost function to
match vec_select high-half of its operands. This traversal prevents
the cost of the vec_select from being added into the cost of the
subtract - meaning that these instructions can now be emitted in the
combine pass as they are no longer deemed prohibitively expensive.

gcc/ChangeLog:

2021-07-28  Jonathan Wright  <jonathan.wright@arm.com>

	* config/aarch64/aarch64.c: Traverse RTL tree to prevent cost
	of vec_select high-half from being added into Neon add cost.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vaddX_high_cost.c: New test.
---
 gcc/config/aarch64/aarch64.c                       | 15 +++++++++
 gcc/testsuite/gcc.target/aarch64/vaddX_high_cost.c | 38 ++++++++++++++++++++++
 2 files changed, 53 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vaddX_high_cost.c

(limited to 'gcc')

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e02cbcb..aa687c5 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13161,6 +13161,21 @@ cost_minus:
 	op1 = XEXP (x, 1);
 
 cost_plus:
+	if (VECTOR_MODE_P (mode))
+	  {
+	    /* ADDL2 and ADDW2.  */
+	    unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+	    if (vec_flags & VEC_ADVSIMD)
+	      {
+		/* The select-operand-high-half versions of the add instruction
+		   have the same cost as the regular three vector version -
+		   don't add the costs of the select into the costs of the add.
+		   */
+		op0 = aarch64_strip_extend_vec_half (op0);
+		op1 = aarch64_strip_extend_vec_half (op1);
+	      }
+	  }
+
 	if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
 	    || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
 	  {
diff --git a/gcc/testsuite/gcc.target/aarch64/vaddX_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vaddX_high_cost.c
new file mode 100644
index 0000000..43f28d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vaddX_high_cost.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_ADDL(rettype, intype, ts, rs) \
+  rettype test_vaddl_ ## ts (intype a, intype b, intype c) \
+	{ \
+		rettype t0 = vaddl_ ## ts (vget_high_ ## ts (a), \
+					   vget_high_ ## ts (c)); \
+		rettype t1 = vaddl_ ## ts (vget_high_ ## ts (b), \
+					   vget_high_ ## ts (c)); \
+		return vaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_ADDL (int16x8_t, int8x16_t, s8, s16)
+TEST_ADDL (uint16x8_t, uint8x16_t, u8, u16)
+TEST_ADDL (int32x4_t, int16x8_t, s16, s32)
+TEST_ADDL (uint32x4_t, uint16x8_t, u16, u32)
+TEST_ADDL (int64x2_t, int32x4_t, s32, s64)
+TEST_ADDL (uint64x2_t, uint32x4_t, u32, u64)
+
+#define TEST_ADDW(rettype, intype, intypel, ts, rs) \
+  rettype test_vaddw_ ## ts (intype a, intype b, intypel c) \
+	{ \
+		rettype t0 = vaddw_ ## ts (a, vget_high_ ## ts (c)); \
+		rettype t1 = vaddw_ ## ts (b, vget_high_ ## ts (c)); \
+		return vaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_ADDW (int16x8_t, int16x8_t, int8x16_t, s8, s16)
+TEST_ADDW (uint16x8_t, uint16x8_t, uint8x16_t, u8, u16)
+TEST_ADDW (int32x4_t, int32x4_t, int16x8_t, s16, s32)
+TEST_ADDW (uint32x4_t, uint32x4_t, uint16x8_t, u16, u32)
+TEST_ADDW (int64x2_t, int64x2_t, int32x4_t, s32, s64)
+TEST_ADDW (uint64x2_t, uint64x2_t, uint32x4_t, u32, u64)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */
-- 
cgit v1.1


From 0c3aab7f2a394b69a0cfd4852e33f11d9eb7e737 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Wed, 28 Jul 2021 17:45:36 +0100
Subject: aarch64: Don't include vec_select high-half in SIMD subtract cost

The Neon subtract-long/subract-widen instructions can select the top
or bottom half of the operand registers. This selection does not
change the cost of the underlying instruction and this should be
reflected by the RTL cost function.

This patch adds RTL tree traversal in the Neon subtract cost function
to match vec_select high-half of its operands. This traversal
prevents the cost of the vec_select from being added into the cost of
the subtract - meaning that these instructions can now be emitted in
the combine pass as they are no longer deemed prohibitively
expensive.

gcc/ChangeLog:

2021-07-28  Jonathan Wright  <jonathan.wright@arm.com>

	* config/aarch64/aarch64.c: Traverse RTL tree to prevent cost
	of vec_select high-half from being added into Neon subtract
	cost.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vsubX_high_cost.c: New test.
---
 gcc/config/aarch64/aarch64.c                       | 15 +++++++++
 gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c | 38 ++++++++++++++++++++++
 2 files changed, 53 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c

(limited to 'gcc')

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index aa687c5..30f8365 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13089,6 +13089,21 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
 	op1 = XEXP (x, 1);
 
 cost_minus:
+	if (VECTOR_MODE_P (mode))
+	  {
+	    /* SUBL2 and SUBW2.  */
+	    unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+	    if (vec_flags & VEC_ADVSIMD)
+	      {
+		/* The select-operand-high-half versions of the sub instruction
+		   have the same cost as the regular three vector version -
+		   don't add the costs of the select into the costs of the sub.
+		   */
+		op0 = aarch64_strip_extend_vec_half (op0);
+		op1 = aarch64_strip_extend_vec_half (op1);
+	      }
+	  }
+
 	*cost += rtx_cost (op0, mode, MINUS, 0, speed);
 
 	/* Detect valid immediates.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c
new file mode 100644
index 0000000..09bc7fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_SUBL(rettype, intype, ts, rs) \
+  rettype test_vsubl_ ## ts (intype a, intype b, intype c) \
+	{ \
+		rettype t0 = vsubl_ ## ts (vget_high_ ## ts (a), \
+					   vget_high_ ## ts (c)); \
+		rettype t1 = vsubl_ ## ts (vget_high_ ## ts (b), \
+					   vget_high_ ## ts (c)); \
+		return vaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_SUBL (int16x8_t, int8x16_t, s8, s16)
+TEST_SUBL (uint16x8_t, uint8x16_t, u8, u16)
+TEST_SUBL (int32x4_t, int16x8_t, s16, s32)
+TEST_SUBL (uint32x4_t, uint16x8_t, u16, u32)
+TEST_SUBL (int64x2_t, int32x4_t, s32, s64)
+TEST_SUBL (uint64x2_t, uint32x4_t, u32, u64)
+
+#define TEST_SUBW(rettype, intype, intypel, ts, rs) \
+  rettype test_vsubw_ ## ts (intype a, intype b, intypel c) \
+	{ \
+		rettype t0 = vsubw_ ## ts (a, vget_high_ ## ts (c)); \
+		rettype t1 = vsubw_ ## ts (b, vget_high_ ## ts (c)); \
+		return vaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_SUBW (int16x8_t, int16x8_t, int8x16_t, s8, s16)
+TEST_SUBW (uint16x8_t, uint16x8_t, uint8x16_t, u8, u16)
+TEST_SUBW (int32x4_t, int32x4_t, int16x8_t, s16, s32)
+TEST_SUBW (uint32x4_t, uint32x4_t, uint16x8_t, u16, u32)
+TEST_SUBW (int64x2_t, int64x2_t, int32x4_t, s32, s64)
+TEST_SUBW (uint64x2_t, uint64x2_t, uint32x4_t, u32, u64)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */
-- 
cgit v1.1


From 62e66c6a6cc52dc0e014141d369cff52757cd7ae Mon Sep 17 00:00:00 2001
From: Richard Earnshaw <rearnsha@arm.com>
Date: Mon, 26 Jul 2021 17:07:14 +0100
Subject: arm: ensure the arch_name is always set for the build target

This should never happen now if GCC is invoked by the driver, but in
the unusual case of calling cc1 (or its ilk) directly from the command
line the build target's arch_name string can remain NULL.  This can
complicate later processing meaning that we need to check for this
case explicitly in some circumstances.  Nothing should rely on this
behaviour, so it's simpler to always set the arch_name when
configuring the build target and be done with it.

gcc:

	* config/arm/arm.c (arm_configure_build_target): Ensure the target's
	arch_name is always set.
---
 gcc/config/arm/arm.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'gcc')

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 6d781e2..b2dd58d 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -3432,6 +3432,8 @@ arm_configure_build_target (struct arm_build_target *target,
   const cpu_tune *tune_data = &all_tunes[arm_selected_tune - all_cores];
 
   /* Finish initializing the target structure.  */
+  if (!target->arch_name)
+    target->arch_name = arm_selected_arch->common.name;
   target->arch_pp_name = arm_selected_arch->arch;
   target->base_arch = arm_selected_arch->base_arch;
   target->profile = arm_selected_arch->profile;
-- 
cgit v1.1


From 6a37d0331c25f23628d4308e5a75624005c223b2 Mon Sep 17 00:00:00 2001
From: Richard Earnshaw <rearnsha@arm.com>
Date: Tue, 27 Jul 2021 15:44:57 +0100
Subject: arm: Don't reconfigure globals in arm_configure_build_target

arm_configure_build_target is usually used to reconfigure the
arm_active_target structure, which is then used to reconfigure a
number of other global variables describing the current target.
Occasionally, however, we need to use arm_configure_build_target to
construct a temporary target structure and in that case it is wrong to
try to reconfigure the global variables (although probably harmless,
since arm_option_reconfigure_globals() only looks at
arm_active_target).  At the very least, however, this is wasted work,
so it is best not to do it unless needed.  What's more, several
callers of arm_configure_build target call
arm_option_reconfigure_globals themselves within a few lines, making
the call from within arm_configure_build_target completely redundant.

So this patch moves the responsibility of calling of
arm_configure_build_target to its callers (only two places needed
updating).

gcc:
	* config/arm/arm.c (arm_configure_build_target): Don't call
	arm_option_reconfigure_globals.
	(arm_option_restore): Call arm_option_reconfigure_globals after
	reconfiguring the target.
	* config/arm/arm-c.c (arm_pragma_target_parse): Likewise.
---
 gcc/config/arm/arm-c.c | 1 +
 gcc/config/arm/arm.c   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'gcc')

diff --git a/gcc/config/arm/arm-c.c b/gcc/config/arm/arm-c.c
index ae2139c..cc7901b 100644
--- a/gcc/config/arm/arm-c.c
+++ b/gcc/config/arm/arm-c.c
@@ -409,6 +409,7 @@ arm_pragma_target_parse (tree args, tree pop_target)
       target_option_current_node = cur_tree;
       arm_configure_build_target (&arm_active_target,
 				  TREE_TARGET_OPTION (cur_tree), false);
+      arm_option_reconfigure_globals ();
     }
 
   /* Update macros if target_node changes. The global state will be restored
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index b2dd58d..273202a 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -3058,6 +3058,7 @@ arm_option_restore (struct gcc_options */* opts */,
 		    struct cl_target_option *ptr)
 {
   arm_configure_build_target (&arm_active_target, ptr, false);
+  arm_option_reconfigure_globals ();
 }
 
 /* Reset options between modes that the user has specified.  */
@@ -3441,7 +3442,6 @@ arm_configure_build_target (struct arm_build_target *target,
   target->tune_flags = tune_data->tune_flags;
   target->tune = tune_data->tune;
   target->tune_core = tune_data->scheduler;
-  arm_option_reconfigure_globals ();
 }
 
 /* Fix up any incompatible options that the user has specified.  */
-- 
cgit v1.1


From c1cdabe3aab817d95a8db00a8b5e9f6bcdea936f Mon Sep 17 00:00:00 2001
From: Richard Earnshaw <rearnsha@arm.com>
Date: Thu, 29 Jul 2021 11:00:31 +0100
Subject: arm: reorder assembler architecture directives [PR101723]

A change to the way gas interprets the .fpu directive in binutils-2.34
means that issuing .fpu will clear any features set by .arch_extension
that apply to the floating point or simd units.  This unfortunately
causes problems for more recent versions of the architecture because
we currently emit .arch, .arch_extension and .fpu directives at
different times and try to suppress redundant changes.

This change addresses this by firstly unifying all the places where we
emit these directives to a single block of code and secondly
(re)emitting all the directives if any changes have been made to the
target options.  Whilst this is slightly more than the strict minimum
it should be enough to catch all cases where a change could have
happened.  The new code also emits the directives in the order: .arch,
.fpu, .arch_extension.  This ensures that the additional architectural
extensions are not removed by a later .fpu directive.

Whilst writing this patch I also noticed that in the corner case where
the last function to be compiled had a non-standard set of
architecture flags, the assembler would add an incorrect set of
derived attributes for the file as a whole.  Instead of reflecting the
command-line options it would reflect the flags from the last file in
the function.  To address this I've also added a call to re-emit the
flags from the asm_file_end callback so the assembler will be in the
correct state when it finishes processing the intput.

There's some slight churn to the testsuite as a consequence of this,
because previously we had a hack to suppress emitting a .fpu directive
for one specific case, but with the new order this is no-longer
necessary.

gcc/ChangeLog:

	PR target/101723
	* config/arm/arm-cpus.in (generic-armv7-a): Add quirk to suppress
	writing .cpu directive in asm output.
	* config/arm/arm.c (arm_identify_fpu_from_isa): New variable.
	(arm_last_printed_arch_string): Delete.
	(arm_last-printed_fpu_string): Delete.
	(arm_configure_build_target): If use of floating-point/SIMD is
	disabled, remove all fp/simd related features from the target ISA.
	(last_arm_targ_options): New variable.
	(arm_print_asm_arch_directives): Add new parameters.  Change order
	of emitted directives and handle all cases here.
	(arm_file_start): Always call arm_print_asm_arch_directives, move
	all generation of .arch/.arch_extension here.
	(arm_file_end): Call arm_print_asm_arch.
	(arm_declare_function_name): Call arm_print_asm_arch_directives
	instead of printing .arch/.fpu directives directly.

gcc/testsuite/ChangeLog:

	PR target/101723
	* gcc.target/arm/cortex-m55-nofp-flag-hard.c: Update expected output.
	* gcc.target/arm/cortex-m55-nofp-flag-softfp.c: Likewise.
	* gcc.target/arm/cortex-m55-nofp-nomve-flag-softfp.c: Likewise.
	* gcc.target/arm/mve/intrinsics/mve_fpu1.c: Convert to dg-do assemble.
	Add a non-no-op function body.
	* gcc.target/arm/mve/intrinsics/mve_fpu2.c: Likewise.
	* gcc.target/arm/pr98636.c (dg-options): Add -mfloat-abi=softfp.
	* gcc.target/arm/attr-neon.c: Tighten scan-assembler tests.
	* gcc.target/arm/attr-neon2.c: Use -Ofast, convert test to use
	check-function-bodies.
	* gcc.target/arm/attr-neon3.c: Likewise.
	* gcc.target/arm/pr69245.c: Tighten scan-assembler match, but allow
	multiple instances.
	* gcc.target/arm/pragma_fpu_attribute.c: Likewise.
	* gcc.target/arm/pragma_fpu_attribute_2.c: Likewise.
---
 gcc/config/arm/arm-cpus.in                         |   1 +
 gcc/config/arm/arm.c                               | 186 +++++++++------------
 gcc/testsuite/gcc.target/arm/attr-neon.c           |   9 +-
 gcc/testsuite/gcc.target/arm/attr-neon2.c          |  35 ++--
 gcc/testsuite/gcc.target/arm/attr-neon3.c          |  48 ++++--
 .../gcc.target/arm/cortex-m55-nofp-flag-hard.c     |   2 +-
 .../gcc.target/arm/cortex-m55-nofp-flag-softfp.c   |   2 +-
 .../arm/cortex-m55-nofp-nomve-flag-softfp.c        |   2 +-
 .../gcc.target/arm/mve/intrinsics/mve_fpu1.c       |   5 +-
 .../gcc.target/arm/mve/intrinsics/mve_fpu2.c       |   5 +-
 gcc/testsuite/gcc.target/arm/pr69245.c             |   6 +-
 gcc/testsuite/gcc.target/arm/pr98636.c             |   3 +-
 .../gcc.target/arm/pragma_fpu_attribute.c          |   7 +-
 .../gcc.target/arm/pragma_fpu_attribute_2.c        |   7 +-
 14 files changed, 169 insertions(+), 149 deletions(-)

(limited to 'gcc')

diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in
index ab4b6ac..249995a 100644
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -1080,6 +1080,7 @@ begin cpu generic-armv7-a
  cname genericv7a
  tune flags LDSCHED
  architecture armv7-a+fp
+ isa quirk_no_asmcpu
  option mp add mp
  option sec add sec
  option vfpv3-d16 add VFPv3 FP_DBL
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 273202a..11dafc7 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -79,10 +79,6 @@
 typedef struct minipool_node    Mnode;
 typedef struct minipool_fixup   Mfix;
 
-/* The last .arch and .fpu assembly strings that we printed.  */
-static std::string arm_last_printed_arch_string;
-static std::string arm_last_printed_fpu_string;
-
 void (*arm_lang_output_object_attributes_hook)(void);
 
 struct four_ints
@@ -334,6 +330,7 @@ static rtx_insn *thumb1_md_asm_adjust (vec<rtx> &, vec<rtx> &,
 				       vec<machine_mode> &,
 				       vec<const char *> &, vec<rtx> &,
 				       HARD_REG_SET &, location_t);
+static const char *arm_identify_fpu_from_isa (sbitmap);
 
 /* Table of machine attributes.  */
 static const struct attribute_spec arm_attribute_table[] =
@@ -3411,6 +3408,11 @@ arm_configure_build_target (struct arm_build_target *target,
       bitmap_ior (target->isa, target->isa, fpu_bits);
     }
 
+  /* If we have the soft-float ABI, clear any feature bits relating to use of
+     floating-point operations.  They'll just confuse things later on.  */
+  if (arm_float_abi == ARM_FLOAT_ABI_SOFT)
+    bitmap_and_compl (target->isa, target->isa, isa_all_fpbits);
+
   /* There may be implied bits which we still need to enable. These are
      non-named features which are needed to complete other sets of features,
      but cannot be enabled from arm-cpus.in due to being shared between
@@ -28096,20 +28098,65 @@ arm_print_tune_info (void)
 	       (int) current_tune->sched_autopref);
 }
 
+/* The last set of target options used to emit .arch directives, etc.  This
+   could be a function-local static if it were not required to expose it as a
+   root to the garbage collector.  */
+static GTY(()) cl_target_option *last_asm_targ_options = NULL;
+
 /* Print .arch and .arch_extension directives corresponding to the
    current architecture configuration.  */
 static void
-arm_print_asm_arch_directives ()
+arm_print_asm_arch_directives (FILE *stream, cl_target_option *targ_options)
 {
+  arm_build_target build_target;
+  /* If the target options haven't changed since the last time we were called
+     there is nothing to do.  This should be sufficient to suppress the
+     majority of redundant work.  */
+  if (last_asm_targ_options == targ_options)
+    return;
+
+  last_asm_targ_options = targ_options;
+
+  build_target.isa = sbitmap_alloc (isa_num_bits);
+  arm_configure_build_target (&build_target, targ_options, false);
+
+  if (build_target.core_name
+      && !bitmap_bit_p (build_target.isa, isa_bit_quirk_no_asmcpu))
+    {
+      const char* truncated_name
+	= arm_rewrite_selected_cpu (build_target.core_name);
+      asm_fprintf (stream, "\t.cpu %s\n", truncated_name);
+    }
+
   const arch_option *arch
     = arm_parse_arch_option_name (all_architectures, "-march",
-				  arm_active_target.arch_name);
+				  build_target.arch_name);
   auto_sbitmap opt_bits (isa_num_bits);
 
   gcc_assert (arch);
 
-  asm_fprintf (asm_out_file, "\t.arch %s\n", arm_active_target.arch_name);
-  arm_last_printed_arch_string = arm_active_target.arch_name;
+  if (strcmp (build_target.arch_name, "armv7ve") == 0)
+    {
+      /* Keep backward compatability for assemblers which don't support
+	 armv7ve.  Fortunately, none of the following extensions are reset
+	 by a .fpu directive.  */
+      asm_fprintf (stream, "\t.arch armv7-a\n");
+      asm_fprintf (stream, "\t.arch_extension virt\n");
+      asm_fprintf (stream, "\t.arch_extension idiv\n");
+      asm_fprintf (stream, "\t.arch_extension sec\n");
+      asm_fprintf (stream, "\t.arch_extension mp\n");
+    }
+  else
+    asm_fprintf (stream, "\t.arch %s\n", build_target.arch_name);
+
+  /* The .fpu directive will reset any architecture extensions from the
+     assembler that relate to the fp/vector extensions.  So put this out before
+     any .arch_extension directives.  */
+  const char *fpu_name = (TARGET_SOFT_FLOAT
+			  ? "softvfp"
+			  : arm_identify_fpu_from_isa (build_target.isa));
+  asm_fprintf (stream, "\t.fpu %s\n", fpu_name);
+
   if (!arch->common.extensions)
     return;
 
@@ -28135,13 +28182,12 @@ arm_print_asm_arch_directives ()
 		  && !TARGET_HAVE_MVE_FLOAT))
 	    continue;
 
-	  /* If every feature bit of this option is set in the target
-	     ISA specification, print out the option name.  However,
-	     don't print anything if all the bits are part of the
-	     FPU specification.  */
-	  if (bitmap_subset_p (opt_bits, arm_active_target.isa)
+	  /* If every feature bit of this option is set in the target ISA
+	     specification, print out the option name.  However, don't print
+	     anything if all the bits are part of the FPU specification.  */
+	  if (bitmap_subset_p (opt_bits, build_target.isa)
 	      && !bitmap_subset_p (opt_bits, isa_all_fpubits_internal))
-	    asm_fprintf (asm_out_file, "\t.arch_extension %s\n", opt->name);
+	    asm_fprintf (stream, "\t.arch_extension %s\n", opt->name);
 	}
     }
 }
@@ -28151,46 +28197,23 @@ arm_file_start (void)
 {
   int val;
 
+  arm_print_asm_arch_directives
+    (asm_out_file, TREE_TARGET_OPTION (target_option_default_node));
+
   if (TARGET_BPABI)
     {
-      /* We don't have a specified CPU.  Use the architecture to
-	 generate the tags.
-
-	 Note: it might be better to do this unconditionally, then the
-	 assembler would not need to know about all new CPU names as
-	 they are added.  */
-      if (!arm_active_target.core_name)
-	{
-	  /* armv7ve doesn't support any extensions.  */
-	  if (strcmp (arm_active_target.arch_name, "armv7ve") == 0)
-	    {
-	      /* Keep backward compatability for assemblers
-		 which don't support armv7ve.  */
-	      asm_fprintf (asm_out_file, "\t.arch armv7-a\n");
-	      asm_fprintf (asm_out_file, "\t.arch_extension virt\n");
-	      asm_fprintf (asm_out_file, "\t.arch_extension idiv\n");
-	      asm_fprintf (asm_out_file, "\t.arch_extension sec\n");
-	      asm_fprintf (asm_out_file, "\t.arch_extension mp\n");
-	      arm_last_printed_arch_string = "armv7ve";
-	    }
-	  else
-	    arm_print_asm_arch_directives ();
-	}
-      else if (startswith (arm_active_target.core_name, "generic"))
-	{
-	  asm_fprintf (asm_out_file, "\t.arch %s\n",
-		       arm_active_target.core_name + 8);
-	  arm_last_printed_arch_string = arm_active_target.core_name + 8;
-	}
-      else
+      /* If we have a named cpu, but we the assembler does not support that
+	 name via .cpu, put out a cpu name attribute; but don't do this if the
+	 name starts with the fictitious prefix, 'generic'.  */
+      if (arm_active_target.core_name
+	  && bitmap_bit_p (arm_active_target.isa, isa_bit_quirk_no_asmcpu)
+	  && !startswith (arm_active_target.core_name, "generic"))
 	{
 	  const char* truncated_name
 	    = arm_rewrite_selected_cpu (arm_active_target.core_name);
 	  if (bitmap_bit_p (arm_active_target.isa, isa_bit_quirk_no_asmcpu))
 	    asm_fprintf (asm_out_file, "\t.eabi_attribute 5, \"%s\"\n",
 			 truncated_name);
-	  else
-	    asm_fprintf (asm_out_file, "\t.cpu %s\n", truncated_name);
 	}
 
       if (print_tune_info)
@@ -28255,6 +28278,13 @@ arm_file_end (void)
 {
   int regno;
 
+  /* Just in case the last function output in the assembler had non-default
+     architecture directives, we force the assembler state back to the default
+     set, so that any 'calculated' build attributes are based on the default
+     options rather than the special options for that function.  */
+  arm_print_asm_arch_directives
+    (asm_out_file, TREE_TARGET_OPTION (target_option_default_node));
+
   if (NEED_INDICATE_EXEC_STACK)
     /* Add .note.GNU-stack.  */
     file_end_indicate_exec_stack ();
@@ -33265,58 +33295,7 @@ arm_declare_function_name (FILE *stream, const char *name, tree decl)
     targ_options = TREE_TARGET_OPTION (target_option_current_node);
   gcc_assert (targ_options);
 
-  /* Only update the assembler .arch string if it is distinct from the last
-     such string we printed. arch_to_print is set conditionally in case
-     targ_options->x_arm_arch_string is NULL which can be the case
-     when cc1 is invoked directly without passing -march option.  */
-  std::string arch_to_print;
-  if (targ_options->x_arm_arch_string)
-    arch_to_print = targ_options->x_arm_arch_string;
-
-  if (arch_to_print != arm_last_printed_arch_string)
-    {
-      std::string arch_name
-	= arch_to_print.substr (0, arch_to_print.find ("+"));
-      asm_fprintf (asm_out_file, "\t.arch %s\n", arch_name.c_str ());
-      const arch_option *arch
-	= arm_parse_arch_option_name (all_architectures, "-march",
-				      targ_options->x_arm_arch_string);
-      auto_sbitmap opt_bits (isa_num_bits);
-
-      gcc_assert (arch);
-      if (arch->common.extensions)
-	{
-	  for (const struct cpu_arch_extension *opt = arch->common.extensions;
-	       opt->name != NULL;
-	       opt++)
-	    {
-	      if (!opt->remove)
-		{
-		  arm_initialize_isa (opt_bits, opt->isa_bits);
-		  /* For the cases "-march=armv8.1-m.main+mve -mfloat-abi=soft"
-		     and "-march=armv8.1-m.main+mve.fp -mfloat-abi=soft" MVE and
-		     MVE with floating point instructions is disabled.  So the
-		     following check restricts the printing of ".arch_extension
-		     mve" and ".arch_extension fp" (for mve.fp) in the assembly
-		     file.    MVE needs this special behaviour because the
-		     feature bit "mve" and "mve_float" are not part of
-		     "fpu bits", so they are not cleared when -mfloat-abi=soft
-		     (i.e nofp) but the marco TARGET_HAVE_MVE and
-		     TARGET_HAVE_MVE_FLOAT are disabled.  */
-		  if ((bitmap_bit_p (opt_bits, isa_bit_mve) && !TARGET_HAVE_MVE)
-		      || (bitmap_bit_p (opt_bits, isa_bit_mve_float)
-			  && !TARGET_HAVE_MVE_FLOAT))
-		    continue;
-		  if (bitmap_subset_p (opt_bits, arm_active_target.isa)
-		      && !bitmap_subset_p (opt_bits, isa_all_fpubits_internal))
-		    asm_fprintf (asm_out_file, "\t.arch_extension %s\n",
-				 opt->name);
-		}
-	     }
-	}
-
-      arm_last_printed_arch_string = arch_to_print;
-    }
+  arm_print_asm_arch_directives (stream, targ_options);
 
   fprintf (stream, "\t.syntax unified\n");
 
@@ -33334,17 +33313,6 @@ arm_declare_function_name (FILE *stream, const char *name, tree decl)
   else
     fprintf (stream, "\t.arm\n");
 
-  std::string fpu_to_print
-    = TARGET_SOFT_FLOAT
-	? "softvfp" : arm_identify_fpu_from_isa (arm_active_target.isa);
-
-  if (!(!strcmp (fpu_to_print.c_str (), "softvfp") && TARGET_VFP_BASE)
-      && (fpu_to_print != arm_last_printed_arch_string))
-    {
-      asm_fprintf (asm_out_file, "\t.fpu %s\n", fpu_to_print.c_str ());
-      arm_last_printed_fpu_string = fpu_to_print;
-    }
-
   if (TARGET_POKE_FUNCTION_NAME)
     arm_poke_function_name (stream, (const char *) name);
 }
diff --git a/gcc/testsuite/gcc.target/arm/attr-neon.c b/gcc/testsuite/gcc.target/arm/attr-neon.c
index 225fb8d..e8e3086 100644
--- a/gcc/testsuite/gcc.target/arm/attr-neon.c
+++ b/gcc/testsuite/gcc.target/arm/attr-neon.c
@@ -1,7 +1,10 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_neon_ok } */
 /* { dg-options "-O2 -ftree-vectorize" } */
-/* { dg-add-options arm_neon arm_v8_vfp } */ /* The arm_v8_vfp adds -mfpu=fp-armv8 to the command line, overriding any -mfpu= option set by arm_neon, thus ensuring that the attributes below really are checked for correct fpu selection.  */
+/* { dg-add-options arm_neon arm_v8_vfp } */
+/* The arm_v8_vfp adds -mfpu=fp-armv8 to the command line, overriding any
+   -mfpu= option set by arm_neon, thus ensuring that the attributes below
+   really are checked for correct fpu selection.  */
 
 /* Verify that neon instructions are emitted once.  */
 void __attribute__ ((target("fpu=neon")))
@@ -18,6 +21,6 @@ f3(int n, int x[], int y[]) {
     y[i] = x[i] << 3;
 }
 
-/* { dg-final { scan-assembler-times "\.fpu vfp" 1 } } */
-/* { dg-final { scan-assembler-times "\.fpu neon" 1 } } */
+/* { dg-final { scan-assembler-times "\.fpu\\s+vfp\n" 1 } } */
+/* { dg-final { scan-assembler-times "\.fpu\\s+neon\n" 1 } } */
 /* { dg-final { scan-assembler-times "vshl" 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/attr-neon2.c b/gcc/testsuite/gcc.target/arm/attr-neon2.c
index 2966825..a7a72da 100644
--- a/gcc/testsuite/gcc.target/arm/attr-neon2.c
+++ b/gcc/testsuite/gcc.target/arm/attr-neon2.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_neon_ok } */
 /* { dg-require-effective-target arm_fp_ok } */
-/* { dg-options "-O2" } */
+/* { dg-options "-Ofast" } */
 /* { dg-add-options arm_fp } */
 
 /* Reset fpu to a value compatible with the next pragmas.  */
@@ -12,23 +12,36 @@
 #include <arm_neon.h>
 
 /* Check that pragma target is used.  */
-int8x8_t 
-my (int8x8_t __a, int8x8_t __b)
+/*
+**my:
+**	...
+**	vadd.f32	d[0-9]+, d[0-9]+, d[0-9]+
+**	...
+**	bx	lr
+*/
+float32x2_t
+my (float32x2_t __a, float32x2_t __b)
 {
   return __a + __b;
 }
 
 #pragma GCC pop_options
 
-/* Check that command line option is restored.  */
-int8x8_t 
-my1 (int8x8_t __a, int8x8_t __b)
+/* Check that fpu=vfp is restored.  */
+/*
+**my1:
+**	...
+**	vadd.f32	s[0-9]+, s[0-9]+, s[0-9]+
+**	vadd.f32	s[0-9]+, s[0-9]+, s[0-9]+
+**	...
+**	bx	lr
+*/
+float32x2_t
+my1 (float32x2_t __a, float32x2_t __b)
 {
   return __a + __b;
 }
 
-/* { dg-final { scan-assembler-times "\.fpu vfp" 1 } } */
-/* { dg-final { scan-assembler-times "\.fpu neon" 1 } } */
-/* { dg-final { scan-assembler "vadd" } } */
-
-
+/* { dg-final { scan-assembler "\.fpu\\s+vfp\n" } } */
+/* { dg-final { scan-assembler "\.fpu\\s+neon\n" } } */
+/* { dg-final { check-function-bodies "**" "" } } */
diff --git a/gcc/testsuite/gcc.target/arm/attr-neon3.c b/gcc/testsuite/gcc.target/arm/attr-neon3.c
index 17e429a..0fbce6e 100644
--- a/gcc/testsuite/gcc.target/arm/attr-neon3.c
+++ b/gcc/testsuite/gcc.target/arm/attr-neon3.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_crypto_ok } */
 /* { dg-require-effective-target arm_fp_ok } */
-/* { dg-options "-O2" } */
+/* { dg-options "-Ofast" } */
 /* { dg-add-options arm_fp } */
 
 /* Reset fpu to a value compatible with the next pragmas.  */
@@ -11,28 +11,54 @@
 #include <arm_neon.h>
 
 /* Check that neon is used.  */
-int8x8_t __attribute__ ((target("fpu=neon")))
-my (int8x8_t __a, int8x8_t __b)
+/*
+**my:
+**	...
+**	vadd.f32	d[0-9]+, d[0-9]+, d[0-9]+
+**	...
+**	bx	lr
+*/
+float32x2_t __attribute__ ((target("fpu=neon")))
+my (float32x2_t __a, float32x2_t __b)
 {
   return __a + __b;
 }
 
 /* Check that crypto builtins are recognized.  */
+/*
+**foo:
+**	...
+** (
+**	vld1.64	{d[0-9]+-d[0-9]+}, \[r[0-9]+:64\]
+** |
+**	vld1.64	{d[0-9]+}, \[r[0-9]+:64\]!
+**	vld1.64	{d[0-9]+}, \[r[0-9]+:64\]
+** }
+**	...
+**	bx	lr
+*/
+
 poly128_t __attribute__ ((target("fpu=crypto-neon-fp-armv8")))
 foo (poly128_t* ptr)
 {
   return vldrq_p128 (ptr);
 }
 
-/* Check that default mode is restored.  */
-int8x8_t
-my1 (int8x8_t __a, int8x8_t __b)
+/* Check that fpu=vfp is restored.  */
+/*
+**my1:
+**	...
+**	vadd.f32	s[0-9]+, s[0-9]+, s[0-9]+
+**	vadd.f32	s[0-9]+, s[0-9]+, s[0-9]+
+**	...
+**	bx	lr
+*/float32x2_t
+my1 (float32x2_t __a, float32x2_t __b)
 {
   return __a + __b;
 }
 
-/* { dg-final { scan-assembler-times "\.fpu vfp" 1 } } */
-/* { dg-final { scan-assembler-times "\.fpu neon" 1 } } */
-/* { dg-final { scan-assembler-times "\.fpu crypto-neon-fp-armv8" 1 } } */
-/* { dg-final { scan-assembler-times "vld1" 1 } } */
-/* { dg-final { scan-assembler-times "vadd" 1} } */
+/* { dg-final { scan-assembler "\.fpu\\s+vfp\n" } } */
+/* { dg-final { scan-assembler "\.fpu\\s+neon\n" } } */
+/* { dg-final { scan-assembler "\.fpu\\s+crypto-neon-fp-armv8\n" } } */
+/* { dg-final { check-function-bodies "**" "" } } */
diff --git a/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-flag-hard.c b/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-flag-hard.c
index 6a92ded..e0fb307 100644
--- a/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-flag-hard.c
+++ b/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-flag-hard.c
@@ -1,12 +1,12 @@
 /* { dg-do assemble } */
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-additional-options "-mcpu=cortex-m55+nofp -mthumb -mfloat-abi=hard -mfpu=auto --save-temps" } */
+/* { dg-final { scan-assembler "\.fpu softvfp" } } */
 /* { dg-final { scan-assembler "\.arch_extension mve" } } */
 /* { dg-final { scan-assembler "\.arch_extension dsp" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension fp" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension fp.dp" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension mve.fp" } } */
-/* { dg-final { scan-assembler-not "\.fpu" } } */
 
 int
 f ()
diff --git a/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-flag-softfp.c b/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-flag-softfp.c
index 25e80e9..50645e8 100644
--- a/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-flag-softfp.c
+++ b/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-flag-softfp.c
@@ -1,12 +1,12 @@
 /* { dg-do assemble } */
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-additional-options "-mcpu=cortex-m55+nofp -mthumb -mfloat-abi=softfp -mfpu=auto --save-temps" } */
+/* { dg-final { scan-assembler "\.fpu softvfp" } } */
 /* { dg-final { scan-assembler "\.arch_extension mve" } } */
 /* { dg-final { scan-assembler "\.arch_extension dsp" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension fp" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension fp.dp" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension mve.fp" } } */
-/* { dg-final { scan-assembler-not "\.fpu" } } */
 
 int
 f ()
diff --git a/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-nomve-flag-softfp.c b/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-nomve-flag-softfp.c
index 38042cc..948f622 100644
--- a/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-nomve-flag-softfp.c
+++ b/gcc/testsuite/gcc.target/arm/cortex-m55-nofp-nomve-flag-softfp.c
@@ -1,12 +1,12 @@
 /* { dg-do assemble } */
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-additional-options "-mcpu=cortex-m55+nomve+nofp -mthumb -mfloat-abi=softfp -mfpu=auto --save-temps" } */
+/* { dg-final { scan-assembler "\.fpu softvfp" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension mve" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension mve.fp" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension fp" } } */
 /* { dg-final { scan-assembler-not "\.arch_extension fp.dp" } } */
 /* { dg-final { scan-assembler "\.arch_extension dsp" } } */
-/* { dg-final { scan-assembler-not "\.fpu" } } */
 
 int
 f ()
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu1.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu1.c
index 611097e..c5acdb5 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu1.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu1.c
@@ -1,3 +1,4 @@
+/* { dg-do assemble } */
 /* { dg-require-effective-target arm_v8_1m_mve_ok } */
 /* { dg-require-effective-target arm_hard_ok } */
 /* { dg-add-options arm_v8_1m_mve } */
@@ -8,8 +9,6 @@
 int8x16_t
 foo1 (int8x16_t value)
 {
-  int8x16_t b = value;
+  int8x16_t b = -value;
   return b;
 }
-
-/* { dg-final { scan-assembler-not "\.fpu softvfp" }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu2.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu2.c
index b8e1051..907db5e 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu2.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu2.c
@@ -1,3 +1,4 @@
+/* { dg-do assemble } */
 /* { dg-require-effective-target arm_v8_1m_mve_ok } */
 /* { dg-require-effective-target arm_softfp_ok } */
 /* { dg-add-options arm_v8_1m_mve } */
@@ -8,8 +9,6 @@
 int8x16_t
 foo1 (int8x16_t value)
 {
-  int8x16_t b = value;
+  int8x16_t b = -value;
   return b;
 }
-
-/* { dg-final { scan-assembler-not "\.fpu softvfp" }  } */
diff --git a/gcc/testsuite/gcc.target/arm/pr69245.c b/gcc/testsuite/gcc.target/arm/pr69245.c
index bd50518..34b97a2 100644
--- a/gcc/testsuite/gcc.target/arm/pr69245.c
+++ b/gcc/testsuite/gcc.target/arm/pr69245.c
@@ -23,4 +23,8 @@ void fn2 ()
   d = b * c + a;
 }
 
-/* { dg-final { scan-assembler-times "\.fpu vfp" 1 } } */
+/* Because we don't know the exact command-line options used to invoke the test
+   we cannot expect these tests to match exactly once.  But they must appear at
+   least once.  */
+/* { dg-final { scan-assembler "\.fpu\s+vfp\n" } } */
+/* { dg-final { scan-assembler "\.fpu\s+neon-vfpv4\n" } } */
diff --git a/gcc/testsuite/gcc.target/arm/pr98636.c b/gcc/testsuite/gcc.target/arm/pr98636.c
index c4d235c..559f9a2 100644
--- a/gcc/testsuite/gcc.target/arm/pr98636.c
+++ b/gcc/testsuite/gcc.target/arm/pr98636.c
@@ -1,5 +1,6 @@
 /* { dg-do compile } */
-/* { dg-options "-mfp16-format=alternative" } */
+/* { dg-require-effective-target arm_softfp_ok } */
+/* { dg-options "-mfp16-format=alternative -mfloat-abi=softfp" } */
 
 #pragma GCC push_options
 # pragma GCC target ("arch=armv8.2-a+fp16") /* { dg-error "selected fp16 options are incompatible" } */
diff --git a/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute.c b/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute.c
index 174be85..7e63cf5 100644
--- a/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute.c
+++ b/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute.c
@@ -22,5 +22,8 @@ uint32_t restored ()
   return bar();
 }
 
-/* { dg-final { scan-assembler-times {\.fpu\s+vfpv4} 1 } } */
-/* { dg-final { scan-assembler-times {\.fpu\s+vfpv3-d16} 1 } } */
+/* We can't tell exactly how many times the following tests will match because
+   command-line options may cause additional instances to be generated, but
+   each must be present at least once.  */
+/* { dg-final { scan-assembler {\.fpu\s+vfpv4\n} } } */
+/* { dg-final { scan-assembler {\.fpu\s+vfpv3-d16\n} } } */
diff --git a/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute_2.c b/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute_2.c
index add40dd..3d33b04 100644
--- a/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute_2.c
+++ b/gcc/testsuite/gcc.target/arm/pragma_fpu_attribute_2.c
@@ -25,5 +25,8 @@ uint32_t restored ()
   return bar();
 }
 
-/* { dg-final { scan-assembler-times {\.fpu\s+vfpv4} 1 } } */
-/* { dg-final { scan-assembler-times {\.fpu\s+vfpv3-d16} 1 } } */
+/* We can't tell exactly how many times the following tests will match because
+   command-line options may cause additional instances to be generated, but
+   each must be present at least once.  */
+/* { dg-final { scan-assembler-times {\.fpu\s+vfpv4\n} } } */
+/* { dg-final { scan-assembler-times {\.fpu\s+vfpv3-d16\n} } } */
-- 
cgit v1.1


From 783d809f0bb13a9f50139d03c328f59f9e3840c7 Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Thu, 5 Aug 2021 14:03:23 +0100
Subject: vect: Move costing helpers from aarch64 code

aarch64.c has various routines to test for specific kinds of
vector statement cost.  The routines aren't really target-specific,
so following a suggestion from Richi, this patch moves them to a new
section of tree-vectorizer.h.

gcc/
	* tree-vectorizer.h (vect_is_store_elt_extraction, vect_is_reduction)
	(vect_reduc_type, vect_embedded_comparison_type, vect_comparison_type)
	(vect_is_extending_load, vect_is_integer_truncation): New functions,
	moved from aarch64.c but given different names.
	* config/aarch64/aarch64.c (aarch64_is_store_elt_extraction)
	(aarch64_is_reduction, aarch64_reduc_type)
	(aarch64_embedded_comparison_type, aarch64_comparison_type)
	(aarch64_extending_load_p, aarch64_integer_truncation_p): Delete
	in favor of the above.  Update callers accordingly.
---
 gcc/config/aarch64/aarch64.c | 125 +++++--------------------------------------
 gcc/tree-vectorizer.h        | 104 +++++++++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+), 111 deletions(-)

(limited to 'gcc')

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 30f8365..4cd4b03 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14820,40 +14820,6 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
     }
 }
 
-/* Return true if an operaton of kind KIND for STMT_INFO represents
-   the extraction of an element from a vector in preparation for
-   storing the element to memory.  */
-static bool
-aarch64_is_store_elt_extraction (vect_cost_for_stmt kind,
-				 stmt_vec_info stmt_info)
-{
-  return (kind == vec_to_scalar
-	  && STMT_VINFO_DATA_REF (stmt_info)
-	  && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)));
-}
-
-/* Return true if STMT_INFO represents part of a reduction.  */
-static bool
-aarch64_is_reduction (stmt_vec_info stmt_info)
-{
-  return (STMT_VINFO_REDUC_DEF (stmt_info)
-	  || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)));
-}
-
-/* If STMT_INFO describes a reduction, return the type of reduction
-   it describes, otherwise return -1.  */
-static int
-aarch64_reduc_type (vec_info *vinfo, stmt_vec_info stmt_info)
-{
-  if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
-    if (STMT_VINFO_REDUC_DEF (stmt_info))
-      {
-	stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
-	return int (STMT_VINFO_REDUC_TYPE (reduc_info));
-      }
-  return -1;
-}
-
 /* Return true if an access of kind KIND for STMT_INFO represents one
    vector of an LD[234] or ST[234] operation.  Return the total number of
    vectors (2, 3 or 4) if so, otherwise return a value outside that range.  */
@@ -14874,32 +14840,6 @@ aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
   return 0;
 }
 
-/* If STMT_INFO is a COND_EXPR that includes an embedded comparison, return the
-   scalar type of the values being compared.  Return null otherwise.  */
-static tree
-aarch64_embedded_comparison_type (stmt_vec_info stmt_info)
-{
-  if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
-    if (gimple_assign_rhs_code (assign) == COND_EXPR)
-      {
-	tree cond = gimple_assign_rhs1 (assign);
-	if (COMPARISON_CLASS_P (cond))
-	  return TREE_TYPE (TREE_OPERAND (cond, 0));
-      }
-  return NULL_TREE;
-}
-
-/* If STMT_INFO is a comparison or contains an embedded comparison, return the
-   scalar type of the values being compared.  Return null otherwise.  */
-static tree
-aarch64_comparison_type (stmt_vec_info stmt_info)
-{
-  if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
-    if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison)
-      return TREE_TYPE (gimple_assign_rhs1 (assign));
-  return aarch64_embedded_comparison_type (stmt_info);
-}
-
 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
    vectors would produce a series of LDP or STP operations.  KIND is the
    kind of statement that STMT_INFO represents.  */
@@ -14926,43 +14866,6 @@ aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
   return is_gimple_assign (stmt_info->stmt);
 }
 
-/* Return true if STMT_INFO extends the result of a load.  */
-static bool
-aarch64_extending_load_p (class vec_info *vinfo, stmt_vec_info stmt_info)
-{
-  gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
-  if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
-    return false;
-
-  tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
-  tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
-  tree rhs_type = TREE_TYPE (rhs);
-  if (!INTEGRAL_TYPE_P (lhs_type)
-      || !INTEGRAL_TYPE_P (rhs_type)
-      || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
-    return false;
-
-  stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
-  return (def_stmt_info
-	  && STMT_VINFO_DATA_REF (def_stmt_info)
-	  && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
-}
-
-/* Return true if STMT_INFO is an integer truncation.  */
-static bool
-aarch64_integer_truncation_p (stmt_vec_info stmt_info)
-{
-  gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
-  if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
-    return false;
-
-  tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
-  tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
-  return (INTEGRAL_TYPE_P (lhs_type)
-	  && INTEGRAL_TYPE_P (rhs_type)
-	  && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
-}
-
 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
    or multiply-subtract sequence that might be suitable for fusing into a
    single instruction.  If VEC_FLAGS is zero, analyze the operation as
@@ -15065,7 +14968,7 @@ aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
 				       tree vectype,
 				       const sve_vec_cost *sve_costs)
 {
-  switch (aarch64_reduc_type (vinfo, stmt_info))
+  switch (vect_reduc_type (vinfo, stmt_info))
     {
     case EXTRACT_LAST_REDUCTION:
       return sve_costs->clast_cost;
@@ -15156,7 +15059,7 @@ aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
 {
   /* Detect an extension of a loaded value.  In general, we'll be able to fuse
      the extension with the load.  */
-  if (kind == scalar_stmt && aarch64_extending_load_p (vinfo, stmt_info))
+  if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
     return 0;
 
   return stmt_cost;
@@ -15188,7 +15091,7 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
   /* Detect cases in which vec_to_scalar is describing the extraction of a
      vector element in preparation for a scalar store.  The store itself is
      costed separately.  */
-  if (aarch64_is_store_elt_extraction (kind, stmt_info))
+  if (vect_is_store_elt_extraction (kind, stmt_info))
     return simd_costs->store_elt_extra_cost;
 
   /* Detect SVE gather loads, which are costed as a single scalar_load
@@ -15227,7 +15130,7 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
      instruction like FADDP or MAXV.  */
   if (kind == vec_to_scalar
       && where == vect_epilogue
-      && aarch64_is_reduction (stmt_info))
+      && vect_is_reduction (stmt_info))
     switch (GET_MODE_INNER (TYPE_MODE (vectype)))
       {
       case E_QImode:
@@ -15277,12 +15180,12 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
      on the fly.  Optimistically assume that a load followed by an extension
      will fold to this form during combine, and that the extension therefore
      comes for free.  */
-  if (kind == vector_stmt && aarch64_extending_load_p (vinfo, stmt_info))
+  if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
     stmt_cost = 0;
 
   /* For similar reasons, vector_stmt integer truncations are a no-op,
      because we can just ignore the unused upper bits of the source.  */
-  if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
+  if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
     stmt_cost = 0;
 
   /* Advanced SIMD can load and store pairs of registers using LDP and STP,
@@ -15357,7 +15260,7 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
 	}
 
       if (kind == vector_stmt || kind == vec_to_scalar)
-	if (tree cmp_type = aarch64_embedded_comparison_type (stmt_info))
+	if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
 	  {
 	    if (FLOAT_TYPE_P (cmp_type))
 	      stmt_cost += simd_costs->fp_stmt_cost;
@@ -15367,7 +15270,7 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
     }
 
   if (kind == scalar_stmt)
-    if (tree cmp_type = aarch64_embedded_comparison_type (stmt_info))
+    if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
       {
 	if (FLOAT_TYPE_P (cmp_type))
 	  stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
@@ -15417,12 +15320,12 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
   /* Calculate the minimum cycles per iteration imposed by a reduction
      operation.  */
   if ((kind == vector_stmt || kind == vec_to_scalar)
-      && aarch64_is_reduction (stmt_info))
+      && vect_is_reduction (stmt_info))
     {
       unsigned int base
 	= aarch64_in_loop_reduction_latency (vinfo, stmt_info, vectype,
 					     vec_flags);
-      if (aarch64_reduc_type (vinfo, stmt_info) == FOLD_LEFT_REDUCTION)
+      if (vect_reduc_type (vinfo, stmt_info) == FOLD_LEFT_REDUCTION)
 	{
 	  if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
 	    {
@@ -15521,7 +15424,7 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
 
   /* Add any embedded comparison operations.  */
   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
-      && aarch64_embedded_comparison_type (stmt_info))
+      && vect_embedded_comparison_type (stmt_info))
     ops->general_ops += num_copies;
 
   /* Detect COND_REDUCTIONs and things that would need to become
@@ -15530,7 +15433,7 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
      have only accounted for one.  */
   if (vec_flags && (kind == vector_stmt || kind == vec_to_scalar))
     {
-      int reduc_type = aarch64_reduc_type (vinfo, stmt_info);
+      int reduc_type = vect_reduc_type (vinfo, stmt_info);
       if ((reduc_type == EXTRACT_LAST_REDUCTION && (vec_flags & VEC_ADVSIMD))
 	  || reduc_type == COND_REDUCTION)
 	ops->general_ops += num_copies;
@@ -15538,7 +15441,7 @@ aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
 
   /* Count the predicate operations needed by an SVE comparison.  */
   if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
-    if (tree type = aarch64_comparison_type (stmt_info))
+    if (tree type = vect_comparison_type (stmt_info))
       {
 	unsigned int base = (FLOAT_TYPE_P (type)
 			     ? sve_issue->fp_cmp_pred_ops
@@ -15616,7 +15519,7 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
 	  /* If we scalarize a strided store, the vectorizer costs one
 	     vec_to_scalar for each element.  However, we can store the first
 	     element using an FP store without a separate extract step.  */
-	  if (aarch64_is_store_elt_extraction (kind, stmt_info))
+	  if (vect_is_store_elt_extraction (kind, stmt_info))
 	    count -= 1;
 
 	  stmt_cost = aarch64_detect_scalar_stmt_subtype
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index deb2247..686644b4 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2192,4 +2192,108 @@ extern vect_pattern_decl_t slp_patterns[];
 /* Number of supported pattern matchers.  */
 extern size_t num__slp_patterns;
 
+/* ----------------------------------------------------------------------
+   Target support routines
+   -----------------------------------------------------------------------
+   The following routines are provided to simplify costing decisions in
+   target code.  Please add more as needed.  */
+
+/* Return true if an operaton of kind KIND for STMT_INFO represents
+   the extraction of an element from a vector in preparation for
+   storing the element to memory.  */
+inline bool
+vect_is_store_elt_extraction (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
+{
+  return (kind == vec_to_scalar
+	  && STMT_VINFO_DATA_REF (stmt_info)
+	  && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)));
+}
+
+/* Return true if STMT_INFO represents part of a reduction.  */
+inline bool
+vect_is_reduction (stmt_vec_info stmt_info)
+{
+  return (STMT_VINFO_REDUC_DEF (stmt_info)
+	  || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)));
+}
+
+/* If STMT_INFO describes a reduction, return the vect_reduction_type
+   of the reduction it describes, otherwise return -1.  */
+inline int
+vect_reduc_type (vec_info *vinfo, stmt_vec_info stmt_info)
+{
+  if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
+    if (STMT_VINFO_REDUC_DEF (stmt_info))
+      {
+	stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
+	return int (STMT_VINFO_REDUC_TYPE (reduc_info));
+      }
+  return -1;
+}
+
+/* If STMT_INFO is a COND_EXPR that includes an embedded comparison, return the
+   scalar type of the values being compared.  Return null otherwise.  */
+inline tree
+vect_embedded_comparison_type (stmt_vec_info stmt_info)
+{
+  if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
+    if (gimple_assign_rhs_code (assign) == COND_EXPR)
+      {
+	tree cond = gimple_assign_rhs1 (assign);
+	if (COMPARISON_CLASS_P (cond))
+	  return TREE_TYPE (TREE_OPERAND (cond, 0));
+      }
+  return NULL_TREE;
+}
+
+/* If STMT_INFO is a comparison or contains an embedded comparison, return the
+   scalar type of the values being compared.  Return null otherwise.  */
+inline tree
+vect_comparison_type (stmt_vec_info stmt_info)
+{
+  if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
+    if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison)
+      return TREE_TYPE (gimple_assign_rhs1 (assign));
+  return vect_embedded_comparison_type (stmt_info);
+}
+
+/* Return true if STMT_INFO extends the result of a load.  */
+inline bool
+vect_is_extending_load (class vec_info *vinfo, stmt_vec_info stmt_info)
+{
+  /* Although this is quite large for an inline function, this part
+     at least should be inline.  */
+  gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
+  if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
+    return false;
+
+  tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
+  tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
+  tree rhs_type = TREE_TYPE (rhs);
+  if (!INTEGRAL_TYPE_P (lhs_type)
+      || !INTEGRAL_TYPE_P (rhs_type)
+      || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
+    return false;
+
+  stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
+  return (def_stmt_info
+	  && STMT_VINFO_DATA_REF (def_stmt_info)
+	  && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
+}
+
+/* Return true if STMT_INFO is an integer truncation.  */
+inline bool
+vect_is_integer_truncation (stmt_vec_info stmt_info)
+{
+  gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
+  if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
+    return false;
+
+  tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
+  tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
+  return (INTEGRAL_TYPE_P (lhs_type)
+	  && INTEGRAL_TYPE_P (rhs_type)
+	  && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
+}
+
 #endif  /* GCC_TREE_VECTORIZER_H  */
-- 
cgit v1.1


From c04bb6d93f3bd009800cb99e56c779a69d832691 Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Thu, 5 Aug 2021 14:03:24 +0100
Subject: doc: Document cond_* shift optabs in md.texi

gcc/
	PR middle-end/101787
	* doc/md.texi (cond_ashl, cond_ashr, cond_lshr): Document.
---
 gcc/doc/md.texi | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'gcc')

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index f6d1bc1..f8047ae 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6921,6 +6921,9 @@ operand 0, otherwise (operand 2 + operand 3) is moved.
 @cindex @code{cond_smax@var{mode}} instruction pattern
 @cindex @code{cond_umin@var{mode}} instruction pattern
 @cindex @code{cond_umax@var{mode}} instruction pattern
+@cindex @code{cond_ashl@var{mode}} instruction pattern
+@cindex @code{cond_ashr@var{mode}} instruction pattern
+@cindex @code{cond_lshr@var{mode}} instruction pattern
 @item @samp{cond_add@var{mode}}
 @itemx @samp{cond_sub@var{mode}}
 @itemx @samp{cond_mul@var{mode}}
@@ -6935,6 +6938,9 @@ operand 0, otherwise (operand 2 + operand 3) is moved.
 @itemx @samp{cond_smax@var{mode}}
 @itemx @samp{cond_umin@var{mode}}
 @itemx @samp{cond_umax@var{mode}}
+@itemx @samp{cond_ashl@var{mode}}
+@itemx @samp{cond_ashr@var{mode}}
+@itemx @samp{cond_lshr@var{mode}}
 When operand 1 is true, perform an operation on operands 2 and 3 and
 store the result in operand 0, otherwise store operand 4 in operand 0.
 The operation works elementwise if the operands are vectors.
@@ -6962,6 +6968,11 @@ Operands 0, 2, 3 and 4 all have mode @var{m}.  Operand 1 is a scalar
 integer if @var{m} is scalar, otherwise it has the mode returned by
 @code{TARGET_VECTORIZE_GET_MASK_MODE}.
 
+@samp{cond_@var{op}@var{mode}} generally corresponds to a conditional
+form of @samp{@var{op}@var{mode}3}.  As an exception, the vector forms
+of shifts correspond to patterns like @code{vashl@var{mode}3} rather
+than patterns like @code{ashl@var{mode}3}.
+
 @cindex @code{cond_fma@var{mode}} instruction pattern
 @cindex @code{cond_fms@var{mode}} instruction pattern
 @cindex @code{cond_fnma@var{mode}} instruction pattern
-- 
cgit v1.1


From 72264a639729a5dcc21dbee304717ce22b338bfd Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sat, 17 Jul 2021 07:44:45 -0700
Subject: <x86gprintrin.h>: Add pragma GCC target("general-regs-only")

1. Intrinsics in <x86gprintrin.h> only require GPR ISAs.  Add

 #if defined __MMX__ || defined __SSE__
 #pragma GCC push_options
 #pragma GCC target("general-regs-only")
 #define __DISABLE_GENERAL_REGS_ONLY__
 #endif

and

 #ifdef __DISABLE_GENERAL_REGS_ONLY__
 #undef __DISABLE_GENERAL_REGS_ONLY__
 #pragma GCC pop_options
 #endif /* __DISABLE_GENERAL_REGS_ONLY__ */

to <x86gprintrin.h> to disable non-GPR ISAs so that they can be used in
functions with __attribute__ ((target("general-regs-only"))).
2. When checking always_inline attribute, if callee only uses GPRs,
ignore MASK_80387 since enable MASK_80387 in caller has no impact on
callee inline.

gcc/

	PR target/99744
	* config/i386/i386.c (ix86_can_inline_p): Ignore MASK_80387 if
	callee only uses GPRs.
	* config/i386/ia32intrin.h: Revert commit 5463cee2770.
	* config/i386/serializeintrin.h: Revert commit 71958f740f1.
	* config/i386/x86gprintrin.h: Add
	#pragma GCC target("general-regs-only") and #pragma GCC pop_options
	to disable non-GPR ISAs.

gcc/testsuite/

	PR target/99744
	* gcc.target/i386/pr99744-3.c: New test.
	* gcc.target/i386/pr99744-4.c: Likewise.
	* gcc.target/i386/pr99744-5.c: Likewise.
	* gcc.target/i386/pr99744-6.c: Likewise.
	* gcc.target/i386/pr99744-7.c: Likewise.
	* gcc.target/i386/pr99744-8.c: Likewise.
---
 gcc/config/i386/i386.c                    |   6 +-
 gcc/config/i386/ia32intrin.h              |  14 +-
 gcc/config/i386/serializeintrin.h         |   7 +-
 gcc/config/i386/x86gprintrin.h            |  11 +
 gcc/testsuite/gcc.target/i386/pr99744-3.c |  13 ++
 gcc/testsuite/gcc.target/i386/pr99744-4.c | 357 ++++++++++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr99744-5.c |  25 +++
 gcc/testsuite/gcc.target/i386/pr99744-6.c |  23 ++
 gcc/testsuite/gcc.target/i386/pr99744-7.c |  12 +
 gcc/testsuite/gcc.target/i386/pr99744-8.c |  13 ++
 10 files changed, 477 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr99744-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr99744-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr99744-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr99744-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr99744-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr99744-8.c

(limited to 'gcc')

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index ec06908..aea224a 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -554,7 +554,7 @@ ix86_can_inline_p (tree caller, tree callee)
 
   /* Changes of those flags can be tolerated for always inlines. Lets hope
      user knows what he is doing.  */
-  const unsigned HOST_WIDE_INT always_inline_safe_mask
+  unsigned HOST_WIDE_INT always_inline_safe_mask
 	 = (MASK_USE_8BIT_IDIV | MASK_ACCUMULATE_OUTGOING_ARGS
 	    | MASK_NO_ALIGN_STRINGOPS | MASK_AVX256_SPLIT_UNALIGNED_LOAD
 	    | MASK_AVX256_SPLIT_UNALIGNED_STORE | MASK_CLD
@@ -579,6 +579,10 @@ ix86_can_inline_p (tree caller, tree callee)
        && lookup_attribute ("always_inline",
 			    DECL_ATTRIBUTES (callee)));
 
+  /* If callee only uses GPRs, ignore MASK_80387.  */
+  if (TARGET_GENERAL_REGS_ONLY_P (callee_opts->x_ix86_target_flags))
+    always_inline_safe_mask |= MASK_80387;
+
   cgraph_node *callee_node = cgraph_node::get (callee);
   /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
      function can inline a SSE2 function but a SSE2 function can't inline
diff --git a/gcc/config/i386/ia32intrin.h b/gcc/config/i386/ia32intrin.h
index 5422b0f..df99220 100644
--- a/gcc/config/i386/ia32intrin.h
+++ b/gcc/config/i386/ia32intrin.h
@@ -107,12 +107,22 @@ __rdpmc (int __S)
 #endif /* __iamcu__ */
 
 /* rdtsc */
-#define __rdtsc()		__builtin_ia32_rdtsc ()
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rdtsc (void)
+{
+  return __builtin_ia32_rdtsc ();
+}
 
 #ifndef __iamcu__
 
 /* rdtscp */
-#define __rdtscp(a)		__builtin_ia32_rdtscp (a)
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rdtscp (unsigned int *__A)
+{
+  return __builtin_ia32_rdtscp (__A);
+}
 
 #endif /* __iamcu__ */
 
diff --git a/gcc/config/i386/serializeintrin.h b/gcc/config/i386/serializeintrin.h
index e280250..89b5b94 100644
--- a/gcc/config/i386/serializeintrin.h
+++ b/gcc/config/i386/serializeintrin.h
@@ -34,7 +34,12 @@
 #define __DISABLE_SERIALIZE__
 #endif /* __SERIALIZE__ */
 
-#define _serialize()	__builtin_ia32_serialize ()
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_serialize (void)
+{
+  __builtin_ia32_serialize ();
+}
 
 #ifdef __DISABLE_SERIALIZE__
 #undef __DISABLE_SERIALIZE__
diff --git a/gcc/config/i386/x86gprintrin.h b/gcc/config/i386/x86gprintrin.h
index 7793032..b7fefa7 100644
--- a/gcc/config/i386/x86gprintrin.h
+++ b/gcc/config/i386/x86gprintrin.h
@@ -24,6 +24,12 @@
 #ifndef _X86GPRINTRIN_H_INCLUDED
 #define _X86GPRINTRIN_H_INCLUDED
 
+#if defined __MMX__ || defined __SSE__
+#pragma GCC push_options
+#pragma GCC target("general-regs-only")
+#define __DISABLE_GENERAL_REGS_ONLY__
+#endif
+
 #include <ia32intrin.h>
 
 #ifndef __iamcu__
@@ -255,4 +261,9 @@ _ptwrite32 (unsigned __B)
 
 #endif /* __iamcu__ */
 
+#ifdef __DISABLE_GENERAL_REGS_ONLY__
+#undef __DISABLE_GENERAL_REGS_ONLY__
+#pragma GCC pop_options
+#endif /* __DISABLE_GENERAL_REGS_ONLY__ */
+
 #endif /* _X86GPRINTRIN_H_INCLUDED.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr99744-3.c b/gcc/testsuite/gcc.target/i386/pr99744-3.c
new file mode 100644
index 0000000..6c50581
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr99744-3.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-serialize" } */
+
+#include <x86intrin.h>
+
+__attribute__ ((target("general-regs-only")))
+void
+foo1 (void)
+{
+  _serialize ();
+}
+
+/* { dg-error "target specific option mismatch" "" { target *-*-* } 0 } */
diff --git a/gcc/testsuite/gcc.target/i386/pr99744-4.c b/gcc/testsuite/gcc.target/i386/pr99744-4.c
new file mode 100644
index 0000000..9196e62
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr99744-4.c
@@ -0,0 +1,357 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi -mbmi2 -mcldemote -mclflushopt -mclwb -mclzero -mcrc32 -menqcmd -mfsgsbase -mfxsr -mhreset -mlzcnt -mlwp -mmovdir64b -mmovdiri -mmwaitx -mpconfig -mpku -mpopcnt -mptwrite -mrdpid -mrdrnd -mrdseed -mrtm -msgx -mshstk -mtbm -mtsxldtrk -mxsave -mxsavec -mxsaveopt -mxsaves -mwaitpkg -mwbnoinvd" } */
+/* { dg-additional-options "-muintr" { target { ! ia32 } } }  */
+
+/* Test calling GPR intrinsics from functions with general-regs-only
+   target attribute.  */
+
+#include <x86gprintrin.h>
+
+#define _CONCAT(x,y) x ## y
+
+#define test_0(func, type)						\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (void)						\
+  { return func (); }
+
+#define test_0_i1(func, type, imm)					\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (void)						\
+  { return func (imm); }
+
+#define test_1(func, type, op1_type)					\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (op1_type A)					\
+  { return func (A); }
+
+#define test_1_i1(func, type, op1_type, imm)				\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (op1_type A)					\
+  { return func (A, imm); }
+
+#define test_2(func, type, op1_type, op2_type)				\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (op1_type A, op2_type B)			\
+  { return func (A, B); }
+
+#define test_2_i1(func, type, op1_type, op2_type, imm)			\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (op1_type A, op2_type B)			\
+  { return func (A, B, imm); }
+
+#define test_3(func, type, op1_type, op2_type, op3_type)		\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (op1_type A, op2_type B, op3_type C)		\
+  { return func (A, B, C); }
+
+#define test_4(func, type, op1_type, op2_type, op3_type, op4_type)	\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (op1_type A, op2_type B, op3_type C,		\
+			  op4_type D)					\
+  { return func (A, B, C, D); }
+
+/* ia32intrin.h  */
+test_1 (__bsfd, int, int)
+test_1 (__bsrd, int, int)
+test_1 (__bswapd, int, int)
+test_1 (__popcntd, int, unsigned int)
+test_2 (__rolb, unsigned char, unsigned char, int)
+test_2 (__rolw, unsigned short, unsigned short, int)
+test_2 (__rold, unsigned int, unsigned int, int)
+test_2 (__rorb, unsigned char, unsigned char, int)
+test_2 (__rorw, unsigned short, unsigned short, int)
+test_2 (__rord, unsigned int, unsigned int, int)
+
+#ifndef __iamcu__
+/* adxintrin.h */
+test_4 (_subborrow_u32, unsigned char, unsigned char, unsigned int,
+	unsigned int, unsigned int *)
+test_4 (_addcarry_u32, unsigned char, unsigned char, unsigned int,
+	unsigned int, unsigned int *)
+test_4 (_addcarryx_u32, unsigned char, unsigned char, unsigned int,
+	unsigned int, unsigned int *)
+
+/* bmiintrin.h */
+test_1 (__tzcnt_u16, unsigned short, unsigned short)
+test_2 (__andn_u32, unsigned int, unsigned int, unsigned int)
+test_2 (__bextr_u32, unsigned int, unsigned int, unsigned int)
+test_3 (_bextr_u32, unsigned int, unsigned int, unsigned int,
+	unsigned int)
+test_1 (__blsi_u32, unsigned int, unsigned int)
+test_1 (_blsi_u32, unsigned int, unsigned int)
+test_1 (__blsmsk_u32, unsigned int, unsigned int)
+test_1 (_blsmsk_u32, unsigned int, unsigned int)
+test_1 (__blsr_u32, unsigned int, unsigned int)
+test_1 (_blsr_u32, unsigned int, unsigned int)
+test_1 (__tzcnt_u32, unsigned int, unsigned int)
+test_1 (_tzcnt_u32, unsigned int, unsigned int)
+
+/* bmi2intrin.h */
+test_2 (_bzhi_u32, unsigned int, unsigned int, unsigned int)
+test_2 (_pdep_u32, unsigned int, unsigned int, unsigned int)
+test_2 (_pext_u32, unsigned int, unsigned int, unsigned int)
+
+/* cetintrin.h */
+test_1 (_inc_ssp, void, unsigned int)
+test_0 (_saveprevssp, void)
+test_1 (_rstorssp, void, void *)
+test_2 (_wrssd, void, unsigned int, void *)
+test_2 (_wrussd, void, unsigned int, void *)
+test_0 (_setssbsy, void)
+test_1 (_clrssbsy, void, void *)
+
+/* cldemoteintrin.h */
+test_1 (_cldemote, void, void *)
+
+/* clflushoptintrin.h */
+test_1 (_mm_clflushopt, void, void *)
+
+/* clwbintrin.h */
+test_1 (_mm_clwb, void, void *)
+
+/* clzerointrin.h */
+test_1 (_mm_clzero, void, void *)
+
+/* enqcmdintrin.h */
+test_2 (_enqcmd, int, void *, const void *)
+test_2 (_enqcmds, int, void *, const void *)
+
+/* fxsrintrin.h */
+test_1 (_fxsave, void, void *)
+test_1 (_fxrstor, void, void *)
+
+/* hresetintrin.h */
+test_1 (_hreset, void, unsigned int)
+
+/* ia32intrin.h  */
+test_2 (__crc32b, unsigned int, unsigned char, unsigned char)
+test_2 (__crc32w, unsigned int, unsigned short, unsigned short)
+test_2 (__crc32d, unsigned int, unsigned int, unsigned int)
+test_1 (__rdpmc, unsigned long long, int)
+test_0 (__rdtsc, unsigned long long)
+test_1 (__rdtscp, unsigned long long, unsigned int *)
+test_0 (__pause, void)
+
+/* lzcntintrin.h */
+test_1 (__lzcnt16, unsigned short, unsigned short)
+test_1 (__lzcnt32, unsigned int, unsigned int)
+test_1 (_lzcnt_u32, unsigned int, unsigned int)
+
+/* lwpintrin.h */
+test_1 (__llwpcb, void, void *)
+test_0 (__slwpcb, void *)
+test_2_i1 (__lwpval32, void, unsigned int, unsigned int, 1)
+test_2_i1 (__lwpins32, unsigned char, unsigned int, unsigned int, 1)
+
+/* movdirintrin.h */
+test_2 (_directstoreu_u32, void, void *, unsigned int)
+test_2 (_movdir64b, void, void *, const void *)
+
+/* mwaitxintrin.h */
+test_3 (_mm_monitorx, void, void const *, unsigned int, unsigned int)
+test_3 (_mm_mwaitx, void, unsigned int, unsigned int, unsigned int)
+
+/* pconfigintrin.h */
+test_2 (_pconfig_u32, unsigned int, const unsigned int, size_t *)
+
+/* pkuintrin.h */
+test_0 (_rdpkru_u32, unsigned int)
+test_1 (_wrpkru, void, unsigned int)
+
+/* popcntintrin.h */
+test_1 (_mm_popcnt_u32, int, unsigned int)
+
+/* rdseedintrin.h */
+test_1 (_rdseed16_step, int, unsigned short *)
+test_1 (_rdseed32_step, int, unsigned int *)
+
+/* rtmintrin.h */
+test_0 (_xbegin, unsigned int)
+test_0 (_xend, void)
+test_0_i1 (_xabort, void, 1)
+
+/* sgxintrin.h */
+test_2 (_encls_u32, unsigned int, const unsigned int, size_t *)
+test_2 (_enclu_u32, unsigned int, const unsigned int, size_t *)
+test_2 (_enclv_u32, unsigned int, const unsigned int, size_t *)
+
+/* tbmintrin.h */
+test_1_i1 (__bextri_u32, unsigned int, unsigned int, 1)
+test_1 (__blcfill_u32, unsigned int, unsigned int)
+test_1 (__blci_u32, unsigned int, unsigned int)
+test_1 (__blcic_u32, unsigned int, unsigned int)
+test_1 (__blcmsk_u32, unsigned int, unsigned int)
+test_1 (__blcs_u32, unsigned int, unsigned int)
+test_1 (__blsfill_u32, unsigned int, unsigned int)
+test_1 (__blsic_u32, unsigned int, unsigned int)
+test_1 (__t1mskc_u32, unsigned int, unsigned int)
+test_1 (__tzmsk_u32, unsigned int, unsigned int)
+
+/* tsxldtrkintrin.h */
+test_0 (_xsusldtrk, void)
+test_0 (_xresldtrk, void)
+
+/* x86gprintrin.h */
+test_1 (_ptwrite32, void, unsigned int)
+test_1 (_rdrand16_step, int, unsigned short *)
+test_1 (_rdrand32_step, int, unsigned int *)
+test_0 (_wbinvd, void)
+
+/* xtestintrin.h */
+test_0 (_xtest, int)
+
+/* xsaveintrin.h */
+test_2 (_xsave, void, void *, long long)
+test_2 (_xrstor, void, void *, long long)
+test_2 (_xsetbv, void, unsigned int, long long)
+test_1 (_xgetbv, long long, unsigned int)
+
+/* xsavecintrin.h */
+test_2 (_xsavec, void, void *, long long)
+
+/* xsaveoptintrin.h */
+test_2 (_xsaveopt, void, void *, long long)
+
+/* xsavesintrin.h */
+test_2 (_xsaves, void, void *, long long)
+test_2 (_xrstors, void, void *, long long)
+
+/* wbnoinvdintrin.h */
+test_0 (_wbnoinvd, void)
+
+#ifdef __x86_64__
+/* adxintrin.h */
+test_4 (_subborrow_u64, unsigned char, unsigned char,
+	unsigned long long, unsigned long long,
+	unsigned long long *)
+test_4 (_addcarry_u64, unsigned char, unsigned char,
+	unsigned long long, unsigned long long,
+	unsigned long long *)
+test_4 (_addcarryx_u64, unsigned char, unsigned char,
+	unsigned long long, unsigned long long,
+	unsigned long long *)
+
+/* bmiintrin.h */
+test_2 (__andn_u64, unsigned long long, unsigned long long,
+	unsigned long long)
+test_2 (__bextr_u64, unsigned long long, unsigned long long,
+	unsigned long long)
+test_3 (_bextr_u64, unsigned long long, unsigned long long,
+	unsigned long long, unsigned long long)
+test_1 (__blsi_u64, unsigned long long, unsigned long long)
+test_1 (_blsi_u64, unsigned long long, unsigned long long)
+test_1 (__blsmsk_u64, unsigned long long, unsigned long long)
+test_1 (_blsmsk_u64, unsigned long long, unsigned long long)
+test_1 (__blsr_u64, unsigned long long, unsigned long long)
+test_1 (_blsr_u64, unsigned long long, unsigned long long)
+test_1 (__tzcnt_u64, unsigned long long, unsigned long long)
+test_1 (_tzcnt_u64, unsigned long long, unsigned long long)
+
+/* bmi2intrin.h */
+test_2 (_bzhi_u64, unsigned long long, unsigned long long,
+	unsigned long long)
+test_2 (_pdep_u64, unsigned long long, unsigned long long,
+	unsigned long long)
+test_2 (_pext_u64, unsigned long long, unsigned long long,
+	unsigned long long)
+test_3 (_mulx_u64, unsigned long long, unsigned long long,
+	unsigned long long, unsigned long long *)
+
+/* cetintrin.h */
+test_0 (_get_ssp, unsigned long long)
+test_2 (_wrssq, void, unsigned long long, void *)
+test_2 (_wrussq, void, unsigned long long, void *)
+
+/* fxsrintrin.h */
+test_1 (_fxsave64, void, void *)
+test_1 (_fxrstor64, void, void *)
+
+/* ia32intrin.h  */
+test_1 (__bsfq, int, long long)
+test_1 (__bsrq, int, long long)
+test_1 (__bswapq, long long, long long)
+test_2 (__crc32q, unsigned long long, unsigned long long,
+	unsigned long long)
+test_1 (__popcntq, long long, unsigned long long)
+test_2 (__rolq, unsigned long long, unsigned long long, int)
+test_2 (__rorq, unsigned long long, unsigned long long, int)
+test_0 (__readeflags, unsigned long long)
+test_1 (__writeeflags, void, unsigned int)
+
+/* lzcntintrin.h */
+test_1 (__lzcnt64, unsigned long long, unsigned long long)
+test_1 (_lzcnt_u64, unsigned long long, unsigned long long)
+
+/* lwpintrin.h */
+test_2_i1 (__lwpval64, void, unsigned long long, unsigned int, 1)
+test_2_i1 (__lwpins64, unsigned char, unsigned long long,
+	   unsigned int, 1)
+
+/* movdirintrin.h */
+test_2 (_directstoreu_u64, void, void *, unsigned long long)
+
+/* popcntintrin.h */
+test_1 (_mm_popcnt_u64, long long, unsigned long long)
+
+/* rdseedintrin.h */
+test_1 (_rdseed64_step, int, unsigned long long *)
+
+/* tbmintrin.h */
+test_1_i1 (__bextri_u64, unsigned long long, unsigned long long, 1)
+test_1 (__blcfill_u64, unsigned long long, unsigned long long)
+test_1 (__blci_u64, unsigned long long, unsigned long long)
+test_1 (__blcic_u64, unsigned long long, unsigned long long)
+test_1 (__blcmsk_u64, unsigned long long, unsigned long long)
+test_1 (__blcs_u64, unsigned long long, unsigned long long)
+test_1 (__blsfill_u64, unsigned long long, unsigned long long)
+test_1 (__blsic_u64, unsigned long long, unsigned long long)
+test_1 (__t1mskc_u64, unsigned long long, unsigned long long)
+test_1 (__tzmsk_u64, unsigned long long, unsigned long long)
+
+/* uintrintrin.h */
+test_0 (_clui, void)
+test_1 (_senduipi, void, unsigned long long)
+test_0 (_stui, void)
+test_0 (_testui, unsigned char)
+
+/* x86gprintrin.h */
+test_1 (_ptwrite64, void, unsigned long long)
+test_0 (_readfsbase_u32, unsigned int)
+test_0 (_readfsbase_u64, unsigned long long)
+test_0 (_readgsbase_u32, unsigned int)
+test_0 (_readgsbase_u64, unsigned long long)
+test_1 (_rdrand64_step, int, unsigned long long *)
+test_1 (_writefsbase_u32, void, unsigned int)
+test_1 (_writefsbase_u64, void, unsigned long long)
+test_1 (_writegsbase_u32, void, unsigned int)
+test_1 (_writegsbase_u64, void, unsigned long long)
+
+/* xsaveintrin.h */
+test_2 (_xsave64, void, void *, long long)
+test_2 (_xrstor64, void, void *, long long)
+
+/* xsavecintrin.h */
+test_2 (_xsavec64, void, void *, long long)
+
+/* xsaveoptintrin.h */
+test_2 (_xsaveopt64, void, void *, long long)
+
+/* xsavesintrin.h */
+test_2 (_xsaves64, void, void *, long long)
+test_2 (_xrstors64, void, void *, long long)
+
+/* waitpkgintrin.h */
+test_1 (_umonitor, void, void *)
+test_2 (_umwait, unsigned char, unsigned int, unsigned long long)
+test_2 (_tpause, unsigned char, unsigned int, unsigned long long)
+
+#else /* !__x86_64__ */
+/* bmi2intrin.h */
+test_3 (_mulx_u32, unsigned int, unsigned int, unsigned int,
+	unsigned int *)
+
+/* cetintrin.h */
+test_0 (_get_ssp, unsigned int)
+#endif /* __x86_64__ */
+
+#endif
diff --git a/gcc/testsuite/gcc.target/i386/pr99744-5.c b/gcc/testsuite/gcc.target/i386/pr99744-5.c
new file mode 100644
index 0000000..9e40e5e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr99744-5.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mmwait" } */
+
+/* Test calling MWAIT intrinsics from functions with general-regs-only
+   target attribute.  */
+
+#include <x86gprintrin.h>
+
+#define _CONCAT(x,y) x ## y
+
+#define test_2(func, type, op1_type, op2_type)				\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (op1_type A, op2_type B)			\
+  { return func (A, B); }
+
+#define test_3(func, type, op1_type, op2_type, op3_type)		\
+  __attribute__ ((target("general-regs-only")))				\
+  type _CONCAT(do_,func) (op1_type A, op2_type B, op3_type C)		\
+  { return func (A, B, C); }
+
+#ifndef __iamcu__
+/* mwaitintrin.h */
+test_3 (_mm_monitor, void, void const *, unsigned int, unsigned int)
+test_2 (_mm_mwait, void, unsigned int, unsigned int)
+#endif
diff --git a/gcc/testsuite/gcc.target/i386/pr99744-6.c b/gcc/testsuite/gcc.target/i386/pr99744-6.c
new file mode 100644
index 0000000..4025918
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr99744-6.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+#include <x86intrin.h>
+
+extern unsigned long long int curr_deadline;
+extern void bar (void);
+
+void
+foo1 (void)
+{
+  if (__rdtsc () < curr_deadline)
+    return; 
+  bar ();
+}
+
+void
+foo2 (unsigned int *p)
+{
+  if (__rdtscp (p) < curr_deadline)
+    return; 
+  bar ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr99744-7.c b/gcc/testsuite/gcc.target/i386/pr99744-7.c
new file mode 100644
index 0000000..30b7ca0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr99744-7.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mno-avx -Wno-psabi" } */
+
+#include <x86intrin.h>
+
+void
+foo (__m256 *x)
+{
+  x[0] = _mm256_sub_ps (x[1], x[2]);
+}
+
+/* { dg-error "target specific option mismatch" "" { target *-*-* } 0 } */
diff --git a/gcc/testsuite/gcc.target/i386/pr99744-8.c b/gcc/testsuite/gcc.target/i386/pr99744-8.c
new file mode 100644
index 0000000..115183e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr99744-8.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O -Wno-psabi" } */
+
+#include <x86intrin.h>
+
+__attribute__((target ("no-avx")))
+void
+foo (__m256 *x)
+{
+  x[0] = _mm256_sub_ps (x[1], x[2]);
+}
+
+/* { dg-error "target specific option mismatch" "" { target *-*-* } 0 } */
-- 
cgit v1.1


From 03d47da7e1e91adddbde261ffefd2760df59a564 Mon Sep 17 00:00:00 2001
From: Jonathan Wakely <jwakely@redhat.com>
Date: Thu, 5 Aug 2021 14:00:35 +0100
Subject: testsuite: Fix warning introduced by nodiscard in libstdc++

Signed-off-by: Jonathan Wakely <jwakely@redhat.com>

gcc/testsuite/ChangeLog:

	* g++.old-deja/g++.other/inline7.C: Cast nodiscard call to void.
---
 gcc/testsuite/g++.old-deja/g++.other/inline7.C | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc')

diff --git a/gcc/testsuite/g++.old-deja/g++.other/inline7.C b/gcc/testsuite/g++.old-deja/g++.other/inline7.C
index a3723cf..6260000 100644
--- a/gcc/testsuite/g++.old-deja/g++.other/inline7.C
+++ b/gcc/testsuite/g++.old-deja/g++.other/inline7.C
@@ -8,7 +8,7 @@ std::list<int*> li;
 
 void f ()
 {
-  li.size ();
+  (void) li.size ();
 }
 
 int main ()
-- 
cgit v1.1