8 files changed, 63 insertions, 13 deletions
diff --git a/libgomp/testsuite/libgomp.c-target/aarch64/firstprivate.c b/libgomp/testsuite/libgomp.c-target/aarch64/firstprivate.c
index 930ca62..58674e2 100644
--- a/libgomp/testsuite/libgomp.c-target/aarch64/firstprivate.c
+++ b/libgomp/testsuite/libgomp.c-target/aarch64/firstprivate.c
@@ -1,6 +1,8 @@
 /* { dg-do run { target aarch64_sve256_hw } } */
 /* { dg-options "-msve-vector-bits=256 -fopenmp -O2" } */
 
+#pragma GCC target "+sve"
+
 #include <arm_sve.h>
 #include <omp.h>
 
diff --git a/libgomp/testsuite/libgomp.c-target/aarch64/lastprivate.c b/libgomp/testsuite/libgomp.c-target/aarch64/lastprivate.c
index be5a618..2f93d7b 100644
--- a/libgomp/testsuite/libgomp.c-target/aarch64/lastprivate.c
+++ b/libgomp/testsuite/libgomp.c-target/aarch64/lastprivate.c
@@ -1,6 +1,8 @@
 /* { dg-do run { target aarch64_sve256_hw } } */
 /* { dg-options "-msve-vector-bits=256 -fopenmp -O2" } */
 
+#pragma GCC target "+sve"
+
 #include <arm_sve.h>
 #include <omp.h>
 
diff --git a/libgomp/testsuite/libgomp.c-target/aarch64/private.c b/libgomp/testsuite/libgomp.c-target/aarch64/private.c
index 0ca74fe..fed5370 100644
--- a/libgomp/testsuite/libgomp.c-target/aarch64/private.c
+++ b/libgomp/testsuite/libgomp.c-target/aarch64/private.c
@@ -1,6 +1,8 @@
 /* { dg-do run { target aarch64_sve256_hw } } */
 /* { dg-options "-msve-vector-bits=256 -fopenmp -O2" } */
 
+#pragma GCC target "+sve"
+
 #include <arm_sve.h>
 #include <omp.h>
 
diff --git a/libgomp/testsuite/libgomp.c-target/aarch64/shared.c b/libgomp/testsuite/libgomp.c-target/aarch64/shared.c
index dec41b8..340a668 100644
--- a/libgomp/testsuite/libgomp.c-target/aarch64/shared.c
+++ b/libgomp/testsuite/libgomp.c-target/aarch64/shared.c
@@ -1,6 +1,8 @@
 /* { dg-do run { target aarch64_sve256_hw } } */
 /* { dg-options "-msve-vector-bits=256 -fopenmp -O2" } */
 
+#pragma GCC target "+sve"
+
 #include <arm_sve.h>
 #include <stdlib.h>
 #include <omp.h>
diff --git a/libgomp/testsuite/libgomp.c-target/aarch64/simd-aligned.c b/libgomp/testsuite/libgomp.c-target/aarch64/simd-aligned.c
index cc41139..14642c9 100644
--- a/libgomp/testsuite/libgomp.c-target/aarch64/simd-aligned.c
+++ b/libgomp/testsuite/libgomp.c-target/aarch64/simd-aligned.c
@@ -1,6 +1,8 @@
 /* { dg-do run { target aarch64_sve256_hw } } */
 /* { dg-options "-msve-vector-bits=256 -fopenmp -O2" } */
 
+#pragma GCC target "+sve"
+
 #include <arm_sve.h>
 #include <stdint.h>
 
diff --git a/libgomp/testsuite/libgomp.c-target/aarch64/simd-nontemporal.c b/libgomp/testsuite/libgomp.c-target/aarch64/simd-nontemporal.c
index 3385427..6fe4616 100644
--- a/libgomp/testsuite/libgomp.c-target/aarch64/simd-nontemporal.c
+++ b/libgomp/testsuite/libgomp.c-target/aarch64/simd-nontemporal.c
@@ -1,6 +1,8 @@
 /* { dg-do run { target aarch64_sve256_hw } } */
 /* { dg-options "-msve-vector-bits=256 -fopenmp -O2" } */
 
+#pragma GCC target "+sve"
+
 #include <arm_sve.h>
 #include <stdint.h>
 
diff --git a/libgomp/testsuite/libgomp.c-target/aarch64/threadprivate.c b/libgomp/testsuite/libgomp.c-target/aarch64/threadprivate.c
index 4a39312..aa7d2f9 100644
--- a/libgomp/testsuite/libgomp.c-target/aarch64/threadprivate.c
+++ b/libgomp/testsuite/libgomp.c-target/aarch64/threadprivate.c
@@ -1,6 +1,8 @@
 /* { dg-do run { target aarch64_sve256_hw } } */
 /* { dg-options "-msve-vector-bits=256 -fopenmp -O2" } */
 
+#pragma GCC target "+sve"
+
 #include <arm_sve.h>
 #include <stdint.h>
 
diff --git a/libgomp/testsuite/libgomp.c-target/aarch64/udr-sve.c b/libgomp/testsuite/libgomp.c-target/aarch64/udr-sve.c
index c79f4a9..02e02dc 100644
--- a/libgomp/testsuite/libgomp.c-target/aarch64/udr-sve.c
+++ b/libgomp/testsuite/libgomp.c-target/aarch64/udr-sve.c
@@ -1,5 +1,5 @@
 /* { dg-do run { target aarch64_sve256_hw } } */
-/* { dg-options "-msve-vector-bits=256 -fopenmp -O2" } */
+/* { dg-options "-march=armv8-a+sve -msve-vector-bits=256 -fopenmp -O2" } */
 
 #include <arm_sve.h>
 
@@ -9,8 +9,8 @@
 void __attribute__ ((noipa))
 parallel_reduction ()
 {
-  int a[8] = {1 ,1, 1, 1, 1, 1, 1, 1};
-  int b[8] = {0 ,0, 0, 0, 0, 0, 0, 0};
+  int a[8] = {1, 1, 1, 1, 1, 1, 1, 1};
+  int b[8] = {0, 0, 0, 0, 0, 0, 0, 0};
   svint32_t va = svld1_s32 (svptrue_b32 (), b);
   int i = 0;
   int64_t res;
@@ -30,15 +30,15 @@ parallel_reduction ()
 void __attribute__ ((noipa))
 for_reduction ()
 {
-  int a[8] = {1 ,1, 1, 1, 1, 1, 1, 1};
-  int b[8] = {0 ,0, 0, 0, 0, 0, 0, 0};
+  int a[8] = {1, 1, 1, 1, 1, 1, 1, 1};
+  int b[8] = {0, 0, 0, 0, 0, 0, 0, 0};
   svint32_t va = svld1_s32 (svptrue_b32 (), b);
   int j;
   int64_t res;
 
   #pragma omp parallel for reduction (+:va)
   for (j = 0; j < 8; j++)
-    va = svld1_s32 (svptrue_b32 (), a);
+    va += svld1_s32 (svptrue_b32 (), a);
 
   res = svaddv_s32 (svptrue_b32 (), va);
 
@@ -58,13 +58,13 @@ simd_reduction ()
   for (j = 0; j < 8; j++)
     a[j] = 1;
 
-  #pragma omp simd reduction (+:va, i)
+  #pragma omp simd reduction (+:va)
   for (j = 0; j < 16; j++)
-    va = svld1_s32 (svptrue_b32 (), a);
+    va += svld1_s32 (svptrue_b32 (), a);
 
   res = svaddv_s32 (svptrue_b32 (), va);
 
-  if (res != 8)
+  if (res != 128)
     __builtin_abort ();
 }
 
@@ -72,22 +72,57 @@ void __attribute__ ((noipa))
 inscan_reduction_incl ()
 {
   svint32_t va = svindex_s32 (0, 0);
+  int a[8] = {1, 1, 1, 1, 1, 1, 1, 1};
+  int b[64] = { 0 };
   int j;
   int64_t res = 0;
 
-  #pragma omp parallel
-  #pragma omp for reduction (inscan,+:va) firstprivate (res) lastprivate (res)
+  #pragma omp parallel for reduction (inscan, +:va)
   for (j = 0; j < 8; j++)
     {
-      va = svindex_s32 (1, 0);
+      va += svld1_s32 (svptrue_b32 (), a);
       #pragma omp scan inclusive (va)
-      res += svaddv_s32 (svptrue_b32 (), va);
+      svst1_s32 (svptrue_b32 (), b + j * 8, va);
     }
 
+  res = svaddv_s32 (svptrue_b32 (), va);
+
   if (res != 64)
     __builtin_abort ();
+
+  for (j = 0; j < 64; j+=8)
+    if (b[j] != (j / 8 + 1))
+      __builtin_abort ();
 }
 
+void __attribute__ ((noipa))
+inscan_reduction_excl ()
+{
+  svint32_t va = svindex_s32 (0, 0);
+  int a[8] = {1, 1, 1, 1, 1, 1, 1, 1};
+  int b[64] = { 0 };
+  int j;
+  int64_t res = 0;
+
+  #pragma omp parallel for reduction (inscan, +:va)
+  for (j = 0; j < 8; j++)
+    {
+      svst1_s32 (svptrue_b32 (), b + j * 8, va);
+      #pragma omp scan exclusive (va)
+      va += svld1_s32 (svptrue_b32 (), a);
+    }
+
+  res = svaddv_s32 (svptrue_b32 (), va);
+
+  if (res != 64)
+    __builtin_abort ();
+
+  for (j = 0; j < 64; j+=8)
+    if (b[j] != j / 8)
+      __builtin_abort ();
+}
+
+
 int
 main ()
 {
@@ -95,4 +130,5 @@ main ()
   for_reduction ();
   simd_reduction ();
   inscan_reduction_incl ();
+  inscan_reduction_excl ();
 }