openmp: Optimize triangular loop logical iterator to actual iterators computation using search for quadratic equation root(s)

This patch implements the optimized logical to actual iterators computation for triangular loops. I have a rough implementation using integers, but this one uses floating point. There is a small problem that -fopenmp programs aren't linked with -lm, so it does it only if the hw has sqrt optab (and uses ifn rather than __builtin_sqrt because it obviously doesn't need errno handling etc.). Do you think it is ok this way, or should I use the integral computation using inlined isqrt (we have inequation of the form start >= x * t10 + t11 * (((x - 1) * x) / 2) where t10 and t11 are signed long long values and start unsigned long long, and the division by 2 actually is a problem for accuracy in some cases, so if we do it in integral, we need to do actually long long t12 = 2 * t10 - t11; unsigned long long t13 = t12 * t12 + start * 8 * t11; unsigned long long isqrt_ = isqrtull (t13); long long x = (((long long) isqrt_ - t12) / t11) >> 1; with careful overflow checking on all the computations before isqrtull (and on overflows use the fallback implementation). 2020-07-09 Jakub Jelinek <jakub@redhat.com> * omp-general.h (struct omp_for_data): Add min_inner_iterations and factor members. * omp-general.c (omp_extract_for_data): Initialize them and remember them in OMP_CLAUSE_COLLAPSE_COUNT if needed and restore from there. * omp-expand.c (expand_omp_for_init_counts): Fix up computation of counts[fd->last_nonrect] if fd->loop.n2 is INTEGER_CST. (expand_omp_for_init_vars): For fd->first_nonrect + 1 == fd->last_nonrect loops with for now INTEGER_CST fd->loop.n2 find quadratic equation roots instead of using fallback method when possible. * testsuite/libgomp.c/loop-19.c: New test. * testsuite/libgomp.c/loop-20.c: New test.
author: Jakub Jelinek <jakub@redhat.com> 2020-07-09 12:07:17 +0200
committer: Jakub Jelinek <jakub@redhat.com> 2020-07-09 12:07:17 +0200
commit: 5acef69f9d3d9f3c537b5e5157519edf02f86c4d (patch)
tree: af18107dc1e787b46c735b2eea3fad74d6b091a2 /libgomp/testsuite/libgomp.c
parent: ea82325afeccf3604f393916832eaadcbe1225bd (diff)
download: gcc-5acef69f9d3d9f3c537b5e5157519edf02f86c4d.zip
gcc-5acef69f9d3d9f3c537b5e5157519edf02f86c4d.tar.gz
gcc-5acef69f9d3d9f3c537b5e5157519edf02f86c4d.tar.bz2
2 files changed, 170 insertions, 0 deletions
diff --git a/libgomp/testsuite/libgomp.c/loop-19.c b/libgomp/testsuite/libgomp.c/loop-19.c
new file mode 100644
index 0000000..732ebb0
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/loop-19.c
@@ -0,0 +1,86 @@
+/* { dg-do run } */
+
+extern void abort (void);
+
+int x, i, j;
+volatile int a, b, c, d, e, f, g, h;
+int k[16][67];
+
+int
+main ()
+{
+  int niters;
+  for (i = 0; i < 16; i++)
+    for (j = i * 2 + 1; j < 4 * i + 3; j++)
+      k[i][j] = 1;
+  a = 0; b = 16; c = 1; d = 2; e = 1; f = 4; g = 3; h = 1;
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = 0; i < 16; i++)
+    for (j = i * 2 + 1; j < 4 * i + 3; j++)
+      {
+	if (i < 0 || i >= 16 || j < 2 * i + 1 || j >= 3 + i * 4 || k[i][j] != 1)
+	  abort ();
+	k[i][j]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (i != 16 || j != 63 || x != 15422 || niters != 272)
+    abort ();
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = a; i < b; i += c)
+    for (j = d * i + e; j < g + i * f; j += h)
+      {
+	if (i < 0 || i >= 16 || j < 2 * i + 1 || j >= 3 + i * 4 || k[i][j] != 2)
+	  abort ();
+	k[i][j]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (i != 16 || j != 63 || x != 15422 || niters != 272)
+    abort ();
+  for (i = 0; i < 16; i++)
+    for (j = i * 2 + 1; j < 4 * i + 3; j++)
+      if (k[i][j] == 3)
+	k[i][j] = 0;
+      else
+	abort ();
+  for (i = 0; i < 16; i++)
+    for (j = i * 2 + 1; j < 2 * i + 7; j++)
+      k[i][j] = 1;
+  a = 0; b = 16; c = 1; d = 2; e = 1; f = 2; g = 7; h = 1;
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = 0; i < 16; i++)
+    for (j = i * 2 + 1; j < 2 * i + 7; j++)
+      {
+	if (i < 0 || i >= 16 || j < 2 * i + 1 || j >= 7 + i * 2 || k[i][j] != 1)
+	  abort ();
+	k[i][j]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (i != 16 || j != 37 || x != 15396 || niters != 96)
+    abort ();
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = a; i < b; i += c)
+    for (j = d * i + e; j < g + i * f; j += h)
+      {
+	if (i < 0 || i >= 16 || j < 2 * i + 1 || j >= 7 + i * 2 || k[i][j] != 2)
+	  abort ();
+	k[i][j]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (i != 16 || j != 37 || x != 15396 || niters != 96)
+    abort ();
+  for (i = 0; i < 16; i++)
+    for (j = i * 2 + 1; j < 2 * i + 7; j++)
+      if (k[i][j] == 3)
+	k[i][j] = 0;
+      else
+	abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/loop-20.c b/libgomp/testsuite/libgomp.c/loop-20.c
new file mode 100644
index 0000000..c3265ac
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/loop-20.c
@@ -0,0 +1,84 @@
+/* { dg-do run } */
+
+extern void abort (void);
+
+unsigned long long int x, i, j;
+volatile unsigned long long int a, b, c, d, e, f, g, h;
+int k[4][206];
+
+int
+main ()
+{
+  long long int niters;
+  for (j = ~0ULL / 2 - 32; j < ((~0ULL / 2) + 6); j++)
+    k[0][j - ~0ULL / 2 + 64] = 1;
+  a = 1; b = 2; c = 1; d = 0; e = ~0ULL / 2 - 32; f = ((~0ULL / 2) + 6); g = 0; h = 1;
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = 1; i < 2; i++)
+    for (j = ~0ULL / 2 - 32; j < i * ((~0ULL / 2) + 6); j++)
+      {
+	if (i != 1 || j < ~0ULL / 2 - 32 || j >= ((~0ULL / 2) + 6) || k[0][j - ~0ULL / 2 + 64] != 1)
+	  abort ();
+	k[0][j - ~0ULL / 2 + 64]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (i != 2 || j != ((~0ULL / 2) + 6) || x != 1028 || niters != 38)
+    abort ();
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = a; i < b; i += c)
+    for (j = d * i + e; j < g + i * f; j += h)
+      {
+	if (i != 1 || j < ~0ULL / 2 - 32 || j >= ((~0ULL / 2) + 6) || k[0][j - ~0ULL / 2 + 64] != 2)
+	  abort ();
+	k[0][j - ~0ULL / 2 + 64]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (i != 2 || j != ((~0ULL / 2) + 6) || x != 1028 || niters != 38)
+    abort ();
+  for (j = ~0ULL / 2 - 32; j < ((~0ULL / 2) + 6); j++)
+    if (k[0][j - ~0ULL / 2 + 64] == 3)
+      k[0][j - ~0ULL / 2 + 64] = 0;
+    else
+      abort ();
+  for (i = 1; i < 4; i++)
+    for (j = 64ULL * i; j < i * 32ULL + 110; j++)
+      k[i][j] = 1;
+  a = 1; b = 4; c = 1; d = 64ULL; e = 0; f = 32ULL; g = 110ULL; h = 1;
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = 1; i < 4; i++)
+    for (j = 64ULL * i; j < i * 32ULL + 110; j++)
+      {
+	if (i < 1 || i >= 4 || j < 64ULL * i || j >= i * 32ULL + 110 || k[i][j] != 1)
+	  abort ();
+	k[i][j]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (i != 4 || j != 206 || x != 3277 || niters != 138)
+    abort ();
+  niters = 0; i = -100; j = -100; x = -100;
+  #pragma omp parallel for collapse(2) lastprivate (i, j, x) reduction(+:niters)
+  for (i = a; i < b; i += c)
+    for (j = d * i + e; j < g + i * f; j += h)
+      {
+	if (i < 1 || i >= 4 || j < 64ULL * i || j >= i * 32ULL + 110 || k[i][j] != 2)
+	  abort ();
+	k[i][j]++;
+	x = i * 1024 + (j & 1023);
+	niters++;
+      }
+  if (i != 4 || j != 206 || x != 3277 || niters != 138)
+    abort ();
+  for (i = 1; i < 4; i++)
+    for (j = 64ULL * i; j < i * 32ULL + 110; j++)
+      if (k[i][j] == 3)
+	k[i][j] = 0;
+      else
+	abort ();
+  return 0;
+}
author	Jakub Jelinek <jakub@redhat.com>	2020-07-09 12:07:17 +0200
committer	Jakub Jelinek <jakub@redhat.com>	2020-07-09 12:07:17 +0200
commit	5acef69f9d3d9f3c537b5e5157519edf02f86c4d (patch)
tree	af18107dc1e787b46c735b2eea3fad74d6b091a2 /libgomp/testsuite/libgomp.c
parent	ea82325afeccf3604f393916832eaadcbe1225bd (diff)
download	gcc-5acef69f9d3d9f3c537b5e5157519edf02f86c4d.zip gcc-5acef69f9d3d9f3c537b5e5157519edf02f86c4d.tar.gz gcc-5acef69f9d3d9f3c537b5e5157519edf02f86c4d.tar.bz2