Initial commit of coarray_native branch.

author: I'm not telling you my name, idiot <git_is@stupid.com> 2020-09-23 18:14:20 +0200
committer: I'm not telling you my name, idiot <git_is@stupid.com> 2020-09-23 18:14:20 +0200
commit: bef0a39f2f8e87780f990d12fa71f5ed0039267a (patch)
tree: ce8138e924d9bfff3ce0b63aa4c29397fbfaa1a5 /libgfortran/generated
parent: 9044db88d634c631920eaa9f66c0275adf18fdf5 (diff)
download: gcc-bef0a39f2f8e87780f990d12fa71f5ed0039267a.zip
gcc-bef0a39f2f8e87780f990d12fa71f5ed0039267a.tar.gz
gcc-bef0a39f2f8e87780f990d12fa71f5ed0039267a.tar.bz2
11 files changed, 6865 insertions, 0 deletions
diff --git a/libgfortran/generated/nca_minmax_i1.c b/libgfortran/generated/nca_minmax_i1.c
new file mode 100644
index 0000000..3bc9a2b
--- /dev/null
+++ b/libgfortran/generated/nca_minmax_i1.c
@@ -0,0 +1,653 @@
+/* Implementation of collective subroutines minmax.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   Contributed by Thomas Koenig  <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+
+#include "libgfortran.h"
+
+#if defined (HAVE_GFC_INTEGER_1)
+#include <string.h>
+#include "../nca/libcoarraynative.h"
+#include "../nca/collective_subroutine.h"
+#include "../nca/collective_inline.h"
+
+void nca_collsub_max_scalar_i1 (GFC_INTEGER_1 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_max_scalar_i1);
+
+void
+nca_collsub_max_scalar_i1 (GFC_INTEGER_1 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_INTEGER_1 *a, *b;
+  GFC_INTEGER_1 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_INTEGER_1) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  if (*b > *a)
+	    *a = *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_min_scalar_i1 (GFC_INTEGER_1 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_min_scalar_i1);
+
+void
+nca_collsub_min_scalar_i1 (GFC_INTEGER_1 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_INTEGER_1 *a, *b;
+  GFC_INTEGER_1 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_INTEGER_1) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  if (*b < *a)
+	    *a = *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_sum_scalar_i1 (GFC_INTEGER_1 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_sum_scalar_i1);
+
+void
+nca_collsub_sum_scalar_i1 (GFC_INTEGER_1 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_INTEGER_1 *a, *b;
+  GFC_INTEGER_1 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_INTEGER_1) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  *a += *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_max_array_i1 (gfc_array_i1 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_max_array_i1);
+
+void
+nca_collsub_max_array_i1 (gfc_array_i1 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_INTEGER_1 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_INTEGER_1 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_INTEGER_1);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_INTEGER_1);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_INTEGER_1);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_INTEGER_1 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_INTEGER_1 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_INTEGER_1 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_INTEGER_1 *a;
+	  GFC_INTEGER_1 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      if (*b > *a)
+		*a = *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_INTEGER_1 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_INTEGER_1 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_min_array_i1 (gfc_array_i1 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_min_array_i1);
+
+void
+nca_collsub_min_array_i1 (gfc_array_i1 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_INTEGER_1 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_INTEGER_1 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_INTEGER_1);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_INTEGER_1);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_INTEGER_1);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_INTEGER_1 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_INTEGER_1 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_INTEGER_1 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_INTEGER_1 *a;
+	  GFC_INTEGER_1 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      if (*b < *a)
+		*a = *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_INTEGER_1 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_INTEGER_1 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_sum_array_i1 (gfc_array_i1 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_sum_array_i1);
+
+void
+nca_collsub_sum_array_i1 (gfc_array_i1 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_INTEGER_1 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_INTEGER_1 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_INTEGER_1);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_INTEGER_1);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_INTEGER_1);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_INTEGER_1 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_INTEGER_1 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_INTEGER_1 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_INTEGER_1 *a;
+	  GFC_INTEGER_1 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      *a += *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_INTEGER_1 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_INTEGER_1 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+
+#endif
+
diff --git a/libgfortran/generated/nca_minmax_i16.c b/libgfortran/generated/nca_minmax_i16.c
new file mode 100644
index 0000000..8fbb948
--- /dev/null
+++ b/libgfortran/generated/nca_minmax_i16.c
@@ -0,0 +1,653 @@
+/* Implementation of collective subroutines minmax.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   Contributed by Thomas Koenig  <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+
+#include "libgfortran.h"
+
+#if defined (HAVE_GFC_INTEGER_16)
+#include <string.h>
+#include "../nca/libcoarraynative.h"
+#include "../nca/collective_subroutine.h"
+#include "../nca/collective_inline.h"
+
+void nca_collsub_max_scalar_i16 (GFC_INTEGER_16 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_max_scalar_i16);
+
+void
+nca_collsub_max_scalar_i16 (GFC_INTEGER_16 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_INTEGER_16 *a, *b;
+  GFC_INTEGER_16 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_INTEGER_16) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  if (*b > *a)
+	    *a = *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_min_scalar_i16 (GFC_INTEGER_16 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_min_scalar_i16);
+
+void
+nca_collsub_min_scalar_i16 (GFC_INTEGER_16 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_INTEGER_16 *a, *b;
+  GFC_INTEGER_16 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_INTEGER_16) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  if (*b < *a)
+	    *a = *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_sum_scalar_i16 (GFC_INTEGER_16 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_sum_scalar_i16);
+
+void
+nca_collsub_sum_scalar_i16 (GFC_INTEGER_16 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_INTEGER_16 *a, *b;
+  GFC_INTEGER_16 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_INTEGER_16) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  *a += *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_max_array_i16 (gfc_array_i16 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_max_array_i16);
+
+void
+nca_collsub_max_array_i16 (gfc_array_i16 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_INTEGER_16 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_INTEGER_16 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_INTEGER_16);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_INTEGER_16);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_INTEGER_16);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_INTEGER_16 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_INTEGER_16 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_INTEGER_16 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_INTEGER_16 *a;
+	  GFC_INTEGER_16 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      if (*b > *a)
+		*a = *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_INTEGER_16 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_INTEGER_16 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_min_array_i16 (gfc_array_i16 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_min_array_i16);
+
+void
+nca_collsub_min_array_i16 (gfc_array_i16 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_INTEGER_16 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_INTEGER_16 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_INTEGER_16);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_INTEGER_16);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_INTEGER_16);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_INTEGER_16 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_INTEGER_16 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_INTEGER_16 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_INTEGER_16 *a;
+	  GFC_INTEGER_16 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      if (*b < *a)
+		*a = *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_INTEGER_16 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_INTEGER_16 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_sum_array_i16 (gfc_array_i16 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_sum_array_i16);
+
+void
+nca_collsub_sum_array_i16 (gfc_array_i16 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_INTEGER_16 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_INTEGER_16 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_INTEGER_16);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_INTEGER_16);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_INTEGER_16);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_INTEGER_16 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_INTEGER_16 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_INTEGER_16 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_INTEGER_16 *a;
+	  GFC_INTEGER_16 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      *a += *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_INTEGER_16 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_INTEGER_16 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+
+#endif
+
diff --git a/libgfortran/generated/nca_minmax_i2.c b/libgfortran/generated/nca_minmax_i2.c
new file mode 100644
index 0000000..61908d6
--- /dev/null
+++ b/libgfortran/generated/nca_minmax_i2.c
@@ -0,0 +1,653 @@
+/* Implementation of collective subroutines minmax.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   Contributed by Thomas Koenig  <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+
+#include "libgfortran.h"
+
+#if defined (HAVE_GFC_INTEGER_2)
+#include <string.h>
+#include "../nca/libcoarraynative.h"
+#include "../nca/collective_subroutine.h"
+#include "../nca/collective_inline.h"
+
+void nca_collsub_max_scalar_i2 (GFC_INTEGER_2 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_max_scalar_i2);
+
+void
+nca_collsub_max_scalar_i2 (GFC_INTEGER_2 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_INTEGER_2 *a, *b;
+  GFC_INTEGER_2 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_INTEGER_2) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  if (*b > *a)
+	    *a = *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_min_scalar_i2 (GFC_INTEGER_2 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_min_scalar_i2);
+
+void
+nca_collsub_min_scalar_i2 (GFC_INTEGER_2 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_INTEGER_2 *a, *b;
+  GFC_INTEGER_2 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_INTEGER_2) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  if (*b < *a)
+	    *a = *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_sum_scalar_i2 (GFC_INTEGER_2 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_sum_scalar_i2);
+
+void
+nca_collsub_sum_scalar_i2 (GFC_INTEGER_2 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_INTEGER_2 *a, *b;
+  GFC_INTEGER_2 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_INTEGER_2) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  *a += *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_max_array_i2 (gfc_array_i2 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_max_array_i2);
+
+void
+nca_collsub_max_array_i2 (gfc_array_i2 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_INTEGER_2 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_INTEGER_2 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_INTEGER_2);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_INTEGER_2);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_INTEGER_2);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_INTEGER_2 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_INTEGER_2 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_INTEGER_2 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_INTEGER_2 *a;
+	  GFC_INTEGER_2 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      if (*b > *a)
+		*a = *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_INTEGER_2 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_INTEGER_2 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_min_array_i2 (gfc_array_i2 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_min_array_i2);
+
+void
+nca_collsub_min_array_i2 (gfc_array_i2 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_INTEGER_2 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_INTEGER_2 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_INTEGER_2);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_INTEGER_2);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_INTEGER_2);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_INTEGER_2 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_INTEGER_2 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_INTEGER_2 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_INTEGER_2 *a;
+	  GFC_INTEGER_2 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      if (*b < *a)
+		*a = *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_INTEGER_2 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_INTEGER_2 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_sum_array_i2 (gfc_array_i2 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_sum_array_i2);
+
+void
+nca_collsub_sum_array_i2 (gfc_array_i2 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_INTEGER_2 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_INTEGER_2 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_INTEGER_2);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_INTEGER_2);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_INTEGER_2);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_INTEGER_2 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_INTEGER_2 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_INTEGER_2 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_INTEGER_2 *a;
+	  GFC_INTEGER_2 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      *a += *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_INTEGER_2 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_INTEGER_2 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+
+#endif
+
diff --git a/libgfortran/generated/nca_minmax_i4.c b/libgfortran/generated/nca_minmax_i4.c
new file mode 100644
index 0000000..5e37586
--- /dev/null
+++ b/libgfortran/generated/nca_minmax_i4.c
@@ -0,0 +1,653 @@
+/* Implementation of collective subroutines minmax.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   Contributed by Thomas Koenig  <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+
+#include "libgfortran.h"
+
+#if defined (HAVE_GFC_INTEGER_4)
+#include <string.h>
+#include "../nca/libcoarraynative.h"
+#include "../nca/collective_subroutine.h"
+#include "../nca/collective_inline.h"
+
+void nca_collsub_max_scalar_i4 (GFC_INTEGER_4 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_max_scalar_i4);
+
+void
+nca_collsub_max_scalar_i4 (GFC_INTEGER_4 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_INTEGER_4 *a, *b;
+  GFC_INTEGER_4 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_INTEGER_4) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  if (*b > *a)
+	    *a = *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_min_scalar_i4 (GFC_INTEGER_4 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_min_scalar_i4);
+
+void
+nca_collsub_min_scalar_i4 (GFC_INTEGER_4 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_INTEGER_4 *a, *b;
+  GFC_INTEGER_4 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_INTEGER_4) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  if (*b < *a)
+	    *a = *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_sum_scalar_i4 (GFC_INTEGER_4 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_sum_scalar_i4);
+
+void
+nca_collsub_sum_scalar_i4 (GFC_INTEGER_4 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_INTEGER_4 *a, *b;
+  GFC_INTEGER_4 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_INTEGER_4) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  *a += *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_max_array_i4 (gfc_array_i4 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_max_array_i4);
+
+void
+nca_collsub_max_array_i4 (gfc_array_i4 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_INTEGER_4 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_INTEGER_4 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_INTEGER_4);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_INTEGER_4);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_INTEGER_4);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_INTEGER_4 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_INTEGER_4 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_INTEGER_4 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_INTEGER_4 *a;
+	  GFC_INTEGER_4 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      if (*b > *a)
+		*a = *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_INTEGER_4 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_INTEGER_4 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_min_array_i4 (gfc_array_i4 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_min_array_i4);
+
+void
+nca_collsub_min_array_i4 (gfc_array_i4 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_INTEGER_4 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_INTEGER_4 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_INTEGER_4);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_INTEGER_4);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_INTEGER_4);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_INTEGER_4 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_INTEGER_4 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_INTEGER_4 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_INTEGER_4 *a;
+	  GFC_INTEGER_4 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      if (*b < *a)
+		*a = *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_INTEGER_4 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_INTEGER_4 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_sum_array_i4 (gfc_array_i4 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_sum_array_i4);
+
+void
+nca_collsub_sum_array_i4 (gfc_array_i4 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_INTEGER_4 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_INTEGER_4 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_INTEGER_4);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_INTEGER_4);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_INTEGER_4);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_INTEGER_4 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_INTEGER_4 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_INTEGER_4 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_INTEGER_4 *a;
+	  GFC_INTEGER_4 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      *a += *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_INTEGER_4 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_INTEGER_4 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+
+#endif
+
diff --git a/libgfortran/generated/nca_minmax_i8.c b/libgfortran/generated/nca_minmax_i8.c
new file mode 100644
index 0000000..b3dc861
--- /dev/null
+++ b/libgfortran/generated/nca_minmax_i8.c
@@ -0,0 +1,653 @@
+/* Implementation of collective subroutines minmax.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   Contributed by Thomas Koenig  <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+
+#include "libgfortran.h"
+
+#if defined (HAVE_GFC_INTEGER_8)
+#include <string.h>
+#include "../nca/libcoarraynative.h"
+#include "../nca/collective_subroutine.h"
+#include "../nca/collective_inline.h"
+
+void nca_collsub_max_scalar_i8 (GFC_INTEGER_8 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_max_scalar_i8);
+
+void
+nca_collsub_max_scalar_i8 (GFC_INTEGER_8 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_INTEGER_8 *a, *b;
+  GFC_INTEGER_8 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_INTEGER_8) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  if (*b > *a)
+	    *a = *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_min_scalar_i8 (GFC_INTEGER_8 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_min_scalar_i8);
+
+void
+nca_collsub_min_scalar_i8 (GFC_INTEGER_8 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_INTEGER_8 *a, *b;
+  GFC_INTEGER_8 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_INTEGER_8) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  if (*b < *a)
+	    *a = *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_sum_scalar_i8 (GFC_INTEGER_8 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_sum_scalar_i8);
+
+void
+nca_collsub_sum_scalar_i8 (GFC_INTEGER_8 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_INTEGER_8 *a, *b;
+  GFC_INTEGER_8 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_INTEGER_8) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  *a += *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_max_array_i8 (gfc_array_i8 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_max_array_i8);
+
+void
+nca_collsub_max_array_i8 (gfc_array_i8 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_INTEGER_8 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_INTEGER_8 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_INTEGER_8);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_INTEGER_8);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_INTEGER_8);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_INTEGER_8 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_INTEGER_8 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_INTEGER_8 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_INTEGER_8 *a;
+	  GFC_INTEGER_8 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      if (*b > *a)
+		*a = *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_INTEGER_8 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_INTEGER_8 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_min_array_i8 (gfc_array_i8 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_min_array_i8);
+
+void
+nca_collsub_min_array_i8 (gfc_array_i8 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_INTEGER_8 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_INTEGER_8 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_INTEGER_8);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_INTEGER_8);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_INTEGER_8);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_INTEGER_8 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_INTEGER_8 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_INTEGER_8 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_INTEGER_8 *a;
+	  GFC_INTEGER_8 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      if (*b < *a)
+		*a = *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_INTEGER_8 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_INTEGER_8 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_sum_array_i8 (gfc_array_i8 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_sum_array_i8);
+
+void
+nca_collsub_sum_array_i8 (gfc_array_i8 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_INTEGER_8 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_INTEGER_8 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_INTEGER_8);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_INTEGER_8);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_INTEGER_8);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_INTEGER_8 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_INTEGER_8 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_INTEGER_8 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_INTEGER_8 *a;
+	  GFC_INTEGER_8 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      *a += *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_INTEGER_8 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_INTEGER_8 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+
+#endif
+
diff --git a/libgfortran/generated/nca_minmax_r10.c b/libgfortran/generated/nca_minmax_r10.c
new file mode 100644
index 0000000..10f7324
--- /dev/null
+++ b/libgfortran/generated/nca_minmax_r10.c
@@ -0,0 +1,653 @@
+/* Implementation of collective subroutines minmax.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   Contributed by Thomas Koenig  <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+
+#include "libgfortran.h"
+
+#if defined (HAVE_GFC_REAL_10)
+#include <string.h>
+#include "../nca/libcoarraynative.h"
+#include "../nca/collective_subroutine.h"
+#include "../nca/collective_inline.h"
+
+void nca_collsub_max_scalar_r10 (GFC_REAL_10 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_max_scalar_r10);
+
+void
+nca_collsub_max_scalar_r10 (GFC_REAL_10 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_REAL_10 *a, *b;
+  GFC_REAL_10 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_REAL_10) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  if (*b > *a)
+	    *a = *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_min_scalar_r10 (GFC_REAL_10 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_min_scalar_r10);
+
+void
+nca_collsub_min_scalar_r10 (GFC_REAL_10 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_REAL_10 *a, *b;
+  GFC_REAL_10 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_REAL_10) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  if (*b < *a)
+	    *a = *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_sum_scalar_r10 (GFC_REAL_10 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_sum_scalar_r10);
+
+void
+nca_collsub_sum_scalar_r10 (GFC_REAL_10 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_REAL_10 *a, *b;
+  GFC_REAL_10 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_REAL_10) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  *a += *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_max_array_r10 (gfc_array_r10 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_max_array_r10);
+
+void
+nca_collsub_max_array_r10 (gfc_array_r10 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_REAL_10 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_REAL_10 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_REAL_10);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_REAL_10);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_REAL_10);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_REAL_10 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_REAL_10 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_REAL_10 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_REAL_10 *a;
+	  GFC_REAL_10 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      if (*b > *a)
+		*a = *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_REAL_10 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_REAL_10 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_min_array_r10 (gfc_array_r10 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_min_array_r10);
+
+void
+nca_collsub_min_array_r10 (gfc_array_r10 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_REAL_10 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_REAL_10 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_REAL_10);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_REAL_10);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_REAL_10);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_REAL_10 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_REAL_10 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_REAL_10 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_REAL_10 *a;
+	  GFC_REAL_10 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      if (*b < *a)
+		*a = *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_REAL_10 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_REAL_10 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_sum_array_r10 (gfc_array_r10 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_sum_array_r10);
+
+void
+nca_collsub_sum_array_r10 (gfc_array_r10 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_REAL_10 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_REAL_10 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_REAL_10);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_REAL_10);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_REAL_10);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_REAL_10 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_REAL_10 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_REAL_10 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_REAL_10 *a;
+	  GFC_REAL_10 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      *a += *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_REAL_10 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_REAL_10 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+
+#endif
+
diff --git a/libgfortran/generated/nca_minmax_r16.c b/libgfortran/generated/nca_minmax_r16.c
new file mode 100644
index 0000000..a0a0a51
--- /dev/null
+++ b/libgfortran/generated/nca_minmax_r16.c
@@ -0,0 +1,653 @@
+/* Implementation of collective subroutines minmax.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   Contributed by Thomas Koenig  <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+
+#include "libgfortran.h"
+
+#if defined (HAVE_GFC_REAL_16)
+#include <string.h>
+#include "../nca/libcoarraynative.h"
+#include "../nca/collective_subroutine.h"
+#include "../nca/collective_inline.h"
+
+void nca_collsub_max_scalar_r16 (GFC_REAL_16 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_max_scalar_r16);
+
+void
+nca_collsub_max_scalar_r16 (GFC_REAL_16 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_REAL_16 *a, *b;
+  GFC_REAL_16 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_REAL_16) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  if (*b > *a)
+	    *a = *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_min_scalar_r16 (GFC_REAL_16 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_min_scalar_r16);
+
+void
+nca_collsub_min_scalar_r16 (GFC_REAL_16 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_REAL_16 *a, *b;
+  GFC_REAL_16 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_REAL_16) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  if (*b < *a)
+	    *a = *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_sum_scalar_r16 (GFC_REAL_16 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_sum_scalar_r16);
+
+void
+nca_collsub_sum_scalar_r16 (GFC_REAL_16 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_REAL_16 *a, *b;
+  GFC_REAL_16 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_REAL_16) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  *a += *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_max_array_r16 (gfc_array_r16 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_max_array_r16);
+
+void
+nca_collsub_max_array_r16 (gfc_array_r16 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_REAL_16 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_REAL_16 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_REAL_16);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_REAL_16);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_REAL_16);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_REAL_16 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_REAL_16 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_REAL_16 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_REAL_16 *a;
+	  GFC_REAL_16 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      if (*b > *a)
+		*a = *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_REAL_16 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_REAL_16 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_min_array_r16 (gfc_array_r16 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_min_array_r16);
+
+void
+nca_collsub_min_array_r16 (gfc_array_r16 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_REAL_16 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_REAL_16 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_REAL_16);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_REAL_16);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_REAL_16);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_REAL_16 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_REAL_16 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_REAL_16 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_REAL_16 *a;
+	  GFC_REAL_16 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      if (*b < *a)
+		*a = *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_REAL_16 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_REAL_16 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_sum_array_r16 (gfc_array_r16 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_sum_array_r16);
+
+void
+nca_collsub_sum_array_r16 (gfc_array_r16 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_REAL_16 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_REAL_16 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_REAL_16);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_REAL_16);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_REAL_16);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_REAL_16 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_REAL_16 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_REAL_16 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_REAL_16 *a;
+	  GFC_REAL_16 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      *a += *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_REAL_16 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_REAL_16 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+
+#endif
+
diff --git a/libgfortran/generated/nca_minmax_r4.c b/libgfortran/generated/nca_minmax_r4.c
new file mode 100644
index 0000000..0eb3f1b
--- /dev/null
+++ b/libgfortran/generated/nca_minmax_r4.c
@@ -0,0 +1,653 @@
+/* Implementation of collective subroutines minmax.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   Contributed by Thomas Koenig  <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+
+#include "libgfortran.h"
+
+#if defined (HAVE_GFC_REAL_4)
+#include <string.h>
+#include "../nca/libcoarraynative.h"
+#include "../nca/collective_subroutine.h"
+#include "../nca/collective_inline.h"
+
+void nca_collsub_max_scalar_r4 (GFC_REAL_4 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_max_scalar_r4);
+
+void
+nca_collsub_max_scalar_r4 (GFC_REAL_4 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_REAL_4 *a, *b;
+  GFC_REAL_4 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_REAL_4) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  if (*b > *a)
+	    *a = *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_min_scalar_r4 (GFC_REAL_4 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_min_scalar_r4);
+
+void
+nca_collsub_min_scalar_r4 (GFC_REAL_4 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_REAL_4 *a, *b;
+  GFC_REAL_4 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_REAL_4) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  if (*b < *a)
+	    *a = *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_sum_scalar_r4 (GFC_REAL_4 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_sum_scalar_r4);
+
+void
+nca_collsub_sum_scalar_r4 (GFC_REAL_4 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_REAL_4 *a, *b;
+  GFC_REAL_4 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_REAL_4) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  *a += *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_max_array_r4 (gfc_array_r4 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_max_array_r4);
+
+void
+nca_collsub_max_array_r4 (gfc_array_r4 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_REAL_4 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_REAL_4 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_REAL_4);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_REAL_4);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_REAL_4);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_REAL_4 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_REAL_4 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_REAL_4 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_REAL_4 *a;
+	  GFC_REAL_4 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      if (*b > *a)
+		*a = *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_REAL_4 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_REAL_4 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_min_array_r4 (gfc_array_r4 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_min_array_r4);
+
+void
+nca_collsub_min_array_r4 (gfc_array_r4 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_REAL_4 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_REAL_4 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_REAL_4);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_REAL_4);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_REAL_4);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_REAL_4 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_REAL_4 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_REAL_4 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_REAL_4 *a;
+	  GFC_REAL_4 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      if (*b < *a)
+		*a = *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_REAL_4 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_REAL_4 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_sum_array_r4 (gfc_array_r4 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_sum_array_r4);
+
+void
+nca_collsub_sum_array_r4 (gfc_array_r4 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_REAL_4 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_REAL_4 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_REAL_4);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_REAL_4);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_REAL_4);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_REAL_4 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_REAL_4 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_REAL_4 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_REAL_4 *a;
+	  GFC_REAL_4 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      *a += *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_REAL_4 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_REAL_4 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+
+#endif
+
diff --git a/libgfortran/generated/nca_minmax_r8.c b/libgfortran/generated/nca_minmax_r8.c
new file mode 100644
index 0000000..3b3e962
--- /dev/null
+++ b/libgfortran/generated/nca_minmax_r8.c
@@ -0,0 +1,653 @@
+/* Implementation of collective subroutines minmax.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   Contributed by Thomas Koenig  <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+
+#include "libgfortran.h"
+
+#if defined (HAVE_GFC_REAL_8)
+#include <string.h>
+#include "../nca/libcoarraynative.h"
+#include "../nca/collective_subroutine.h"
+#include "../nca/collective_inline.h"
+
+void nca_collsub_max_scalar_r8 (GFC_REAL_8 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_max_scalar_r8);
+
+void
+nca_collsub_max_scalar_r8 (GFC_REAL_8 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_REAL_8 *a, *b;
+  GFC_REAL_8 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_REAL_8) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  if (*b > *a)
+	    *a = *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_min_scalar_r8 (GFC_REAL_8 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_min_scalar_r8);
+
+void
+nca_collsub_min_scalar_r8 (GFC_REAL_8 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_REAL_8 *a, *b;
+  GFC_REAL_8 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_REAL_8) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  if (*b < *a)
+	    *a = *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_sum_scalar_r8 (GFC_REAL_8 *obj, int *result_image,
+			int *stat, char *errmsg, index_type errmsg_len);
+export_proto(nca_collsub_sum_scalar_r8);
+
+void
+nca_collsub_sum_scalar_r8 (GFC_REAL_8 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_REAL_8 *a, *b;
+  GFC_REAL_8 *buffer, *this_image_buf;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  buffer = get_collsub_buf (ci, sizeof(GFC_REAL_8) * local->num_images);
+  this_image_buf = buffer + this_image.image_num;
+  *this_image_buf = *obj;
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset;
+	  *a += *b;
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    *obj = *buffer;
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_max_array_r8 (gfc_array_r8 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_max_array_r8);
+
+void
+nca_collsub_max_array_r8 (gfc_array_r8 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_REAL_8 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_REAL_8 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_REAL_8);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_REAL_8);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_REAL_8);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_REAL_8 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_REAL_8 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_REAL_8 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_REAL_8 *a;
+	  GFC_REAL_8 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      if (*b > *a)
+		*a = *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_REAL_8 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_REAL_8 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_min_array_r8 (gfc_array_r8 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_min_array_r8);
+
+void
+nca_collsub_min_array_r8 (gfc_array_r8 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_REAL_8 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_REAL_8 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_REAL_8);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_REAL_8);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_REAL_8);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_REAL_8 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_REAL_8 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_REAL_8 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_REAL_8 *a;
+	  GFC_REAL_8 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      if (*b < *a)
+		*a = *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_REAL_8 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_REAL_8 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_sum_array_r8 (gfc_array_r8 * restrict array, int *result_image,
+				      int *stat, char *errmsg, index_type errmsg_len);
+export_proto (nca_collsub_sum_array_r8);
+
+void
+nca_collsub_sum_array_r8 (gfc_array_r8 * restrict array, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];
+  index_type extent[GFC_MAX_DIMENSIONS];
+  GFC_REAL_8 *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  GFC_REAL_8 *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  dim = GFC_DESCRIPTOR_RANK (array);
+  ssize = sizeof (GFC_REAL_8);
+  packed = true;
+  span = array->span != 0 ? array->span : (index_type) sizeof (GFC_REAL_8);
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (ssize != stride[n])
+	packed = false;
+
+      ssize *= extent[n];
+    }
+
+  num_elems = ssize / sizeof (GFC_REAL_8);
+
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * num_elems;
+  
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      GFC_REAL_8 *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+	  *(dest++) = *((GFC_REAL_8 *) src);
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0 && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  GFC_REAL_8 * other_shared_ptr;  /* Points to the shared memory
+						allocated to another image.  */
+	  GFC_REAL_8 *a;
+	  GFC_REAL_8 *b;
+
+	  other_shared_ptr = this_shared_ptr + num_elems * imoffset;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = this_shared_ptr + i;
+	      b = other_shared_ptr + i;
+	      *a += *b;
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  GFC_REAL_8 *src = buffer;
+	  char * restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      *((GFC_REAL_8 * ) dest) =  *src++;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+
+#endif
+
diff --git a/libgfortran/generated/nca_minmax_s1.c b/libgfortran/generated/nca_minmax_s1.c
new file mode 100644
index 0000000..b081452
--- /dev/null
+++ b/libgfortran/generated/nca_minmax_s1.c
@@ -0,0 +1,494 @@
+/* Implementation of collective subroutines minmax.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   Contributed by Thomas Koenig  <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+
+
+#include "libgfortran.h"
+
+#if defined (HAVE_GFC_UINTEGER_1)
+#include <string.h>
+#include "../nca/libcoarraynative.h"
+#include "../nca/collective_subroutine.h"
+#include "../nca/collective_inline.h"
+
+#if 1 == 4
+
+/* Compare wide character types, which are handled internally as
+   unsigned 4-byte integers.  */
+static inline int
+memcmp4 (const void *a, const void *b, size_t len)
+{
+  const GFC_UINTEGER_4 *pa = a;
+  const GFC_UINTEGER_4 *pb = b;
+  while (len-- > 0)
+    {
+      if (*pa != *pb)
+	return *pa < *pb ? -1 : 1;
+      pa ++;
+      pb ++;
+    }
+  return 0;
+}
+
+#endif
+void nca_collsub_max_scalar_s1 (GFC_UINTEGER_1 *obj, int *result_image,
+			int *stat, char *errmsg, index_type char_len, index_type errmsg_len);
+export_proto(nca_collsub_max_scalar_s1);
+
+void
+nca_collsub_max_scalar_s1 (GFC_UINTEGER_1 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type char_len,
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_UINTEGER_1 *a, *b;
+  GFC_UINTEGER_1 *buffer, *this_image_buf;
+  collsub_iface *ci;
+  index_type type_size;
+
+  ci = &local->ci;
+
+  type_size = char_len * sizeof (GFC_UINTEGER_1);
+  buffer = get_collsub_buf (ci, type_size * local->num_images);
+  this_image_buf = buffer + this_image.image_num * char_len;
+  memcpy (this_image_buf, obj, type_size);
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0
+    && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset * char_len;
+	  if (memcmp (b, a, char_len) > 0)
+	    memcpy (a, b, type_size);
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    memcpy (obj, buffer, type_size);
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_min_scalar_s1 (GFC_UINTEGER_1 *obj, int *result_image,
+			int *stat, char *errmsg, index_type char_len, index_type errmsg_len);
+export_proto(nca_collsub_min_scalar_s1);
+
+void
+nca_collsub_min_scalar_s1 (GFC_UINTEGER_1 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type char_len,
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_UINTEGER_1 *a, *b;
+  GFC_UINTEGER_1 *buffer, *this_image_buf;
+  collsub_iface *ci;
+  index_type type_size;
+
+  ci = &local->ci;
+
+  type_size = char_len * sizeof (GFC_UINTEGER_1);
+  buffer = get_collsub_buf (ci, type_size * local->num_images);
+  this_image_buf = buffer + this_image.image_num * char_len;
+  memcpy (this_image_buf, obj, type_size);
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0
+    && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset * char_len;
+	  if (memcmp (b, a, char_len) < 0)
+	    memcpy (a, b, type_size);
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    memcpy (obj, buffer, type_size);
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_max_array_s1 (gfc_array_s1 * restrict array, int *result_image,
+				int *stat, char *errmsg, index_type char_len,
+				index_type errmsg_len);
+export_proto (nca_collsub_max_array_s1);
+
+void
+nca_collsub_max_array_s1 (gfc_array_s1 * restrict array, int *result_image,
+			  int *stat __attribute__ ((unused)),
+			  char *errmsg __attribute__ ((unused)),
+			  index_type char_len,
+			  index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];  /* stride is byte-based here.  */
+  index_type extent[GFC_MAX_DIMENSIONS];
+  char *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  char *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  index_type type_size;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  type_size = char_len * sizeof (GFC_UINTEGER_1);
+  dim = GFC_DESCRIPTOR_RANK (array);
+  num_elems = 1;
+  ssize = type_size;
+  packed = true;
+  span = array->span != 0 ? array->span : type_size;
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (num_elems != GFC_DESCRIPTOR_STRIDE (array,n))
+	packed = false;
+
+      num_elems *= extent[n];
+    }
+
+  ssize = num_elems * type_size;
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * ssize;
+
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      char *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+
+	  memcpy (dest, src, type_size);
+	  dest += type_size;
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0
+    && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  char *other_shared_ptr;  /* Points to the shared memory
+				      allocated to another image.  */
+	  GFC_UINTEGER_1 *a;
+	  GFC_UINTEGER_1 *b;
+
+	  other_shared_ptr = this_shared_ptr + imoffset * ssize;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = (GFC_UINTEGER_1 *) (this_shared_ptr + i * type_size);
+	      b = (GFC_UINTEGER_1 *) (other_shared_ptr + i * type_size);
+	      if (memcmp (b, a, char_len) > 0)
+		memcpy (a, b, type_size);
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  char *src = buffer;
+	  char *restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      memcpy (dest, src, type_size);
+	      src += span;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_min_array_s1 (gfc_array_s1 * restrict array, int *result_image,
+				int *stat, char *errmsg, index_type char_len,
+				index_type errmsg_len);
+export_proto (nca_collsub_min_array_s1);
+
+void
+nca_collsub_min_array_s1 (gfc_array_s1 * restrict array, int *result_image,
+			  int *stat __attribute__ ((unused)),
+			  char *errmsg __attribute__ ((unused)),
+			  index_type char_len,
+			  index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];  /* stride is byte-based here.  */
+  index_type extent[GFC_MAX_DIMENSIONS];
+  char *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  char *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  index_type type_size;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  type_size = char_len * sizeof (GFC_UINTEGER_1);
+  dim = GFC_DESCRIPTOR_RANK (array);
+  num_elems = 1;
+  ssize = type_size;
+  packed = true;
+  span = array->span != 0 ? array->span : type_size;
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (num_elems != GFC_DESCRIPTOR_STRIDE (array,n))
+	packed = false;
+
+      num_elems *= extent[n];
+    }
+
+  ssize = num_elems * type_size;
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * ssize;
+
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      char *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+
+	  memcpy (dest, src, type_size);
+	  dest += type_size;
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0
+    && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  char *other_shared_ptr;  /* Points to the shared memory
+				      allocated to another image.  */
+	  GFC_UINTEGER_1 *a;
+	  GFC_UINTEGER_1 *b;
+
+	  other_shared_ptr = this_shared_ptr + imoffset * ssize;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = (GFC_UINTEGER_1 *) (this_shared_ptr + i * type_size);
+	      b = (GFC_UINTEGER_1 *) (other_shared_ptr + i * type_size);
+	      if (memcmp (b, a, char_len) < 0)
+		memcpy (a, b, type_size);
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  char *src = buffer;
+	  char *restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      memcpy (dest, src, type_size);
+	      src += span;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+
+#endif
+
diff --git a/libgfortran/generated/nca_minmax_s4.c b/libgfortran/generated/nca_minmax_s4.c
new file mode 100644
index 0000000..b202fda
--- /dev/null
+++ b/libgfortran/generated/nca_minmax_s4.c
@@ -0,0 +1,494 @@
+/* Implementation of collective subroutines minmax.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   Contributed by Thomas Koenig  <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+
+
+#include "libgfortran.h"
+
+#if defined (HAVE_GFC_UINTEGER_4)
+#include <string.h>
+#include "../nca/libcoarraynative.h"
+#include "../nca/collective_subroutine.h"
+#include "../nca/collective_inline.h"
+
+#if 4 == 4
+
+/* Compare wide character types, which are handled internally as
+   unsigned 4-byte integers.  */
+static inline int
+memcmp4 (const void *a, const void *b, size_t len)
+{
+  const GFC_UINTEGER_4 *pa = a;
+  const GFC_UINTEGER_4 *pb = b;
+  while (len-- > 0)
+    {
+      if (*pa != *pb)
+	return *pa < *pb ? -1 : 1;
+      pa ++;
+      pb ++;
+    }
+  return 0;
+}
+
+#endif
+void nca_collsub_max_scalar_s4 (GFC_UINTEGER_4 *obj, int *result_image,
+			int *stat, char *errmsg, index_type char_len, index_type errmsg_len);
+export_proto(nca_collsub_max_scalar_s4);
+
+void
+nca_collsub_max_scalar_s4 (GFC_UINTEGER_4 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type char_len,
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_UINTEGER_4 *a, *b;
+  GFC_UINTEGER_4 *buffer, *this_image_buf;
+  collsub_iface *ci;
+  index_type type_size;
+
+  ci = &local->ci;
+
+  type_size = char_len * sizeof (GFC_UINTEGER_4);
+  buffer = get_collsub_buf (ci, type_size * local->num_images);
+  this_image_buf = buffer + this_image.image_num * char_len;
+  memcpy (this_image_buf, obj, type_size);
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0
+    && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset * char_len;
+	  if (memcmp4 (b, a, char_len) > 0)
+	    memcpy (a, b, type_size);
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    memcpy (obj, buffer, type_size);
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_min_scalar_s4 (GFC_UINTEGER_4 *obj, int *result_image,
+			int *stat, char *errmsg, index_type char_len, index_type errmsg_len);
+export_proto(nca_collsub_min_scalar_s4);
+
+void
+nca_collsub_min_scalar_s4 (GFC_UINTEGER_4 *obj, int *result_image,
+			   int *stat __attribute__ ((unused)),
+			   char *errmsg __attribute__ ((unused)),
+			   index_type char_len,
+			   index_type errmsg_len __attribute__ ((unused)))
+{
+  int cbit = 0;
+  int imoffset;
+  GFC_UINTEGER_4 *a, *b;
+  GFC_UINTEGER_4 *buffer, *this_image_buf;
+  collsub_iface *ci;
+  index_type type_size;
+
+  ci = &local->ci;
+
+  type_size = char_len * sizeof (GFC_UINTEGER_4);
+  buffer = get_collsub_buf (ci, type_size * local->num_images);
+  this_image_buf = buffer + this_image.image_num * char_len;
+  memcpy (this_image_buf, obj, type_size);
+
+  collsub_sync (ci);
+  for (; ((this_image.image_num >> cbit) & 1) == 0
+    && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  a = this_image_buf;
+	  b = this_image_buf + imoffset * char_len;
+	  if (memcmp4 (b, a, char_len) < 0)
+	    memcpy (a, b, type_size);
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    memcpy (obj, buffer, type_size);
+
+  finish_collective_subroutine (ci);
+
+}
+
+void nca_collsub_max_array_s4 (gfc_array_s4 * restrict array, int *result_image,
+				int *stat, char *errmsg, index_type char_len,
+				index_type errmsg_len);
+export_proto (nca_collsub_max_array_s4);
+
+void
+nca_collsub_max_array_s4 (gfc_array_s4 * restrict array, int *result_image,
+			  int *stat __attribute__ ((unused)),
+			  char *errmsg __attribute__ ((unused)),
+			  index_type char_len,
+			  index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];  /* stride is byte-based here.  */
+  index_type extent[GFC_MAX_DIMENSIONS];
+  char *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  char *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  index_type type_size;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  type_size = char_len * sizeof (GFC_UINTEGER_4);
+  dim = GFC_DESCRIPTOR_RANK (array);
+  num_elems = 1;
+  ssize = type_size;
+  packed = true;
+  span = array->span != 0 ? array->span : type_size;
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (num_elems != GFC_DESCRIPTOR_STRIDE (array,n))
+	packed = false;
+
+      num_elems *= extent[n];
+    }
+
+  ssize = num_elems * type_size;
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * ssize;
+
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      char *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+
+	  memcpy (dest, src, type_size);
+	  dest += type_size;
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0
+    && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  char *other_shared_ptr;  /* Points to the shared memory
+				      allocated to another image.  */
+	  GFC_UINTEGER_4 *a;
+	  GFC_UINTEGER_4 *b;
+
+	  other_shared_ptr = this_shared_ptr + imoffset * ssize;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = (GFC_UINTEGER_4 *) (this_shared_ptr + i * type_size);
+	      b = (GFC_UINTEGER_4 *) (other_shared_ptr + i * type_size);
+	      if (memcmp4 (b, a, char_len) > 0)
+		memcpy (a, b, type_size);
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  char *src = buffer;
+	  char *restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      memcpy (dest, src, type_size);
+	      src += span;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+void nca_collsub_min_array_s4 (gfc_array_s4 * restrict array, int *result_image,
+				int *stat, char *errmsg, index_type char_len,
+				index_type errmsg_len);
+export_proto (nca_collsub_min_array_s4);
+
+void
+nca_collsub_min_array_s4 (gfc_array_s4 * restrict array, int *result_image,
+			  int *stat __attribute__ ((unused)),
+			  char *errmsg __attribute__ ((unused)),
+			  index_type char_len,
+			  index_type errmsg_len __attribute__ ((unused)))
+{
+  index_type count[GFC_MAX_DIMENSIONS];
+  index_type stride[GFC_MAX_DIMENSIONS];  /* stride is byte-based here.  */
+  index_type extent[GFC_MAX_DIMENSIONS];
+  char *this_shared_ptr;  /* Points to the shared memory allocated to this image.  */
+  char *buffer;
+  index_type dim;
+  bool packed;
+  index_type span;
+  index_type ssize, num_elems;
+  int cbit = 0;
+  int imoffset;
+  index_type type_size;
+  collsub_iface *ci;
+
+  ci = &local->ci;
+
+  type_size = char_len * sizeof (GFC_UINTEGER_4);
+  dim = GFC_DESCRIPTOR_RANK (array);
+  num_elems = 1;
+  ssize = type_size;
+  packed = true;
+  span = array->span != 0 ? array->span : type_size;
+  for (index_type n = 0; n < dim; n++)
+    {
+      count[n] = 0;
+      stride[n] = GFC_DESCRIPTOR_STRIDE (array, n) * span;
+      extent[n] = GFC_DESCRIPTOR_EXTENT (array, n);
+
+      /* No-op for an empty array.  */
+      if (extent[n] <= 0)
+	return;
+
+      if (num_elems != GFC_DESCRIPTOR_STRIDE (array,n))
+	packed = false;
+
+      num_elems *= extent[n];
+    }
+
+  ssize = num_elems * type_size;
+  buffer = get_collsub_buf (ci, ssize * local->num_images);
+  this_shared_ptr = buffer + this_image.image_num * ssize;
+
+  if (packed)
+    memcpy (this_shared_ptr, array->base_addr, ssize);
+  else
+    {
+      char *src = (char *) array->base_addr;
+      char *restrict dest = this_shared_ptr;
+      index_type stride0 = stride[0];
+
+      while (src)
+	{
+	  /* Copy the data.  */
+
+	  memcpy (dest, src, type_size);
+	  dest += type_size;
+	  src += stride0;
+	  count[0] ++;
+	  /* Advance to the next source element.  */
+	  for (index_type n = 0; count[n] == extent[n] ; )
+	    {
+	      /* When we get to the end of a dimension, reset it and increment
+		 the next dimension.  */
+	      count[n] = 0;
+	      src -= stride[n] * extent[n];
+	      n++;
+	      if (n == dim)
+		{
+		  src = NULL;
+		  break;
+		}
+	      else
+		{
+		  count[n]++;
+		  src += stride[n];
+		}
+	    }
+	}
+    }
+
+  collsub_sync (ci);
+
+  /* Reduce the array to image zero. Here the general scheme:
+
+      abababababab
+      a_b_a_b_a_b_
+      a___b___a___
+      a_______b___
+      r___________
+  */
+  for (; ((this_image.image_num >> cbit) & 1) == 0
+    && (local->num_images >> cbit) != 0; cbit++)
+    {
+      imoffset = 1 << cbit;
+      if (this_image.image_num + imoffset < local->num_images)
+	{
+	  char *other_shared_ptr;  /* Points to the shared memory
+				      allocated to another image.  */
+	  GFC_UINTEGER_4 *a;
+	  GFC_UINTEGER_4 *b;
+
+	  other_shared_ptr = this_shared_ptr + imoffset * ssize;
+	  for (index_type i = 0; i < num_elems; i++)
+	    {
+	      a = (GFC_UINTEGER_4 *) (this_shared_ptr + i * type_size);
+	      b = (GFC_UINTEGER_4 *) (other_shared_ptr + i * type_size);
+	      if (memcmp4 (b, a, char_len) < 0)
+		memcpy (a, b, type_size);
+	    }
+	}
+      collsub_sync (ci);
+    }
+  for ( ; (local->num_images >> cbit) != 0; cbit++)
+    collsub_sync (ci);
+
+  if (!result_image || (*result_image - 1) == this_image.image_num)
+    {
+      if (packed)
+	memcpy (array->base_addr, buffer, ssize);
+      else
+	{
+	  char *src = buffer;
+	  char *restrict dest = (char *) array->base_addr;
+	  index_type stride0 = stride[0];
+
+	  for (index_type n = 0; n < dim; n++)
+	    count[n] = 0;
+
+	  while (dest)
+	    {
+	      memcpy (dest, src, type_size);
+	      src += span;
+	      dest += stride0;
+	      count[0] ++;
+	      for (index_type n = 0; count[n] == extent[n] ;)
+	        {
+	      	  /* When we get to the end of a dimension, reset it and increment
+		     the next dimension.  */
+	      	   count[n] = 0;
+	      	   dest -= stride[n] * extent[n];
+	      	   n++;
+	      	   if (n == dim)
+		     {
+		       dest = NULL;
+		       break;
+		     }
+	      	   else
+		     {
+		       count[n]++;
+		       dest += stride[n];
+		     }
+		}
+	    }
+	}
+    }
+    finish_collective_subroutine (ci);
+}
+
+#endif
+
author	I'm not telling you my name, idiot <git_is@stupid.com>	2020-09-23 18:14:20 +0200
committer	I'm not telling you my name, idiot <git_is@stupid.com>	2020-09-23 18:14:20 +0200
commit	bef0a39f2f8e87780f990d12fa71f5ed0039267a (patch)
tree	ce8138e924d9bfff3ce0b63aa4c29397fbfaa1a5 /libgfortran/generated
parent	9044db88d634c631920eaa9f66c0275adf18fdf5 (diff)
download	gcc-bef0a39f2f8e87780f990d12fa71f5ed0039267a.zip gcc-bef0a39f2f8e87780f990d12fa71f5ed0039267a.tar.gz gcc-bef0a39f2f8e87780f990d12fa71f5ed0039267a.tar.bz2