1 files changed, 638 insertions, 804 deletions
diff --git a/libffi/src/aarch64/ffi.c b/libffi/src/aarch64/ffi.c
index 1405665..0cace9d 100644
--- a/libffi/src/aarch64/ffi.c
+++ b/libffi/src/aarch64/ffi.c
@@ -20,19 +20,22 @@ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
 #include <stdio.h>
-
+#include <stdlib.h>
+#include <stdint.h>
 #include <ffi.h>
 #include <ffi_common.h>
-
-#include <stdlib.h>
-
-/* Stack alignment requirement in bytes */
-#define AARCH64_STACK_ALIGN 16
-
-#define N_X_ARG_REG 8
-#define N_V_ARG_REG 8
-
-#define AARCH64_FFI_WITH_V (1 << AARCH64_FFI_WITH_V_BIT)
+#include "internal.h"
+
+/* Force FFI_TYPE_LONGDOUBLE to be different than FFI_TYPE_DOUBLE;
+   all further uses in this file will refer to the 128-bit type.  */
+#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
+# if FFI_TYPE_LONGDOUBLE != 4
+#  error FFI_TYPE_LONGDOUBLE out of date
+# endif
+#else
+# undef FFI_TYPE_LONGDOUBLE
+# define FFI_TYPE_LONGDOUBLE 4
+#endif
 
 union _d
 {
@@ -40,321 +43,184 @@ union _d
   UINT32 s[2];
 };
 
-struct call_context
+struct _v
 {
-  UINT64 x [AARCH64_N_XREG];
-  struct
-  {
-    union _d d[2];
-  } v [AARCH64_N_VREG];
+  union _d d[2] __attribute__((aligned(16)));
 };
 
-static void *
-get_x_addr (struct call_context *context, unsigned n)
+struct call_context
 {
-  return &context->x[n];
-}
+  struct _v v[N_V_ARG_REG];
+  UINT64 x[N_X_ARG_REG];
+};
 
-static void *
-get_s_addr (struct call_context *context, unsigned n)
-{
-#if defined __AARCH64EB__
-  return &context->v[n].d[1].s[1];
-#else
-  return &context->v[n].d[0].s[0];
+#if defined (__clang__) && defined (__APPLE__)
+extern void sys_icache_invalidate (void *start, size_t len);
 #endif
-}
 
-static void *
-get_d_addr (struct call_context *context, unsigned n)
+static inline void
+ffi_clear_cache (void *start, void *end)
 {
-#if defined __AARCH64EB__
-  return &context->v[n].d[1];
+#if defined (__clang__) && defined (__APPLE__)
+  sys_icache_invalidate (start, (char *)end - (char *)start);
+#elif defined (__GNUC__)
+  __builtin___clear_cache (start, end);
 #else
-  return &context->v[n].d[0];
+#error "Missing builtin to flush instruction cache"
 #endif
 }
 
-static void *
-get_v_addr (struct call_context *context, unsigned n)
-{
-  return &context->v[n];
-}
-
-/* Return the memory location at which a basic type would reside
-   were it to have been stored in register n.  */
-
-static void *
-get_basic_type_addr (unsigned short type, struct call_context *context,
-		     unsigned n)
-{
-  switch (type)
-    {
-    case FFI_TYPE_FLOAT:
-      return get_s_addr (context, n);
-    case FFI_TYPE_DOUBLE:
-      return get_d_addr (context, n);
-    case FFI_TYPE_LONGDOUBLE:
-      return get_v_addr (context, n);
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_SINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_SINT64:
-      return get_x_addr (context, n);
-    default:
-      FFI_ASSERT (0);
-      return NULL;
-    }
-}
-
-/* Return the alignment width for each of the basic types.  */
-
-static size_t
-get_basic_type_alignment (unsigned short type)
-{
-  switch (type)
-    {
-    case FFI_TYPE_FLOAT:
-    case FFI_TYPE_DOUBLE:
-      return sizeof (UINT64);
-    case FFI_TYPE_LONGDOUBLE:
-      return sizeof (long double);
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_SINT32:
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_SINT64:
-      return sizeof (UINT64);
-
-    default:
-      FFI_ASSERT (0);
-      return 0;
-    }
-}
-
-/* Return the size in bytes for each of the basic types.  */
+/* A subroutine of is_vfp_type.  Given a structure type, return the type code
+   of the first non-structure element.  Recurse for structure elements.
+   Return -1 if the structure is in fact empty, i.e. no nested elements.  */
 
-static size_t
-get_basic_type_size (unsigned short type)
+static int
+is_hfa0 (const ffi_type *ty)
 {
-  switch (type)
-    {
-    case FFI_TYPE_FLOAT:
-      return sizeof (UINT32);
-    case FFI_TYPE_DOUBLE:
-      return sizeof (UINT64);
-    case FFI_TYPE_LONGDOUBLE:
-      return sizeof (long double);
-    case FFI_TYPE_UINT8:
-      return sizeof (UINT8);
-    case FFI_TYPE_SINT8:
-      return sizeof (SINT8);
-    case FFI_TYPE_UINT16:
-      return sizeof (UINT16);
-    case FFI_TYPE_SINT16:
-      return sizeof (SINT16);
-    case FFI_TYPE_UINT32:
-      return sizeof (UINT32);
-    case FFI_TYPE_INT:
-    case FFI_TYPE_SINT32:
-      return sizeof (SINT32);
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-      return sizeof (UINT64);
-    case FFI_TYPE_SINT64:
-      return sizeof (SINT64);
-
-    default:
-      FFI_ASSERT (0);
-      return 0;
-    }
-}
-
-extern void
-ffi_call_SYSV (unsigned (*)(struct call_context *context, unsigned char *,
-			    extended_cif *),
-               struct call_context *context,
-               extended_cif *,
-               unsigned,
-               void (*fn)(void));
+  ffi_type **elements = ty->elements;
+  int i, ret = -1;
 
-extern void
-ffi_closure_SYSV (ffi_closure *);
-
-/* Test for an FFI floating point representation.  */
+  if (elements != NULL)
+    for (i = 0; elements[i]; ++i)
+      {
+        ret = elements[i]->type;
+        if (ret == FFI_TYPE_STRUCT || ret == FFI_TYPE_COMPLEX)
+          {
+            ret = is_hfa0 (elements[i]);
+            if (ret < 0)
+              continue;
+          }
+        break;
+      }
 
-static unsigned
-is_floating_type (unsigned short type)
-{
-  return (type == FFI_TYPE_FLOAT || type == FFI_TYPE_DOUBLE
-	  || type == FFI_TYPE_LONGDOUBLE);
+  return ret;
 }
 
-/* Test for a homogeneous structure.  */
+/* A subroutine of is_vfp_type.  Given a structure type, return true if all
+   of the non-structure elements are the same as CANDIDATE.  */
 
-static unsigned short
-get_homogeneous_type (ffi_type *ty)
+static int
+is_hfa1 (const ffi_type *ty, int candidate)
 {
-  if (ty->type == FFI_TYPE_STRUCT && ty->elements)
-    {
-      unsigned i;
-      unsigned short candidate_type
-	= get_homogeneous_type (ty->elements[0]);
-      for (i =1; ty->elements[i]; i++)
-	{
-	  unsigned short iteration_type = 0;
-	  /* If we have a nested struct, we must find its homogeneous type.
-	     If that fits with our candidate type, we are still
-	     homogeneous.  */
-	  if (ty->elements[i]->type == FFI_TYPE_STRUCT
-	      && ty->elements[i]->elements)
-	    {
-	      iteration_type = get_homogeneous_type (ty->elements[i]);
-	    }
-	  else
-	    {
-	      iteration_type = ty->elements[i]->type;
-	    }
+  ffi_type **elements = ty->elements;
+  int i;
 
-	  /* If we are not homogeneous, return FFI_TYPE_STRUCT.  */
-	  if (candidate_type != iteration_type)
-	    return FFI_TYPE_STRUCT;
-	}
-      return candidate_type;
-    }
+  if (elements != NULL)
+    for (i = 0; elements[i]; ++i)
+      {
+        int t = elements[i]->type;
+        if (t == FFI_TYPE_STRUCT || t == FFI_TYPE_COMPLEX)
+          {
+            if (!is_hfa1 (elements[i], candidate))
+              return 0;
+          }
+        else if (t != candidate)
+          return 0;
+      }
 
-  /* Base case, we have no more levels of nesting, so we
-     are a basic type, and so, trivially homogeneous in that type.  */
-  return ty->type;
+  return 1;
 }
 
-/* Determine the number of elements within a STRUCT.
-
-   Note, we must handle nested structs.
+/* Determine if TY may be allocated to the FP registers.  This is both an
+   fp scalar type as well as an homogenous floating point aggregate (HFA).
+   That is, a structure consisting of 1 to 4 members of all the same type,
+   where that type is an fp scalar.
 
-   If ty is not a STRUCT this function will return 0.  */
+   Returns non-zero iff TY is an HFA.  The result is the AARCH64_RET_*
+   constant for the type.  */
 
-static unsigned
-element_count (ffi_type *ty)
+static int
+is_vfp_type (const ffi_type *ty)
 {
-  if (ty->type == FFI_TYPE_STRUCT && ty->elements)
+  ffi_type **elements;
+  int candidate, i;
+  size_t size, ele_count;
+
+  /* Quickest tests first.  */
+  candidate = ty->type;
+  switch (candidate)
     {
-      unsigned n;
-      unsigned elems = 0;
-      for (n = 0; ty->elements[n]; n++)
+    default:
+      return 0;
+    case FFI_TYPE_FLOAT:
+    case FFI_TYPE_DOUBLE:
+    case FFI_TYPE_LONGDOUBLE:
+      ele_count = 1;
+      goto done;
+    case FFI_TYPE_COMPLEX:
+      candidate = ty->elements[0]->type;
+      switch (candidate)
 	{
-	  if (ty->elements[n]->type == FFI_TYPE_STRUCT
-	      && ty->elements[n]->elements)
-	    elems += element_count (ty->elements[n]);
-	  else
-	    elems++;
+	case FFI_TYPE_FLOAT:
+	case FFI_TYPE_DOUBLE:
+	case FFI_TYPE_LONGDOUBLE:
+	  ele_count = 2;
+	  goto done;
 	}
-      return elems;
+      return 0;
+    case FFI_TYPE_STRUCT:
+      break;
     }
-  return 0;
-}
-
-/* Test for a homogeneous floating point aggregate.
 
-   A homogeneous floating point aggregate is a homogeneous aggregate of
-   a half- single- or double- precision floating point type with one
-   to four elements.  Note that this includes nested structs of the
-   basic type.  */
+  /* No HFA types are smaller than 4 bytes, or larger than 64 bytes.  */
+  size = ty->size;
+  if (size < 4 || size > 64)
+    return 0;
 
-static int
-is_hfa (ffi_type *ty)
-{
-  if (ty->type == FFI_TYPE_STRUCT
-      && ty->elements[0]
-      && is_floating_type (get_homogeneous_type (ty)))
+  /* Find the type of the first non-structure member.  */
+  elements = ty->elements;
+  candidate = elements[0]->type;
+  if (candidate == FFI_TYPE_STRUCT || candidate == FFI_TYPE_COMPLEX)
     {
-      unsigned n = element_count (ty);
-      return n >= 1 && n <= 4;
+      for (i = 0; ; ++i)
+        {
+          candidate = is_hfa0 (elements[i]);
+          if (candidate >= 0)
+            break;
+        }
     }
-  return 0;
-}
-
-/* Test if an ffi_type is a candidate for passing in a register.
-
-   This test does not check that sufficient registers of the
-   appropriate class are actually available, merely that IFF
-   sufficient registers are available then the argument will be passed
-   in register(s).
 
-   Note that an ffi_type that is deemed to be a register candidate
-   will always be returned in registers.
-
-   Returns 1 if a register candidate else 0.  */
-
-static int
-is_register_candidate (ffi_type *ty)
-{
-  switch (ty->type)
+  /* If the first member is not a floating point type, it's not an HFA.
+     Also quickly re-check the size of the structure.  */
+  switch (candidate)
     {
-    case FFI_TYPE_VOID:
     case FFI_TYPE_FLOAT:
+      ele_count = size / sizeof(float);
+      if (size != ele_count * sizeof(float))
+        return 0;
+      break;
     case FFI_TYPE_DOUBLE:
+      ele_count = size / sizeof(double);
+      if (size != ele_count * sizeof(double))
+        return 0;
+      break;
     case FFI_TYPE_LONGDOUBLE:
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_SINT16:
-    case FFI_TYPE_SINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_SINT64:
-      return 1;
-
-    case FFI_TYPE_STRUCT:
-      if (is_hfa (ty))
-        {
-          return 1;
-        }
-      else if (ty->size > 16)
-        {
-          /* Too large. Will be replaced with a pointer to memory. The
-             pointer MAY be passed in a register, but the value will
-             not. This test specifically fails since the argument will
-             never be passed by value in registers. */
-          return 0;
-        }
-      else
-        {
-          /* Might be passed in registers depending on the number of
-             registers required. */
-          return (ty->size + 7) / 8 < N_X_ARG_REG;
-        }
+      ele_count = size / sizeof(long double);
+      if (size != ele_count * sizeof(long double))
+        return 0;
       break;
-
     default:
-      FFI_ASSERT (0);
-      break;
+      return 0;
     }
+  if (ele_count > 4)
+    return 0;
 
-  return 0;
-}
-
-/* Test if an ffi_type argument or result is a candidate for a vector
-   register.  */
+  /* Finally, make sure that all scalar elements are the same type.  */
+  for (i = 0; elements[i]; ++i)
+    {
+      int t = elements[i]->type;
+      if (t == FFI_TYPE_STRUCT || t == FFI_TYPE_COMPLEX)
+        {
+          if (!is_hfa1 (elements[i], candidate))
+            return 0;
+        }
+      else if (t != candidate)
+        return 0;
+    }
 
-static int
-is_v_register_candidate (ffi_type *ty)
-{
-  return is_floating_type (ty->type)
-	   || (ty->type == FFI_TYPE_STRUCT && is_hfa (ty));
+  /* All tests succeeded.  Encode the result.  */
+ done:
+  return candidate * 4 + (4 - ele_count);
 }
 
 /* Representation of the procedure call argument marshalling
@@ -367,161 +233,198 @@ struct arg_state
 {
   unsigned ngrn;                /* Next general-purpose register number. */
   unsigned nsrn;                /* Next vector register number. */
-  unsigned nsaa;                /* Next stack offset. */
+  size_t nsaa;                  /* Next stack offset. */
+
+#if defined (__APPLE__)
+  unsigned allocating_variadic;
+#endif
 };
 
 /* Initialize a procedure call argument marshalling state.  */
 static void
-arg_init (struct arg_state *state, unsigned call_frame_size)
+arg_init (struct arg_state *state)
 {
   state->ngrn = 0;
   state->nsrn = 0;
   state->nsaa = 0;
-}
-
-/* Return the number of available consecutive core argument
-   registers.  */
-
-static unsigned
-available_x (struct arg_state *state)
-{
-  return N_X_ARG_REG - state->ngrn;
-}
-
-/* Return the number of available consecutive vector argument
-   registers.  */
-
-static unsigned
-available_v (struct arg_state *state)
-{
-  return N_V_ARG_REG - state->nsrn;
-}
-
-static void *
-allocate_to_x (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->ngrn < N_X_ARG_REG)
-  return get_x_addr (context, (state->ngrn)++);
-}
-
-static void *
-allocate_to_s (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->nsrn < N_V_ARG_REG)
-  return get_s_addr (context, (state->nsrn)++);
-}
-
-static void *
-allocate_to_d (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->nsrn < N_V_ARG_REG)
-  return get_d_addr (context, (state->nsrn)++);
-}
-
-static void *
-allocate_to_v (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->nsrn < N_V_ARG_REG)
-  return get_v_addr (context, (state->nsrn)++);
+#if defined (__APPLE__)
+  state->allocating_variadic = 0;
+#endif
 }
 
 /* Allocate an aligned slot on the stack and return a pointer to it.  */
 static void *
-allocate_to_stack (struct arg_state *state, void *stack, unsigned alignment,
-		   unsigned size)
+allocate_to_stack (struct arg_state *state, void *stack,
+		   size_t alignment, size_t size)
 {
-  void *allocation;
+  size_t nsaa = state->nsaa;
 
   /* Round up the NSAA to the larger of 8 or the natural
      alignment of the argument's type.  */
-  state->nsaa = ALIGN (state->nsaa, alignment);
-  state->nsaa = ALIGN (state->nsaa, alignment);
-  state->nsaa = ALIGN (state->nsaa, 8);
-
-  allocation = stack + state->nsaa;
+#if defined (__APPLE__)
+  if (state->allocating_variadic && alignment < 8)
+    alignment = 8;
+#else
+  if (alignment < 8)
+    alignment = 8;
+#endif
+    
+  nsaa = ALIGN (nsaa, alignment);
+  state->nsaa = nsaa + size;
 
-  state->nsaa += size;
-  return allocation;
+  return (char *)stack + nsaa;
 }
 
-static void
-copy_basic_type (void *dest, void *source, unsigned short type)
+static ffi_arg
+extend_integer_type (void *source, int type)
 {
-  /* This is neccessary to ensure that basic types are copied
-     sign extended to 64-bits as libffi expects.  */
   switch (type)
     {
-    case FFI_TYPE_FLOAT:
-      *(float *) dest = *(float *) source;
-      break;
-    case FFI_TYPE_DOUBLE:
-      *(double *) dest = *(double *) source;
-      break;
-    case FFI_TYPE_LONGDOUBLE:
-      *(long double *) dest = *(long double *) source;
-      break;
     case FFI_TYPE_UINT8:
-      *(ffi_arg *) dest = *(UINT8 *) source;
-      break;
+      return *(UINT8 *) source;
     case FFI_TYPE_SINT8:
-      *(ffi_sarg *) dest = *(SINT8 *) source;
-      break;
+      return *(SINT8 *) source;
     case FFI_TYPE_UINT16:
-      *(ffi_arg *) dest = *(UINT16 *) source;
-      break;
+      return *(UINT16 *) source;
     case FFI_TYPE_SINT16:
-      *(ffi_sarg *) dest = *(SINT16 *) source;
-      break;
+      return *(SINT16 *) source;
     case FFI_TYPE_UINT32:
-      *(ffi_arg *) dest = *(UINT32 *) source;
-      break;
+      return *(UINT32 *) source;
     case FFI_TYPE_INT:
     case FFI_TYPE_SINT32:
-      *(ffi_sarg *) dest = *(SINT32 *) source;
-      break;
-    case FFI_TYPE_POINTER:
+      return *(SINT32 *) source;
     case FFI_TYPE_UINT64:
-      *(ffi_arg *) dest = *(UINT64 *) source;
-      break;
     case FFI_TYPE_SINT64:
-      *(ffi_sarg *) dest = *(SINT64 *) source;
+      return *(UINT64 *) source;
       break;
-
+    case FFI_TYPE_POINTER:
+      return *(uintptr_t *) source;
     default:
-      FFI_ASSERT (0);
+      abort();
     }
 }
 
 static void
-copy_hfa_to_reg_or_stack (void *memory,
-			  ffi_type *ty,
-			  struct call_context *context,
-			  unsigned char *stack,
-			  struct arg_state *state)
+extend_hfa_type (void *dest, void *src, int h)
+{
+  int f = h - AARCH64_RET_S4;
+  void *x0;
+
+  asm volatile (
+	"adr	%0, 0f\n"
+"	add	%0, %0, %1\n"
+"	br	%0\n"
+"0:	ldp	s16, s17, [%3]\n"	/* S4 */
+"	ldp	s18, s19, [%3, #8]\n"
+"	b	4f\n"
+"	ldp	s16, s17, [%3]\n"	/* S3 */
+"	ldr	s18, [%3, #8]\n"
+"	b	3f\n"
+"	ldp	s16, s17, [%3]\n"	/* S2 */
+"	b	2f\n"
+"	nop\n"
+"	ldr	s16, [%3]\n"		/* S1 */
+"	b	1f\n"
+"	nop\n"
+"	ldp	d16, d17, [%3]\n"	/* D4 */
+"	ldp	d18, d19, [%3, #16]\n"
+"	b	4f\n"
+"	ldp	d16, d17, [%3]\n"	/* D3 */
+"	ldr	d18, [%3, #16]\n"
+"	b	3f\n"
+"	ldp	d16, d17, [%3]\n"	/* D2 */
+"	b	2f\n"
+"	nop\n"
+"	ldr	d16, [%3]\n"		/* D1 */
+"	b	1f\n"
+"	nop\n"
+"	ldp	q16, q17, [%3]\n"	/* Q4 */
+"	ldp	q18, q19, [%3, #16]\n"
+"	b	4f\n"
+"	ldp	q16, q17, [%3]\n"	/* Q3 */
+"	ldr	q18, [%3, #16]\n"
+"	b	3f\n"
+"	ldp	q16, q17, [%3]\n"	/* Q2 */
+"	b	2f\n"
+"	nop\n"
+"	ldr	q16, [%3]\n"		/* Q1 */
+"	b	1f\n"
+"4:	str	q19, [%2, #48]\n"
+"3:	str	q18, [%2, #32]\n"
+"2:	str	q17, [%2, #16]\n"
+"1:	str	q16, [%2]"
+    : "=&r"(x0)
+    : "r"(f * 12), "r"(dest), "r"(src)
+    : "memory", "v16", "v17", "v18", "v19");
+}
+
+static void *
+compress_hfa_type (void *dest, void *reg, int h)
 {
-  unsigned elems = element_count (ty);
-  if (available_v (state) < elems)
-    {
-      /* There are insufficient V registers. Further V register allocations
-	 are prevented, the NSAA is adjusted (by allocate_to_stack ())
-	 and the argument is copied to memory at the adjusted NSAA.  */
-      state->nsrn = N_V_ARG_REG;
-      memcpy (allocate_to_stack (state, stack, ty->alignment, ty->size),
-	      memory,
-	      ty->size);
-    }
-  else
+  switch (h)
     {
-      int i;
-      unsigned short type = get_homogeneous_type (ty);
-      unsigned elems = element_count (ty);
-      for (i = 0; i < elems; i++)
+    case AARCH64_RET_S1:
+      if (dest == reg)
+	{
+#ifdef __AARCH64EB__
+	  dest += 12;
+#endif
+	}
+      else
+	*(float *)dest = *(float *)reg;
+      break;
+    case AARCH64_RET_S2:
+      asm ("ldp q16, q17, [%1]\n\t"
+	   "st2 { v16.s, v17.s }[0], [%0]"
+	   : : "r"(dest), "r"(reg) : "memory", "v16", "v17");
+      break;
+    case AARCH64_RET_S3:
+      asm ("ldp q16, q17, [%1]\n\t"
+	   "ldr q18, [%1, #32]\n\t"
+	   "st3 { v16.s, v17.s, v18.s }[0], [%0]"
+	   : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18");
+      break;
+    case AARCH64_RET_S4:
+      asm ("ldp q16, q17, [%1]\n\t"
+	   "ldp q18, q19, [%1, #32]\n\t"
+	   "st4 { v16.s, v17.s, v18.s, v19.s }[0], [%0]"
+	   : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18", "v19");
+      break;
+
+    case AARCH64_RET_D1:
+      if (dest == reg)
 	{
-	  void *reg = allocate_to_v (context, state);
-	  copy_basic_type (reg, memory, type);
-	  memory += get_basic_type_size (type);
+#ifdef __AARCH64EB__
+	  dest += 8;
+#endif
 	}
+      else
+	*(double *)dest = *(double *)reg;
+      break;
+    case AARCH64_RET_D2:
+      asm ("ldp q16, q17, [%1]\n\t"
+	   "st2 { v16.d, v17.d }[0], [%0]"
+	   : : "r"(dest), "r"(reg) : "memory", "v16", "v17");
+      break;
+    case AARCH64_RET_D3:
+      asm ("ldp q16, q17, [%1]\n\t"
+	   "ldr q18, [%1, #32]\n\t"
+	   "st3 { v16.d, v17.d, v18.d }[0], [%0]"
+	   : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18");
+      break;
+    case AARCH64_RET_D4:
+      asm ("ldp q16, q17, [%1]\n\t"
+	   "ldp q18, q19, [%1, #32]\n\t"
+	   "st4 { v16.d, v17.d, v18.d, v19.d }[0], [%0]"
+	   : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18", "v19");
+      break;
+
+    default:
+      if (dest != reg)
+	return memcpy (dest, reg, 16 * (4 - (h & 3)));
+      break;
     }
+  return dest;
 }
 
 /* Either allocate an appropriate register for the argument type, or if
@@ -529,83 +432,164 @@ copy_hfa_to_reg_or_stack (void *memory,
    to the allocated space.  */
 
 static void *
-allocate_to_register_or_stack (struct call_context *context,
-			       unsigned char *stack,
-			       struct arg_state *state,
-			       unsigned short type)
+allocate_int_to_reg_or_stack (struct call_context *context,
+			      struct arg_state *state,
+			      void *stack, size_t size)
 {
-  size_t alignment = get_basic_type_alignment (type);
-  size_t size = alignment;
-  switch (type)
+  if (state->ngrn < N_X_ARG_REG)
+    return &context->x[state->ngrn++];
+
+  state->ngrn = N_X_ARG_REG;
+  return allocate_to_stack (state, stack, size, size);
+}
+
+ffi_status
+ffi_prep_cif_machdep (ffi_cif *cif)
+{
+  ffi_type *rtype = cif->rtype;
+  size_t bytes = cif->bytes;
+  int flags, i, n;
+
+  switch (rtype->type)
     {
-    case FFI_TYPE_FLOAT:
-      /* This is the only case for which the allocated stack size
-	 should not match the alignment of the type.  */
-      size = sizeof (UINT32);
-      /* Fall through.  */
-    case FFI_TYPE_DOUBLE:
-      if (state->nsrn < N_V_ARG_REG)
-	return allocate_to_d (context, state);
-      state->nsrn = N_V_ARG_REG;
-      break;
-    case FFI_TYPE_LONGDOUBLE:
-      if (state->nsrn < N_V_ARG_REG)
-	return allocate_to_v (context, state);
-      state->nsrn = N_V_ARG_REG;
+    case FFI_TYPE_VOID:
+      flags = AARCH64_RET_VOID;
       break;
     case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
+      flags = AARCH64_RET_UINT8;
+      break;
     case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
+      flags = AARCH64_RET_UINT16;
+      break;
     case FFI_TYPE_UINT32:
-    case FFI_TYPE_SINT32:
+      flags = AARCH64_RET_UINT32;
+      break;
+    case FFI_TYPE_SINT8:
+      flags = AARCH64_RET_SINT8;
+      break;
+    case FFI_TYPE_SINT16:
+      flags = AARCH64_RET_SINT16;
+      break;
     case FFI_TYPE_INT:
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
+    case FFI_TYPE_SINT32:
+      flags = AARCH64_RET_SINT32;
+      break;
     case FFI_TYPE_SINT64:
-      if (state->ngrn < N_X_ARG_REG)
-	return allocate_to_x (context, state);
-      state->ngrn = N_X_ARG_REG;
+    case FFI_TYPE_UINT64:
+      flags = AARCH64_RET_INT64;
       break;
+    case FFI_TYPE_POINTER:
+      flags = (sizeof(void *) == 4 ? AARCH64_RET_UINT32 : AARCH64_RET_INT64);
+      break;
+
+    case FFI_TYPE_FLOAT:
+    case FFI_TYPE_DOUBLE:
+    case FFI_TYPE_LONGDOUBLE:
+    case FFI_TYPE_STRUCT:
+    case FFI_TYPE_COMPLEX:
+      flags = is_vfp_type (rtype);
+      if (flags == 0)
+	{
+	  size_t s = rtype->size;
+	  if (s > 16)
+	    {
+	      flags = AARCH64_RET_VOID | AARCH64_RET_IN_MEM;
+	      bytes += 8;
+	    }
+	  else if (s == 16)
+	    flags = AARCH64_RET_INT128;
+	  else if (s == 8)
+	    flags = AARCH64_RET_INT64;
+	  else
+	    flags = AARCH64_RET_INT128 | AARCH64_RET_NEED_COPY;
+	}
+      break;
+
     default:
-      FFI_ASSERT (0);
+      abort();
     }
 
-    return allocate_to_stack (state, stack, alignment, size);
-}
+  for (i = 0, n = cif->nargs; i < n; i++)
+    if (is_vfp_type (cif->arg_types[i]))
+      {
+	flags |= AARCH64_FLAG_ARG_V;
+	break;
+      }
 
-/* Copy a value to an appropriate register, or if none are
-   available, to the stack.  */
+  /* Round the stack up to a multiple of the stack alignment requirement. */
+  cif->bytes = ALIGN(bytes, 16);
+  cif->flags = flags;
+#if defined (__APPLE__)
+  cif->aarch64_nfixedargs = 0;
+#endif
 
-static void
-copy_to_register_or_stack (struct call_context *context,
-			   unsigned char *stack,
-			   struct arg_state *state,
-			   void *value,
-			   unsigned short type)
+  return FFI_OK;
+}
+
+#if defined (__APPLE__)
+/* Perform Apple-specific cif processing for variadic calls */
+ffi_status ffi_prep_cif_machdep_var(ffi_cif *cif,
+				    unsigned int nfixedargs,
+				    unsigned int ntotalargs)
 {
-  copy_basic_type (
-	  allocate_to_register_or_stack (context, stack, state, type),
-	  value,
-	  type);
+  ffi_status status = ffi_prep_cif_machdep (cif);
+  cif->aarch64_nfixedargs = nfixedargs;
+  return status;
 }
+#endif /* __APPLE__ */
 
-/* Marshall the arguments from FFI representation to procedure call
-   context and stack.  */
+extern void ffi_call_SYSV (struct call_context *context, void *frame,
+			   void (*fn)(void), void *rvalue, int flags,
+			   void *closure) FFI_HIDDEN;
 
-static unsigned
-aarch64_prep_args (struct call_context *context, unsigned char *stack,
-		   extended_cif *ecif)
+/* Call a function with the provided arguments and capture the return
+   value.  */
+static void
+ffi_call_int (ffi_cif *cif, void (*fn)(void), void *orig_rvalue,
+	      void **avalue, void *closure)
 {
-  int i;
+  struct call_context *context;
+  void *stack, *frame, *rvalue;
   struct arg_state state;
-
-  arg_init (&state, ALIGN(ecif->cif->bytes, 16));
-
-  for (i = 0; i < ecif->cif->nargs; i++)
+  size_t stack_bytes, rtype_size, rsize;
+  int i, nargs, flags;
+  ffi_type *rtype;
+
+  flags = cif->flags;
+  rtype = cif->rtype;
+  rtype_size = rtype->size;
+  stack_bytes = cif->bytes;
+
+  /* If the target function returns a structure via hidden pointer,
+     then we cannot allow a null rvalue.  Otherwise, mash a null
+     rvalue to void return type.  */
+  rsize = 0;
+  if (flags & AARCH64_RET_IN_MEM)
+    {
+      if (orig_rvalue == NULL)
+	rsize = rtype_size;
+    }
+  else if (orig_rvalue == NULL)
+    flags &= AARCH64_FLAG_ARG_V;
+  else if (flags & AARCH64_RET_NEED_COPY)
+    rsize = 16;
+
+  /* Allocate consectutive stack for everything we'll need.  */
+  context = alloca (sizeof(struct call_context) + stack_bytes + 32 + rsize);
+  stack = context + 1;
+  frame = stack + stack_bytes;
+  rvalue = (rsize ? frame + 32 : orig_rvalue);
+
+  arg_init (&state);
+  for (i = 0, nargs = cif->nargs; i < nargs; i++)
     {
-      ffi_type *ty = ecif->cif->arg_types[i];
-      switch (ty->type)
+      ffi_type *ty = cif->arg_types[i];
+      size_t s = ty->size;
+      void *a = avalue[i];
+      int h, t;
+
+      t = ty->type;
+      switch (t)
 	{
 	case FFI_TYPE_VOID:
 	  FFI_ASSERT (0);
@@ -613,239 +597,194 @@ aarch64_prep_args (struct call_context *context, unsigned char *stack,
 
 	/* If the argument is a basic type the argument is allocated to an
 	   appropriate register, or if none are available, to the stack.  */
-	case FFI_TYPE_FLOAT:
-	case FFI_TYPE_DOUBLE:
-	case FFI_TYPE_LONGDOUBLE:
+	case FFI_TYPE_INT:
 	case FFI_TYPE_UINT8:
 	case FFI_TYPE_SINT8:
 	case FFI_TYPE_UINT16:
 	case FFI_TYPE_SINT16:
 	case FFI_TYPE_UINT32:
-	case FFI_TYPE_INT:
 	case FFI_TYPE_SINT32:
-	case FFI_TYPE_POINTER:
 	case FFI_TYPE_UINT64:
 	case FFI_TYPE_SINT64:
-	  copy_to_register_or_stack (context, stack, &state,
-				     ecif->avalue[i], ty->type);
+	case FFI_TYPE_POINTER:
+	do_pointer:
+	  {
+	    ffi_arg ext = extend_integer_type (a, t);
+	    if (state.ngrn < N_X_ARG_REG)
+	      context->x[state.ngrn++] = ext;
+	    else
+	      {
+		void *d = allocate_to_stack (&state, stack, ty->alignment, s);
+		state.ngrn = N_X_ARG_REG;
+		/* Note that the default abi extends each argument
+		   to a full 64-bit slot, while the iOS abi allocates
+		   only enough space. */
+#ifdef __APPLE__
+		memcpy(d, a, s);
+#else
+		*(ffi_arg *)d = ext;
+#endif
+	      }
+	  }
 	  break;
 
+	case FFI_TYPE_FLOAT:
+	case FFI_TYPE_DOUBLE:
+	case FFI_TYPE_LONGDOUBLE:
 	case FFI_TYPE_STRUCT:
-	  if (is_hfa (ty))
-	    {
-	      copy_hfa_to_reg_or_stack (ecif->avalue[i], ty, context,
-					stack, &state);
-	    }
-	  else if (ty->size > 16)
-	    {
-	      /* If the argument is a composite type that is larger than 16
-		 bytes, then the argument has been copied to memory, and
-		 the argument is replaced by a pointer to the copy.  */
+	case FFI_TYPE_COMPLEX:
+	  {
+	    void *dest;
 
-	      copy_to_register_or_stack (context, stack, &state,
-					 &(ecif->avalue[i]), FFI_TYPE_POINTER);
-	    }
-	  else if (available_x (&state) >= (ty->size + 7) / 8)
-	    {
-	      /* If the argument is a composite type and the size in
-		 double-words is not more than the number of available
-		 X registers, then the argument is copied into consecutive
-		 X registers.  */
-	      int j;
-	      for (j = 0; j < (ty->size + 7) / 8; j++)
-		{
-		  memcpy (allocate_to_x (context, &state),
-			  &(((UINT64 *) ecif->avalue[i])[j]),
-			  sizeof (UINT64));
+	    h = is_vfp_type (ty);
+	    if (h)
+	      {
+		int elems = 4 - (h & 3);
+	        if (state.nsrn + elems <= N_V_ARG_REG)
+		  {
+		    dest = &context->v[state.nsrn];
+		    state.nsrn += elems;
+		    extend_hfa_type (dest, a, h);
+		    break;
+		  }
+		state.nsrn = N_V_ARG_REG;
+		dest = allocate_to_stack (&state, stack, ty->alignment, s);
+	      }
+	    else if (s > 16)
+	      {
+		/* If the argument is a composite type that is larger than 16
+		   bytes, then the argument has been copied to memory, and
+		   the argument is replaced by a pointer to the copy.  */
+		a = &avalue[i];
+		t = FFI_TYPE_POINTER;
+		goto do_pointer;
+	      }
+	    else
+	      {
+		size_t n = (s + 7) / 8;
+		if (state.ngrn + n <= N_X_ARG_REG)
+		  {
+		    /* If the argument is a composite type and the size in
+		       double-words is not more than the number of available
+		       X registers, then the argument is copied into
+		       consecutive X registers.  */
+		    dest = &context->x[state.ngrn];
+		    state.ngrn += n;
+		  }
+		else
+		  {
+		    /* Otherwise, there are insufficient X registers. Further
+		       X register allocations are prevented, the NSAA is
+		       adjusted and the argument is copied to memory at the
+		       adjusted NSAA.  */
+		    state.ngrn = N_X_ARG_REG;
+		    dest = allocate_to_stack (&state, stack, ty->alignment, s);
+		  }
 		}
-	    }
-	  else
-	    {
-	      /* Otherwise, there are insufficient X registers. Further X
-		 register allocations are prevented, the NSAA is adjusted
-		 (by allocate_to_stack ()) and the argument is copied to
-		 memory at the adjusted NSAA.  */
-	      state.ngrn = N_X_ARG_REG;
-
-	      memcpy (allocate_to_stack (&state, stack, ty->alignment,
-					 ty->size), ecif->avalue + i, ty->size);
+	      memcpy (dest, a, s);
 	    }
 	  break;
 
 	default:
-	  FFI_ASSERT (0);
-	  break;
+	  abort();
 	}
+
+#if defined (__APPLE__)
+      if (i + 1 == cif->aarch64_nfixedargs)
+	{
+	  state.ngrn = N_X_ARG_REG;
+	  state.nsrn = N_V_ARG_REG;
+	  state.allocating_variadic = 1;
+	}
+#endif
     }
 
-  return ecif->cif->aarch64_flags;
+  ffi_call_SYSV (context, frame, fn, rvalue, flags, closure);
+
+  if (flags & AARCH64_RET_NEED_COPY)
+    memcpy (orig_rvalue, rvalue, rtype_size);
 }
 
-ffi_status
-ffi_prep_cif_machdep (ffi_cif *cif)
+void
+ffi_call (ffi_cif *cif, void (*fn) (void), void *rvalue, void **avalue)
 {
-  /* Round the stack up to a multiple of the stack alignment requirement. */
-  cif->bytes =
-    (cif->bytes + (AARCH64_STACK_ALIGN - 1)) & ~ (AARCH64_STACK_ALIGN - 1);
-
-  /* Initialize our flags. We are interested if this CIF will touch a
-     vector register, if so we will enable context save and load to
-     those registers, otherwise not. This is intended to be friendly
-     to lazy float context switching in the kernel.  */
-  cif->aarch64_flags = 0;
-
-  if (is_v_register_candidate (cif->rtype))
-    {
-      cif->aarch64_flags |= AARCH64_FFI_WITH_V;
-    }
-  else
-    {
-      int i;
-      for (i = 0; i < cif->nargs; i++)
-        if (is_v_register_candidate (cif->arg_types[i]))
-          {
-            cif->aarch64_flags |= AARCH64_FFI_WITH_V;
-            break;
-          }
-    }
-
-  return FFI_OK;
+  ffi_call_int (cif, fn, rvalue, avalue, NULL);
 }
 
-/* Call a function with the provided arguments and capture the return
-   value.  */
+#ifdef FFI_GO_CLOSURES
 void
-ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+ffi_call_go (ffi_cif *cif, void (*fn) (void), void *rvalue,
+	     void **avalue, void *closure)
 {
-  extended_cif ecif;
+  ffi_call_int (cif, fn, rvalue, avalue, closure);
+}
+#endif /* FFI_GO_CLOSURES */
 
-  ecif.cif = cif;
-  ecif.avalue = avalue;
-  ecif.rvalue = rvalue;
+/* Build a trampoline.  */
 
-  switch (cif->abi)
-    {
-    case FFI_SYSV:
-      {
-        struct call_context context;
-	unsigned stack_bytes;
+extern void ffi_closure_SYSV (void) FFI_HIDDEN;
+extern void ffi_closure_SYSV_V (void) FFI_HIDDEN;
 
-	/* Figure out the total amount of stack space we need, the
-	   above call frame space needs to be 16 bytes aligned to
-	   ensure correct alignment of the first object inserted in
-	   that space hence the ALIGN applied to cif->bytes.*/
-	stack_bytes = ALIGN(cif->bytes, 16);
+ffi_status
+ffi_prep_closure_loc (ffi_closure *closure,
+                      ffi_cif* cif,
+                      void (*fun)(ffi_cif*,void*,void**,void*),
+                      void *user_data,
+                      void *codeloc)
+{
+  static const unsigned char trampoline[16] = {
+    0x90, 0x00, 0x00, 0x58,	/* ldr	x16, tramp+16	*/
+    0xf1, 0xff, 0xff, 0x10,	/* adr	x17, tramp+0	*/
+    0x00, 0x02, 0x1f, 0xd6	/* br	x16		*/
+  };
+  char *tramp = closure->tramp;
+  void (*start)(void);
 
-	memset (&context, 0, sizeof (context));
-        if (is_register_candidate (cif->rtype))
-          {
-            ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn);
-            switch (cif->rtype->type)
-              {
-              case FFI_TYPE_VOID:
-              case FFI_TYPE_FLOAT:
-              case FFI_TYPE_DOUBLE:
-              case FFI_TYPE_LONGDOUBLE:
-              case FFI_TYPE_UINT8:
-              case FFI_TYPE_SINT8:
-              case FFI_TYPE_UINT16:
-              case FFI_TYPE_SINT16:
-              case FFI_TYPE_UINT32:
-              case FFI_TYPE_SINT32:
-              case FFI_TYPE_POINTER:
-              case FFI_TYPE_UINT64:
-              case FFI_TYPE_INT:
-              case FFI_TYPE_SINT64:
-		{
-		  void *addr = get_basic_type_addr (cif->rtype->type,
-						    &context, 0);
-		  copy_basic_type (rvalue, addr, cif->rtype->type);
-		  break;
-		}
+  if (cif->abi != FFI_SYSV)
+    return FFI_BAD_ABI;
 
-              case FFI_TYPE_STRUCT:
-                if (is_hfa (cif->rtype))
-		  {
-		    int j;
-		    unsigned short type = get_homogeneous_type (cif->rtype);
-		    unsigned elems = element_count (cif->rtype);
-		    for (j = 0; j < elems; j++)
-		      {
-			void *reg = get_basic_type_addr (type, &context, j);
-			copy_basic_type (rvalue, reg, type);
-			rvalue += get_basic_type_size (type);
-		      }
-		  }
-                else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
-                  {
-                    unsigned size = ALIGN (cif->rtype->size, sizeof (UINT64));
-                    memcpy (rvalue, get_x_addr (&context, 0), size);
-                  }
-                else
-                  {
-                    FFI_ASSERT (0);
-                  }
-                break;
-
-              default:
-                FFI_ASSERT (0);
-                break;
-              }
-          }
-        else
-          {
-            memcpy (get_x_addr (&context, 8), &rvalue, sizeof (UINT64));
-            ffi_call_SYSV (aarch64_prep_args, &context, &ecif,
-			   stack_bytes, fn);
-          }
-        break;
-      }
+  closure->cif = cif;
+  closure->fun = fun;
+  closure->user_data = user_data;
 
-    default:
-      FFI_ASSERT (0);
-      break;
-    }
-}
+  memcpy (tramp, trampoline, sizeof(trampoline));
 
-static unsigned char trampoline [] =
-{ 0x70, 0x00, 0x00, 0x58,	/* ldr	x16, 1f	*/
-  0x91, 0x00, 0x00, 0x10,	/* adr	x17, 2f	*/
-  0x00, 0x02, 0x1f, 0xd6	/* br	x16	*/
-};
+  if (cif->flags & AARCH64_FLAG_ARG_V)
+    start = ffi_closure_SYSV_V;
+  else
+    start = ffi_closure_SYSV;
+  *(UINT64 *)(tramp + 16) = (uintptr_t)start;
 
-/* Build a trampoline.  */
+  ffi_clear_cache(tramp, tramp + FFI_TRAMPOLINE_SIZE);
+
+  return FFI_OK;
+}
 
-#define FFI_INIT_TRAMPOLINE(TRAMP,FUN,CTX,FLAGS)			\
-  ({unsigned char *__tramp = (unsigned char*)(TRAMP);			\
-    UINT64  __fun = (UINT64)(FUN);					\
-    UINT64  __ctx = (UINT64)(CTX);					\
-    UINT64  __flags = (UINT64)(FLAGS);					\
-    memcpy (__tramp, trampoline, sizeof (trampoline));			\
-    memcpy (__tramp + 12, &__fun, sizeof (__fun));			\
-    memcpy (__tramp + 20, &__ctx, sizeof (__ctx));			\
-    memcpy (__tramp + 28, &__flags, sizeof (__flags));			\
-    __clear_cache(__tramp, __tramp + FFI_TRAMPOLINE_SIZE);		\
-  })
+#ifdef FFI_GO_CLOSURES
+extern void ffi_go_closure_SYSV (void) FFI_HIDDEN;
+extern void ffi_go_closure_SYSV_V (void) FFI_HIDDEN;
 
 ffi_status
-ffi_prep_closure_loc (ffi_closure* closure,
-                      ffi_cif* cif,
-                      void (*fun)(ffi_cif*,void*,void**,void*),
-                      void *user_data,
-                      void *codeloc)
+ffi_prep_go_closure (ffi_go_closure *closure, ffi_cif* cif,
+                     void (*fun)(ffi_cif*,void*,void**,void*))
 {
+  void (*start)(void);
+
   if (cif->abi != FFI_SYSV)
     return FFI_BAD_ABI;
 
-  FFI_INIT_TRAMPOLINE (&closure->tramp[0], &ffi_closure_SYSV, codeloc,
-		       cif->aarch64_flags);
+  if (cif->flags & AARCH64_FLAG_ARG_V)
+    start = ffi_go_closure_SYSV_V;
+  else
+    start = ffi_go_closure_SYSV;
 
-  closure->cif  = cif;
-  closure->user_data = user_data;
-  closure->fun  = fun;
+  closure->tramp = start;
+  closure->cif = cif;
+  closure->fun = fun;
 
   return FFI_OK;
 }
+#endif /* FFI_GO_CLOSURES */
 
 /* Primary handler to setup and invoke a function within a closure.
 
@@ -857,220 +796,115 @@ ffi_prep_closure_loc (ffi_closure* closure,
    the stack at the point ffi_closure_SYSV() was invoked.
 
    On the return path the assembler wrapper will reload call context
-   regsiters.
+   registers.
 
    ffi_closure_SYSV_inner() marshalls the call context into ffi value
-   desriptors, invokes the wrapped function, then marshalls the return
+   descriptors, invokes the wrapped function, then marshalls the return
    value back into the call context.  */
 
-void
-ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
-			void *stack)
+int FFI_HIDDEN
+ffi_closure_SYSV_inner (ffi_cif *cif,
+			void (*fun)(ffi_cif*,void*,void**,void*),
+			void *user_data,
+			struct call_context *context,
+			void *stack, void *rvalue, void *struct_rvalue)
 {
-  ffi_cif *cif = closure->cif;
   void **avalue = (void**) alloca (cif->nargs * sizeof (void*));
-  void *rvalue = NULL;
-  int i;
+  int i, h, nargs, flags;
   struct arg_state state;
 
-  arg_init (&state, ALIGN(cif->bytes, 16));
+  arg_init (&state);
 
-  for (i = 0; i < cif->nargs; i++)
+  for (i = 0, nargs = cif->nargs; i < nargs; i++)
     {
       ffi_type *ty = cif->arg_types[i];
+      int t = ty->type;
+      size_t n, s = ty->size;
 
-      switch (ty->type)
+      switch (t)
 	{
 	case FFI_TYPE_VOID:
 	  FFI_ASSERT (0);
 	  break;
 
+	case FFI_TYPE_INT:
 	case FFI_TYPE_UINT8:
 	case FFI_TYPE_SINT8:
 	case FFI_TYPE_UINT16:
 	case FFI_TYPE_SINT16:
 	case FFI_TYPE_UINT32:
 	case FFI_TYPE_SINT32:
-	case FFI_TYPE_INT:
-	case FFI_TYPE_POINTER:
 	case FFI_TYPE_UINT64:
 	case FFI_TYPE_SINT64:
-	case  FFI_TYPE_FLOAT:
-	case  FFI_TYPE_DOUBLE:
-	case  FFI_TYPE_LONGDOUBLE:
-	  avalue[i] = allocate_to_register_or_stack (context, stack,
-						     &state, ty->type);
+	case FFI_TYPE_POINTER:
+	  avalue[i] = allocate_int_to_reg_or_stack (context, &state, stack, s);
 	  break;
 
+	case FFI_TYPE_FLOAT:
+	case FFI_TYPE_DOUBLE:
+	case FFI_TYPE_LONGDOUBLE:
 	case FFI_TYPE_STRUCT:
-	  if (is_hfa (ty))
+	case FFI_TYPE_COMPLEX:
+	  h = is_vfp_type (ty);
+	  if (h)
 	    {
-	      unsigned n = element_count (ty);
-	      if (available_v (&state) < n)
+	      n = 4 - (h & 3);
+	      if (state.nsrn + n <= N_V_ARG_REG)
 		{
-		  state.nsrn = N_V_ARG_REG;
-		  avalue[i] = allocate_to_stack (&state, stack, ty->alignment,
-						 ty->size);
+		  void *reg = &context->v[state.nsrn];
+		  state.nsrn += n;
+
+		  /* Eeek! We need a pointer to the structure, however the
+		     homogeneous float elements are being passed in individual
+		     registers, therefore for float and double the structure
+		     is not represented as a contiguous sequence of bytes in
+		     our saved register context.  We don't need the original
+		     contents of the register storage, so we reformat the
+		     structure into the same memory.  */
+		  avalue[i] = compress_hfa_type (reg, reg, h);
 		}
 	      else
 		{
-		  switch (get_homogeneous_type (ty))
-		    {
-		    case FFI_TYPE_FLOAT:
-		      {
-			/* Eeek! We need a pointer to the structure,
-			   however the homogeneous float elements are
-			   being passed in individual S registers,
-			   therefore the structure is not represented as
-			   a contiguous sequence of bytes in our saved
-			   register context. We need to fake up a copy
-			   of the structure layed out in memory
-			   correctly. The fake can be tossed once the
-			   closure function has returned hence alloca()
-			   is sufficient. */
-			int j;
-			UINT32 *p = avalue[i] = alloca (ty->size);
-			for (j = 0; j < element_count (ty); j++)
-			  memcpy (&p[j],
-				  allocate_to_s (context, &state),
-				  sizeof (*p));
-			break;
-		      }
-
-		    case FFI_TYPE_DOUBLE:
-		      {
-			/* Eeek! We need a pointer to the structure,
-			   however the homogeneous float elements are
-			   being passed in individual S registers,
-			   therefore the structure is not represented as
-			   a contiguous sequence of bytes in our saved
-			   register context. We need to fake up a copy
-			   of the structure layed out in memory
-			   correctly. The fake can be tossed once the
-			   closure function has returned hence alloca()
-			   is sufficient. */
-			int j;
-			UINT64 *p = avalue[i] = alloca (ty->size);
-			for (j = 0; j < element_count (ty); j++)
-			  memcpy (&p[j],
-				  allocate_to_d (context, &state),
-				  sizeof (*p));
-			break;
-		      }
-
-		    case FFI_TYPE_LONGDOUBLE:
-			  memcpy (&avalue[i],
-				  allocate_to_v (context, &state),
-				  sizeof (*avalue));
-		      break;
-
-		    default:
-		      FFI_ASSERT (0);
-		      break;
-		    }
+		  state.nsrn = N_V_ARG_REG;
+		  avalue[i] = allocate_to_stack (&state, stack,
+						 ty->alignment, s);
 		}
 	    }
-	  else if (ty->size > 16)
+	  else if (s > 16)
 	    {
 	      /* Replace Composite type of size greater than 16 with a
 		 pointer.  */
-	      memcpy (&avalue[i],
-		      allocate_to_register_or_stack (context, stack,
-						     &state, FFI_TYPE_POINTER),
-		      sizeof (avalue[i]));
-	    }
-	  else if (available_x (&state) >= (ty->size + 7) / 8)
-	    {
-	      avalue[i] = get_x_addr (context, state.ngrn);
-	      state.ngrn += (ty->size + 7) / 8;
+	      avalue[i] = *(void **)
+		allocate_int_to_reg_or_stack (context, &state, stack,
+					      sizeof (void *));
 	    }
 	  else
 	    {
-	      state.ngrn = N_X_ARG_REG;
-
-	      avalue[i] = allocate_to_stack (&state, stack, ty->alignment,
-					     ty->size);
+	      n = (s + 7) / 8;
+	      if (state.ngrn + n <= N_X_ARG_REG)
+		{
+		  avalue[i] = &context->x[state.ngrn];
+		  state.ngrn += n;
+		}
+	      else
+		{
+		  state.ngrn = N_X_ARG_REG;
+		  avalue[i] = allocate_to_stack (&state, stack,
+						 ty->alignment, s);
+		}
 	    }
 	  break;
 
 	default:
-	  FFI_ASSERT (0);
-	  break;
+	  abort();
 	}
     }
 
-  /* Figure out where the return value will be passed, either in
-     registers or in a memory block allocated by the caller and passed
-     in x8.  */
+  flags = cif->flags;
+  if (flags & AARCH64_RET_IN_MEM)
+    rvalue = struct_rvalue;
 
-  if (is_register_candidate (cif->rtype))
-    {
-      /* Register candidates are *always* returned in registers. */
-
-      /* Allocate a scratchpad for the return value, we will let the
-         callee scrible the result into the scratch pad then move the
-         contents into the appropriate return value location for the
-         call convention.  */
-      rvalue = alloca (cif->rtype->size);
-      (closure->fun) (cif, rvalue, avalue, closure->user_data);
-
-      /* Copy the return value into the call context so that it is returned
-         as expected to our caller.  */
-      switch (cif->rtype->type)
-        {
-        case FFI_TYPE_VOID:
-          break;
-
-        case FFI_TYPE_UINT8:
-        case FFI_TYPE_UINT16:
-        case FFI_TYPE_UINT32:
-        case FFI_TYPE_POINTER:
-        case FFI_TYPE_UINT64:
-        case FFI_TYPE_SINT8:
-        case FFI_TYPE_SINT16:
-        case FFI_TYPE_INT:
-        case FFI_TYPE_SINT32:
-        case FFI_TYPE_SINT64:
-        case FFI_TYPE_FLOAT:
-        case FFI_TYPE_DOUBLE:
-        case FFI_TYPE_LONGDOUBLE:
-	  {
-	    void *addr = get_basic_type_addr (cif->rtype->type, context, 0);
-	    copy_basic_type (addr, rvalue, cif->rtype->type);
-            break;
-	  }
-        case FFI_TYPE_STRUCT:
-          if (is_hfa (cif->rtype))
-	    {
-	      int i;
-	      unsigned short type = get_homogeneous_type (cif->rtype);
-	      unsigned elems = element_count (cif->rtype);
-	      for (i = 0; i < elems; i++)
-		{
-		  void *reg = get_basic_type_addr (type, context, i);
-		  copy_basic_type (reg, rvalue, type);
-		  rvalue += get_basic_type_size (type);
-		}
-	    }
-          else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
-            {
-              unsigned size = ALIGN (cif->rtype->size, sizeof (UINT64)) ;
-              memcpy (get_x_addr (context, 0), rvalue, size);
-            }
-          else
-            {
-              FFI_ASSERT (0);
-            }
-          break;
-        default:
-          FFI_ASSERT (0);
-          break;
-        }
-    }
-  else
-    {
-      memcpy (&rvalue, get_x_addr (context, 8), sizeof (UINT64));
-      (closure->fun) (cif, rvalue, avalue, closure->user_data);
-    }
-}
+  fun (cif, rvalue, avalue, user_data);
 
+  return flags;
+}