21 files changed, 2947 insertions, 103 deletions
diff --git a/libffi/src/aarch64/ffi.c b/libffi/src/aarch64/ffi.c
new file mode 100644
index 0000000..1405665
--- /dev/null
+++ b/libffi/src/aarch64/ffi.c
@@ -0,0 +1,1076 @@
+/* Copyright (c) 2009, 2010, 2011, 2012 ARM Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+``Software''), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
+
+#include <stdio.h>
+
+#include <ffi.h>
+#include <ffi_common.h>
+
+#include <stdlib.h>
+
+/* Stack alignment requirement in bytes */
+#define AARCH64_STACK_ALIGN 16
+
+#define N_X_ARG_REG 8
+#define N_V_ARG_REG 8
+
+#define AARCH64_FFI_WITH_V (1 << AARCH64_FFI_WITH_V_BIT)
+
+union _d
+{
+  UINT64 d;
+  UINT32 s[2];
+};
+
+struct call_context
+{
+  UINT64 x [AARCH64_N_XREG];
+  struct
+  {
+    union _d d[2];
+  } v [AARCH64_N_VREG];
+};
+
+static void *
+get_x_addr (struct call_context *context, unsigned n)
+{
+  return &context->x[n];
+}
+
+static void *
+get_s_addr (struct call_context *context, unsigned n)
+{
+#if defined __AARCH64EB__
+  return &context->v[n].d[1].s[1];
+#else
+  return &context->v[n].d[0].s[0];
+#endif
+}
+
+static void *
+get_d_addr (struct call_context *context, unsigned n)
+{
+#if defined __AARCH64EB__
+  return &context->v[n].d[1];
+#else
+  return &context->v[n].d[0];
+#endif
+}
+
+static void *
+get_v_addr (struct call_context *context, unsigned n)
+{
+  return &context->v[n];
+}
+
+/* Return the memory location at which a basic type would reside
+   were it to have been stored in register n.  */
+
+static void *
+get_basic_type_addr (unsigned short type, struct call_context *context,
+		     unsigned n)
+{
+  switch (type)
+    {
+    case FFI_TYPE_FLOAT:
+      return get_s_addr (context, n);
+    case FFI_TYPE_DOUBLE:
+      return get_d_addr (context, n);
+    case FFI_TYPE_LONGDOUBLE:
+      return get_v_addr (context, n);
+    case FFI_TYPE_UINT8:
+    case FFI_TYPE_SINT8:
+    case FFI_TYPE_UINT16:
+    case FFI_TYPE_SINT16:
+    case FFI_TYPE_UINT32:
+    case FFI_TYPE_SINT32:
+    case FFI_TYPE_INT:
+    case FFI_TYPE_POINTER:
+    case FFI_TYPE_UINT64:
+    case FFI_TYPE_SINT64:
+      return get_x_addr (context, n);
+    default:
+      FFI_ASSERT (0);
+      return NULL;
+    }
+}
+
+/* Return the alignment width for each of the basic types.  */
+
+static size_t
+get_basic_type_alignment (unsigned short type)
+{
+  switch (type)
+    {
+    case FFI_TYPE_FLOAT:
+    case FFI_TYPE_DOUBLE:
+      return sizeof (UINT64);
+    case FFI_TYPE_LONGDOUBLE:
+      return sizeof (long double);
+    case FFI_TYPE_UINT8:
+    case FFI_TYPE_SINT8:
+    case FFI_TYPE_UINT16:
+    case FFI_TYPE_SINT16:
+    case FFI_TYPE_UINT32:
+    case FFI_TYPE_INT:
+    case FFI_TYPE_SINT32:
+    case FFI_TYPE_POINTER:
+    case FFI_TYPE_UINT64:
+    case FFI_TYPE_SINT64:
+      return sizeof (UINT64);
+
+    default:
+      FFI_ASSERT (0);
+      return 0;
+    }
+}
+
+/* Return the size in bytes for each of the basic types.  */
+
+static size_t
+get_basic_type_size (unsigned short type)
+{
+  switch (type)
+    {
+    case FFI_TYPE_FLOAT:
+      return sizeof (UINT32);
+    case FFI_TYPE_DOUBLE:
+      return sizeof (UINT64);
+    case FFI_TYPE_LONGDOUBLE:
+      return sizeof (long double);
+    case FFI_TYPE_UINT8:
+      return sizeof (UINT8);
+    case FFI_TYPE_SINT8:
+      return sizeof (SINT8);
+    case FFI_TYPE_UINT16:
+      return sizeof (UINT16);
+    case FFI_TYPE_SINT16:
+      return sizeof (SINT16);
+    case FFI_TYPE_UINT32:
+      return sizeof (UINT32);
+    case FFI_TYPE_INT:
+    case FFI_TYPE_SINT32:
+      return sizeof (SINT32);
+    case FFI_TYPE_POINTER:
+    case FFI_TYPE_UINT64:
+      return sizeof (UINT64);
+    case FFI_TYPE_SINT64:
+      return sizeof (SINT64);
+
+    default:
+      FFI_ASSERT (0);
+      return 0;
+    }
+}
+
+extern void
+ffi_call_SYSV (unsigned (*)(struct call_context *context, unsigned char *,
+			    extended_cif *),
+               struct call_context *context,
+               extended_cif *,
+               unsigned,
+               void (*fn)(void));
+
+extern void
+ffi_closure_SYSV (ffi_closure *);
+
+/* Test for an FFI floating point representation.  */
+
+static unsigned
+is_floating_type (unsigned short type)
+{
+  return (type == FFI_TYPE_FLOAT || type == FFI_TYPE_DOUBLE
+	  || type == FFI_TYPE_LONGDOUBLE);
+}
+
+/* Test for a homogeneous structure.  */
+
+static unsigned short
+get_homogeneous_type (ffi_type *ty)
+{
+  if (ty->type == FFI_TYPE_STRUCT && ty->elements)
+    {
+      unsigned i;
+      unsigned short candidate_type
+	= get_homogeneous_type (ty->elements[0]);
+      for (i =1; ty->elements[i]; i++)
+	{
+	  unsigned short iteration_type = 0;
+	  /* If we have a nested struct, we must find its homogeneous type.
+	     If that fits with our candidate type, we are still
+	     homogeneous.  */
+	  if (ty->elements[i]->type == FFI_TYPE_STRUCT
+	      && ty->elements[i]->elements)
+	    {
+	      iteration_type = get_homogeneous_type (ty->elements[i]);
+	    }
+	  else
+	    {
+	      iteration_type = ty->elements[i]->type;
+	    }
+
+	  /* If we are not homogeneous, return FFI_TYPE_STRUCT.  */
+	  if (candidate_type != iteration_type)
+	    return FFI_TYPE_STRUCT;
+	}
+      return candidate_type;
+    }
+
+  /* Base case, we have no more levels of nesting, so we
+     are a basic type, and so, trivially homogeneous in that type.  */
+  return ty->type;
+}
+
+/* Determine the number of elements within a STRUCT.
+
+   Note, we must handle nested structs.
+
+   If ty is not a STRUCT this function will return 0.  */
+
+static unsigned
+element_count (ffi_type *ty)
+{
+  if (ty->type == FFI_TYPE_STRUCT && ty->elements)
+    {
+      unsigned n;
+      unsigned elems = 0;
+      for (n = 0; ty->elements[n]; n++)
+	{
+	  if (ty->elements[n]->type == FFI_TYPE_STRUCT
+	      && ty->elements[n]->elements)
+	    elems += element_count (ty->elements[n]);
+	  else
+	    elems++;
+	}
+      return elems;
+    }
+  return 0;
+}
+
+/* Test for a homogeneous floating point aggregate.
+
+   A homogeneous floating point aggregate is a homogeneous aggregate of
+   a half- single- or double- precision floating point type with one
+   to four elements.  Note that this includes nested structs of the
+   basic type.  */
+
+static int
+is_hfa (ffi_type *ty)
+{
+  if (ty->type == FFI_TYPE_STRUCT
+      && ty->elements[0]
+      && is_floating_type (get_homogeneous_type (ty)))
+    {
+      unsigned n = element_count (ty);
+      return n >= 1 && n <= 4;
+    }
+  return 0;
+}
+
+/* Test if an ffi_type is a candidate for passing in a register.
+
+   This test does not check that sufficient registers of the
+   appropriate class are actually available, merely that IFF
+   sufficient registers are available then the argument will be passed
+   in register(s).
+
+   Note that an ffi_type that is deemed to be a register candidate
+   will always be returned in registers.
+
+   Returns 1 if a register candidate else 0.  */
+
+static int
+is_register_candidate (ffi_type *ty)
+{
+  switch (ty->type)
+    {
+    case FFI_TYPE_VOID:
+    case FFI_TYPE_FLOAT:
+    case FFI_TYPE_DOUBLE:
+    case FFI_TYPE_LONGDOUBLE:
+    case FFI_TYPE_UINT8:
+    case FFI_TYPE_UINT16:
+    case FFI_TYPE_UINT32:
+    case FFI_TYPE_UINT64:
+    case FFI_TYPE_POINTER:
+    case FFI_TYPE_SINT8:
+    case FFI_TYPE_SINT16:
+    case FFI_TYPE_SINT32:
+    case FFI_TYPE_INT:
+    case FFI_TYPE_SINT64:
+      return 1;
+
+    case FFI_TYPE_STRUCT:
+      if (is_hfa (ty))
+        {
+          return 1;
+        }
+      else if (ty->size > 16)
+        {
+          /* Too large. Will be replaced with a pointer to memory. The
+             pointer MAY be passed in a register, but the value will
+             not. This test specifically fails since the argument will
+             never be passed by value in registers. */
+          return 0;
+        }
+      else
+        {
+          /* Might be passed in registers depending on the number of
+             registers required. */
+          return (ty->size + 7) / 8 < N_X_ARG_REG;
+        }
+      break;
+
+    default:
+      FFI_ASSERT (0);
+      break;
+    }
+
+  return 0;
+}
+
+/* Test if an ffi_type argument or result is a candidate for a vector
+   register.  */
+
+static int
+is_v_register_candidate (ffi_type *ty)
+{
+  return is_floating_type (ty->type)
+	   || (ty->type == FFI_TYPE_STRUCT && is_hfa (ty));
+}
+
+/* Representation of the procedure call argument marshalling
+   state.
+
+   The terse state variable names match the names used in the AARCH64
+   PCS. */
+
+struct arg_state
+{
+  unsigned ngrn;                /* Next general-purpose register number. */
+  unsigned nsrn;                /* Next vector register number. */
+  unsigned nsaa;                /* Next stack offset. */
+};
+
+/* Initialize a procedure call argument marshalling state.  */
+static void
+arg_init (struct arg_state *state, unsigned call_frame_size)
+{
+  state->ngrn = 0;
+  state->nsrn = 0;
+  state->nsaa = 0;
+}
+
+/* Return the number of available consecutive core argument
+   registers.  */
+
+static unsigned
+available_x (struct arg_state *state)
+{
+  return N_X_ARG_REG - state->ngrn;
+}
+
+/* Return the number of available consecutive vector argument
+   registers.  */
+
+static unsigned
+available_v (struct arg_state *state)
+{
+  return N_V_ARG_REG - state->nsrn;
+}
+
+static void *
+allocate_to_x (struct call_context *context, struct arg_state *state)
+{
+  FFI_ASSERT (state->ngrn < N_X_ARG_REG)
+  return get_x_addr (context, (state->ngrn)++);
+}
+
+static void *
+allocate_to_s (struct call_context *context, struct arg_state *state)
+{
+  FFI_ASSERT (state->nsrn < N_V_ARG_REG)
+  return get_s_addr (context, (state->nsrn)++);
+}
+
+static void *
+allocate_to_d (struct call_context *context, struct arg_state *state)
+{
+  FFI_ASSERT (state->nsrn < N_V_ARG_REG)
+  return get_d_addr (context, (state->nsrn)++);
+}
+
+static void *
+allocate_to_v (struct call_context *context, struct arg_state *state)
+{
+  FFI_ASSERT (state->nsrn < N_V_ARG_REG)
+  return get_v_addr (context, (state->nsrn)++);
+}
+
+/* Allocate an aligned slot on the stack and return a pointer to it.  */
+static void *
+allocate_to_stack (struct arg_state *state, void *stack, unsigned alignment,
+		   unsigned size)
+{
+  void *allocation;
+
+  /* Round up the NSAA to the larger of 8 or the natural
+     alignment of the argument's type.  */
+  state->nsaa = ALIGN (state->nsaa, alignment);
+  state->nsaa = ALIGN (state->nsaa, alignment);
+  state->nsaa = ALIGN (state->nsaa, 8);
+
+  allocation = stack + state->nsaa;
+
+  state->nsaa += size;
+  return allocation;
+}
+
+static void
+copy_basic_type (void *dest, void *source, unsigned short type)
+{
+  /* This is neccessary to ensure that basic types are copied
+     sign extended to 64-bits as libffi expects.  */
+  switch (type)
+    {
+    case FFI_TYPE_FLOAT:
+      *(float *) dest = *(float *) source;
+      break;
+    case FFI_TYPE_DOUBLE:
+      *(double *) dest = *(double *) source;
+      break;
+    case FFI_TYPE_LONGDOUBLE:
+      *(long double *) dest = *(long double *) source;
+      break;
+    case FFI_TYPE_UINT8:
+      *(ffi_arg *) dest = *(UINT8 *) source;
+      break;
+    case FFI_TYPE_SINT8:
+      *(ffi_sarg *) dest = *(SINT8 *) source;
+      break;
+    case FFI_TYPE_UINT16:
+      *(ffi_arg *) dest = *(UINT16 *) source;
+      break;
+    case FFI_TYPE_SINT16:
+      *(ffi_sarg *) dest = *(SINT16 *) source;
+      break;
+    case FFI_TYPE_UINT32:
+      *(ffi_arg *) dest = *(UINT32 *) source;
+      break;
+    case FFI_TYPE_INT:
+    case FFI_TYPE_SINT32:
+      *(ffi_sarg *) dest = *(SINT32 *) source;
+      break;
+    case FFI_TYPE_POINTER:
+    case FFI_TYPE_UINT64:
+      *(ffi_arg *) dest = *(UINT64 *) source;
+      break;
+    case FFI_TYPE_SINT64:
+      *(ffi_sarg *) dest = *(SINT64 *) source;
+      break;
+
+    default:
+      FFI_ASSERT (0);
+    }
+}
+
+static void
+copy_hfa_to_reg_or_stack (void *memory,
+			  ffi_type *ty,
+			  struct call_context *context,
+			  unsigned char *stack,
+			  struct arg_state *state)
+{
+  unsigned elems = element_count (ty);
+  if (available_v (state) < elems)
+    {
+      /* There are insufficient V registers. Further V register allocations
+	 are prevented, the NSAA is adjusted (by allocate_to_stack ())
+	 and the argument is copied to memory at the adjusted NSAA.  */
+      state->nsrn = N_V_ARG_REG;
+      memcpy (allocate_to_stack (state, stack, ty->alignment, ty->size),
+	      memory,
+	      ty->size);
+    }
+  else
+    {
+      int i;
+      unsigned short type = get_homogeneous_type (ty);
+      unsigned elems = element_count (ty);
+      for (i = 0; i < elems; i++)
+	{
+	  void *reg = allocate_to_v (context, state);
+	  copy_basic_type (reg, memory, type);
+	  memory += get_basic_type_size (type);
+	}
+    }
+}
+
+/* Either allocate an appropriate register for the argument type, or if
+   none are available, allocate a stack slot and return a pointer
+   to the allocated space.  */
+
+static void *
+allocate_to_register_or_stack (struct call_context *context,
+			       unsigned char *stack,
+			       struct arg_state *state,
+			       unsigned short type)
+{
+  size_t alignment = get_basic_type_alignment (type);
+  size_t size = alignment;
+  switch (type)
+    {
+    case FFI_TYPE_FLOAT:
+      /* This is the only case for which the allocated stack size
+	 should not match the alignment of the type.  */
+      size = sizeof (UINT32);
+      /* Fall through.  */
+    case FFI_TYPE_DOUBLE:
+      if (state->nsrn < N_V_ARG_REG)
+	return allocate_to_d (context, state);
+      state->nsrn = N_V_ARG_REG;
+      break;
+    case FFI_TYPE_LONGDOUBLE:
+      if (state->nsrn < N_V_ARG_REG)
+	return allocate_to_v (context, state);
+      state->nsrn = N_V_ARG_REG;
+      break;
+    case FFI_TYPE_UINT8:
+    case FFI_TYPE_SINT8:
+    case FFI_TYPE_UINT16:
+    case FFI_TYPE_SINT16:
+    case FFI_TYPE_UINT32:
+    case FFI_TYPE_SINT32:
+    case FFI_TYPE_INT:
+    case FFI_TYPE_POINTER:
+    case FFI_TYPE_UINT64:
+    case FFI_TYPE_SINT64:
+      if (state->ngrn < N_X_ARG_REG)
+	return allocate_to_x (context, state);
+      state->ngrn = N_X_ARG_REG;
+      break;
+    default:
+      FFI_ASSERT (0);
+    }
+
+    return allocate_to_stack (state, stack, alignment, size);
+}
+
+/* Copy a value to an appropriate register, or if none are
+   available, to the stack.  */
+
+static void
+copy_to_register_or_stack (struct call_context *context,
+			   unsigned char *stack,
+			   struct arg_state *state,
+			   void *value,
+			   unsigned short type)
+{
+  copy_basic_type (
+	  allocate_to_register_or_stack (context, stack, state, type),
+	  value,
+	  type);
+}
+
+/* Marshall the arguments from FFI representation to procedure call
+   context and stack.  */
+
+static unsigned
+aarch64_prep_args (struct call_context *context, unsigned char *stack,
+		   extended_cif *ecif)
+{
+  int i;
+  struct arg_state state;
+
+  arg_init (&state, ALIGN(ecif->cif->bytes, 16));
+
+  for (i = 0; i < ecif->cif->nargs; i++)
+    {
+      ffi_type *ty = ecif->cif->arg_types[i];
+      switch (ty->type)
+	{
+	case FFI_TYPE_VOID:
+	  FFI_ASSERT (0);
+	  break;
+
+	/* If the argument is a basic type the argument is allocated to an
+	   appropriate register, or if none are available, to the stack.  */
+	case FFI_TYPE_FLOAT:
+	case FFI_TYPE_DOUBLE:
+	case FFI_TYPE_LONGDOUBLE:
+	case FFI_TYPE_UINT8:
+	case FFI_TYPE_SINT8:
+	case FFI_TYPE_UINT16:
+	case FFI_TYPE_SINT16:
+	case FFI_TYPE_UINT32:
+	case FFI_TYPE_INT:
+	case FFI_TYPE_SINT32:
+	case FFI_TYPE_POINTER:
+	case FFI_TYPE_UINT64:
+	case FFI_TYPE_SINT64:
+	  copy_to_register_or_stack (context, stack, &state,
+				     ecif->avalue[i], ty->type);
+	  break;
+
+	case FFI_TYPE_STRUCT:
+	  if (is_hfa (ty))
+	    {
+	      copy_hfa_to_reg_or_stack (ecif->avalue[i], ty, context,
+					stack, &state);
+	    }
+	  else if (ty->size > 16)
+	    {
+	      /* If the argument is a composite type that is larger than 16
+		 bytes, then the argument has been copied to memory, and
+		 the argument is replaced by a pointer to the copy.  */
+
+	      copy_to_register_or_stack (context, stack, &state,
+					 &(ecif->avalue[i]), FFI_TYPE_POINTER);
+	    }
+	  else if (available_x (&state) >= (ty->size + 7) / 8)
+	    {
+	      /* If the argument is a composite type and the size in
+		 double-words is not more than the number of available
+		 X registers, then the argument is copied into consecutive
+		 X registers.  */
+	      int j;
+	      for (j = 0; j < (ty->size + 7) / 8; j++)
+		{
+		  memcpy (allocate_to_x (context, &state),
+			  &(((UINT64 *) ecif->avalue[i])[j]),
+			  sizeof (UINT64));
+		}
+	    }
+	  else
+	    {
+	      /* Otherwise, there are insufficient X registers. Further X
+		 register allocations are prevented, the NSAA is adjusted
+		 (by allocate_to_stack ()) and the argument is copied to
+		 memory at the adjusted NSAA.  */
+	      state.ngrn = N_X_ARG_REG;
+
+	      memcpy (allocate_to_stack (&state, stack, ty->alignment,
+					 ty->size), ecif->avalue + i, ty->size);
+	    }
+	  break;
+
+	default:
+	  FFI_ASSERT (0);
+	  break;
+	}
+    }
+
+  return ecif->cif->aarch64_flags;
+}
+
+ffi_status
+ffi_prep_cif_machdep (ffi_cif *cif)
+{
+  /* Round the stack up to a multiple of the stack alignment requirement. */
+  cif->bytes =
+    (cif->bytes + (AARCH64_STACK_ALIGN - 1)) & ~ (AARCH64_STACK_ALIGN - 1);
+
+  /* Initialize our flags. We are interested if this CIF will touch a
+     vector register, if so we will enable context save and load to
+     those registers, otherwise not. This is intended to be friendly
+     to lazy float context switching in the kernel.  */
+  cif->aarch64_flags = 0;
+
+  if (is_v_register_candidate (cif->rtype))
+    {
+      cif->aarch64_flags |= AARCH64_FFI_WITH_V;
+    }
+  else
+    {
+      int i;
+      for (i = 0; i < cif->nargs; i++)
+        if (is_v_register_candidate (cif->arg_types[i]))
+          {
+            cif->aarch64_flags |= AARCH64_FFI_WITH_V;
+            break;
+          }
+    }
+
+  return FFI_OK;
+}
+
+/* Call a function with the provided arguments and capture the return
+   value.  */
+void
+ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  extended_cif ecif;
+
+  ecif.cif = cif;
+  ecif.avalue = avalue;
+  ecif.rvalue = rvalue;
+
+  switch (cif->abi)
+    {
+    case FFI_SYSV:
+      {
+        struct call_context context;
+	unsigned stack_bytes;
+
+	/* Figure out the total amount of stack space we need, the
+	   above call frame space needs to be 16 bytes aligned to
+	   ensure correct alignment of the first object inserted in
+	   that space hence the ALIGN applied to cif->bytes.*/
+	stack_bytes = ALIGN(cif->bytes, 16);
+
+	memset (&context, 0, sizeof (context));
+        if (is_register_candidate (cif->rtype))
+          {
+            ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn);
+            switch (cif->rtype->type)
+              {
+              case FFI_TYPE_VOID:
+              case FFI_TYPE_FLOAT:
+              case FFI_TYPE_DOUBLE:
+              case FFI_TYPE_LONGDOUBLE:
+              case FFI_TYPE_UINT8:
+              case FFI_TYPE_SINT8:
+              case FFI_TYPE_UINT16:
+              case FFI_TYPE_SINT16:
+              case FFI_TYPE_UINT32:
+              case FFI_TYPE_SINT32:
+              case FFI_TYPE_POINTER:
+              case FFI_TYPE_UINT64:
+              case FFI_TYPE_INT:
+              case FFI_TYPE_SINT64:
+		{
+		  void *addr = get_basic_type_addr (cif->rtype->type,
+						    &context, 0);
+		  copy_basic_type (rvalue, addr, cif->rtype->type);
+		  break;
+		}
+
+              case FFI_TYPE_STRUCT:
+                if (is_hfa (cif->rtype))
+		  {
+		    int j;
+		    unsigned short type = get_homogeneous_type (cif->rtype);
+		    unsigned elems = element_count (cif->rtype);
+		    for (j = 0; j < elems; j++)
+		      {
+			void *reg = get_basic_type_addr (type, &context, j);
+			copy_basic_type (rvalue, reg, type);
+			rvalue += get_basic_type_size (type);
+		      }
+		  }
+                else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
+                  {
+                    unsigned size = ALIGN (cif->rtype->size, sizeof (UINT64));
+                    memcpy (rvalue, get_x_addr (&context, 0), size);
+                  }
+                else
+                  {
+                    FFI_ASSERT (0);
+                  }
+                break;
+
+              default:
+                FFI_ASSERT (0);
+                break;
+              }
+          }
+        else
+          {
+            memcpy (get_x_addr (&context, 8), &rvalue, sizeof (UINT64));
+            ffi_call_SYSV (aarch64_prep_args, &context, &ecif,
+			   stack_bytes, fn);
+          }
+        break;
+      }
+
+    default:
+      FFI_ASSERT (0);
+      break;
+    }
+}
+
+static unsigned char trampoline [] =
+{ 0x70, 0x00, 0x00, 0x58,	/* ldr	x16, 1f	*/
+  0x91, 0x00, 0x00, 0x10,	/* adr	x17, 2f	*/
+  0x00, 0x02, 0x1f, 0xd6	/* br	x16	*/
+};
+
+/* Build a trampoline.  */
+
+#define FFI_INIT_TRAMPOLINE(TRAMP,FUN,CTX,FLAGS)			\
+  ({unsigned char *__tramp = (unsigned char*)(TRAMP);			\
+    UINT64  __fun = (UINT64)(FUN);					\
+    UINT64  __ctx = (UINT64)(CTX);					\
+    UINT64  __flags = (UINT64)(FLAGS);					\
+    memcpy (__tramp, trampoline, sizeof (trampoline));			\
+    memcpy (__tramp + 12, &__fun, sizeof (__fun));			\
+    memcpy (__tramp + 20, &__ctx, sizeof (__ctx));			\
+    memcpy (__tramp + 28, &__flags, sizeof (__flags));			\
+    __clear_cache(__tramp, __tramp + FFI_TRAMPOLINE_SIZE);		\
+  })
+
+ffi_status
+ffi_prep_closure_loc (ffi_closure* closure,
+                      ffi_cif* cif,
+                      void (*fun)(ffi_cif*,void*,void**,void*),
+                      void *user_data,
+                      void *codeloc)
+{
+  if (cif->abi != FFI_SYSV)
+    return FFI_BAD_ABI;
+
+  FFI_INIT_TRAMPOLINE (&closure->tramp[0], &ffi_closure_SYSV, codeloc,
+		       cif->aarch64_flags);
+
+  closure->cif  = cif;
+  closure->user_data = user_data;
+  closure->fun  = fun;
+
+  return FFI_OK;
+}
+
+/* Primary handler to setup and invoke a function within a closure.
+
+   A closure when invoked enters via the assembler wrapper
+   ffi_closure_SYSV(). The wrapper allocates a call context on the
+   stack, saves the interesting registers (from the perspective of
+   the calling convention) into the context then passes control to
+   ffi_closure_SYSV_inner() passing the saved context and a pointer to
+   the stack at the point ffi_closure_SYSV() was invoked.
+
+   On the return path the assembler wrapper will reload call context
+   regsiters.
+
+   ffi_closure_SYSV_inner() marshalls the call context into ffi value
+   desriptors, invokes the wrapped function, then marshalls the return
+   value back into the call context.  */
+
+void
+ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
+			void *stack)
+{
+  ffi_cif *cif = closure->cif;
+  void **avalue = (void**) alloca (cif->nargs * sizeof (void*));
+  void *rvalue = NULL;
+  int i;
+  struct arg_state state;
+
+  arg_init (&state, ALIGN(cif->bytes, 16));
+
+  for (i = 0; i < cif->nargs; i++)
+    {
+      ffi_type *ty = cif->arg_types[i];
+
+      switch (ty->type)
+	{
+	case FFI_TYPE_VOID:
+	  FFI_ASSERT (0);
+	  break;
+
+	case FFI_TYPE_UINT8:
+	case FFI_TYPE_SINT8:
+	case FFI_TYPE_UINT16:
+	case FFI_TYPE_SINT16:
+	case FFI_TYPE_UINT32:
+	case FFI_TYPE_SINT32:
+	case FFI_TYPE_INT:
+	case FFI_TYPE_POINTER:
+	case FFI_TYPE_UINT64:
+	case FFI_TYPE_SINT64:
+	case  FFI_TYPE_FLOAT:
+	case  FFI_TYPE_DOUBLE:
+	case  FFI_TYPE_LONGDOUBLE:
+	  avalue[i] = allocate_to_register_or_stack (context, stack,
+						     &state, ty->type);
+	  break;
+
+	case FFI_TYPE_STRUCT:
+	  if (is_hfa (ty))
+	    {
+	      unsigned n = element_count (ty);
+	      if (available_v (&state) < n)
+		{
+		  state.nsrn = N_V_ARG_REG;
+		  avalue[i] = allocate_to_stack (&state, stack, ty->alignment,
+						 ty->size);
+		}
+	      else
+		{
+		  switch (get_homogeneous_type (ty))
+		    {
+		    case FFI_TYPE_FLOAT:
+		      {
+			/* Eeek! We need a pointer to the structure,
+			   however the homogeneous float elements are
+			   being passed in individual S registers,
+			   therefore the structure is not represented as
+			   a contiguous sequence of bytes in our saved
+			   register context. We need to fake up a copy
+			   of the structure layed out in memory
+			   correctly. The fake can be tossed once the
+			   closure function has returned hence alloca()
+			   is sufficient. */
+			int j;
+			UINT32 *p = avalue[i] = alloca (ty->size);
+			for (j = 0; j < element_count (ty); j++)
+			  memcpy (&p[j],
+				  allocate_to_s (context, &state),
+				  sizeof (*p));
+			break;
+		      }
+
+		    case FFI_TYPE_DOUBLE:
+		      {
+			/* Eeek! We need a pointer to the structure,
+			   however the homogeneous float elements are
+			   being passed in individual S registers,
+			   therefore the structure is not represented as
+			   a contiguous sequence of bytes in our saved
+			   register context. We need to fake up a copy
+			   of the structure layed out in memory
+			   correctly. The fake can be tossed once the
+			   closure function has returned hence alloca()
+			   is sufficient. */
+			int j;
+			UINT64 *p = avalue[i] = alloca (ty->size);
+			for (j = 0; j < element_count (ty); j++)
+			  memcpy (&p[j],
+				  allocate_to_d (context, &state),
+				  sizeof (*p));
+			break;
+		      }
+
+		    case FFI_TYPE_LONGDOUBLE:
+			  memcpy (&avalue[i],
+				  allocate_to_v (context, &state),
+				  sizeof (*avalue));
+		      break;
+
+		    default:
+		      FFI_ASSERT (0);
+		      break;
+		    }
+		}
+	    }
+	  else if (ty->size > 16)
+	    {
+	      /* Replace Composite type of size greater than 16 with a
+		 pointer.  */
+	      memcpy (&avalue[i],
+		      allocate_to_register_or_stack (context, stack,
+						     &state, FFI_TYPE_POINTER),
+		      sizeof (avalue[i]));
+	    }
+	  else if (available_x (&state) >= (ty->size + 7) / 8)
+	    {
+	      avalue[i] = get_x_addr (context, state.ngrn);
+	      state.ngrn += (ty->size + 7) / 8;
+	    }
+	  else
+	    {
+	      state.ngrn = N_X_ARG_REG;
+
+	      avalue[i] = allocate_to_stack (&state, stack, ty->alignment,
+					     ty->size);
+	    }
+	  break;
+
+	default:
+	  FFI_ASSERT (0);
+	  break;
+	}
+    }
+
+  /* Figure out where the return value will be passed, either in
+     registers or in a memory block allocated by the caller and passed
+     in x8.  */
+
+  if (is_register_candidate (cif->rtype))
+    {
+      /* Register candidates are *always* returned in registers. */
+
+      /* Allocate a scratchpad for the return value, we will let the
+         callee scrible the result into the scratch pad then move the
+         contents into the appropriate return value location for the
+         call convention.  */
+      rvalue = alloca (cif->rtype->size);
+      (closure->fun) (cif, rvalue, avalue, closure->user_data);
+
+      /* Copy the return value into the call context so that it is returned
+         as expected to our caller.  */
+      switch (cif->rtype->type)
+        {
+        case FFI_TYPE_VOID:
+          break;
+
+        case FFI_TYPE_UINT8:
+        case FFI_TYPE_UINT16:
+        case FFI_TYPE_UINT32:
+        case FFI_TYPE_POINTER:
+        case FFI_TYPE_UINT64:
+        case FFI_TYPE_SINT8:
+        case FFI_TYPE_SINT16:
+        case FFI_TYPE_INT:
+        case FFI_TYPE_SINT32:
+        case FFI_TYPE_SINT64:
+        case FFI_TYPE_FLOAT:
+        case FFI_TYPE_DOUBLE:
+        case FFI_TYPE_LONGDOUBLE:
+	  {
+	    void *addr = get_basic_type_addr (cif->rtype->type, context, 0);
+	    copy_basic_type (addr, rvalue, cif->rtype->type);
+            break;
+	  }
+        case FFI_TYPE_STRUCT:
+          if (is_hfa (cif->rtype))
+	    {
+	      int i;
+	      unsigned short type = get_homogeneous_type (cif->rtype);
+	      unsigned elems = element_count (cif->rtype);
+	      for (i = 0; i < elems; i++)
+		{
+		  void *reg = get_basic_type_addr (type, context, i);
+		  copy_basic_type (reg, rvalue, type);
+		  rvalue += get_basic_type_size (type);
+		}
+	    }
+          else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
+            {
+              unsigned size = ALIGN (cif->rtype->size, sizeof (UINT64)) ;
+              memcpy (get_x_addr (context, 0), rvalue, size);
+            }
+          else
+            {
+              FFI_ASSERT (0);
+            }
+          break;
+        default:
+          FFI_ASSERT (0);
+          break;
+        }
+    }
+  else
+    {
+      memcpy (&rvalue, get_x_addr (context, 8), sizeof (UINT64));
+      (closure->fun) (cif, rvalue, avalue, closure->user_data);
+    }
+}
+
diff --git a/libffi/src/aarch64/ffitarget.h b/libffi/src/aarch64/ffitarget.h
new file mode 100644
index 0000000..6f1a348
--- /dev/null
+++ b/libffi/src/aarch64/ffitarget.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2009, 2010, 2011, 2012 ARM Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+``Software''), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
+
+#ifndef LIBFFI_TARGET_H
+#define LIBFFI_TARGET_H
+
+#ifndef LIBFFI_H
+#error "Please do not include ffitarget.h directly into your source.  Use ffi.h instead."
+#endif
+
+#ifndef LIBFFI_ASM
+typedef unsigned long ffi_arg;
+typedef signed long ffi_sarg;
+
+typedef enum ffi_abi
+  {
+    FFI_FIRST_ABI = 0,
+    FFI_SYSV,
+    FFI_LAST_ABI,
+    FFI_DEFAULT_ABI = FFI_SYSV
+  } ffi_abi;
+#endif
+
+/* ---- Definitions for closures ----------------------------------------- */
+
+#define FFI_CLOSURES 1
+#define FFI_TRAMPOLINE_SIZE 36
+#define FFI_NATIVE_RAW_API 0
+
+/* ---- Internal ---- */
+
+
+#define FFI_EXTRA_CIF_FIELDS unsigned aarch64_flags
+
+#define AARCH64_FFI_WITH_V_BIT 0
+
+#define AARCH64_N_XREG 32
+#define AARCH64_N_VREG 32
+#define AARCH64_CALL_CONTEXT_SIZE (AARCH64_N_XREG * 8 + AARCH64_N_VREG * 16)
+
+#endif
diff --git a/libffi/src/aarch64/sysv.S b/libffi/src/aarch64/sysv.S
new file mode 100644
index 0000000..b8cd421
--- /dev/null
+++ b/libffi/src/aarch64/sysv.S
@@ -0,0 +1,307 @@
+/* Copyright (c) 2009, 2010, 2011, 2012 ARM Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+``Software''), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
+
+#define LIBFFI_ASM
+#include <fficonfig.h>
+#include <ffi.h>
+
+#define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#define cfi_restore(reg)		.cfi_restore reg
+#define cfi_def_cfa_register(reg)	.cfi_def_cfa_register reg
+
+        .text
+        .globl ffi_call_SYSV
+        .type ffi_call_SYSV, #function
+
+/* ffi_call_SYSV()
+
+   Create a stack frame, setup an argument context, call the callee
+   and extract the result.
+
+   The maximum required argument stack size is provided,
+   ffi_call_SYSV() allocates that stack space then calls the
+   prepare_fn to populate register context and stack.  The
+   argument passing registers are loaded from the register
+   context and the callee called, on return the register passing
+   register are saved back to the context.  Our caller will
+   extract the return value from the final state of the saved
+   register context.
+
+   Prototype:
+
+   extern unsigned
+   ffi_call_SYSV (void (*)(struct call_context *context, unsigned char *,
+			   extended_cif *),
+                  struct call_context *context,
+                  extended_cif *,
+                  unsigned required_stack_size,
+                  void (*fn)(void));
+
+   Therefore on entry we have:
+
+   x0 prepare_fn
+   x1 &context
+   x2 &ecif
+   x3 bytes
+   x4 fn
+
+   This function uses the following stack frame layout:
+
+   ==
+                saved x30(lr)
+   x29(fp)->    saved x29(fp)
+                saved x24
+                saved x23
+                saved x22
+   sp'    ->    saved x21
+                ...
+   sp     ->    (constructed callee stack arguments)
+   ==
+
+   Voila! */
+
+#define ffi_call_SYSV_FS (8 * 4)
+
+        .cfi_startproc
+ffi_call_SYSV:
+        stp     x29, x30, [sp, #-16]!
+	cfi_adjust_cfa_offset (16)
+        cfi_rel_offset (x29, 0)
+        cfi_rel_offset (x30, 8)
+
+        mov     x29, sp
+	cfi_def_cfa_register (x29)
+        sub     sp, sp, #ffi_call_SYSV_FS
+
+        stp     x21, x22, [sp, 0]
+        cfi_rel_offset (x21, 0 - ffi_call_SYSV_FS)
+        cfi_rel_offset (x22, 8 - ffi_call_SYSV_FS)
+
+        stp     x23, x24, [sp, 16]
+        cfi_rel_offset (x23, 16 - ffi_call_SYSV_FS)
+        cfi_rel_offset (x24, 24 - ffi_call_SYSV_FS)
+
+        mov     x21, x1
+        mov     x22, x2
+        mov     x24, x4
+
+        /* Allocate the stack space for the actual arguments, many
+           arguments will be passed in registers, but we assume
+           worst case and allocate sufficient stack for ALL of
+           the arguments.  */
+        sub     sp, sp, x3
+
+        /* unsigned (*prepare_fn) (struct call_context *context,
+				   unsigned char *stack, extended_cif *ecif);
+	 */
+        mov     x23, x0
+        mov     x0, x1
+        mov     x1, sp
+        /* x2 already in place */
+        blr     x23
+
+        /* Preserve the flags returned.  */
+        mov     x23, x0
+
+        /* Figure out if we should touch the vector registers.  */
+        tbz     x23, #AARCH64_FFI_WITH_V_BIT, 1f
+
+        /* Load the vector argument passing registers.  */
+        ldp     q0, q1, [x21, #8*32 +  0]
+        ldp     q2, q3, [x21, #8*32 + 32]
+        ldp     q4, q5, [x21, #8*32 + 64]
+        ldp     q6, q7, [x21, #8*32 + 96]
+1:
+        /* Load the core argument passing registers.  */
+        ldp     x0, x1, [x21,  #0]
+        ldp     x2, x3, [x21, #16]
+        ldp     x4, x5, [x21, #32]
+        ldp     x6, x7, [x21, #48]
+
+        /* Don't forget x8 which may be holding the address of a return buffer.
+	 */
+        ldr     x8,     [x21, #8*8]
+
+        blr     x24
+
+        /* Save the core argument passing registers.  */
+        stp     x0, x1, [x21,  #0]
+        stp     x2, x3, [x21, #16]
+        stp     x4, x5, [x21, #32]
+        stp     x6, x7, [x21, #48]
+
+        /* Note nothing useful ever comes back in x8!  */
+
+        /* Figure out if we should touch the vector registers.  */
+        tbz     x23, #AARCH64_FFI_WITH_V_BIT, 1f
+
+        /* Save the vector argument passing registers.  */
+        stp     q0, q1, [x21, #8*32 + 0]
+        stp     q2, q3, [x21, #8*32 + 32]
+        stp     q4, q5, [x21, #8*32 + 64]
+        stp     q6, q7, [x21, #8*32 + 96]
+1:
+        /* All done, unwind our stack frame.  */
+        ldp     x21, x22, [x29,  # - ffi_call_SYSV_FS]
+        cfi_restore (x21)
+        cfi_restore (x22)
+
+        ldp     x23, x24, [x29,  # - ffi_call_SYSV_FS + 16]
+        cfi_restore (x23)
+        cfi_restore (x24)
+
+        mov     sp, x29
+	cfi_def_cfa_register (sp)
+
+        ldp     x29, x30, [sp], #16
+	cfi_adjust_cfa_offset (-16)
+        cfi_restore (x29)
+        cfi_restore (x30)
+
+        ret
+
+        .cfi_endproc
+        .size ffi_call_SYSV, .-ffi_call_SYSV
+
+#define ffi_closure_SYSV_FS (8 * 2 + AARCH64_CALL_CONTEXT_SIZE)
+
+/* ffi_closure_SYSV
+
+   Closure invocation glue. This is the low level code invoked directly by
+   the closure trampoline to setup and call a closure.
+
+   On entry x17 points to a struct trampoline_data, x16 has been clobbered
+   all other registers are preserved.
+
+   We allocate a call context and save the argument passing registers,
+   then invoked the generic C ffi_closure_SYSV_inner() function to do all
+   the real work, on return we load the result passing registers back from
+   the call context.
+
+   On entry
+
+   extern void
+   ffi_closure_SYSV (struct trampoline_data *);
+
+   struct trampoline_data
+   {
+        UINT64 *ffi_closure;
+        UINT64 flags;
+   };
+
+   This function uses the following stack frame layout:
+
+   ==
+                saved x30(lr)
+   x29(fp)->    saved x29(fp)
+                saved x22
+                saved x21
+                ...
+   sp     ->    call_context
+   ==
+
+   Voila!  */
+
+        .text
+        .globl ffi_closure_SYSV
+        .cfi_startproc
+ffi_closure_SYSV:
+        stp     x29, x30, [sp, #-16]!
+	cfi_adjust_cfa_offset (16)
+        cfi_rel_offset (x29, 0)
+        cfi_rel_offset (x30, 8)
+
+        mov     x29, sp
+
+        sub     sp, sp, #ffi_closure_SYSV_FS
+	cfi_adjust_cfa_offset (ffi_closure_SYSV_FS)
+
+        stp     x21, x22, [x29, #-16]
+        cfi_rel_offset (x21, 0)
+        cfi_rel_offset (x22, 8)
+
+        /* Load x21 with &call_context.  */
+        mov     x21, sp
+        /* Preserve our struct trampoline_data *  */
+        mov     x22, x17
+
+        /* Save the rest of the argument passing registers.  */
+        stp     x0, x1, [x21, #0]
+        stp     x2, x3, [x21, #16]
+        stp     x4, x5, [x21, #32]
+        stp     x6, x7, [x21, #48]
+        /* Don't forget we may have been given a result scratch pad address.
+	 */
+        str     x8,     [x21, #64]
+
+        /* Figure out if we should touch the vector registers.  */
+        ldr     x0, [x22, #8]
+        tbz     x0, #AARCH64_FFI_WITH_V_BIT, 1f
+
+        /* Save the argument passing vector registers.  */
+        stp     q0, q1, [x21, #8*32 + 0]
+        stp     q2, q3, [x21, #8*32 + 32]
+        stp     q4, q5, [x21, #8*32 + 64]
+        stp     q6, q7, [x21, #8*32 + 96]
+1:
+        /* Load &ffi_closure..  */
+        ldr     x0, [x22, #0]
+        mov     x1, x21
+        /* Compute the location of the stack at the point that the
+           trampoline was called.  */
+        add     x2, x29, #16
+
+        bl      ffi_closure_SYSV_inner
+
+        /* Figure out if we should touch the vector registers.  */
+        ldr     x0, [x22, #8]
+        tbz     x0, #AARCH64_FFI_WITH_V_BIT, 1f
+
+        /* Load the result passing vector registers.  */
+        ldp     q0, q1, [x21, #8*32 + 0]
+        ldp     q2, q3, [x21, #8*32 + 32]
+        ldp     q4, q5, [x21, #8*32 + 64]
+        ldp     q6, q7, [x21, #8*32 + 96]
+1:
+        /* Load the result passing core registers.  */
+        ldp     x0, x1, [x21,  #0]
+        ldp     x2, x3, [x21, #16]
+        ldp     x4, x5, [x21, #32]
+        ldp     x6, x7, [x21, #48]
+        /* Note nothing usefull is returned in x8.  */
+
+        /* We are done, unwind our frame.  */
+        ldp     x21, x22, [x29,  #-16]
+        cfi_restore (x21)
+        cfi_restore (x22)
+
+        mov     sp, x29
+	cfi_adjust_cfa_offset (-ffi_closure_SYSV_FS)
+
+        ldp     x29, x30, [sp], #16
+	cfi_adjust_cfa_offset (-16)
+        cfi_restore (x29)
+        cfi_restore (x30)
+
+        ret
+        .cfi_endproc
+        .size ffi_closure_SYSV, .-ffi_closure_SYSV
diff --git a/libffi/src/arm/ffi.c b/libffi/src/arm/ffi.c
index 1f8597d..3ccceb9 100644
--- a/libffi/src/arm/ffi.c
+++ b/libffi/src/arm/ffi.c
@@ -251,8 +251,10 @@ void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
       break;
 
     case FFI_VFP:
+#ifdef __ARM_EABI__
       ffi_call_VFP (fn, &ecif, cif->bytes, cif->flags, ecif.rvalue);
       break;
+#endif
 
     default:
       FFI_ASSERT(0);
@@ -609,8 +611,10 @@ ffi_prep_closure_loc (ffi_closure* closure,
 
   if (cif->abi == FFI_SYSV)
     closure_func = &ffi_closure_SYSV;
+#ifdef __ARM_EABI__
   else if (cif->abi == FFI_VFP)
     closure_func = &ffi_closure_VFP;
+#endif
   else
     return FFI_BAD_ABI;
     
diff --git a/libffi/src/arm/sysv.S b/libffi/src/arm/sysv.S
index 60e2ae3..fb38cd6 100644
--- a/libffi/src/arm/sysv.S
+++ b/libffi/src/arm/sysv.S
@@ -41,7 +41,7 @@
 #define CNAME(x) x
 #endif
 #ifdef __APPLE__
-#define ENTRY(x) .globl CNAME(x); CNAME(x):
+#define ENTRY(x) .globl _##x; _##x:
 #else
 #define ENTRY(x) .globl CNAME(x); .type CNAME(x),%function; CNAME(x):
 #endif /* __APPLE__ */
@@ -187,7 +187,7 @@ ARM_FUNC_START ffi_call_SYSV
 	@     r1 already set
 
 	@ Call ffi_prep_args(stack, &ecif)
-	bl	ffi_prep_args
+	bl	CNAME(ffi_prep_args)
 
 	@ move first 4 parameters in registers
 	ldmia	sp, {r0-r3}
@@ -334,7 +334,9 @@ ARM_FUNC_START ffi_closure_SYSV
 
 
 /* Below are VFP hard-float ABI call and closure implementations.
-   Add VFP FPU directive here. */
+   Add VFP FPU directive here. This is only compiled into the library
+   under EABI.  */
+#ifdef __ARM_EABI__
 	.fpu	vfp
 
 	@ r0:   fn
@@ -362,7 +364,7 @@ ARM_FUNC_START ffi_call_VFP
 	sub	r2, fp, #64   @ VFP scratch space
 
 	@ Call ffi_prep_args(stack, &ecif, vfp_space)
-	bl	ffi_prep_args
+	bl	CNAME(ffi_prep_args)
 
 	@ Load VFP register args if needed
 	cmp	r0, #0
@@ -444,7 +446,7 @@ ARM_FUNC_START ffi_closure_VFP
 	sub	sp, sp, #72
 	str	sp, [sp, #64]
 	add	r1, sp, #64
-	bl	ffi_closure_SYSV_inner
+	bl	CNAME(ffi_closure_SYSV_inner)
 
 	cmp	r0, #FFI_TYPE_INT
 	beq	.Lretint_vfp
@@ -491,6 +493,7 @@ ARM_FUNC_START ffi_closure_VFP
 .ffi_closure_VFP_end:
 	UNWIND .fnend
         .size    CNAME(ffi_closure_VFP),.ffi_closure_VFP_end-CNAME(ffi_closure_VFP)
+#endif
 
 ENTRY(ffi_arm_trampoline)
 	stmfd sp!, {r0-r3}
diff --git a/libffi/src/arm/trampoline.S b/libffi/src/arm/trampoline.S
index 7b47429..935e8de 100644
--- a/libffi/src/arm/trampoline.S
+++ b/libffi/src/arm/trampoline.S
@@ -1,5 +1,5 @@
 # GENERATED CODE - DO NOT EDIT
-# This file was generated by ./gentramp.sh
+# This file was generated by src/arm/gentramp.sh
 
 #  Copyright (c) 2010, Plausible Labs Cooperative, Inc.
 #  
diff --git a/libffi/src/bfin/ffi.c b/libffi/src/bfin/ffi.c
new file mode 100644
index 0000000..0beccc1
--- /dev/null
+++ b/libffi/src/bfin/ffi.c
@@ -0,0 +1,195 @@
+/* -----------------------------------------------------------------------
+   ffi.c - Copyright (c) 2012  Alexandre K. I. de Mendonca <alexandre.keunecke@gmail.com>
+
+   Blackfin Foreign Function Interface
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+   ----------------------------------------------------------------------- */
+#include <ffi.h>
+#include <ffi_common.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+
+/* Maximum number of GPRs available for argument passing.  */
+#define MAX_GPRARGS 3
+
+/*
+ * Return types
+ */
+#define FFIBFIN_RET_VOID 0
+#define FFIBFIN_RET_BYTE 1
+#define FFIBFIN_RET_HALFWORD 2
+#define FFIBFIN_RET_INT64 3
+#define FFIBFIN_RET_INT32 4
+
+/*====================================================================*/
+/*                          PROTOTYPE          *
+ /*====================================================================*/
+void ffi_prep_args(unsigned char *, extended_cif *);
+
+/*====================================================================*/
+/*                          Externals                                 */
+/*                          (Assembly)                                */
+/*====================================================================*/
+
+extern void ffi_call_SYSV(unsigned, extended_cif *, void(*)(unsigned char *, extended_cif *), unsigned, void *, void(*fn)(void));
+
+/*====================================================================*/
+/*                          Implementation                            */
+/*                                                            */
+/*====================================================================*/
+
+
+/*
+ * This function calculates the return type (size) based on type.
+ */
+
+ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
+{
+   /* --------------------------------------*
+    *   Return handling                *
+    * --------------------------------------*/
+   switch (cif->rtype->type) {
+      case FFI_TYPE_VOID:
+         cif->flags = FFIBFIN_RET_VOID;
+         break;
+      case FFI_TYPE_UINT16:
+      case FFI_TYPE_SINT16:
+         cif->flags = FFIBFIN_RET_HALFWORD;
+         break;
+      case FFI_TYPE_UINT8:
+         cif->flags = FFIBFIN_RET_BYTE;
+         break;
+      case FFI_TYPE_INT:
+      case FFI_TYPE_UINT32:
+      case FFI_TYPE_SINT32:
+      case FFI_TYPE_FLOAT:
+      case FFI_TYPE_POINTER:
+      case FFI_TYPE_SINT8:
+         cif->flags = FFIBFIN_RET_INT32;
+         break;
+      case FFI_TYPE_SINT64:
+      case FFI_TYPE_UINT64:
+      case FFI_TYPE_DOUBLE:
+          cif->flags = FFIBFIN_RET_INT64;
+          break;
+      case FFI_TYPE_STRUCT:
+         if (cif->rtype->size <= 4){
+        	 cif->flags = FFIBFIN_RET_INT32;
+         }else if (cif->rtype->size == 8){
+        	 cif->flags = FFIBFIN_RET_INT64;
+         }else{
+        	 //it will return via a hidden pointer in P0
+        	 cif->flags = FFIBFIN_RET_VOID;
+         }
+         break;
+      default:
+         FFI_ASSERT(0);
+         break;
+   }
+   return FFI_OK;
+}
+
+/*
+ * This will prepare the arguments and will call the assembly routine
+ * cif = the call interface
+ * fn = the function to be called
+ * rvalue = the return value
+ * avalue = the arguments
+ */
+void ffi_call(ffi_cif *cif, void(*fn)(void), void *rvalue, void **avalue)
+{
+   int ret_type = cif->flags;
+   extended_cif ecif;
+   ecif.cif = cif;
+   ecif.avalue = avalue;
+   ecif.rvalue = rvalue;
+
+   switch (cif->abi) {
+      case FFI_SYSV:
+         ffi_call_SYSV(cif->bytes, &ecif, ffi_prep_args, ret_type, ecif.rvalue, fn);
+         break;
+      default:
+         FFI_ASSERT(0);
+         break;
+   }
+}
+
+
+/*
+* This function prepares the parameters (copies them from the ecif to the stack)
+*  to call the function (ffi_prep_args is called by the assembly routine in file
+*  sysv.S, which also calls the actual function)
+*/
+void ffi_prep_args(unsigned char *stack, extended_cif *ecif)
+{
+   register unsigned int i = 0;
+   void **p_argv;
+   unsigned char *argp;
+   ffi_type **p_arg;
+   argp = stack;
+   p_argv = ecif->avalue;
+   for (i = ecif->cif->nargs, p_arg = ecif->cif->arg_types;
+        (i != 0);
+        i--, p_arg++) {
+      size_t z;
+      z = (*p_arg)->size;
+      if (z < sizeof(int)) {
+         z = sizeof(int);
+         switch ((*p_arg)->type) {
+            case FFI_TYPE_SINT8: {
+                  signed char v = *(SINT8 *)(* p_argv);
+                  signed int t = v;
+                  *(signed int *) argp = t;
+               }
+               break;
+            case FFI_TYPE_UINT8: {
+                  unsigned char v = *(UINT8 *)(* p_argv);
+                  unsigned int t = v;
+                  *(unsigned int *) argp = t;
+               }
+               break;
+            case FFI_TYPE_SINT16:
+               *(signed int *) argp = (signed int) * (SINT16 *)(* p_argv);
+               break;
+            case FFI_TYPE_UINT16:
+               *(unsigned int *) argp = (unsigned int) * (UINT16 *)(* p_argv);
+               break;
+            case FFI_TYPE_STRUCT:
+               memcpy(argp, *p_argv, (*p_arg)->size);
+               break;
+            default:
+               FFI_ASSERT(0);
+               break;
+         }
+      } else if (z == sizeof(int)) {
+         *(unsigned int *) argp = (unsigned int) * (UINT32 *)(* p_argv);
+      } else {
+         memcpy(argp, *p_argv, z);
+      }
+      p_argv++;
+      argp += z;
+   }
+}
+
+
+
diff --git a/libffi/src/bfin/ffitarget.h b/libffi/src/bfin/ffitarget.h
new file mode 100644
index 0000000..2175c01
--- /dev/null
+++ b/libffi/src/bfin/ffitarget.h
@@ -0,0 +1,43 @@
+/* -----------------------------------------------------------------------
+   ffitarget.h - Copyright (c) 2012  Alexandre K. I. de Mendonca <alexandre.keunecke@gmail.com>
+
+   Blackfin Foreign Function Interface
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+   ----------------------------------------------------------------------- */
+
+#ifndef LIBFFI_TARGET_H
+#define LIBFFI_TARGET_H
+
+#ifndef LIBFFI_ASM
+typedef unsigned long          ffi_arg;
+typedef signed 	 long          ffi_sarg;
+
+typedef enum ffi_abi {
+  FFI_FIRST_ABI = 0,
+  FFI_SYSV,
+  FFI_LAST_ABI,
+  FFI_DEFAULT_ABI = FFI_SYSV
+} ffi_abi;
+#endif
+
+#endif
+
diff --git a/libffi/src/bfin/sysv.S b/libffi/src/bfin/sysv.S
new file mode 100644
index 0000000..ae7a152
--- /dev/null
+++ b/libffi/src/bfin/sysv.S
@@ -0,0 +1,177 @@
+/* -----------------------------------------------------------------------
+   sysv.S - Copyright (c) 2012  Alexandre K. I. de Mendonca <alexandre.keunecke@gmail.com>
+
+   Blackfin Foreign Function Interface
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+   ----------------------------------------------------------------------- */
+
+#define LIBFFI_ASM
+#include <fficonfig.h>
+#include <ffi.h>
+
+.text
+.align 4
+
+	/*
+	 There is a "feature" in the bfin toolchain that it puts a _ before funcion names
+	 that's why the function here it's called _ffi_call_SYSV and not ffi_call_SYSV
+	 */
+	.global _ffi_call_SYSV;
+	.type _ffi_call_SYSV, STT_FUNC;
+	.func ffi_call_SYSV
+
+	/*
+	cif->bytes  	= R0	(fp+8)
+	&ecif			= R1	(fp+12)
+	ffi_prep_args	= R2	(fp+16)
+	ret_type		= stack (fp+20)
+	ecif.rvalue		= stack (fp+24)
+	fn				= stack	(fp+28)
+					  got	(fp+32)
+    There is room for improvement here (we can use temporary registers
+        instead of saving the values in the memory)
+	REGS:
+		P5 => Stack pointer (function arguments)
+		R5 => cif->bytes
+		R4 => ret->type
+
+		FP-20 = P3
+		FP-16 = SP (parameters area)
+		FP-12 = SP (temp)
+		FP-08 = function return part 1 [R0]
+		FP-04 = function return part 2 [R1]
+	*/
+
+_ffi_call_SYSV:
+.prologue:
+	LINK 20;
+	[FP-20] = P3;
+	[FP+8] = R0;
+	[FP+12] = R1;
+	[FP+16] = R2;
+
+.allocate_stack:
+	//alocate cif->bytes into the stack
+	R1 = [FP+8];
+	R0 = SP;
+	R0 = R0 - R1;
+	R1 = 4;
+	R0 = R0 - R1;
+	[FP-12] = SP;
+	SP = R0;
+	[FP-16] = SP;
+
+.call_prep_args:
+	//get the addr of prep_args
+	P0 = [P3 + _ffi_prep_args@FUNCDESC_GOT17M4];
+	P1 = [P0];
+	P3 = [P0+4];
+	R0 = [FP-16];//SP (parameter area)
+	R1 = [FP+12];//ecif
+	call (P1);
+
+.call_user_function:
+	//ajust SP so as to allow the user function access the parameters on the stack
+	SP = [FP-16]; //point to function parameters
+	R0 = [SP];
+	R1 = [SP+4];
+	R2 = [SP+8];
+	//load user function address
+	P0 = FP;
+	P0 +=28;
+	P1 = [P0];
+	P1 = [P1];
+	P3 = [P0+4];
+	/*
+		For functions returning aggregate values (struct) occupying more than 8 bytes,
+		the caller allocates the return value object on the stack and the address
+		of this object is passed to the callee as a hidden argument in register P0.
+	*/
+	P0 = [FP+24];
+
+	call (P1);
+	SP = [FP-12];
+.compute_return:
+	P2 = [FP-20];
+	[FP-8] = R0;
+	[FP-4] = R1;
+
+	R0 = [FP+20];
+	R1 = R0 << 2;
+
+	R0 = [P2+.rettable@GOT17M4];
+	R0 = R1 + R0;
+	P2 = R0;
+	R1 = [P2];
+
+	P2 = [FP+-20];
+	R0 = [P2+.rettable@GOT17M4];
+	R0 = R1 + R0;
+	P2 = R0;
+	R0 = [FP-8];
+	R1 = [FP-4];
+	jump (P2);
+
+/*
+#define FFIBFIN_RET_VOID 0
+#define FFIBFIN_RET_BYTE 1
+#define FFIBFIN_RET_HALFWORD 2
+#define FFIBFIN_RET_INT64 3
+#define FFIBFIN_RET_INT32 4
+*/
+.align 4
+.align 4
+.rettable:
+	.dd .epilogue - .rettable
+	.dd	.rbyte - .rettable;
+	.dd	.rhalfword - .rettable;
+	.dd	.rint64 - .rettable;
+	.dd	.rint32 - .rettable;
+
+.rbyte:
+	P0 = [FP+24];
+	R0 = R0.B (Z);
+	[P0] = R0;
+	JUMP .epilogue
+.rhalfword:
+	P0 = [FP+24];
+	R0 = R0.L;
+	[P0] = R0;
+	JUMP .epilogue
+.rint64:
+	P0 = [FP+24];// &rvalue
+	[P0] = R0;
+	[P0+4] = R1;
+	JUMP .epilogue
+.rint32:
+	P0 = [FP+24];
+	[P0] = R0;
+.epilogue:
+	R0 = [FP+8];
+	R1 = [FP+12];
+	R2 = [FP+16];
+	P3 = [FP-20];
+	UNLINK;
+	RTS;
+
+.size _ffi_call_SYSV,.-_ffi_call_SYSV;
+.endfunc
diff --git a/libffi/src/closures.c b/libffi/src/closures.c
index 1b37827..fecbc4a 100644
--- a/libffi/src/closures.c
+++ b/libffi/src/closures.c
@@ -172,6 +172,27 @@ selinux_enabled_check (void)
 
 #endif /* !FFI_MMAP_EXEC_SELINUX */
 
+/* On PaX enable kernels that have MPROTECT enable we can't use PROT_EXEC. */
+#ifdef FFI_MMAP_EXEC_EMUTRAMP_PAX
+#include <stdlib.h>
+
+static int emutramp_enabled = -1;
+
+static int
+emutramp_enabled_check (void)
+{
+  if (getenv ("FFI_DISABLE_EMUTRAMP") == NULL)
+    return 1;
+  else
+    return 0;
+}
+
+#define is_emutramp_enabled() (emutramp_enabled >= 0 ? emutramp_enabled \
+                               : (emutramp_enabled = emutramp_enabled_check ()))
+#else
+#define is_emutramp_enabled() 0
+#endif /* FFI_MMAP_EXEC_EMUTRAMP_PAX */
+
 #elif defined (__CYGWIN__) || defined(__INTERIX)
 
 #include <sys/mman.h>
@@ -458,6 +479,12 @@ dlmmap (void *start, size_t length, int prot,
   printf ("mapping in %zi\n", length);
 #endif
 
+  if (execfd == -1 && is_emutramp_enabled ())
+    {
+      ptr = mmap (start, length, prot & ~PROT_EXEC, flags, fd, offset);
+      return ptr;
+    }
+
   if (execfd == -1 && !is_selinux_enabled ())
     {
       ptr = mmap (start, length, prot | PROT_EXEC, flags, fd, offset);
diff --git a/libffi/src/m68k/ffi.c b/libffi/src/m68k/ffi.c
index d95c72b..37a0784 100644
--- a/libffi/src/m68k/ffi.c
+++ b/libffi/src/m68k/ffi.c
@@ -1,7 +1,7 @@
 /* -----------------------------------------------------------------------
    ffi.c
-   
-   m68k Foreign Function Interface 
+
+   m68k Foreign Function Interface
    ----------------------------------------------------------------------- */
 
 #include <ffi.h>
@@ -13,8 +13,13 @@
 void rtems_cache_flush_multiple_data_lines( const void *, size_t );
 #else
 #include <sys/syscall.h>
+#ifdef __MINT__
+#include <mint/mintbind.h>
+#include <mint/ssystem.h>
+#else
 #include <asm/cachectl.h>
 #endif
+#endif
 
 void ffi_call_SYSV (extended_cif *,
 		    unsigned, unsigned,
@@ -39,8 +44,12 @@ ffi_prep_args (void *stack, extended_cif *ecif)
 
   argp = stack;
 
-  if (ecif->cif->rtype->type == FFI_TYPE_STRUCT
-      && !ecif->cif->flags)
+  if (
+#ifdef __MINT__
+      (ecif->cif->rtype->type == FFI_TYPE_LONGDOUBLE) ||
+#endif
+      (((ecif->cif->rtype->type == FFI_TYPE_STRUCT)
+        && !ecif->cif->flags)))
     struct_value_ptr = ecif->rvalue;
   else
     struct_value_ptr = NULL;
@@ -51,12 +60,12 @@ ffi_prep_args (void *stack, extended_cif *ecif)
        i != 0;
        i--, p_arg++)
     {
-      size_t z;
+      size_t z = (*p_arg)->size;
+      int type = (*p_arg)->type;
 
-      z = (*p_arg)->size;
       if (z < sizeof (int))
 	{
-	  switch ((*p_arg)->type)
+	  switch (type)
 	    {
 	    case FFI_TYPE_SINT8:
 	      *(signed int *) argp = (signed int) *(SINT8 *) *p_argv;
@@ -75,7 +84,14 @@ ffi_prep_args (void *stack, extended_cif *ecif)
 	      break;
 
 	    case FFI_TYPE_STRUCT:
+#ifdef __MINT__
+	      if (z == 1 || z == 2)
+		memcpy (argp + 2, *p_argv, z);
+              else
+		memcpy (argp, *p_argv, z);
+#else
 	      memcpy (argp + sizeof (int) - z, *p_argv, z);
+#endif
 	      break;
 
 	    default:
@@ -120,17 +136,34 @@ ffi_prep_cif_machdep (ffi_cif *cif)
       break;
 
     case FFI_TYPE_STRUCT:
+      if (cif->rtype->elements[0]->type == FFI_TYPE_STRUCT &&
+          cif->rtype->elements[1])
+        {
+          cif->flags = 0;
+          break;
+        }
+
       switch (cif->rtype->size)
 	{
 	case 1:
+#ifdef __MINT__
+	  cif->flags = CIF_FLAGS_STRUCT2;
+#else
 	  cif->flags = CIF_FLAGS_STRUCT1;
+#endif
 	  break;
 	case 2:
 	  cif->flags = CIF_FLAGS_STRUCT2;
 	  break;
+#ifdef __MINT__
+	case 3:
+#endif
 	case 4:
 	  cif->flags = CIF_FLAGS_INT;
 	  break;
+#ifdef __MINT__
+	case 7:
+#endif
 	case 8:
 	  cif->flags = CIF_FLAGS_DINT;
 	  break;
@@ -150,7 +183,11 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 
 #if (FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE)
     case FFI_TYPE_LONGDOUBLE:
+#ifdef __MINT__
+      cif->flags = 0;
+#else
       cif->flags = CIF_FLAGS_LDOUBLE;
+#endif
       break;
 #endif
 
@@ -218,6 +255,26 @@ ffi_prep_incoming_args_SYSV (char *stack, void **avalue, ffi_cif *cif)
       size_t z;
 
       z = (*p_arg)->size;
+#ifdef __MINT__
+      if (cif->flags &&
+          cif->rtype->type == FFI_TYPE_STRUCT &&
+          (z == 1 || z == 2))
+ 	{
+	  *p_argv = (void *) (argp + 2);
+
+	  z = 4;
+	}
+      else
+      if (cif->flags &&
+          cif->rtype->type == FFI_TYPE_STRUCT &&
+          (z == 3 || z == 4))
+ 	{
+	  *p_argv = (void *) (argp);
+
+	  z = 4;
+	}
+      else
+#endif
       if (z <= 4)
 	{
 	  *p_argv = (void *) (argp + 4 - z);
@@ -267,14 +324,21 @@ ffi_prep_closure_loc (ffi_closure* closure,
   *(unsigned short *)closure->tramp = 0x207c;
   *(void **)(closure->tramp + 2) = codeloc;
   *(unsigned short *)(closure->tramp + 6) = 0x4ef9;
-  if (cif->rtype->type == FFI_TYPE_STRUCT
-      && !cif->flags)
+
+  if (
+#ifdef __MINT__
+      (cif->rtype->type == FFI_TYPE_LONGDOUBLE) ||
+#endif
+      (((cif->rtype->type == FFI_TYPE_STRUCT)
+         && !cif->flags)))
     *(void **)(closure->tramp + 8) = ffi_closure_struct_SYSV;
   else
     *(void **)(closure->tramp + 8) = ffi_closure_SYSV;
 
 #ifdef __rtems__
   rtems_cache_flush_multiple_data_lines( codeloc, FFI_TRAMPOLINE_SIZE );
+#elif defined(__MINT__)
+  Ssystem(S_FLUSHCACHE, codeloc, FFI_TRAMPOLINE_SIZE);
 #else
   syscall(SYS_cacheflush, codeloc, FLUSH_SCOPE_LINE,
 	  FLUSH_CACHE_BOTH, FFI_TRAMPOLINE_SIZE);
@@ -286,4 +350,3 @@ ffi_prep_closure_loc (ffi_closure* closure,
 
   return FFI_OK;
 }
-
diff --git a/libffi/src/m68k/sysv.S b/libffi/src/m68k/sysv.S
index dfdd864..f6f4ef9 100644
--- a/libffi/src/m68k/sysv.S
+++ b/libffi/src/m68k/sysv.S
@@ -1,6 +1,7 @@
 /* -----------------------------------------------------------------------
 	
-   sysv.S - Copyright (c) 1998, 2012 Andreas Schwab
+   sysv.S - Copyright (c) 2012 Alan Hourihane
+	    Copyright (c) 1998, 2012 Andreas Schwab
 	    Copyright (c) 2008 Red Hat, Inc. 
    
    m68k Foreign Function Interface 
@@ -42,13 +43,19 @@
 #define CFI_ENDPROC()
 #endif
 
+#ifdef __MINT__
+#define CALLFUNC(funcname) _ ## funcname
+#else
+#define CALLFUNC(funcname) funcname
+#endif
+
 	.text
 
-	.globl	ffi_call_SYSV
-	.type	ffi_call_SYSV,@function
+	.globl	CALLFUNC(ffi_call_SYSV)
+	.type	CALLFUNC(ffi_call_SYSV),@function
 	.align	4
 
-ffi_call_SYSV:
+CALLFUNC(ffi_call_SYSV):
 	CFI_STARTPROC()
 	link	%fp,#0
 	CFI_OFFSET(14,-8)
@@ -63,14 +70,18 @@ ffi_call_SYSV:
 	move.l	8(%fp),-(%sp)
 	pea	4(%sp)
 #if !defined __PIC__
-	jsr	ffi_prep_args
+	jsr	CALLFUNC(ffi_prep_args)
 #else
-	bsr.l	ffi_prep_args@PLTPC
+	bsr.l	CALLFUNC(ffi_prep_args@PLTPC)
 #endif
 	addq.l	#8,%sp	
 
 	| Pass pointer to struct value, if any
+#ifdef __MINT__
+	move.l	%d0,%a1
+#else
 	move.l	%a0,%a1
+#endif
 
 	| Call the function
 	move.l	24(%fp),%a0
@@ -142,7 +153,11 @@ retlongdouble:
 retpointer:
 	btst	#5,%d2
 	jbeq	retstruct1
+#ifdef __MINT__
+	move.l	%d0,(%a1)
+#else
 	move.l	%a0,(%a1)
+#endif
 	jbra	epilogue
 
 retstruct1:
@@ -162,13 +177,13 @@ epilogue:
 	unlk	%fp
 	rts
 	CFI_ENDPROC()
-	.size	ffi_call_SYSV,.-ffi_call_SYSV
+	.size	CALLFUNC(ffi_call_SYSV),.-CALLFUNC(ffi_call_SYSV)
 
-	.globl	ffi_closure_SYSV
-	.type	ffi_closure_SYSV, @function
+	.globl	CALLFUNC(ffi_closure_SYSV)
+	.type	CALLFUNC(ffi_closure_SYSV), @function
 	.align	4
 
-ffi_closure_SYSV:
+CALLFUNC(ffi_closure_SYSV):
 	CFI_STARTPROC()
 	link	%fp,#-12
 	CFI_OFFSET(14,-8)
@@ -178,9 +193,9 @@ ffi_closure_SYSV:
 	pea	-12(%fp)
 	move.l	%a0,-(%sp)
 #if !defined __PIC__
-	jsr	ffi_closure_SYSV_inner
+	jsr	CALLFUNC(ffi_closure_SYSV_inner)
 #else
-	bsr.l	ffi_closure_SYSV_inner@PLTPC
+	bsr.l	CALLFUNC(ffi_closure_SYSV_inner@PLTPC)
 #endif
 
 	lsr.l	#1,%d0
@@ -240,13 +255,13 @@ ffi_closure_SYSV:
 	jra	.Lcls_epilogue
 	CFI_ENDPROC()
 
-	.size	ffi_closure_SYSV,.-ffi_closure_SYSV
+	.size	CALLFUNC(ffi_closure_SYSV),.-CALLFUNC(ffi_closure_SYSV)
 
-	.globl	ffi_closure_struct_SYSV
-	.type	ffi_closure_struct_SYSV, @function
+	.globl	CALLFUNC(ffi_closure_struct_SYSV)
+	.type	CALLFUNC(ffi_closure_struct_SYSV), @function
 	.align	4
 
-ffi_closure_struct_SYSV:
+CALLFUNC(ffi_closure_struct_SYSV):
 	CFI_STARTPROC()
 	link	%fp,#0
 	CFI_OFFSET(14,-8)
@@ -256,14 +271,14 @@ ffi_closure_struct_SYSV:
 	move.l	%a1,-(%sp)
 	move.l	%a0,-(%sp)
 #if !defined __PIC__
-	jsr	ffi_closure_SYSV_inner
+	jsr	CALLFUNC(ffi_closure_SYSV_inner)
 #else
-	bsr.l	ffi_closure_SYSV_inner@PLTPC
+	bsr.l	CALLFUNC(ffi_closure_SYSV_inner@PLTPC)
 #endif
 	unlk	%fp
 	rts
 	CFI_ENDPROC()
-	.size	ffi_closure_struct_SYSV,.-ffi_closure_struct_SYSV
+	.size	CALLFUNC(ffi_closure_struct_SYSV),.-CALLFUNC(ffi_closure_struct_SYSV)
 
 #if defined __ELF__ && defined __linux__
 	.section	.note.GNU-stack,"",@progbits
diff --git a/libffi/src/powerpc/ffi_darwin.c b/libffi/src/powerpc/ffi_darwin.c
index ee03dab..dd897f4 100644
--- a/libffi/src/powerpc/ffi_darwin.c
+++ b/libffi/src/powerpc/ffi_darwin.c
@@ -1065,10 +1065,10 @@ ffi_prep_closure_loc (ffi_closure* closure,
       closure->cif = cif;
       closure->fun = fun;
       closure->user_data = user_data;
+      break;
 
     default:
-
-      FFI_ASSERT(0);
+      return FFI_BAD_ABI;
       break;
     }
   return FFI_OK;
@@ -1235,7 +1235,7 @@ ffi_closure_helper_DARWIN (ffi_closure *closure, void *rvalue,
 	  if (arg_types[i]->elements[0]->type == FFI_TYPE_DOUBLE)
 	    size_al = ALIGN(arg_types[i]->size, 8);
 #  if defined(POWERPC64)
-	  FFI_ASSERT (cif->abi != FFI_DARWIN)
+	  FFI_ASSERT (cif->abi != FFI_DARWIN);
 	  avalue[i] = pgr;
 	  pgr += (size_al + 7) / 8;
 #  else
diff --git a/libffi/src/prep_cif.c b/libffi/src/prep_cif.c
index eb68341..5d1924b 100644
--- a/libffi/src/prep_cif.c
+++ b/libffi/src/prep_cif.c
@@ -140,6 +140,9 @@ ffi_status FFI_HIDDEN ffi_prep_cif_core(ffi_cif *cif, ffi_abi abi,
 #ifdef SPARC
       && (cif->abi != FFI_V9 || cif->rtype->size > 32)
 #endif
+#ifdef TILE
+      && (cif->rtype->size > 10 * FFI_SIZEOF_ARG)
+#endif
      )
     bytes = STACK_ARG_SIZE(sizeof(void*));
 #endif
@@ -169,6 +172,16 @@ ffi_status FFI_HIDDEN ffi_prep_cif_core(ffi_cif *cif, ffi_abi abi,
 	  if (((*ptr)->alignment - 1) & bytes)
 	    bytes = ALIGN(bytes, (*ptr)->alignment);
 
+#ifdef TILE
+	  if (bytes < 10 * FFI_SIZEOF_ARG &&
+	      bytes + STACK_ARG_SIZE((*ptr)->size) > 10 * FFI_SIZEOF_ARG)
+	    {
+	      /* An argument is never split between the 10 parameter
+		 registers and the stack.  */
+	      bytes = 10 * FFI_SIZEOF_ARG;
+	    }
+#endif
+
 	  bytes += STACK_ARG_SIZE((*ptr)->size);
 	}
 #endif
diff --git a/libffi/src/tile/ffi.c b/libffi/src/tile/ffi.c
new file mode 100644
index 0000000..3a94469
--- /dev/null
+++ b/libffi/src/tile/ffi.c
@@ -0,0 +1,355 @@
+/* -----------------------------------------------------------------------
+   ffi.c - Copyright (c) 2012 Tilera Corp.
+
+   TILE Foreign Function Interface
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+   ----------------------------------------------------------------------- */
+
+#include <ffi.h>
+#include <ffi_common.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <arch/abi.h>
+#include <arch/icache.h>
+#include <arch/opcode.h>
+
+
+/* The first 10 registers are used to pass arguments and return values. */
+#define NUM_ARG_REGS 10
+
+/* Performs a raw function call with the given NUM_ARG_REGS register arguments
+   and the specified additional stack arguments (if any). */
+extern void ffi_call_tile(ffi_sarg reg_args[NUM_ARG_REGS],
+                          const ffi_sarg *stack_args,
+                          size_t stack_args_bytes,
+                          void (*fnaddr)(void))
+  FFI_HIDDEN;
+
+/* This handles the raw call from the closure stub, cleaning up the
+   parameters and delegating to ffi_closure_tile_inner. */
+extern void ffi_closure_tile(void) FFI_HIDDEN;
+
+
+ffi_status
+ffi_prep_cif_machdep(ffi_cif *cif)
+{
+  /* We always allocate room for all registers. Even if we don't
+     use them as parameters, they get returned in the same array
+     as struct return values so we need to make room. */
+  if (cif->bytes < NUM_ARG_REGS * FFI_SIZEOF_ARG)
+    cif->bytes = NUM_ARG_REGS * FFI_SIZEOF_ARG;
+
+  if (cif->rtype->size > NUM_ARG_REGS * FFI_SIZEOF_ARG)
+    cif->flags = FFI_TYPE_STRUCT;
+  else
+    cif->flags = FFI_TYPE_INT;
+
+  /* Nothing to do. */
+  return FFI_OK;
+}
+
+
+static long
+assign_to_ffi_arg(ffi_sarg *out, void *in, const ffi_type *type,
+                  int write_to_reg)
+{
+  switch (type->type)
+    {
+    case FFI_TYPE_SINT8:
+      *out = *(SINT8 *)in;
+      return 1;
+
+    case FFI_TYPE_UINT8:
+      *out = *(UINT8 *)in;
+      return 1;
+
+    case FFI_TYPE_SINT16:
+      *out = *(SINT16 *)in;
+      return 1;
+
+    case FFI_TYPE_UINT16:
+      *out = *(UINT16 *)in;
+      return 1;
+
+    case FFI_TYPE_SINT32:
+    case FFI_TYPE_UINT32:
+#ifndef __LP64__
+    case FFI_TYPE_POINTER:
+#endif
+      /* Note that even unsigned 32-bit quantities are sign extended
+         on tilegx when stored in a register.  */
+      *out = *(SINT32 *)in;
+      return 1;
+
+    case FFI_TYPE_FLOAT:
+#ifdef __tilegx__
+      if (write_to_reg)
+        {
+          /* Properly sign extend the value.  */
+          union { float f; SINT32 s32; } val;
+          val.f = *(float *)in;
+          *out = val.s32;
+        }
+      else
+#endif
+        {
+          *(float *)out = *(float *)in;
+        }
+      return 1;
+
+    case FFI_TYPE_SINT64:
+    case FFI_TYPE_UINT64:
+    case FFI_TYPE_DOUBLE:
+#ifdef __LP64__
+    case FFI_TYPE_POINTER:
+#endif
+      *(UINT64 *)out = *(UINT64 *)in;
+      return sizeof(UINT64) / FFI_SIZEOF_ARG;
+
+    case FFI_TYPE_STRUCT:
+      memcpy(out, in, type->size);
+      return (type->size + FFI_SIZEOF_ARG - 1) / FFI_SIZEOF_ARG;
+
+    case FFI_TYPE_VOID:
+      /* Must be a return type. Nothing to do. */
+      return 0;
+
+    default:
+      FFI_ASSERT(0);
+      return -1;
+    }
+}
+
+
+void
+ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  ffi_sarg * const arg_mem = alloca(cif->bytes);
+  ffi_sarg * const reg_args = arg_mem;
+  ffi_sarg * const stack_args = &reg_args[NUM_ARG_REGS];
+  ffi_sarg *argp = arg_mem;
+  ffi_type ** const arg_types = cif->arg_types;
+  const long num_args = cif->nargs;
+  long i;
+
+  if (cif->flags == FFI_TYPE_STRUCT)
+    {
+      /* Pass a hidden pointer to the return value. We make sure there
+         is scratch space for the callee to store the return value even if
+         our caller doesn't care about it. */
+      *argp++ = (intptr_t)(rvalue ? rvalue : alloca(cif->rtype->size));
+
+      /* No more work needed to return anything. */
+      rvalue = NULL;
+    }
+
+  for (i = 0; i < num_args; i++)
+    {
+      ffi_type *type = arg_types[i];
+      void * const arg_in = avalue[i];
+      ptrdiff_t arg_word = argp - arg_mem;
+
+#ifndef __tilegx__
+      /* Doubleword-aligned values are always in an even-number register
+         pair, or doubleword-aligned stack slot if out of registers. */
+      long align = arg_word & (type->alignment > FFI_SIZEOF_ARG);
+      argp += align;
+      arg_word += align;
+#endif
+
+      if (type->type == FFI_TYPE_STRUCT)
+        {
+          const size_t arg_size_in_words =
+            (type->size + FFI_SIZEOF_ARG - 1) / FFI_SIZEOF_ARG;
+
+          if (arg_word < NUM_ARG_REGS &&
+              arg_word + arg_size_in_words > NUM_ARG_REGS)
+            {
+              /* Args are not allowed to span registers and the stack. */
+              argp = stack_args;
+            }
+
+          memcpy(argp, arg_in, type->size);
+          argp += arg_size_in_words;
+        }
+      else
+        {
+          argp += assign_to_ffi_arg(argp, arg_in, arg_types[i], 1);
+        }
+    }
+
+  /* Actually do the call. */
+  ffi_call_tile(reg_args, stack_args,
+                cif->bytes - (NUM_ARG_REGS * FFI_SIZEOF_ARG), fn);
+
+  if (rvalue != NULL)
+    assign_to_ffi_arg(rvalue, reg_args, cif->rtype, 0);
+}
+
+
+/* Template code for closure. */
+extern const UINT64 ffi_template_tramp_tile[] FFI_HIDDEN;
+
+
+ffi_status
+ffi_prep_closure_loc (ffi_closure *closure,
+                      ffi_cif *cif,
+                      void (*fun)(ffi_cif*, void*, void**, void*),
+                      void *user_data,
+                      void *codeloc)
+{
+#ifdef __tilegx__
+  /* TILE-Gx */
+  SINT64 c;
+  SINT64 h;
+  int s;
+  UINT64 *out;
+
+  if (cif->abi != FFI_UNIX)
+    return FFI_BAD_ABI;
+
+  out = (UINT64 *)closure->tramp;
+
+  c = (intptr_t)closure;
+  h = (intptr_t)ffi_closure_tile;
+  s = 0;
+
+  /* Find the smallest shift count that doesn't lose information
+     (i.e. no need to explicitly insert high bits of the address that
+     are just the sign extension of the low bits). */
+  while ((c >> s) != (SINT16)(c >> s) || (h >> s) != (SINT16)(h >> s))
+    s += 16;
+
+#define OPS(a, b, shift) \
+  (create_Imm16_X0((a) >> (shift)) | create_Imm16_X1((b) >> (shift)))
+
+  /* Emit the moveli. */
+  *out++ = ffi_template_tramp_tile[0] | OPS(c, h, s);
+  for (s -= 16; s >= 0; s -= 16)
+    *out++ = ffi_template_tramp_tile[1] | OPS(c, h, s);
+
+#undef OPS
+
+  *out++ = ffi_template_tramp_tile[2];
+
+#else
+  /* TILEPro */
+  UINT64 *out;
+  intptr_t delta;
+
+  if (cif->abi != FFI_UNIX)
+    return FFI_BAD_ABI;
+
+  out = (UINT64 *)closure->tramp;
+  delta = (intptr_t)ffi_closure_tile - (intptr_t)codeloc;
+
+  *out++ = ffi_template_tramp_tile[0] | create_JOffLong_X1(delta >> 3);
+#endif
+
+  closure->cif = cif;
+  closure->fun = fun;
+  closure->user_data = user_data;
+
+  invalidate_icache(closure->tramp, (char *)out - closure->tramp,
+                    getpagesize());
+
+  return FFI_OK;
+}
+
+
+/* This is called by the assembly wrapper for closures. This does
+   all of the work. On entry reg_args[0] holds the values the registers
+   had when the closure was invoked. On return reg_args[1] holds the register
+   values to be returned to the caller (many of which may be garbage). */
+void FFI_HIDDEN
+ffi_closure_tile_inner(ffi_closure *closure,
+                       ffi_sarg reg_args[2][NUM_ARG_REGS],
+                       ffi_sarg *stack_args)
+{
+  ffi_cif * const cif = closure->cif;
+  void ** const avalue = alloca(cif->nargs * sizeof(void *));
+  void *rvalue;
+  ffi_type ** const arg_types = cif->arg_types;
+  ffi_sarg * const reg_args_in = reg_args[0];
+  ffi_sarg * const reg_args_out = reg_args[1];
+  ffi_sarg * argp;
+  long i, arg_word, nargs = cif->nargs;
+  /* Use a union to guarantee proper alignment for double. */
+  union { ffi_sarg arg[NUM_ARG_REGS]; double d; UINT64 u64; } closure_ret;
+
+  /* Start out reading register arguments. */
+  argp = reg_args_in;
+
+  /* Copy the caller's structure return address to that the closure
+     returns the data directly to the caller.  */
+  if (cif->flags == FFI_TYPE_STRUCT)
+    {
+      /* Return by reference via hidden pointer. */
+      rvalue = (void *)(intptr_t)*argp++;
+      arg_word = 1;
+    }
+  else
+    {
+      /* Return the value in registers. */
+      rvalue = &closure_ret;
+      arg_word = 0;
+    }
+
+  /* Grab the addresses of the arguments. */
+  for (i = 0; i < nargs; i++)
+    {
+      ffi_type * const type = arg_types[i];
+      const size_t arg_size_in_words =
+        (type->size + FFI_SIZEOF_ARG - 1) / FFI_SIZEOF_ARG;
+
+#ifndef __tilegx__
+      /* Doubleword-aligned values are always in an even-number register
+         pair, or doubleword-aligned stack slot if out of registers. */
+      long align = arg_word & (type->alignment > FFI_SIZEOF_ARG);
+      argp += align;
+      arg_word += align;
+#endif
+
+      if (arg_word == NUM_ARG_REGS ||
+          (arg_word < NUM_ARG_REGS &&
+           arg_word + arg_size_in_words > NUM_ARG_REGS))
+        {
+          /* Switch to reading arguments from the stack. */
+          argp = stack_args;
+          arg_word = NUM_ARG_REGS;
+        }
+
+      avalue[i] = argp;
+      argp += arg_size_in_words;
+      arg_word += arg_size_in_words;
+    }
+
+  /* Invoke the closure.  */
+  closure->fun(cif, rvalue, avalue, closure->user_data);
+
+  if (cif->flags != FFI_TYPE_STRUCT)
+    {
+      /* Canonicalize for register representation. */
+      assign_to_ffi_arg(reg_args_out, &closure_ret, cif->rtype, 1);
+    }
+}
diff --git a/libffi/src/tile/ffitarget.h b/libffi/src/tile/ffitarget.h
new file mode 100644
index 0000000..679fb5d
--- /dev/null
+++ b/libffi/src/tile/ffitarget.h
@@ -0,0 +1,65 @@
+/* -----------------------------------------------------------------*-C-*-
+   ffitarget.h - Copyright (c) 2012 Tilera Corp.
+   Target configuration macros for TILE.
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+   ----------------------------------------------------------------------- */
+
+#ifndef LIBFFI_TARGET_H
+#define LIBFFI_TARGET_H
+
+#ifndef LIBFFI_H
+#error "Please do not include ffitarget.h directly into your source.  Use ffi.h instead."
+#endif
+
+#ifndef LIBFFI_ASM
+
+#include <arch/abi.h>
+
+typedef uint_reg_t ffi_arg;
+typedef int_reg_t  ffi_sarg;
+
+typedef enum ffi_abi {
+  FFI_FIRST_ABI = 0,
+  FFI_UNIX,
+  FFI_LAST_ABI,
+  FFI_DEFAULT_ABI = FFI_UNIX
+} ffi_abi;
+#endif
+
+/* ---- Definitions for closures ----------------------------------------- */
+#define FFI_CLOSURES 1
+
+#ifdef __tilegx__
+/* We always pass 8-byte values, even in -m32 mode. */
+# define FFI_SIZEOF_ARG 8
+# ifdef __LP64__
+#  define FFI_TRAMPOLINE_SIZE (8 * 5)  /* 5 bundles */
+# else
+#  define FFI_TRAMPOLINE_SIZE (8 * 3)  /* 3 bundles */
+# endif
+#else
+# define FFI_SIZEOF_ARG 4
+# define FFI_TRAMPOLINE_SIZE 8 /* 1 bundle */
+#endif
+#define FFI_NATIVE_RAW_API 0
+
+#endif
diff --git a/libffi/src/tile/tile.S b/libffi/src/tile/tile.S
new file mode 100644
index 0000000..a186e1f
--- /dev/null
+++ b/libffi/src/tile/tile.S
@@ -0,0 +1,360 @@
+/* -----------------------------------------------------------------------
+   tile.S - Copyright (c) 2011 Tilera Corp.
+
+   Tilera TILEPro and TILE-Gx Foreign Function Interface
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+   ----------------------------------------------------------------------- */
+
+#define LIBFFI_ASM
+#include <fficonfig.h>
+#include <ffi.h>
+
+/* Number of bytes in a register. */
+#define REG_SIZE FFI_SIZEOF_ARG
+
+/* Number of bytes in stack linkage area for backtracing.
+
+   A note about the ABI: on entry to a procedure, sp points to a stack
+   slot where it must spill the return address if it's not a leaf.
+   REG_SIZE bytes beyond that is a slot owned by the caller which
+   contains the sp value that the caller had when it was originally
+   entered (i.e. the caller's frame pointer). */
+#define LINKAGE_SIZE (2 * REG_SIZE)
+
+/* The first 10 registers are used to pass arguments and return values. */
+#define NUM_ARG_REGS 10
+
+#ifdef __tilegx__
+#define SW st
+#define LW ld
+#define BGZT bgtzt
+#else
+#define SW sw
+#define LW lw
+#define BGZT bgzt
+#endif
+
+
+/* void ffi_call_tile (int_reg_t reg_args[NUM_ARG_REGS],
+                       const int_reg_t *stack_args,
+                       unsigned long stack_args_bytes,
+                       void (*fnaddr)(void));
+
+        On entry, REG_ARGS contain the outgoing register values,
+        and STACK_ARGS containts STACK_ARG_BYTES of additional values
+        to be passed on the stack. If STACK_ARG_BYTES is zero, then
+        STACK_ARGS is ignored.
+
+        When the invoked function returns, the values of r0-r9 are
+        blindly stored back into REG_ARGS for the caller to examine. */
+
+        .section .text.ffi_call_tile, "ax", @progbits
+        .align  8
+        .globl  ffi_call_tile
+        FFI_HIDDEN(ffi_call_tile)
+ffi_call_tile:
+
+/* Incoming arguments. */
+#define REG_ARGS                r0
+#define INCOMING_STACK_ARGS     r1
+#define STACK_ARG_BYTES         r2
+#define ORIG_FNADDR             r3
+
+/* Temporary values. */
+#define FRAME_SIZE              r10
+#define TMP                     r11
+#define TMP2                    r12
+#define OUTGOING_STACK_ARGS     r13
+#define REG_ADDR_PTR            r14
+#define RETURN_REG_ADDR         r15
+#define FNADDR                  r16
+
+        .cfi_startproc
+        {
+         /* Save return address. */
+         SW     sp, lr
+         .cfi_offset lr, 0
+         /* Prepare to spill incoming r52. */
+         addi   TMP, sp, -REG_SIZE
+         /* Increase frame size to have room to spill r52 and REG_ARGS.
+            The +7 is to round up mod 8. */
+         addi   FRAME_SIZE, STACK_ARG_BYTES, \
+                REG_SIZE + REG_SIZE + LINKAGE_SIZE + 7
+        }
+        {
+         /* Round stack frame size to a multiple of 8 to satisfy ABI. */
+         andi   FRAME_SIZE, FRAME_SIZE, -8
+         /* Compute where to spill REG_ARGS value. */
+         addi   TMP2, sp, -(REG_SIZE * 2)
+        }
+        {
+         /* Spill incoming r52. */
+         SW     TMP, r52
+         .cfi_offset r52, -REG_SIZE
+         /* Set up our frame pointer. */
+         move   r52, sp
+         .cfi_def_cfa_register r52
+         /* Push stack frame. */
+         sub    sp, sp, FRAME_SIZE
+        }
+        {
+         /* Prepare to set up stack linkage. */
+         addi   TMP, sp, REG_SIZE
+         /* Prepare to memcpy stack args. */
+         addi   OUTGOING_STACK_ARGS, sp, LINKAGE_SIZE
+         /* Save REG_ARGS which we will need after we call the subroutine. */
+         SW     TMP2, REG_ARGS
+        }
+        {
+         /* Set up linkage info to hold incoming stack pointer. */
+         SW     TMP, r52
+        }
+        {
+         /* Skip stack args memcpy if we don't have any stack args (common). */
+         blezt  STACK_ARG_BYTES, .Ldone_stack_args_memcpy
+        }
+
+.Lmemcpy_stack_args:
+        {
+         /* Load incoming argument from stack_args. */
+         LW     TMP, INCOMING_STACK_ARGS
+         addi   INCOMING_STACK_ARGS, INCOMING_STACK_ARGS, REG_SIZE
+        }
+        {
+         /* Store stack argument into outgoing stack argument area. */
+         SW     OUTGOING_STACK_ARGS, TMP
+         addi   OUTGOING_STACK_ARGS, OUTGOING_STACK_ARGS, REG_SIZE
+         addi   STACK_ARG_BYTES, STACK_ARG_BYTES, -REG_SIZE
+        }
+        {
+         BGZT   STACK_ARG_BYTES, .Lmemcpy_stack_args
+        }
+.Ldone_stack_args_memcpy:
+
+        {
+         /* Copy aside ORIG_FNADDR so we can overwrite its register. */
+         move   FNADDR, ORIG_FNADDR
+         /* Prepare to load argument registers. */
+         addi   REG_ADDR_PTR, r0, REG_SIZE
+         /* Load outgoing r0. */
+         LW     r0, r0
+        }
+
+        /* Load up argument registers from the REG_ARGS array. */
+#define LOAD_REG(REG, PTR) \
+        { \
+         LW     REG, PTR ; \
+         addi   PTR, PTR, REG_SIZE \
+        }
+
+        LOAD_REG(r1, REG_ADDR_PTR)
+        LOAD_REG(r2, REG_ADDR_PTR)
+        LOAD_REG(r3, REG_ADDR_PTR)
+        LOAD_REG(r4, REG_ADDR_PTR)
+        LOAD_REG(r5, REG_ADDR_PTR)
+        LOAD_REG(r6, REG_ADDR_PTR)
+        LOAD_REG(r7, REG_ADDR_PTR)
+        LOAD_REG(r8, REG_ADDR_PTR)
+        LOAD_REG(r9, REG_ADDR_PTR)
+
+        {
+         /* Call the subroutine. */
+         jalr   FNADDR
+        }
+
+        {
+         /* Restore original lr. */
+         LW     lr, r52
+         /* Prepare to recover ARGS, which we spilled earlier. */
+         addi   TMP, r52, -(2 * REG_SIZE)
+        }
+        {
+         /* Restore ARGS, so we can fill it in with the return regs r0-r9. */
+         LW     RETURN_REG_ADDR, TMP
+         /* Prepare to restore original r52. */
+         addi   TMP, r52, -REG_SIZE
+        }
+
+        {
+         /* Pop stack frame. */
+         move   sp, r52
+         /* Restore original r52. */
+         LW     r52, TMP
+        }
+
+#define STORE_REG(REG, PTR) \
+        { \
+         SW     PTR, REG ; \
+         addi   PTR, PTR, REG_SIZE \
+        }
+
+        /* Return all register values by reference. */
+        STORE_REG(r0, RETURN_REG_ADDR)
+        STORE_REG(r1, RETURN_REG_ADDR)
+        STORE_REG(r2, RETURN_REG_ADDR)
+        STORE_REG(r3, RETURN_REG_ADDR)
+        STORE_REG(r4, RETURN_REG_ADDR)
+        STORE_REG(r5, RETURN_REG_ADDR)
+        STORE_REG(r6, RETURN_REG_ADDR)
+        STORE_REG(r7, RETURN_REG_ADDR)
+        STORE_REG(r8, RETURN_REG_ADDR)
+        STORE_REG(r9, RETURN_REG_ADDR)
+
+        {
+         jrp    lr
+        }
+
+        .cfi_endproc
+        .size ffi_call_tile, .-ffi_call_tile
+
+/* ffi_closure_tile(...)
+
+   On entry, lr points to the closure plus 8 bytes, and r10
+   contains the actual return address.
+
+   This function simply dumps all register parameters into a stack array
+   and passes the closure, the registers array, and the stack arguments
+   to C code that does all of the actual closure processing. */
+
+        .section .text.ffi_closure_tile, "ax", @progbits
+        .align  8
+        .globl  ffi_closure_tile
+        FFI_HIDDEN(ffi_closure_tile)
+
+        .cfi_startproc
+/* Room to spill all NUM_ARG_REGS incoming registers, plus frame linkage. */
+#define CLOSURE_FRAME_SIZE (((NUM_ARG_REGS * REG_SIZE * 2 + LINKAGE_SIZE) + 7) & -8)
+ffi_closure_tile:
+        {
+#ifdef __tilegx__
+         st     sp, lr
+         .cfi_offset lr, 0
+#else
+         /* Save return address (in r10 due to closure stub wrapper). */
+         SW     sp, r10
+         .cfi_return_column r10
+         .cfi_offset r10, 0
+#endif
+         /* Compute address for stack frame linkage. */
+         addli   r10, sp, -(CLOSURE_FRAME_SIZE - REG_SIZE)
+        }
+        {
+         /* Save incoming stack pointer in linkage area. */
+         SW     r10, sp
+         .cfi_offset sp, -(CLOSURE_FRAME_SIZE - REG_SIZE)
+         /* Push a new stack frame. */
+         addli   sp, sp, -CLOSURE_FRAME_SIZE
+         .cfi_adjust_cfa_offset CLOSURE_FRAME_SIZE
+        }
+
+        {
+         /* Create pointer to where to start spilling registers. */
+         addi   r10, sp, LINKAGE_SIZE
+        }
+
+        /* Spill all the incoming registers. */
+        STORE_REG(r0, r10)
+        STORE_REG(r1, r10)
+        STORE_REG(r2, r10)
+        STORE_REG(r3, r10)
+        STORE_REG(r4, r10)
+        STORE_REG(r5, r10)
+        STORE_REG(r6, r10)
+        STORE_REG(r7, r10)
+        STORE_REG(r8, r10)
+        {
+         /* Save r9. */
+         SW     r10, r9
+#ifdef __tilegx__
+         /* Pointer to closure is passed in r11. */
+         move  r0, r11
+#else
+         /* Compute pointer to the closure object. Because the closure
+            starts with a "jal ffi_closure_tile", we can just take the
+            value of lr (a phony return address pointing into the closure)
+            and subtract 8. */
+         addi   r0, lr, -8
+#endif
+         /* Compute a pointer to the register arguments we just spilled. */
+         addi   r1, sp, LINKAGE_SIZE
+        }
+        {
+         /* Compute a pointer to the extra stack arguments (if any). */
+         addli   r2, sp, CLOSURE_FRAME_SIZE + LINKAGE_SIZE
+         /* Call C code to deal with all of the grotty details. */
+         jal    ffi_closure_tile_inner
+        }
+        {
+         addli   r10, sp, CLOSURE_FRAME_SIZE
+        }
+        {
+         /* Restore the return address. */
+         LW     lr, r10
+         /* Compute pointer to registers array. */
+         addli   r10, sp, LINKAGE_SIZE + (NUM_ARG_REGS * REG_SIZE)
+        }
+        /* Return all the register values, which C code may have set. */
+        LOAD_REG(r0, r10)
+        LOAD_REG(r1, r10)
+        LOAD_REG(r2, r10)
+        LOAD_REG(r3, r10)
+        LOAD_REG(r4, r10)
+        LOAD_REG(r5, r10)
+        LOAD_REG(r6, r10)
+        LOAD_REG(r7, r10)
+        LOAD_REG(r8, r10)
+        LOAD_REG(r9, r10)
+        {
+         /* Pop the frame. */
+         addli   sp, sp, CLOSURE_FRAME_SIZE
+         jrp    lr
+        }
+
+        .cfi_endproc
+        .size   ffi_closure_tile, . - ffi_closure_tile
+
+
+/* What follows are code template instructions that get copied to the
+   closure trampoline by ffi_prep_closure_loc.  The zeroed operands
+   get replaced by their proper values at runtime. */
+
+        .section .text.ffi_template_tramp_tile, "ax", @progbits
+        .align  8
+        .globl  ffi_template_tramp_tile
+        FFI_HIDDEN(ffi_template_tramp_tile)
+ffi_template_tramp_tile:
+#ifdef __tilegx__
+        {
+          moveli r11, 0 /* backpatched to address of containing closure. */
+          moveli r10, 0 /* backpatched to ffi_closure_tile. */
+        }
+        /* Note: the following bundle gets generated multiple times
+           depending on the pointer value (esp. useful for -m32 mode). */
+        { shl16insli r11, r11, 0 ; shl16insli r10, r10, 0 }
+        { info 2+8 /* for backtracer: -> pc in lr, frame size 0 */ ; jr r10 }
+#else
+        /* 'jal .' yields a PC-relative offset of zero so we can OR in the
+           right offset at runtime. */
+        { move r10, lr ; jal . /* ffi_closure_tile */ }
+#endif
+
+        .size   ffi_template_tramp_tile, . - ffi_template_tramp_tile
diff --git a/libffi/src/x86/ffi.c b/libffi/src/x86/ffi.c
index f643b34..611e221 100644
--- a/libffi/src/x86/ffi.c
+++ b/libffi/src/x86/ffi.c
@@ -58,7 +58,8 @@ void ffi_prep_args(char *stack, extended_cif *ecif)
 
   argp = stack;
 
-  if (ecif->cif->flags == FFI_TYPE_STRUCT
+  if ((ecif->cif->flags == FFI_TYPE_STRUCT
+       || ecif->cif->flags == FFI_TYPE_MS_STRUCT)
 #ifdef X86_WIN64
       && (ecif->cif->rtype->size != 1 && ecif->cif->rtype->size != 2
           && ecif->cif->rtype->size != 4 && ecif->cif->rtype->size != 8)
@@ -279,7 +280,12 @@ ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
       else
 #endif
         {
-          cif->flags = FFI_TYPE_STRUCT;
+#ifdef X86_WIN32
+          if (cif->abi == FFI_MS_CDECL)
+            cif->flags = FFI_TYPE_MS_STRUCT;
+          else
+#endif
+            cif->flags = FFI_TYPE_STRUCT;
           /* allocate space for return value pointer */
           cif->bytes += ALIGN(sizeof(void*), FFI_SIZEOF_ARG);
         }
@@ -349,7 +355,8 @@ void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
     }
 #else
   if (rvalue == NULL
-      && cif->flags == FFI_TYPE_STRUCT)
+      && (cif->flags == FFI_TYPE_STRUCT
+          || cif->flags == FFI_TYPE_MS_STRUCT))
     {
       ecif.rvalue = alloca(cif->rtype->size);
     }
@@ -368,6 +375,7 @@ void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 #elif defined(X86_WIN32)
     case FFI_SYSV:
     case FFI_STDCALL:
+    case FFI_MS_CDECL:
       ffi_call_win32(ffi_prep_args, &ecif, cif->abi, cif->bytes, cif->flags,
 		     ecif.rvalue, fn);
       break;
@@ -513,7 +521,8 @@ ffi_prep_incoming_args_SYSV(char *stack, void **rvalue, void **avalue,
     argp += sizeof(void *);
   }
 #else
-  if ( cif->flags == FFI_TYPE_STRUCT ) {
+  if ( cif->flags == FFI_TYPE_STRUCT
+       || cif->flags == FFI_TYPE_MS_STRUCT ) {
     *rvalue = *(void **) argp;
     argp += sizeof(void *);
   }
@@ -673,6 +682,12 @@ ffi_prep_closure_loc (ffi_closure* closure,
                                    &ffi_closure_STDCALL,
                                    (void*)codeloc, cif->bytes);
     }
+  else if (cif->abi == FFI_MS_CDECL)
+    {
+      FFI_INIT_TRAMPOLINE (&closure->tramp[0],
+                           &ffi_closure_SYSV,
+                           (void*)codeloc);
+    }
 #endif /* X86_WIN32 */
 #endif /* !X86_WIN64 */
   else
@@ -762,8 +777,9 @@ ffi_raw_call(ffi_cif *cif, void (*fn)(void), void *rvalue, ffi_raw *fake_avalue)
   /* If the return value is a struct and we don't have a return */
   /* value address then we need to make one                     */
 
-  if ((rvalue == NULL) && 
-      (cif->rtype->type == FFI_TYPE_STRUCT))
+  if (rvalue == NULL
+      && (cif->flags == FFI_TYPE_STRUCT
+          || cif->flags == FFI_TYPE_MS_STRUCT))
     {
       ecif.rvalue = alloca(cif->rtype->size);
     }
@@ -776,6 +792,7 @@ ffi_raw_call(ffi_cif *cif, void (*fn)(void), void *rvalue, ffi_raw *fake_avalue)
 #ifdef X86_WIN32
     case FFI_SYSV:
     case FFI_STDCALL:
+    case FFI_MS_CDECL:
       ffi_call_win32(ffi_prep_args_raw, &ecif, cif->abi, cif->bytes, cif->flags,
 		     ecif.rvalue, fn);
       break;
diff --git a/libffi/src/x86/ffi64.c b/libffi/src/x86/ffi64.c
index defd774..1daa1c0 100644
--- a/libffi/src/x86/ffi64.c
+++ b/libffi/src/x86/ffi64.c
@@ -37,11 +37,17 @@
 #define MAX_GPR_REGS 6
 #define MAX_SSE_REGS 8
 
+#ifdef __INTEL_COMPILER
+#define UINT128 __m128
+#else
+#define UINT128 __int128_t
+#endif
+
 struct register_args
 {
   /* Registers for argument passing.  */
   UINT64 gpr[MAX_GPR_REGS];
-  __int128_t sse[MAX_SSE_REGS];
+  UINT128 sse[MAX_SSE_REGS];
 };
 
 extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
diff --git a/libffi/src/x86/ffitarget.h b/libffi/src/x86/ffitarget.h
index fc01541..46f294c 100644
--- a/libffi/src/x86/ffitarget.h
+++ b/libffi/src/x86/ffitarget.h
@@ -81,9 +81,13 @@ typedef enum ffi_abi {
   FFI_STDCALL,
   FFI_THISCALL,
   FFI_FASTCALL,
+  FFI_MS_CDECL,
   FFI_LAST_ABI,
-  /* TODO: Add fastcall support for the sake of completeness */
+#ifdef _MSC_VER
+  FFI_DEFAULT_ABI = FFI_MS_CDECL
+#else
   FFI_DEFAULT_ABI = FFI_SYSV
+#endif
 
 #elif defined(X86_WIN64)
   FFI_WIN64,
@@ -110,6 +114,7 @@ typedef enum ffi_abi {
 #define FFI_TYPE_SMALL_STRUCT_1B (FFI_TYPE_LAST + 1)
 #define FFI_TYPE_SMALL_STRUCT_2B (FFI_TYPE_LAST + 2)
 #define FFI_TYPE_SMALL_STRUCT_4B (FFI_TYPE_LAST + 3)
+#define FFI_TYPE_MS_STRUCT       (FFI_TYPE_LAST + 4)
 
 #if defined (X86_64) || (defined (__x86_64__) && defined (X86_DARWIN))
 #define FFI_TRAMPOLINE_SIZE 24
diff --git a/libffi/src/x86/win32.S b/libffi/src/x86/win32.S
index deb4a03..24b7bbd 100644
--- a/libffi/src/x86/win32.S
+++ b/libffi/src/x86/win32.S
@@ -108,31 +108,37 @@ ca_jumpdata:
         dd offset ca_retfloat       ;; FFI_TYPE_FLOAT
         dd offset ca_retdouble      ;; FFI_TYPE_DOUBLE
         dd offset ca_retlongdouble  ;; FFI_TYPE_LONGDOUBLE
-        dd offset ca_retint8        ;; FFI_TYPE_UINT8
-        dd offset ca_retint8        ;; FFI_TYPE_SINT8
-        dd offset ca_retint16       ;; FFI_TYPE_UINT16
-        dd offset ca_retint16       ;; FFI_TYPE_SINT16
+        dd offset ca_retuint8       ;; FFI_TYPE_UINT8
+        dd offset ca_retsint8       ;; FFI_TYPE_SINT8
+        dd offset ca_retuint16      ;; FFI_TYPE_UINT16
+        dd offset ca_retsint16      ;; FFI_TYPE_SINT16
         dd offset ca_retint         ;; FFI_TYPE_UINT32
         dd offset ca_retint         ;; FFI_TYPE_SINT32
         dd offset ca_retint64       ;; FFI_TYPE_UINT64
         dd offset ca_retint64       ;; FFI_TYPE_SINT64
         dd offset ca_epilogue       ;; FFI_TYPE_STRUCT
         dd offset ca_retint         ;; FFI_TYPE_POINTER
-        dd offset ca_retint8        ;; FFI_TYPE_SMALL_STRUCT_1B
-        dd offset ca_retint16       ;; FFI_TYPE_SMALL_STRUCT_2B
+        dd offset ca_retstruct1b    ;; FFI_TYPE_SMALL_STRUCT_1B
+        dd offset ca_retstruct2b    ;; FFI_TYPE_SMALL_STRUCT_2B
         dd offset ca_retint         ;; FFI_TYPE_SMALL_STRUCT_4B
+        dd offset ca_epilogue       ;; FFI_TYPE_MS_STRUCT
 
-ca_retint8:
-        ;; Load %ecx with the pointer to storage for the return value
-        mov   ecx, rvalue
-        mov   [ecx + 0], al
-        jmp   ca_epilogue
+        /* Sign/zero extend as appropriate.  */
+ca_retuint8:
+        movzx eax, al
+        jmp   ca_retint
 
-ca_retint16:
-        ;; Load %ecx with the pointer to storage for the return value
-        mov   ecx, rvalue
-        mov   [ecx + 0], ax
-        jmp   ca_epilogue
+ca_retsint8:
+        movsx eax, al
+        jmp   ca_retint
+
+ca_retuint16:
+        movzx eax, ax
+        jmp   ca_retint
+
+ca_retsint16:
+        movsx eax, ax
+        jmp   ca_retint
 
 ca_retint:
         ;; Load %ecx with the pointer to storage for the return value
@@ -165,14 +171,24 @@ ca_retlongdouble:
         fstp  TBYTE PTR [ecx]
         jmp   ca_epilogue
 
+ca_retstruct1b:
+        ;; Load %ecx with the pointer to storage for the return value
+        mov   ecx, rvalue
+        mov   [ecx + 0], al
+        jmp   ca_epilogue
+
+ca_retstruct2b:
+        ;; Load %ecx with the pointer to storage for the return value
+        mov   ecx, rvalue
+        mov   [ecx + 0], ax
+        jmp   ca_epilogue
+
 ca_epilogue:
         ;; Epilogue code is autogenerated.
         ret
 ffi_call_win32 ENDP
 
 ffi_closure_THISCALL PROC NEAR FORCEFRAME
-	push	ebp
-	mov	ebp, esp
 	sub	esp, 40
 	lea	edx, [ebp -24]
 	mov	[ebp - 12], edx	/* resp */
@@ -187,7 +203,7 @@ ffi_closure_SYSV PROC NEAR FORCEFRAME
         lea  edx, [ebp - 24]
         mov  [ebp - 12], edx         ;; resp
         lea  edx, [ebp + 8]
-stub:
+stub::
         mov  [esp + 8], edx          ;; args
         lea  edx, [ebp - 12]
         mov  [esp + 4], edx          ;; &resp
@@ -204,26 +220,35 @@ cs_jumpdata:
         dd offset cs_retfloat       ;; FFI_TYPE_FLOAT
         dd offset cs_retdouble      ;; FFI_TYPE_DOUBLE
         dd offset cs_retlongdouble  ;; FFI_TYPE_LONGDOUBLE
-        dd offset cs_retint8        ;; FFI_TYPE_UINT8
-        dd offset cs_retint8        ;; FFI_TYPE_SINT8
-        dd offset cs_retint16       ;; FFI_TYPE_UINT16
-        dd offset cs_retint16       ;; FFI_TYPE_SINT16
+        dd offset cs_retuint8       ;; FFI_TYPE_UINT8
+        dd offset cs_retsint8       ;; FFI_TYPE_SINT8
+        dd offset cs_retuint16      ;; FFI_TYPE_UINT16
+        dd offset cs_retsint16      ;; FFI_TYPE_SINT16
         dd offset cs_retint         ;; FFI_TYPE_UINT32
         dd offset cs_retint         ;; FFI_TYPE_SINT32
         dd offset cs_retint64       ;; FFI_TYPE_UINT64
         dd offset cs_retint64       ;; FFI_TYPE_SINT64
         dd offset cs_retstruct      ;; FFI_TYPE_STRUCT
         dd offset cs_retint         ;; FFI_TYPE_POINTER
-        dd offset cs_retint8        ;; FFI_TYPE_SMALL_STRUCT_1B
-        dd offset cs_retint16       ;; FFI_TYPE_SMALL_STRUCT_2B
+        dd offset cs_retsint8       ;; FFI_TYPE_SMALL_STRUCT_1B
+        dd offset cs_retsint16      ;; FFI_TYPE_SMALL_STRUCT_2B
         dd offset cs_retint         ;; FFI_TYPE_SMALL_STRUCT_4B
+        dd offset cs_retmsstruct    ;; FFI_TYPE_MS_STRUCT
+
+cs_retuint8:
+        movzx eax, BYTE PTR [ecx]
+        jmp   cs_epilogue
+
+cs_retsint8:
+        movsx eax, BYTE PTR [ecx]
+        jmp   cs_epilogue
 
-cs_retint8:
-        mov   al, [ecx]
+cs_retuint16:
+        movzx eax, WORD PTR [ecx]
         jmp   cs_epilogue
 
-cs_retint16:
-        mov   ax, [ecx]
+cs_retsint16:
+        movsx eax, WORD PTR [ecx]
         jmp   cs_epilogue
 
 cs_retint:
@@ -252,6 +277,12 @@ cs_retstruct:
         ;; Epilogue code is autogenerated.
         ret	4
 
+cs_retmsstruct:
+        ;; Caller expects us to return a pointer to the real return value.
+        mov   eax, ecx
+        ;; Caller doesn't expects us to pop struct return value pointer hidden arg.
+        jmp   cs_epilogue
+
 cs_epilogue:
         ;; Epilogue code is autogenerated.
         ret
@@ -264,19 +295,16 @@ ffi_closure_SYSV ENDP
 #define RAW_CLOSURE_USER_DATA_OFFSET (RAW_CLOSURE_FUN_OFFSET + 4)
 #define CIF_FLAGS_OFFSET 20
 
-ffi_closure_raw_THISCALL PROC NEAR
-	push ebp
-	mov  ebp, esp
-	push esi
+ffi_closure_raw_THISCALL PROC NEAR USES esi FORCEFRAME
 	sub esp, 36
 	mov  esi, [eax + RAW_CLOSURE_CIF_OFFSET]        ;; closure->cif
 	mov  edx, [eax + RAW_CLOSURE_USER_DATA_OFFSET]  ;; closure->user_data
 	mov [esp + 12], edx
-	lea edx, [ebp + 12], edx
+	lea edx, [ebp + 12]
 	jmp stubraw
-ffi_closure_raw_SYSV ENDP
+ffi_closure_raw_THISCALL ENDP
 
-ffi_closure_raw_SYSV PROC NEAR USES esi
+ffi_closure_raw_SYSV PROC NEAR USES esi FORCEFRAME
     ;; the ffi_closure ctx is passed in eax by the trampoline.
 
         sub  esp, 40
@@ -284,7 +312,7 @@ ffi_closure_raw_SYSV PROC NEAR USES esi
         mov  edx, [eax + RAW_CLOSURE_USER_DATA_OFFSET]  ;; closure->user_data
         mov  [esp + 12], edx                            ;; user_data
         lea  edx, [ebp + 8]
-stubraw:
+stubraw::
         mov  [esp + 8], edx                             ;; raw_args
         lea  edx, [ebp - 24]
         mov  [esp + 4], edx                             ;; &res
@@ -302,26 +330,35 @@ cr_jumpdata:
         dd offset cr_retfloat       ;; FFI_TYPE_FLOAT
         dd offset cr_retdouble      ;; FFI_TYPE_DOUBLE
         dd offset cr_retlongdouble  ;; FFI_TYPE_LONGDOUBLE
-        dd offset cr_retint8        ;; FFI_TYPE_UINT8
-        dd offset cr_retint8        ;; FFI_TYPE_SINT8
-        dd offset cr_retint16       ;; FFI_TYPE_UINT16
-        dd offset cr_retint16       ;; FFI_TYPE_SINT16
+        dd offset cr_retuint8       ;; FFI_TYPE_UINT8
+        dd offset cr_retsint8       ;; FFI_TYPE_SINT8
+        dd offset cr_retuint16      ;; FFI_TYPE_UINT16
+        dd offset cr_retsint16      ;; FFI_TYPE_SINT16
         dd offset cr_retint         ;; FFI_TYPE_UINT32
         dd offset cr_retint         ;; FFI_TYPE_SINT32
         dd offset cr_retint64       ;; FFI_TYPE_UINT64
         dd offset cr_retint64       ;; FFI_TYPE_SINT64
         dd offset cr_epilogue       ;; FFI_TYPE_STRUCT
         dd offset cr_retint         ;; FFI_TYPE_POINTER
-        dd offset cr_retint8        ;; FFI_TYPE_SMALL_STRUCT_1B
-        dd offset cr_retint16       ;; FFI_TYPE_SMALL_STRUCT_2B
+        dd offset cr_retsint8       ;; FFI_TYPE_SMALL_STRUCT_1B
+        dd offset cr_retsint16      ;; FFI_TYPE_SMALL_STRUCT_2B
         dd offset cr_retint         ;; FFI_TYPE_SMALL_STRUCT_4B
+        dd offset cr_epilogue       ;; FFI_TYPE_MS_STRUCT
+
+cr_retuint8:
+        movzx eax, BYTE PTR [ecx]
+        jmp   cr_epilogue
+
+cr_retsint8:
+        movsx eax, BYTE PTR [ecx]
+        jmp   cr_epilogue
 
-cr_retint8:
-        mov   al, [ecx]
+cr_retuint16:
+        movzx eax, WORD PTR [ecx]
         jmp   cr_epilogue
 
-cr_retint16:
-        mov   ax, [ecx]
+cr_retsint16:
+        movsx eax, WORD PTR [ecx]
         jmp   cr_epilogue
 
 cr_retint:
@@ -375,26 +412,34 @@ cd_jumpdata:
         dd offset cd_retfloat       ;; FFI_TYPE_FLOAT
         dd offset cd_retdouble      ;; FFI_TYPE_DOUBLE
         dd offset cd_retlongdouble  ;; FFI_TYPE_LONGDOUBLE
-        dd offset cd_retint8        ;; FFI_TYPE_UINT8
-        dd offset cd_retint8        ;; FFI_TYPE_SINT8
-        dd offset cd_retint16       ;; FFI_TYPE_UINT16
-        dd offset cd_retint16       ;; FFI_TYPE_SINT16
+        dd offset cd_retuint8       ;; FFI_TYPE_UINT8
+        dd offset cd_retsint8       ;; FFI_TYPE_SINT8
+        dd offset cd_retuint16      ;; FFI_TYPE_UINT16
+        dd offset cd_retsint16      ;; FFI_TYPE_SINT16
         dd offset cd_retint         ;; FFI_TYPE_UINT32
         dd offset cd_retint         ;; FFI_TYPE_SINT32
         dd offset cd_retint64       ;; FFI_TYPE_UINT64
         dd offset cd_retint64       ;; FFI_TYPE_SINT64
         dd offset cd_epilogue       ;; FFI_TYPE_STRUCT
         dd offset cd_retint         ;; FFI_TYPE_POINTER
-        dd offset cd_retint8        ;; FFI_TYPE_SMALL_STRUCT_1B
-        dd offset cd_retint16       ;; FFI_TYPE_SMALL_STRUCT_2B
+        dd offset cd_retsint8       ;; FFI_TYPE_SMALL_STRUCT_1B
+        dd offset cd_retsint16      ;; FFI_TYPE_SMALL_STRUCT_2B
         dd offset cd_retint         ;; FFI_TYPE_SMALL_STRUCT_4B
 
-cd_retint8:
-        mov   al, [ecx]
+cd_retuint8:
+        movzx eax, BYTE PTR [ecx]
+        jmp   cd_epilogue
+
+cd_retsint8:
+        movsx eax, BYTE PTR [ecx]
         jmp   cd_epilogue
 
-cd_retint16:
-        mov   ax, [ecx]
+cd_retuint16:
+        movzx eax, WORD PTR [ecx]
+        jmp   cd_epilogue
+
+cd_retsint16:
+        movsx eax, WORD PTR [ecx]
         jmp   cd_epilogue
 
 cd_retint:
@@ -515,6 +560,7 @@ _ffi_call_win32:
 	.long	.Lretstruct1b		/* FFI_TYPE_SMALL_STRUCT_1B */
 	.long	.Lretstruct2b		/* FFI_TYPE_SMALL_STRUCT_2B */
 	.long	.Lretstruct4b		/* FFI_TYPE_SMALL_STRUCT_4B */
+	.long	.Lretstruct		/* FFI_TYPE_MS_STRUCT */
 1:
 	add	%ecx, %ecx
 	add	%ecx, %ecx
@@ -657,6 +703,7 @@ _ffi_closure_SYSV:
 	.long	.Lcls_retstruct1	/* FFI_TYPE_SMALL_STRUCT_1B */
 	.long	.Lcls_retstruct2	/* FFI_TYPE_SMALL_STRUCT_2B */
 	.long	.Lcls_retstruct4	/* FFI_TYPE_SMALL_STRUCT_4B */
+	.long	.Lcls_retmsstruct	/* FFI_TYPE_MS_STRUCT */
 
 1:
 	add	%eax, %eax
@@ -721,6 +768,12 @@ _ffi_closure_SYSV:
 	popl	%ebp
 	ret	$0x4
 
+.Lcls_retmsstruct:
+	# Caller expects us to return a pointer to the real return value.
+	mov	%ecx, %eax
+	# Caller doesn't expects us to pop struct return value pointer hidden arg.
+	jmp	.Lcls_epilogue
+
 .Lcls_noretval:
 .Lcls_epilogue:
 	movl	%ebp, %esp
@@ -798,6 +851,7 @@ _ffi_closure_raw_SYSV:
 	.long	.Lrcls_retstruct1	/* FFI_TYPE_SMALL_STRUCT_1B */
 	.long	.Lrcls_retstruct2	/* FFI_TYPE_SMALL_STRUCT_2B */
 	.long	.Lrcls_retstruct4	/* FFI_TYPE_SMALL_STRUCT_4B */
+	.long	.Lrcls_retstruct	/* FFI_TYPE_MS_STRUCT */
 1:
 	add	%eax, %eax
 	add	%eax, %eax