libffi: Sync with libffi 3.4.2

Merged commit: f9ea41683444ebe11cfa45b05223899764df28fb
author: H.J. Lu <hjl.tools@gmail.com> 2021-08-31 07:14:47 -0700
committer: H.J. Lu <hjl.tools@gmail.com> 2021-10-20 05:35:52 -0700
commit: 92456a4e5658e138e2cea79e390e3306b07685b0 (patch)
tree: 6ef878e933b504a902035f1ae89510fde96a976d /libffi/src
parent: d738405e7fe62cc8eb9580948a6ea39005cd7170 (diff)
download: gcc-92456a4e5658e138e2cea79e390e3306b07685b0.zip
gcc-92456a4e5658e138e2cea79e390e3306b07685b0.tar.gz
gcc-92456a4e5658e138e2cea79e390e3306b07685b0.tar.bz2
80 files changed, 7466 insertions, 3912 deletions
diff --git a/libffi/src/aarch64/ffi.c b/libffi/src/aarch64/ffi.c
index f79602b..5c85fcd 100644
--- a/libffi/src/aarch64/ffi.c
+++ b/libffi/src/aarch64/ffi.c
@@ -19,12 +19,18 @@ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
+#if defined(__aarch64__) || defined(__arm64__)|| defined (_M_ARM64)
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <fficonfig.h>
 #include <ffi.h>
 #include <ffi_common.h>
 #include "internal.h"
+#ifdef _WIN32
+#include <windows.h> /* FlushInstructionCache */
+#endif
+#include <tramp.h>
 
 /* Force FFI_TYPE_LONGDOUBLE to be different than FFI_TYPE_DOUBLE;
    all further uses in this file will refer to the 128-bit type.  */
@@ -54,6 +60,17 @@ struct call_context
   UINT64 x[N_X_ARG_REG];
 };
 
+#if FFI_EXEC_TRAMPOLINE_TABLE
+
+#ifdef __MACH__
+#ifdef HAVE_PTRAUTH
+#include <ptrauth.h>
+#endif
+#include <mach/vm_param.h>
+#endif
+
+#else
+
 #if defined (__clang__) && defined (__APPLE__)
 extern void sys_icache_invalidate (void *start, size_t len);
 #endif
@@ -65,11 +82,15 @@ ffi_clear_cache (void *start, void *end)
   sys_icache_invalidate (start, (char *)end - (char *)start);
 #elif defined (__GNUC__)
   __builtin___clear_cache (start, end);
+#elif defined (_WIN32)
+  FlushInstructionCache(GetCurrentProcess(), start, (char*)end - (char*)start);
 #else
 #error "Missing builtin to flush instruction cache"
 #endif
 }
 
+#endif
+
 /* A subroutine of is_vfp_type.  Given a structure type, return the type code
    of the first non-structure element.  Recurse for structure elements.
    Return -1 if the structure is in fact empty, i.e. no nested elements.  */
@@ -220,7 +241,7 @@ is_vfp_type (const ffi_type *ty)
 
   /* All tests succeeded.  Encode the result.  */
  done:
-  return candidate * 4 + (4 - ele_count);
+  return candidate * 4 + (4 - (int)ele_count);
 }
 
 /* Representation of the procedure call argument marshalling
@@ -269,7 +290,7 @@ allocate_to_stack (struct arg_state *state, void *stack,
     alignment = 8;
 #endif
     
-  nsaa = ALIGN (nsaa, alignment);
+  nsaa = FFI_ALIGN (nsaa, alignment);
   state->nsaa = nsaa + size;
 
   return (char *)stack + nsaa;
@@ -304,10 +325,13 @@ extend_integer_type (void *source, int type)
     }
 }
 
+#if defined(_MSC_VER)
+void extend_hfa_type (void *dest, void *src, int h);
+#else
 static void
 extend_hfa_type (void *dest, void *src, int h)
 {
-  int f = h - AARCH64_RET_S4;
+  ssize_t f = h - AARCH64_RET_S4;
   void *x0;
 
   asm volatile (
@@ -339,10 +363,10 @@ extend_hfa_type (void *dest, void *src, int h)
 "	b	1f\n"
 "	nop\n"
 "	ldp	q16, q17, [%3]\n"	/* Q4 */
-"	ldp	q18, q19, [%3, #16]\n"
+"	ldp	q18, q19, [%3, #32]\n"
 "	b	4f\n"
 "	ldp	q16, q17, [%3]\n"	/* Q3 */
-"	ldr	q18, [%3, #16]\n"
+"	ldr	q18, [%3, #32]\n"
 "	b	3f\n"
 "	ldp	q16, q17, [%3]\n"	/* Q2 */
 "	b	2f\n"
@@ -357,7 +381,11 @@ extend_hfa_type (void *dest, void *src, int h)
     : "r"(f * 12), "r"(dest), "r"(src)
     : "memory", "v16", "v17", "v18", "v19");
 }
+#endif
 
+#if defined(_MSC_VER)
+void* compress_hfa_type (void *dest, void *src, int h);
+#else
 static void *
 compress_hfa_type (void *dest, void *reg, int h)
 {
@@ -426,6 +454,7 @@ compress_hfa_type (void *dest, void *reg, int h)
     }
   return dest;
 }
+#endif
 
 /* Either allocate an appropriate register for the argument type, or if
    none are available, allocate a stack slot and return a pointer
@@ -443,7 +472,7 @@ allocate_int_to_reg_or_stack (struct call_context *context,
   return allocate_to_stack (state, stack, size, size);
 }
 
-ffi_status
+ffi_status FFI_HIDDEN
 ffi_prep_cif_machdep (ffi_cif *cif)
 {
   ffi_type *rtype = cif->rtype;
@@ -517,7 +546,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
       }
 
   /* Round the stack up to a multiple of the stack alignment requirement. */
-  cif->bytes = ALIGN(bytes, 16);
+  cif->bytes = (unsigned) FFI_ALIGN(bytes, 16);
   cif->flags = flags;
 #if defined (__APPLE__)
   cif->aarch64_nfixedargs = 0;
@@ -528,14 +557,22 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 
 #if defined (__APPLE__)
 /* Perform Apple-specific cif processing for variadic calls */
-ffi_status ffi_prep_cif_machdep_var(ffi_cif *cif,
-				    unsigned int nfixedargs,
-				    unsigned int ntotalargs)
+ffi_status FFI_HIDDEN
+ffi_prep_cif_machdep_var(ffi_cif *cif, unsigned int nfixedargs,
+			 unsigned int ntotalargs)
 {
   ffi_status status = ffi_prep_cif_machdep (cif);
   cif->aarch64_nfixedargs = nfixedargs;
   return status;
 }
+#else
+ffi_status FFI_HIDDEN
+ffi_prep_cif_machdep_var(ffi_cif *cif, unsigned int nfixedargs, unsigned int ntotalargs)
+{
+  ffi_status status = ffi_prep_cif_machdep (cif);
+  cif->flags |= AARCH64_FLAG_VARARG;
+  return status;
+}
 #endif /* __APPLE__ */
 
 extern void ffi_call_SYSV (struct call_context *context, void *frame,
@@ -552,7 +589,7 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *orig_rvalue,
   void *stack, *frame, *rvalue;
   struct arg_state state;
   size_t stack_bytes, rtype_size, rsize;
-  int i, nargs, flags;
+  int i, nargs, flags, isvariadic = 0;
   ffi_type *rtype;
 
   flags = cif->flags;
@@ -560,6 +597,12 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *orig_rvalue,
   rtype_size = rtype->size;
   stack_bytes = cif->bytes;
 
+  if (flags & AARCH64_FLAG_VARARG)
+  {
+    isvariadic = 1;
+    flags &= ~AARCH64_FLAG_VARARG;
+  }
+
   /* If the target function returns a structure via hidden pointer,
      then we cannot allow a null rvalue.  Otherwise, mash a null
      rvalue to void return type.  */
@@ -574,11 +617,12 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *orig_rvalue,
   else if (flags & AARCH64_RET_NEED_COPY)
     rsize = 16;
 
-  /* Allocate consectutive stack for everything we'll need.  */
-  context = alloca (sizeof(struct call_context) + stack_bytes + 32 + rsize);
+  /* Allocate consectutive stack for everything we'll need.
+     The frame uses 40 bytes for: lr, fp, rvalue, flags, sp */
+  context = alloca (sizeof(struct call_context) + stack_bytes + 40 + rsize);
   stack = context + 1;
-  frame = stack + stack_bytes;
-  rvalue = (rsize ? frame + 32 : orig_rvalue);
+  frame = (void*)((uintptr_t)stack + (uintptr_t)stack_bytes);
+  rvalue = (rsize ? (void*)((uintptr_t)frame + 40) : orig_rvalue);
 
   arg_init (&state);
   for (i = 0, nargs = cif->nargs; i < nargs; i++)
@@ -639,16 +683,31 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *orig_rvalue,
 	    h = is_vfp_type (ty);
 	    if (h)
 	      {
-		int elems = 4 - (h & 3);
-	        if (state.nsrn + elems <= N_V_ARG_REG)
-		  {
-		    dest = &context->v[state.nsrn];
-		    state.nsrn += elems;
-		    extend_hfa_type (dest, a, h);
-		    break;
-		  }
-		state.nsrn = N_V_ARG_REG;
-		dest = allocate_to_stack (&state, stack, ty->alignment, s);
+              int elems = 4 - (h & 3);
+              if (cif->abi == FFI_WIN64 && isvariadic)
+              {
+                if (state.ngrn + elems <= N_X_ARG_REG)
+                {
+                  dest = &context->x[state.ngrn];
+                  state.ngrn += elems;
+                  extend_hfa_type(dest, a, h);
+                  break;
+                }
+                state.nsrn = N_X_ARG_REG;
+                dest = allocate_to_stack(&state, stack, ty->alignment, s);
+              }
+              else
+              {
+                if (state.nsrn + elems <= N_V_ARG_REG)
+                {
+                  dest = &context->v[state.nsrn];
+                  state.nsrn += elems;
+                  extend_hfa_type (dest, a, h);
+                  break;
+                }
+                state.nsrn = N_V_ARG_REG;
+                dest = allocate_to_stack (&state, stack, ty->alignment, s);
+              }
 	      }
 	    else if (s > 16)
 	      {
@@ -657,6 +716,7 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *orig_rvalue,
 		   the argument is replaced by a pointer to the copy.  */
 		a = &avalue[i];
 		t = FFI_TYPE_POINTER;
+		s = sizeof (void *);
 		goto do_pointer;
 	      }
 	    else
@@ -669,7 +729,7 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *orig_rvalue,
 		       X registers, then the argument is copied into
 		       consecutive X registers.  */
 		    dest = &context->x[state.ngrn];
-		    state.ngrn += n;
+                    state.ngrn += (unsigned int)n;
 		  }
 		else
 		  {
@@ -711,6 +771,8 @@ ffi_call (ffi_cif *cif, void (*fn) (void), void *rvalue, void **avalue)
   ffi_call_int (cif, fn, rvalue, avalue, NULL);
 }
 
+#if FFI_CLOSURES
+
 #ifdef FFI_GO_CLOSURES
 void
 ffi_call_go (ffi_cif *cif, void (*fn) (void), void *rvalue,
@@ -724,239 +786,9 @@ ffi_call_go (ffi_cif *cif, void (*fn) (void), void *rvalue,
 
 extern void ffi_closure_SYSV (void) FFI_HIDDEN;
 extern void ffi_closure_SYSV_V (void) FFI_HIDDEN;
-
-#if FFI_EXEC_TRAMPOLINE_TABLE
-
-#include <mach/mach.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-extern void *ffi_closure_trampoline_table_page;
-
-typedef struct ffi_trampoline_table ffi_trampoline_table;
-typedef struct ffi_trampoline_table_entry ffi_trampoline_table_entry;
-
-struct ffi_trampoline_table
-{
-  /* contiguous writable and executable pages */
-  vm_address_t config_page;
-  vm_address_t trampoline_page;
-
-  /* free list tracking */
-  uint16_t free_count;
-  ffi_trampoline_table_entry *free_list;
-  ffi_trampoline_table_entry *free_list_pool;
-
-  ffi_trampoline_table *prev;
-  ffi_trampoline_table *next;
-};
-
-struct ffi_trampoline_table_entry
-{
-  void *(*trampoline) ();
-  ffi_trampoline_table_entry *next;
-};
-
-/* The trampoline configuration is placed a page prior to the trampoline's entry point */
-#define FFI_TRAMPOLINE_CODELOC_CONFIG(codeloc) ((void **) (((uint8_t *) codeloc) - PAGE_SIZE));
-
-/* Total number of trampolines that fit in one trampoline table */
-#define FFI_TRAMPOLINE_COUNT (PAGE_SIZE / FFI_TRAMPOLINE_SIZE)
-
-static pthread_mutex_t ffi_trampoline_lock = PTHREAD_MUTEX_INITIALIZER;
-static ffi_trampoline_table *ffi_trampoline_tables = NULL;
-
-static ffi_trampoline_table *
-ffi_trampoline_table_alloc ()
-{
-  ffi_trampoline_table *table = NULL;
-
-  /* Loop until we can allocate two contiguous pages */
-  while (table == NULL)
-    {
-      vm_address_t config_page = 0x0;
-      kern_return_t kt;
-
-      /* Try to allocate two pages */
-      kt =
-	vm_allocate (mach_task_self (), &config_page, PAGE_SIZE * 2,
-		     VM_FLAGS_ANYWHERE);
-      if (kt != KERN_SUCCESS)
-	{
-	  fprintf (stderr, "vm_allocate() failure: %d at %s:%d\n", kt,
-		   __FILE__, __LINE__);
-	  break;
-	}
-
-      /* Now drop the second half of the allocation to make room for the trampoline table */
-      vm_address_t trampoline_page = config_page + PAGE_SIZE;
-      kt = vm_deallocate (mach_task_self (), trampoline_page, PAGE_SIZE);
-      if (kt != KERN_SUCCESS)
-	{
-	  fprintf (stderr, "vm_deallocate() failure: %d at %s:%d\n", kt,
-		   __FILE__, __LINE__);
-	  break;
-	}
-
-      /* Remap the trampoline table to directly follow the config page */
-      vm_prot_t cur_prot;
-      vm_prot_t max_prot;
-
-      kt =
-	vm_remap (mach_task_self (), &trampoline_page, PAGE_SIZE, 0x0, FALSE,
-		  mach_task_self (),
-		  (vm_address_t) & ffi_closure_trampoline_table_page, FALSE,
-		  &cur_prot, &max_prot, VM_INHERIT_SHARE);
-
-      /* If we lost access to the destination trampoline page, drop our config allocation mapping and retry */
-      if (kt != KERN_SUCCESS)
-	{
-	  /* Log unexpected failures */
-	  if (kt != KERN_NO_SPACE)
-	    {
-	      fprintf (stderr, "vm_remap() failure: %d at %s:%d\n", kt,
-		       __FILE__, __LINE__);
-	    }
-
-	  vm_deallocate (mach_task_self (), config_page, PAGE_SIZE);
-	  continue;
-	}
-
-      /* We have valid trampoline and config pages */
-      table = calloc (1, sizeof (ffi_trampoline_table));
-      table->free_count = FFI_TRAMPOLINE_COUNT;
-      table->config_page = config_page;
-      table->trampoline_page = trampoline_page;
-
-      /* Create and initialize the free list */
-      table->free_list_pool =
-	calloc (FFI_TRAMPOLINE_COUNT, sizeof (ffi_trampoline_table_entry));
-
-      uint16_t i;
-      for (i = 0; i < table->free_count; i++)
-	{
-	  ffi_trampoline_table_entry *entry = &table->free_list_pool[i];
-	  entry->trampoline =
-	    (void *) (table->trampoline_page + (i * FFI_TRAMPOLINE_SIZE));
-
-	  if (i < table->free_count - 1)
-	    entry->next = &table->free_list_pool[i + 1];
-	}
-
-      table->free_list = table->free_list_pool;
-    }
-
-  return table;
-}
-
-void *
-ffi_closure_alloc (size_t size, void **code)
-{
-  /* Create the closure */
-  ffi_closure *closure = malloc (size);
-  if (closure == NULL)
-    return NULL;
-
-  pthread_mutex_lock (&ffi_trampoline_lock);
-
-  /* Check for an active trampoline table with available entries. */
-  ffi_trampoline_table *table = ffi_trampoline_tables;
-  if (table == NULL || table->free_list == NULL)
-    {
-      table = ffi_trampoline_table_alloc ();
-      if (table == NULL)
-	{
-	  free (closure);
-	  return NULL;
-	}
-
-      /* Insert the new table at the top of the list */
-      table->next = ffi_trampoline_tables;
-      if (table->next != NULL)
-	table->next->prev = table;
-
-      ffi_trampoline_tables = table;
-    }
-
-  /* Claim the free entry */
-  ffi_trampoline_table_entry *entry = ffi_trampoline_tables->free_list;
-  ffi_trampoline_tables->free_list = entry->next;
-  ffi_trampoline_tables->free_count--;
-  entry->next = NULL;
-
-  pthread_mutex_unlock (&ffi_trampoline_lock);
-
-  /* Initialize the return values */
-  *code = entry->trampoline;
-  closure->trampoline_table = table;
-  closure->trampoline_table_entry = entry;
-
-  return closure;
-}
-
-void
-ffi_closure_free (void *ptr)
-{
-  ffi_closure *closure = ptr;
-
-  pthread_mutex_lock (&ffi_trampoline_lock);
-
-  /* Fetch the table and entry references */
-  ffi_trampoline_table *table = closure->trampoline_table;
-  ffi_trampoline_table_entry *entry = closure->trampoline_table_entry;
-
-  /* Return the entry to the free list */
-  entry->next = table->free_list;
-  table->free_list = entry;
-  table->free_count++;
-
-  /* If all trampolines within this table are free, and at least one other table exists, deallocate
-   * the table */
-  if (table->free_count == FFI_TRAMPOLINE_COUNT
-      && ffi_trampoline_tables != table)
-    {
-      /* Remove from the list */
-      if (table->prev != NULL)
-	table->prev->next = table->next;
-
-      if (table->next != NULL)
-	table->next->prev = table->prev;
-
-      /* Deallocate pages */
-      kern_return_t kt;
-      kt = vm_deallocate (mach_task_self (), table->config_page, PAGE_SIZE);
-      if (kt != KERN_SUCCESS)
-	fprintf (stderr, "vm_deallocate() failure: %d at %s:%d\n", kt,
-		 __FILE__, __LINE__);
-
-      kt =
-	vm_deallocate (mach_task_self (), table->trampoline_page, PAGE_SIZE);
-      if (kt != KERN_SUCCESS)
-	fprintf (stderr, "vm_deallocate() failure: %d at %s:%d\n", kt,
-		 __FILE__, __LINE__);
-
-      /* Deallocate free list */
-      free (table->free_list_pool);
-      free (table);
-    }
-  else if (ffi_trampoline_tables != table)
-    {
-      /* Otherwise, bump this table to the top of the list */
-      table->prev = NULL;
-      table->next = ffi_trampoline_tables;
-      if (ffi_trampoline_tables != NULL)
-	ffi_trampoline_tables->prev = table;
-
-      ffi_trampoline_tables = table;
-    }
-
-  pthread_mutex_unlock (&ffi_trampoline_lock);
-
-  /* Free the closure */
-  free (closure);
-}
-
+#if defined(FFI_EXEC_STATIC_TRAMP)
+extern void ffi_closure_SYSV_alt (void) FFI_HIDDEN;
+extern void ffi_closure_SYSV_V_alt (void) FFI_HIDDEN;
 #endif
 
 ffi_status
@@ -966,7 +798,7 @@ ffi_prep_closure_loc (ffi_closure *closure,
                       void *user_data,
                       void *codeloc)
 {
-  if (cif->abi != FFI_SYSV)
+  if (cif->abi != FFI_SYSV && cif->abi != FFI_WIN64)
     return FFI_BAD_ABI;
 
   void (*start)(void);
@@ -977,9 +809,14 @@ ffi_prep_closure_loc (ffi_closure *closure,
     start = ffi_closure_SYSV;
 
 #if FFI_EXEC_TRAMPOLINE_TABLE
-  void **config = FFI_TRAMPOLINE_CODELOC_CONFIG (codeloc);
+#ifdef __MACH__
+#ifdef HAVE_PTRAUTH
+  codeloc = ptrauth_auth_data(codeloc, ptrauth_key_function_pointer, 0);
+#endif
+  void **config = (void **)((uint8_t *)codeloc - PAGE_MAX_SIZE);
   config[0] = closure;
   config[1] = start;
+#endif
 #else
   static const unsigned char trampoline[16] = {
     0x90, 0x00, 0x00, 0x58,	/* ldr	x16, tramp+16	*/
@@ -987,12 +824,37 @@ ffi_prep_closure_loc (ffi_closure *closure,
     0x00, 0x02, 0x1f, 0xd6	/* br	x16		*/
   };
   char *tramp = closure->tramp;
-  
+
+#if defined(FFI_EXEC_STATIC_TRAMP)
+  if (ffi_tramp_is_present(closure))
+    {
+      /* Initialize the static trampoline's parameters. */
+      if (start == ffi_closure_SYSV_V)
+          start = ffi_closure_SYSV_V_alt;
+      else
+          start = ffi_closure_SYSV_alt;
+      ffi_tramp_set_parms (closure->ftramp, start, closure);
+      goto out;
+    }
+#endif
+
+  /* Initialize the dynamic trampoline. */
   memcpy (tramp, trampoline, sizeof(trampoline));
   
   *(UINT64 *)(tramp + 16) = (uintptr_t)start;
 
   ffi_clear_cache(tramp, tramp + FFI_TRAMPOLINE_SIZE);
+
+  /* Also flush the cache for code mapping.  */
+#ifdef _WIN32
+  // Not using dlmalloc.c for Windows ARM64 builds
+  // so calling ffi_data_to_code_pointer() isn't necessary
+  unsigned char *tramp_code = tramp;
+  #else
+  unsigned char *tramp_code = ffi_data_to_code_pointer (tramp);
+  #endif
+  ffi_clear_cache (tramp_code, tramp_code + FFI_TRAMPOLINE_SIZE);
+out:
 #endif
 
   closure->cif = cif;
@@ -1012,7 +874,7 @@ ffi_prep_go_closure (ffi_go_closure *closure, ffi_cif* cif,
 {
   void (*start)(void);
 
-  if (cif->abi != FFI_SYSV)
+  if (cif->abi != FFI_SYSV && cif->abi != FFI_WIN64)
     return FFI_BAD_ABI;
 
   if (cif->flags & AARCH64_FLAG_ARG_V)
@@ -1052,11 +914,18 @@ ffi_closure_SYSV_inner (ffi_cif *cif,
 			void *stack, void *rvalue, void *struct_rvalue)
 {
   void **avalue = (void**) alloca (cif->nargs * sizeof (void*));
-  int i, h, nargs, flags;
+  int i, h, nargs, flags, isvariadic = 0;
   struct arg_state state;
 
   arg_init (&state);
 
+  flags = cif->flags;
+  if (flags & AARCH64_FLAG_VARARG)
+  {
+    isvariadic = 1;
+    flags &= ~AARCH64_FLAG_VARARG;
+  }
+
   for (i = 0, nargs = cif->nargs; i < nargs; i++)
     {
       ffi_type *ty = cif->arg_types[i];
@@ -1091,58 +960,85 @@ ffi_closure_SYSV_inner (ffi_cif *cif,
 	  if (h)
 	    {
 	      n = 4 - (h & 3);
-	      if (state.nsrn + n <= N_V_ARG_REG)
-		{
-		  void *reg = &context->v[state.nsrn];
-		  state.nsrn += n;
-
-		  /* Eeek! We need a pointer to the structure, however the
-		     homogeneous float elements are being passed in individual
-		     registers, therefore for float and double the structure
-		     is not represented as a contiguous sequence of bytes in
-		     our saved register context.  We don't need the original
-		     contents of the register storage, so we reformat the
-		     structure into the same memory.  */
-		  avalue[i] = compress_hfa_type (reg, reg, h);
-		}
-	      else
-		{
-		  state.nsrn = N_V_ARG_REG;
-		  avalue[i] = allocate_to_stack (&state, stack,
-						 ty->alignment, s);
-		}
-	    }
-	  else if (s > 16)
-	    {
-	      /* Replace Composite type of size greater than 16 with a
-		 pointer.  */
-	      avalue[i] = *(void **)
-		allocate_int_to_reg_or_stack (context, &state, stack,
-					      sizeof (void *));
-	    }
-	  else
-	    {
-	      n = (s + 7) / 8;
-	      if (state.ngrn + n <= N_X_ARG_REG)
-		{
-		  avalue[i] = &context->x[state.ngrn];
-		  state.ngrn += n;
-		}
-	      else
-		{
-		  state.ngrn = N_X_ARG_REG;
-		  avalue[i] = allocate_to_stack (&state, stack,
-						 ty->alignment, s);
-		}
-	    }
-	  break;
+              if (cif->abi == FFI_WIN64 && isvariadic)
+                {
+                  if (state.ngrn + n <= N_X_ARG_REG)
+                    {
+                      void *reg = &context->x[state.ngrn];
+                      state.ngrn += (unsigned int)n;
+    
+                      /* Eeek! We need a pointer to the structure, however the
+                       homogeneous float elements are being passed in individual
+                       registers, therefore for float and double the structure
+                       is not represented as a contiguous sequence of bytes in
+                       our saved register context.  We don't need the original
+                       contents of the register storage, so we reformat the
+                       structure into the same memory.  */
+                      avalue[i] = compress_hfa_type(reg, reg, h);
+                    }
+                  else
+                    {
+                      state.ngrn = N_X_ARG_REG;
+                      state.nsrn = N_V_ARG_REG;
+                      avalue[i] = allocate_to_stack(&state, stack,
+                             ty->alignment, s);
+                    }
+                }
+              else
+                {
+                  if (state.nsrn + n <= N_V_ARG_REG)
+                    {
+                      void *reg = &context->v[state.nsrn];
+                      state.nsrn += (unsigned int)n;
+                      avalue[i] = compress_hfa_type(reg, reg, h);
+                    }
+                  else
+                    {
+                      state.nsrn = N_V_ARG_REG;
+                      avalue[i] = allocate_to_stack(&state, stack,
+                                                   ty->alignment, s);
+                    }
+                }
+            }
+          else if (s > 16)
+            {
+              /* Replace Composite type of size greater than 16 with a
+                  pointer.  */
+              avalue[i] = *(void **)
+              allocate_int_to_reg_or_stack (context, &state, stack,
+                                         sizeof (void *));
+            }
+          else
+            {
+              n = (s + 7) / 8;
+              if (state.ngrn + n <= N_X_ARG_REG)
+                {
+                  avalue[i] = &context->x[state.ngrn];
+                  state.ngrn += (unsigned int)n;
+                }
+              else
+                {
+                  state.ngrn = N_X_ARG_REG;
+                  avalue[i] = allocate_to_stack(&state, stack,
+                                           ty->alignment, s);
+                }
+            }
+          break;
+
+        default:
+          abort();
+      }
 
-	default:
-	  abort();
+#if defined (__APPLE__)
+      if (i + 1 == cif->aarch64_nfixedargs)
+	{
+	  state.ngrn = N_X_ARG_REG;
+	  state.nsrn = N_V_ARG_REG;
+	  state.allocating_variadic = 1;
 	}
+#endif
     }
 
-  flags = cif->flags;
   if (flags & AARCH64_RET_IN_MEM)
     rvalue = struct_rvalue;
 
@@ -1150,3 +1046,19 @@ ffi_closure_SYSV_inner (ffi_cif *cif,
 
   return flags;
 }
+
+#if defined(FFI_EXEC_STATIC_TRAMP)
+void *
+ffi_tramp_arch (size_t *tramp_size, size_t *map_size)
+{
+  extern void *trampoline_code_table;
+
+  *tramp_size = AARCH64_TRAMP_SIZE;
+  *map_size = AARCH64_TRAMP_MAP_SIZE;
+  return &trampoline_code_table;
+}
+#endif
+
+#endif /* FFI_CLOSURES */
+
+#endif /* (__aarch64__) || defined(__arm64__)|| defined (_M_ARM64)*/
diff --git a/libffi/src/aarch64/ffitarget.h b/libffi/src/aarch64/ffitarget.h
index 34200ad..d5622e1 100644
--- a/libffi/src/aarch64/ffitarget.h
+++ b/libffi/src/aarch64/ffitarget.h
@@ -32,6 +32,10 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #define FFI_SIZEOF_JAVA_RAW  4
 typedef unsigned long long ffi_arg;
 typedef signed long long ffi_sarg;
+#elif defined(_WIN32)
+#define FFI_SIZEOF_ARG 8
+typedef unsigned long long ffi_arg;
+typedef signed long long ffi_sarg;
 #else
 typedef unsigned long ffi_arg;
 typedef signed long ffi_sarg;
@@ -41,34 +45,53 @@ typedef enum ffi_abi
   {
     FFI_FIRST_ABI = 0,
     FFI_SYSV,
+    FFI_WIN64,
     FFI_LAST_ABI,
+#if defined(_WIN32)
+    FFI_DEFAULT_ABI = FFI_WIN64
+#else
     FFI_DEFAULT_ABI = FFI_SYSV
+#endif
   } ffi_abi;
 #endif
 
 /* ---- Definitions for closures ----------------------------------------- */
 
 #define FFI_CLOSURES 1
-#if defined (__APPLE__)
-#define FFI_TRAMPOLINE_SIZE 20
+#define FFI_NATIVE_RAW_API 0
+
+#if defined (FFI_EXEC_TRAMPOLINE_TABLE) && FFI_EXEC_TRAMPOLINE_TABLE
+
+#ifdef __MACH__
+#define FFI_TRAMPOLINE_SIZE 16
 #define FFI_TRAMPOLINE_CLOSURE_OFFSET 16
 #else
+#error "No trampoline table implementation"
+#endif
+
+#else
 #define FFI_TRAMPOLINE_SIZE 24
 #define FFI_TRAMPOLINE_CLOSURE_OFFSET FFI_TRAMPOLINE_SIZE
 #endif
-#define FFI_NATIVE_RAW_API 0
+
+#ifdef _WIN32
+#define FFI_EXTRA_CIF_FIELDS unsigned is_variadic
+#endif
+#define FFI_TARGET_SPECIFIC_VARIADIC
 
 /* ---- Internal ---- */
 
 #if defined (__APPLE__)
-#define FFI_TARGET_SPECIFIC_VARIADIC
 #define FFI_EXTRA_CIF_FIELDS unsigned aarch64_nfixedargs
-#else
-/* iOS reserves x18 for the system.  Disable Go closures until
+#elif !defined(_WIN32)
+/* iOS and Windows reserve x18 for the system.  Disable Go closures until
    a new static chain is chosen.  */
 #define FFI_GO_CLOSURES 1
 #endif
 
+#ifndef _WIN32
+/* No complex type on Windows */
 #define FFI_TARGET_HAS_COMPLEX_TYPE
+#endif
 
 #endif
diff --git a/libffi/src/aarch64/internal.h b/libffi/src/aarch64/internal.h
index 9c3e077..b5d102b 100644
--- a/libffi/src/aarch64/internal.h
+++ b/libffi/src/aarch64/internal.h
@@ -61,7 +61,40 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
 #define AARCH64_FLAG_ARG_V_BIT	7
 #define AARCH64_FLAG_ARG_V	(1 << AARCH64_FLAG_ARG_V_BIT)
+#define AARCH64_FLAG_VARARG	(1 << 8)
 
 #define N_X_ARG_REG		8
 #define N_V_ARG_REG		8
 #define CALL_CONTEXT_SIZE	(N_V_ARG_REG * 16 + N_X_ARG_REG * 8)
+
+#if defined(FFI_EXEC_STATIC_TRAMP)
+/*
+ * For the trampoline code table mapping, a mapping size of 16K is chosen to
+ * cover the base page sizes of 4K and 16K.
+ */
+#define AARCH64_TRAMP_MAP_SHIFT	14
+#define AARCH64_TRAMP_MAP_SIZE	(1 << AARCH64_TRAMP_MAP_SHIFT)
+#define AARCH64_TRAMP_SIZE	32
+
+#endif
+
+/* Helpers for writing assembly compatible with arm ptr auth */
+#ifdef LIBFFI_ASM
+
+#ifdef HAVE_PTRAUTH
+#define SIGN_LR pacibsp
+#define SIGN_LR_WITH_REG(x) pacib lr, x
+#define AUTH_LR_AND_RET retab
+#define AUTH_LR_WITH_REG(x) autib lr, x
+#define BRANCH_AND_LINK_TO_REG blraaz
+#define BRANCH_TO_REG braaz
+#else
+#define SIGN_LR
+#define SIGN_LR_WITH_REG(x)
+#define AUTH_LR_AND_RET ret
+#define AUTH_LR_WITH_REG(x)
+#define BRANCH_AND_LINK_TO_REG blr
+#define BRANCH_TO_REG br
+#endif
+
+#endif
diff --git a/libffi/src/aarch64/sysv.S b/libffi/src/aarch64/sysv.S
index c1bf9b9..eeaf3f8 100644
--- a/libffi/src/aarch64/sysv.S
+++ b/libffi/src/aarch64/sysv.S
@@ -19,6 +19,7 @@ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
+#if defined(__aarch64__) || defined(__arm64__)
 #define LIBFFI_ASM
 #include <fficonfig.h>
 #include <ffi.h>
@@ -77,9 +78,22 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
 	cfi_startproc
 CNAME(ffi_call_SYSV):
+	/* Sign the lr with x1 since that is where it will be stored */
+	SIGN_LR_WITH_REG(x1)
+
 	/* Use a stack frame allocated by our caller.  */
-	cfi_def_cfa(x1, 32);
+#if defined(HAVE_PTRAUTH) && defined(__APPLE__)
+	/* darwin's libunwind assumes that the cfa is the sp and that's the data
+	 * used to sign the lr.  In order to allow unwinding through this
+	 * function it is necessary to point the cfa at the signing register.
+	 */
+	cfi_def_cfa(x1, 0);
+#else
+	cfi_def_cfa(x1, 40);
+#endif
 	stp	x29, x30, [x1]
+	mov	x9, sp
+	str	x9, [x1, #32]
 	mov	x29, x1
 	mov	sp, x0
 	cfi_def_cfa_register(x29)
@@ -110,13 +124,15 @@ CNAME(ffi_call_SYSV):
 	/* Deallocate the context, leaving the stacked arguments.  */
 	add	sp, sp, #CALL_CONTEXT_SIZE
 
-	blr     x9			/* call fn */
+	BRANCH_AND_LINK_TO_REG     x9			/* call fn */
 
 	ldp	x3, x4, [x29, #16]	/* reload rvalue and flags */
 
 	/* Partially deconstruct the stack frame.  */
-	mov     sp, x29
+	ldr	x9, [x29, #32]
+	mov	sp, x9
 	cfi_def_cfa_register (sp)
+	mov	x2, x29			/* Preserve for auth */
 	ldp     x29, x30, [x29]
 
 	/* Save the return value as directed.  */
@@ -130,80 +146,87 @@ CNAME(ffi_call_SYSV):
 	   and therefore we want to extend to 64 bits; these types
 	   have two consecutive entries allocated for them.  */
 	.align	4
-0:	ret				/* VOID */
+0:	b 99f				/* VOID */
 	nop
 1:	str	x0, [x3]		/* INT64 */
-	ret
+	b 99f
 2:	stp	x0, x1, [x3]		/* INT128 */
-	ret
+	b 99f
 3:	brk	#1000			/* UNUSED */
-	ret
+	b 99f
 4:	brk	#1000			/* UNUSED */
-	ret
+	b 99f
 5:	brk	#1000			/* UNUSED */
-	ret
+	b 99f
 6:	brk	#1000			/* UNUSED */
-	ret
+	b 99f
 7:	brk	#1000			/* UNUSED */
-	ret
+	b 99f
 8:	st4	{ v0.s, v1.s, v2.s, v3.s }[0], [x3]	/* S4 */
-	ret
+	b 99f
 9:	st3	{ v0.s, v1.s, v2.s }[0], [x3]	/* S3 */
-	ret
+	b 99f
 10:	stp	s0, s1, [x3]		/* S2 */
-	ret
+	b 99f
 11:	str	s0, [x3]		/* S1 */
-	ret
+	b 99f
 12:	st4	{ v0.d, v1.d, v2.d, v3.d }[0], [x3]	/* D4 */
-	ret
+	b 99f
 13:	st3	{ v0.d, v1.d, v2.d }[0], [x3]	/* D3 */
-	ret
+	b 99f
 14:	stp	d0, d1, [x3]		/* D2 */
-	ret
+	b 99f
 15:	str	d0, [x3]		/* D1 */
-	ret
+	b 99f
 16:	str	q3, [x3, #48]		/* Q4 */
 	nop
 17:	str	q2, [x3, #32]		/* Q3 */
 	nop
 18:	stp	q0, q1, [x3]		/* Q2 */
-	ret
+	b 99f
 19:	str	q0, [x3]		/* Q1 */
-	ret
+	b 99f
 20:	uxtb	w0, w0			/* UINT8 */
 	str	x0, [x3]
-21:	ret				/* reserved */
+21:	b 99f				/* reserved */
 	nop
 22:	uxth	w0, w0			/* UINT16 */
 	str	x0, [x3]
-23:	ret				/* reserved */
+23:	b 99f				/* reserved */
 	nop
 24:	mov	w0, w0			/* UINT32 */
 	str	x0, [x3]
-25:	ret				/* reserved */
+25:	b 99f				/* reserved */
 	nop
 26:	sxtb	x0, w0			/* SINT8 */
 	str	x0, [x3]
-27:	ret				/* reserved */
+27:	b 99f				/* reserved */
 	nop
 28:	sxth	x0, w0			/* SINT16 */
 	str	x0, [x3]
-29:	ret				/* reserved */
+29:	b 99f				/* reserved */
 	nop
 30:	sxtw	x0, w0			/* SINT32 */
 	str	x0, [x3]
-31:	ret				/* reserved */
+31:	b 99f				/* reserved */
 	nop
 
+	/* Return now that result has been populated. */
+99:
+	AUTH_LR_WITH_REG(x2)
+	ret
+
 	cfi_endproc
 
 	.globl	CNAME(ffi_call_SYSV)
+	FFI_HIDDEN(CNAME(ffi_call_SYSV))
 #ifdef __ELF__
 	.type	CNAME(ffi_call_SYSV), #function
-	.hidden	CNAME(ffi_call_SYSV)
 	.size CNAME(ffi_call_SYSV), .-CNAME(ffi_call_SYSV)
 #endif
 
+#if FFI_CLOSURES
+
 /* ffi_closure_SYSV
 
    Closure invocation glue. This is the low level code invoked directly by
@@ -223,6 +246,7 @@ CNAME(ffi_call_SYSV):
 	.align 4
 CNAME(ffi_closure_SYSV_V):
 	cfi_startproc
+	SIGN_LR
 	stp     x29, x30, [sp, #-ffi_closure_SYSV_FS]!
 	cfi_adjust_cfa_offset (ffi_closure_SYSV_FS)
 	cfi_rel_offset (x29, 0)
@@ -237,15 +261,16 @@ CNAME(ffi_closure_SYSV_V):
 	cfi_endproc
 
 	.globl	CNAME(ffi_closure_SYSV_V)
+	FFI_HIDDEN(CNAME(ffi_closure_SYSV_V))
 #ifdef __ELF__
 	.type	CNAME(ffi_closure_SYSV_V), #function
-	.hidden	CNAME(ffi_closure_SYSV_V)
 	.size	CNAME(ffi_closure_SYSV_V), . - CNAME(ffi_closure_SYSV_V)
 #endif
 
 	.align	4
 	cfi_startproc
 CNAME(ffi_closure_SYSV):
+	SIGN_LR
 	stp     x29, x30, [sp, #-ffi_closure_SYSV_FS]!
 	cfi_adjust_cfa_offset (ffi_closure_SYSV_FS)
 	cfi_rel_offset (x29, 0)
@@ -262,7 +287,9 @@ CNAME(ffi_closure_SYSV):
 	/* Load ffi_closure_inner arguments.  */
 	ldp	PTR_REG(0), PTR_REG(1), [x17, #FFI_TRAMPOLINE_CLOSURE_OFFSET]	/* load cif, fn */
 	ldr	PTR_REG(2), [x17, #FFI_TRAMPOLINE_CLOSURE_OFFSET+PTR_SIZE*2]	/* load user_data */
+#ifdef FFI_GO_CLOSURES
 .Ldo_closure:
+#endif
 	add	x3, sp, #16				/* load context */
 	add	x4, sp, #ffi_closure_SYSV_FS		/* load stack */
 	add	x5, sp, #16+CALL_CONTEXT_SIZE		/* load rvalue */
@@ -296,7 +323,7 @@ CNAME(ffi_closure_SYSV):
 	nop
 8:	ldr	s3, [x3, #12]		/* S4 */
 	nop
-9:	ldr	s2, [x2, #8]		/* S3 */
+9:	ldr	s2, [x3, #8]		/* S3 */
 	nop
 10:	ldp	s0, s1, [x3]		/* S2 */
 	b	99f
@@ -345,35 +372,109 @@ CNAME(ffi_closure_SYSV):
 	cfi_adjust_cfa_offset (-ffi_closure_SYSV_FS)
 	cfi_restore (x29)
 	cfi_restore (x30)
-	ret
+	AUTH_LR_AND_RET
 	cfi_endproc
 
 	.globl	CNAME(ffi_closure_SYSV)
+	FFI_HIDDEN(CNAME(ffi_closure_SYSV))
 #ifdef __ELF__
 	.type	CNAME(ffi_closure_SYSV), #function
-	.hidden	CNAME(ffi_closure_SYSV)
 	.size	CNAME(ffi_closure_SYSV), . - CNAME(ffi_closure_SYSV)
 #endif
 
+#if defined(FFI_EXEC_STATIC_TRAMP)
+	.align 4
+CNAME(ffi_closure_SYSV_V_alt):
+	/* See the comments above trampoline_code_table. */
+	ldr	x17, [sp, #8]			/* Load closure in x17 */
+	add	sp, sp, #16			/* Restore the stack */
+	b	CNAME(ffi_closure_SYSV_V)
+
+	.globl	CNAME(ffi_closure_SYSV_V_alt)
+	FFI_HIDDEN(CNAME(ffi_closure_SYSV_V_alt))
+#ifdef __ELF__
+	.type	CNAME(ffi_closure_SYSV_V_alt), #function
+	.size	CNAME(ffi_closure_SYSV_V_alt), . - CNAME(ffi_closure_SYSV_V_alt)
+#endif
+
+	.align 4
+CNAME(ffi_closure_SYSV_alt):
+	/* See the comments above trampoline_code_table. */
+	ldr	x17, [sp, #8]			/* Load closure in x17 */
+	add	sp, sp, #16			/* Restore the stack */
+	b	CNAME(ffi_closure_SYSV)
+
+	.globl	CNAME(ffi_closure_SYSV_alt)
+	FFI_HIDDEN(CNAME(ffi_closure_SYSV_alt))
+#ifdef __ELF__
+	.type	CNAME(ffi_closure_SYSV_alt), #function
+	.size	CNAME(ffi_closure_SYSV_alt), . - CNAME(ffi_closure_SYSV_alt)
+#endif
+
+/*
+ * Below is the definition of the trampoline code table. Each element in
+ * the code table is a trampoline.
+ */
+/*
+ * The trampoline uses register x17. It saves the original value of x17 on
+ * the stack.
+ *
+ * The trampoline has two parameters - target code to jump to and data for
+ * the target code. The trampoline extracts the parameters from its parameter
+ * block (see tramp_table_map()). The trampoline saves the data address on
+ * the stack. Finally, it jumps to the target code.
+ *
+ * The target code can choose to:
+ *
+ * - restore the value of x17
+ * - load the data address in a register
+ * - restore the stack pointer to what it was when the trampoline was invoked.
+ */
+	.align	AARCH64_TRAMP_MAP_SHIFT
+CNAME(trampoline_code_table):
+	.rept	AARCH64_TRAMP_MAP_SIZE / AARCH64_TRAMP_SIZE
+	sub	sp, sp, #16		/* Make space on the stack */
+	str	x17, [sp]		/* Save x17 on stack */
+	adr	x17, #16376		/* Get data address */
+	ldr	x17, [x17]		/* Copy data into x17 */
+	str	x17, [sp, #8]		/* Save data on stack */
+	adr	x17, #16372		/* Get code address */
+	ldr	x17, [x17]		/* Load code address into x17 */
+	br	x17			/* Jump to code */
+	.endr
+
+	.globl CNAME(trampoline_code_table)
+	FFI_HIDDEN(CNAME(trampoline_code_table))
+#ifdef __ELF__
+	.type	CNAME(trampoline_code_table), #function
+	.size	CNAME(trampoline_code_table), . - CNAME(trampoline_code_table)
+#endif
+	.align	AARCH64_TRAMP_MAP_SHIFT
+#endif /* FFI_EXEC_STATIC_TRAMP */
+
 #if FFI_EXEC_TRAMPOLINE_TABLE
-    .align 12
+
+#ifdef __MACH__
+#include <mach/machine/vm_param.h>
+    .align PAGE_MAX_SHIFT
 CNAME(ffi_closure_trampoline_table_page):
-    .rept 16384 / FFI_TRAMPOLINE_SIZE
-    adr	x17, -16384
-    adr	x16, -16380
-    ldr x16, [x16]
-    ldr x17, [x17]
-    br	x16
+    .rept PAGE_MAX_SIZE / FFI_TRAMPOLINE_SIZE
+    adr x16, -PAGE_MAX_SIZE
+    ldp x17, x16, [x16]
+    br x16
+	nop		/* each entry in the trampoline config page is 2*sizeof(void*) so the trampoline itself cannot be smaller than 16 bytes */
     .endr
-    
+
     .globl CNAME(ffi_closure_trampoline_table_page)
+    FFI_HIDDEN(CNAME(ffi_closure_trampoline_table_page))
     #ifdef __ELF__
     	.type	CNAME(ffi_closure_trampoline_table_page), #function
-    	.hidden	CNAME(ffi_closure_trampoline_table_page)
     	.size	CNAME(ffi_closure_trampoline_table_page), . - CNAME(ffi_closure_trampoline_table_page)
     #endif
 #endif
 
+#endif /* FFI_EXEC_TRAMPOLINE_TABLE */
+
 #ifdef FFI_GO_CLOSURES
 	.align 4
 CNAME(ffi_go_closure_SYSV_V):
@@ -392,9 +493,9 @@ CNAME(ffi_go_closure_SYSV_V):
 	cfi_endproc
 
 	.globl	CNAME(ffi_go_closure_SYSV_V)
+	FFI_HIDDEN(CNAME(ffi_go_closure_SYSV_V))
 #ifdef __ELF__
 	.type	CNAME(ffi_go_closure_SYSV_V), #function
-	.hidden	CNAME(ffi_go_closure_SYSV_V)
 	.size	CNAME(ffi_go_closure_SYSV_V), . - CNAME(ffi_go_closure_SYSV_V)
 #endif
 
@@ -421,12 +522,14 @@ CNAME(ffi_go_closure_SYSV):
 	cfi_endproc
 
 	.globl	CNAME(ffi_go_closure_SYSV)
+	FFI_HIDDEN(CNAME(ffi_go_closure_SYSV))
 #ifdef __ELF__
 	.type	CNAME(ffi_go_closure_SYSV), #function
-	.hidden	CNAME(ffi_go_closure_SYSV)
 	.size	CNAME(ffi_go_closure_SYSV), . - CNAME(ffi_go_closure_SYSV)
 #endif
 #endif /* FFI_GO_CLOSURES */
+#endif /* FFI_CLOSURES */
+#endif /* __arm64__ */
 
 #if defined __ELF__ && defined __linux__
 	.section .note.GNU-stack,"",%progbits
diff --git a/libffi/src/aarch64/win64_armasm.S b/libffi/src/aarch64/win64_armasm.S
new file mode 100644
index 0000000..7fc185b
--- /dev/null
+++ b/libffi/src/aarch64/win64_armasm.S
@@ -0,0 +1,506 @@
+/* Copyright (c) 2009, 2010, 2011, 2012 ARM Ltd.
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+``Software''), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
+
+#define LIBFFI_ASM
+#include <fficonfig.h>
+#include <ffi.h>
+#include <ffi_cfi.h>
+#include "internal.h"
+
+	OPT	2 /*disable listing */
+/* For some macros to add unwind information */
+#include "ksarm64.h"
+	OPT	1 /*re-enable listing */
+
+#define BE(X)	0
+#define PTR_REG(n)      x##n
+#define PTR_SIZE	8
+
+	IMPORT ffi_closure_SYSV_inner
+	EXPORT	ffi_call_SYSV
+	EXPORT	ffi_closure_SYSV_V
+	EXPORT	ffi_closure_SYSV
+	EXPORT	extend_hfa_type
+	EXPORT	compress_hfa_type
+#ifdef FFI_GO_CLOSURES
+	EXPORT	ffi_go_closure_SYSV_V
+	EXPORT	ffi_go_closure_SYSV
+#endif
+
+	TEXTAREA, ALIGN=8
+
+/* ffi_call_SYSV
+   extern void ffi_call_SYSV (void *stack, void *frame,
+			      void (*fn)(void), void *rvalue,
+			      int flags, void *closure);
+   Therefore on entry we have:
+   x0 stack
+   x1 frame
+   x2 fn
+   x3 rvalue
+   x4 flags
+   x5 closure
+*/
+
+	NESTED_ENTRY ffi_call_SYSV_fake
+
+	/* For unwind information, Windows has to store fp and lr  */
+	PROLOG_SAVE_REG_PAIR	x29, x30, #-32!
+
+	ALTERNATE_ENTRY ffi_call_SYSV
+	/* Use a stack frame allocated by our caller. */
+	stp	x29, x30, [x1]
+	mov	x29, x1
+	mov	sp, x0
+
+	mov	x9, x2			/* save fn */
+	mov	x8, x3			/* install structure return */
+#ifdef FFI_GO_CLOSURES
+	/*mov	x18, x5			install static chain */
+#endif
+	stp	x3, x4, [x29, #16]	/* save rvalue and flags */
+	
+	/* Load the vector argument passing registers, if necessary.  */
+	tbz	x4, #AARCH64_FLAG_ARG_V_BIT, ffi_call_SYSV_L1
+	ldp	q0, q1, [sp, #0]
+	ldp	q2, q3, [sp, #32]
+	ldp	q4, q5, [sp, #64]
+	ldp	q6, q7, [sp, #96]
+
+ffi_call_SYSV_L1
+	/* Load the core argument passing registers, including
+	   the structure return pointer.  */
+	ldp     x0, x1, [sp, #16*N_V_ARG_REG + 0]
+	ldp     x2, x3, [sp, #16*N_V_ARG_REG + 16]
+	ldp     x4, x5, [sp, #16*N_V_ARG_REG + 32]
+	ldp     x6, x7, [sp, #16*N_V_ARG_REG + 48]
+
+	/* Deallocate the context, leaving the stacked arguments.  */
+	add	sp, sp, #CALL_CONTEXT_SIZE	
+
+	blr     x9			/* call fn */
+
+	ldp	x3, x4, [x29, #16]	/* reload rvalue and flags */
+
+	/* Partially deconstruct the stack frame. */
+	mov     sp, x29 
+	ldp     x29, x30, [x29]
+
+	/* Save the return value as directed.  */
+	adr	x5, ffi_call_SYSV_return
+	and	w4, w4, #AARCH64_RET_MASK
+	add	x5, x5, x4, lsl #3
+	br	x5
+	
+	/* Note that each table entry is 2 insns, and thus 8 bytes.
+	   For integer data, note that we're storing into ffi_arg
+	   and therefore we want to extend to 64 bits; these types
+	   have two consecutive entries allocated for them.  */
+	ALIGN 4
+ffi_call_SYSV_return
+	ret				/* VOID */
+	nop
+	str	x0, [x3]		/* INT64 */
+	ret
+	stp	x0, x1, [x3]		/* INT128 */
+	ret
+	brk	#1000			/* UNUSED */
+	ret
+	brk	#1000			/* UNUSED */
+	ret
+	brk	#1000			/* UNUSED */
+	ret
+	brk	#1000			/* UNUSED */
+	ret
+	brk	#1000			/* UNUSED */
+	ret
+	st4	{ v0.s, v1.s, v2.s, v3.s }[0], [x3]	/* S4 */
+	ret
+	st3	{ v0.s, v1.s, v2.s }[0], [x3]	/* S3 */
+	ret
+	stp	s0, s1, [x3]		/* S2 */
+	ret
+	str	s0, [x3]		/* S1 */
+	ret
+	st4	{ v0.d, v1.d, v2.d, v3.d }[0], [x3]	/* D4 */
+	ret
+	st3	{ v0.d, v1.d, v2.d }[0], [x3]	/* D3 */
+	ret
+	stp	d0, d1, [x3]		/* D2 */
+	ret
+	str	d0, [x3]		/* D1 */
+	ret
+	str	q3, [x3, #48]		/* Q4 */
+	nop
+	str	q2, [x3, #32]		/* Q3 */
+	nop
+	stp	q0, q1, [x3]		/* Q2 */
+	ret
+	str	q0, [x3]		/* Q1 */
+	ret
+	uxtb	w0, w0			/* UINT8 */
+	str	x0, [x3]
+	ret				/* reserved */
+	nop
+	uxth	w0, w0			/* UINT16 */
+	str	x0, [x3]
+	ret				/* reserved */
+	nop
+	mov	w0, w0			/* UINT32 */
+	str	x0, [x3]
+	ret				/* reserved */
+	nop
+	sxtb	x0, w0			/* SINT8 */
+	str	x0, [x3]
+	ret				/* reserved */
+	nop
+	sxth	x0, w0			/* SINT16 */
+	str	x0, [x3]
+	ret				/* reserved */
+	nop
+	sxtw	x0, w0			/* SINT32 */
+	str	x0, [x3]
+	ret				/* reserved */
+	nop
+	
+	
+	NESTED_END ffi_call_SYSV_fake
+	
+
+/* ffi_closure_SYSV
+   Closure invocation glue. This is the low level code invoked directly by
+   the closure trampoline to setup and call a closure.
+   On entry x17 points to a struct ffi_closure, x16 has been clobbered
+   all other registers are preserved.
+   We allocate a call context and save the argument passing registers,
+   then invoked the generic C ffi_closure_SYSV_inner() function to do all
+   the real work, on return we load the result passing registers back from
+   the call context.
+*/
+
+#define ffi_closure_SYSV_FS (8*2 + CALL_CONTEXT_SIZE + 64)
+
+	NESTED_ENTRY	ffi_closure_SYSV_V
+	PROLOG_SAVE_REG_PAIR	x29, x30, #-ffi_closure_SYSV_FS!
+
+	/* Save the argument passing vector registers.  */
+	stp	q0, q1, [sp, #16 + 0]
+	stp	q2, q3, [sp, #16 + 32]
+	stp	q4, q5, [sp, #16 + 64]
+	stp	q6, q7, [sp, #16 + 96]
+
+	b	ffi_closure_SYSV_save_argument
+	NESTED_END	ffi_closure_SYSV_V
+
+	NESTED_ENTRY	ffi_closure_SYSV
+	PROLOG_SAVE_REG_PAIR	x29, x30, #-ffi_closure_SYSV_FS!
+
+ffi_closure_SYSV_save_argument
+	/* Save the argument passing core registers.  */
+	stp     x0, x1, [sp, #16 + 16*N_V_ARG_REG + 0]
+	stp     x2, x3, [sp, #16 + 16*N_V_ARG_REG + 16]
+	stp     x4, x5, [sp, #16 + 16*N_V_ARG_REG + 32]
+	stp     x6, x7, [sp, #16 + 16*N_V_ARG_REG + 48]
+
+	/* Load ffi_closure_inner arguments.  */
+	ldp	PTR_REG(0), PTR_REG(1), [x17, #FFI_TRAMPOLINE_CLOSURE_OFFSET]	/* load cif, fn */
+	ldr	PTR_REG(2), [x17, #FFI_TRAMPOLINE_CLOSURE_OFFSET+PTR_SIZE*2]	/* load user_data */
+
+do_closure
+	add	x3, sp, #16							/* load context */
+	add	x4, sp, #ffi_closure_SYSV_FS		/* load stack */
+	add	x5, sp, #16+CALL_CONTEXT_SIZE		/* load rvalue */
+	mov	x6, x8					/* load struct_rval */
+
+	bl	ffi_closure_SYSV_inner
+
+	/* Load the return value as directed.  */
+	adr	x1, ffi_closure_SYSV_return_base
+	and	w0, w0, #AARCH64_RET_MASK
+	add	x1, x1, x0, lsl #3
+	add	x3, sp, #16+CALL_CONTEXT_SIZE
+	br	x1
+
+	/* Note that each table entry is 2 insns, and thus 8 bytes.  */
+	ALIGN	8
+ffi_closure_SYSV_return_base
+	b	ffi_closure_SYSV_epilog			/* VOID */
+	nop
+	ldr	x0, [x3]		/* INT64 */
+	b	ffi_closure_SYSV_epilog
+	ldp	x0, x1, [x3]		/* INT128 */
+	b	ffi_closure_SYSV_epilog
+	brk	#1000			/* UNUSED */
+	nop
+	brk	#1000			/* UNUSED */
+	nop
+	brk	#1000			/* UNUSED */
+	nop
+	brk	#1000			/* UNUSED */
+	nop
+	brk	#1000			/* UNUSED */
+	nop
+	ldr	s3, [x3, #12]		/* S4 */
+	nop
+	ldr	s2, [x3, #8]		/* S3 */
+	nop
+	ldp	s0, s1, [x3]		/* S2 */
+	b	ffi_closure_SYSV_epilog
+	ldr	s0, [x3]		/* S1 */
+	b	ffi_closure_SYSV_epilog
+	ldr	d3, [x3, #24]		/* D4 */
+	nop
+	ldr	d2, [x3, #16]		/* D3 */
+	nop
+	ldp	d0, d1, [x3]		/* D2 */
+	b	ffi_closure_SYSV_epilog
+	ldr	d0, [x3]		/* D1 */
+	b	ffi_closure_SYSV_epilog
+	ldr	q3, [x3, #48]		/* Q4 */
+	nop
+	ldr	q2, [x3, #32]		/* Q3 */
+	nop
+	ldp	q0, q1, [x3]		/* Q2 */
+	b	ffi_closure_SYSV_epilog
+	ldr	q0, [x3]		/* Q1 */
+	b	ffi_closure_SYSV_epilog
+	ldrb	w0, [x3, #BE(7)]	/* UINT8 */
+	b	ffi_closure_SYSV_epilog
+	brk	#1000			/* reserved */
+	nop
+	ldrh	w0, [x3, #BE(6)]	/* UINT16 */
+	b	ffi_closure_SYSV_epilog
+	brk	#1000			/* reserved */
+	nop
+	ldr	w0, [x3, #BE(4)]	/* UINT32 */
+	b	ffi_closure_SYSV_epilog
+	brk	#1000			/* reserved */
+	nop
+	ldrsb	x0, [x3, #BE(7)]	/* SINT8 */
+	b	ffi_closure_SYSV_epilog
+	brk	#1000			/* reserved */
+	nop
+	ldrsh	x0, [x3, #BE(6)]	/* SINT16 */
+	b	ffi_closure_SYSV_epilog
+	brk	#1000			/* reserved */
+	nop
+	ldrsw	x0, [x3, #BE(4)]	/* SINT32 */
+	nop
+					/* reserved */
+
+ffi_closure_SYSV_epilog
+	EPILOG_RESTORE_REG_PAIR	x29, x30, #ffi_closure_SYSV_FS!
+	EPILOG_RETURN
+	NESTED_END	ffi_closure_SYSV
+
+
+#ifdef FFI_GO_CLOSURES
+	NESTED_ENTRY	ffi_go_closure_SYSV_V
+	PROLOG_SAVE_REG_PAIR	x29, x30, #-ffi_closure_SYSV_FS!
+
+	/* Save the argument passing vector registers.  */
+	stp	q0, q1, [sp, #16 + 0]
+	stp	q2, q3, [sp, #16 + 32]
+	stp	q4, q5, [sp, #16 + 64]
+	stp	q6, q7, [sp, #16 + 96]
+	b	ffi_go_closure_SYSV_save_argument
+	NESTED_END	ffi_go_closure_SYSV_V
+
+	NESTED_ENTRY	ffi_go_closure_SYSV
+	PROLOG_SAVE_REG_PAIR	x29, x30, #-ffi_closure_SYSV_FS!
+
+ffi_go_closure_SYSV_save_argument
+	/* Save the argument passing core registers.  */
+	stp     x0, x1, [sp, #16 + 16*N_V_ARG_REG + 0]
+	stp     x2, x3, [sp, #16 + 16*N_V_ARG_REG + 16]
+	stp     x4, x5, [sp, #16 + 16*N_V_ARG_REG + 32]
+	stp     x6, x7, [sp, #16 + 16*N_V_ARG_REG + 48]
+
+	/* Load ffi_closure_inner arguments.  */
+	ldp	PTR_REG(0), PTR_REG(1), [x18, #PTR_SIZE]/* load cif, fn */
+	mov	x2, x18					/* load user_data */
+	b	do_closure
+	NESTED_END	ffi_go_closure_SYSV
+
+#endif /* FFI_GO_CLOSURES */
+
+
+/* void extend_hfa_type (void *dest, void *src, int h) */
+
+	LEAF_ENTRY	extend_hfa_type
+
+	adr	x3, extend_hfa_type_jump_base
+	and	w2, w2, #AARCH64_RET_MASK
+	sub	x2, x2, #AARCH64_RET_S4
+	add	x3, x3, x2, lsl #4
+	br	x3
+
+	ALIGN	4
+extend_hfa_type_jump_base
+	ldp	s16, s17, [x1]		/* S4 */
+	ldp	s18, s19, [x1, #8]
+	b	extend_hfa_type_store_4
+	nop
+
+	ldp	s16, s17, [x1]		/* S3 */
+	ldr	s18, [x1, #8]
+	b	extend_hfa_type_store_3
+	nop
+
+	ldp	s16, s17, [x1]		/* S2 */
+	b	extend_hfa_type_store_2
+	nop
+	nop
+
+	ldr	s16, [x1]		/* S1 */
+	b	extend_hfa_type_store_1
+	nop
+	nop
+
+	ldp	d16, d17, [x1]		/* D4 */
+	ldp	d18, d19, [x1, #16]
+	b       extend_hfa_type_store_4
+	nop
+
+	ldp     d16, d17, [x1]		/* D3 */
+	ldr     d18, [x1, #16]
+	b	extend_hfa_type_store_3
+	nop
+
+	ldp	d16, d17, [x1]		/* D2 */
+	b	extend_hfa_type_store_2
+	nop
+	nop
+
+	ldr	d16, [x1]		/* D1 */
+	b	extend_hfa_type_store_1
+	nop
+	nop
+
+	ldp	q16, q17, [x1]		/* Q4 */
+	ldp	q18, q19, [x1, #16]
+	b	extend_hfa_type_store_4
+	nop
+
+	ldp	q16, q17, [x1]		/* Q3 */
+	ldr	q18, [x1, #16]
+	b	extend_hfa_type_store_3
+	nop
+
+	ldp	q16, q17, [x1]		/* Q2 */
+	b	extend_hfa_type_store_2
+	nop
+	nop
+
+	ldr	q16, [x1]		/* Q1 */
+	b	extend_hfa_type_store_1
+
+extend_hfa_type_store_4
+	str	q19, [x0, #48]
+extend_hfa_type_store_3
+	str	q18, [x0, #32]
+extend_hfa_type_store_2
+	str	q17, [x0, #16]
+extend_hfa_type_store_1
+	str	q16, [x0]
+	ret
+
+	LEAF_END	extend_hfa_type
+
+
+/* void compress_hfa_type (void *dest, void *reg, int h) */
+
+	LEAF_ENTRY	compress_hfa_type
+
+	adr	x3, compress_hfa_type_jump_base
+	and	w2, w2, #AARCH64_RET_MASK
+	sub	x2, x2, #AARCH64_RET_S4
+	add	x3, x3, x2, lsl #4
+	br	x3
+
+	ALIGN	4
+compress_hfa_type_jump_base
+	ldp	q16, q17, [x1]		/* S4 */
+	ldp	q18, q19, [x1, #32]
+	st4	{ v16.s, v17.s, v18.s, v19.s }[0], [x0]
+	ret
+
+	ldp	q16, q17, [x1]		/* S3 */
+	ldr	q18, [x1, #32]
+	st3	{ v16.s, v17.s, v18.s }[0], [x0]
+	ret
+
+	ldp	q16, q17, [x1]		/* S2 */
+	st2	{ v16.s, v17.s }[0], [x0]
+	ret
+	nop
+
+	ldr	q16, [x1]		/* S1 */
+	st1	{ v16.s }[0], [x0]
+	ret
+	nop
+
+	ldp	q16, q17, [x1]		/* D4 */
+	ldp	q18, q19, [x1, #32]
+	st4	{ v16.d, v17.d, v18.d, v19.d }[0], [x0]
+	ret
+
+	ldp	q16, q17, [x1]		/* D3 */
+	ldr	q18, [x1, #32]
+	st3	{ v16.d, v17.d, v18.d }[0], [x0]
+	ret
+
+	ldp	q16, q17, [x1]		/* D2 */
+	st2	{ v16.d, v17.d }[0], [x0]
+	ret
+	nop
+
+	ldr	q16, [x1]		/* D1 */
+	st1	{ v16.d }[0], [x0]
+	ret
+	nop
+
+	ldp	q16, q17, [x1]		/* Q4 */
+	ldp	q18, q19, [x1, #32]
+	b	compress_hfa_type_store_q4
+	nop
+
+	ldp	q16, q17, [x1]		/* Q3 */
+	ldr	q18, [x1, #32]
+	b	compress_hfa_type_store_q3
+	nop
+
+	ldp	q16, q17, [x1]		/* Q2 */
+	stp	q16, q17, [x0]
+	ret
+	nop
+
+	ldr	q16, [x1]		/* Q1 */
+	str	q16, [x0]
+	ret
+
+compress_hfa_type_store_q4
+	str	q19, [x0, #48]
+compress_hfa_type_store_q3
+	str	q18, [x0, #32]
+	stp	q16, q17, [x0]
+	ret
+
+	LEAF_END	compress_hfa_type
+
+	END
+\ No newline at end of file
diff --git a/libffi/src/alpha/ffi.c b/libffi/src/alpha/ffi.c
index efae4cc..7a95e97 100644
--- a/libffi/src/alpha/ffi.c
+++ b/libffi/src/alpha/ffi.c
@@ -98,7 +98,7 @@ ffi_prep_cif_machdep(ffi_cif *cif)
 	case FFI_TYPE_VOID:
 	case FFI_TYPE_STRUCT:
 	  /* Passed by value in N slots.  */
-	  bytes += ALIGN(itype->size, FFI_SIZEOF_ARG);
+	  bytes += FFI_ALIGN(itype->size, FFI_SIZEOF_ARG);
 	  break;
 
 	case FFI_TYPE_COMPLEX:
@@ -285,7 +285,7 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
 	case FFI_TYPE_STRUCT:
 	  size = ty->size;
 	  memcpy(argp + argn, valp, size);
-	  argn += ALIGN(size, FFI_SIZEOF_ARG) / FFI_SIZEOF_ARG;
+	  argn += FFI_ALIGN(size, FFI_SIZEOF_ARG) / FFI_SIZEOF_ARG;
 	  break;
 
 	case FFI_TYPE_COMPLEX:
@@ -421,7 +421,7 @@ ffi_closure_osf_inner (ffi_cif *cif,
 	case FFI_TYPE_VOID:
 	case FFI_TYPE_STRUCT:
 	  size = ty->size;
-	  argn += ALIGN(size, FFI_SIZEOF_ARG) / FFI_SIZEOF_ARG;
+	  argn += FFI_ALIGN(size, FFI_SIZEOF_ARG) / FFI_SIZEOF_ARG;
 	  break;
 
 	case FFI_TYPE_FLOAT:
diff --git a/libffi/src/arc/ffi.c b/libffi/src/arc/ffi.c
index 32f82a7d5..4d10b21 100644
--- a/libffi/src/arc/ffi.c
+++ b/libffi/src/arc/ffi.c
@@ -46,12 +46,10 @@ void
 ffi_prep_args (char *stack, extended_cif * ecif)
 {
   unsigned int i;
-  int tmp;
   void **p_argv;
   char *argp;
   ffi_type **p_arg;
 
-  tmp = 0;
   argp = stack;
 
   if (ecif->cif->rtype->type == FFI_TYPE_STRUCT)
@@ -73,7 +71,7 @@ ffi_prep_args (char *stack, extended_cif * ecif)
 
       /* Align if necessary.  */
       if ((alignment - 1) & (unsigned) argp)
-	argp = (char *) ALIGN (argp, alignment);
+	argp = (char *) FFI_ALIGN (argp, alignment);
 
       z = (*p_arg)->size;
       if (z < sizeof (int))
@@ -225,7 +223,7 @@ ffi_closure_inner_ARCompact (ffi_closure * closure, void *rvalue,
 
       /* Align if necessary.  */
       if ((alignment - 1) & (unsigned) argp)
-	argp = (char *) ALIGN (argp, alignment);
+	argp = (char *) FFI_ALIGN (argp, alignment);
 
       z = (*p_argt)->size;
       *p_argv = (void *) argp;
diff --git a/libffi/src/arm/ffi.c b/libffi/src/arm/ffi.c
index 9c8732d..593ab4d 100644
--- a/libffi/src/arm/ffi.c
+++ b/libffi/src/arm/ffi.c
@@ -28,11 +28,42 @@
    DEALINGS IN THE SOFTWARE.
    ----------------------------------------------------------------------- */
 
+#if defined(__arm__) || defined(_M_ARM)
+#include <fficonfig.h>
 #include <ffi.h>
 #include <ffi_common.h>
+#include <stdint.h>
 #include <stdlib.h>
+#include <tramp.h>
 #include "internal.h"
 
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
+#if FFI_EXEC_TRAMPOLINE_TABLE
+
+#ifdef __MACH__
+#include <mach/machine/vm_param.h>
+#endif
+
+#else
+#ifndef _WIN32
+extern unsigned int ffi_arm_trampoline[2] FFI_HIDDEN;
+#else
+// Declare this as an array of char, instead of array of int,
+// otherwise Clang optimizes out the "& 0xFFFFFFFE" for clearing
+// the thumb bit.
+extern unsigned char ffi_arm_trampoline[12] FFI_HIDDEN;
+#endif
+#endif
+
+#if defined(__FreeBSD__) && defined(__arm__)
+#include <sys/types.h>
+#include <machine/sysarch.h>
+#endif
+
 /* Forward declares. */
 static int vfp_type_p (const ffi_type *);
 static void layout_vfp_args (ffi_cif *);
@@ -49,7 +80,7 @@ ffi_align (ffi_type *ty, void *p)
   if (alignment < 4)
     alignment = 4;
 #endif
-  return (void *) ALIGN (p, alignment);
+  return (void *) FFI_ALIGN (p, alignment);
 }
 
 static size_t
@@ -76,10 +107,20 @@ ffi_put_arg (ffi_type *ty, void *src, void *dst)
     case FFI_TYPE_SINT32:
     case FFI_TYPE_UINT32:
     case FFI_TYPE_POINTER:
+#ifndef _WIN32
     case FFI_TYPE_FLOAT:
+#endif
       *(UINT32 *)dst = *(UINT32 *)src;
       break;
 
+#ifdef _WIN32
+    // casting a float* to a UINT32* doesn't work on Windows
+    case FFI_TYPE_FLOAT:
+        *(uintptr_t *)dst = 0;
+        *(float *)dst = *(float *)src;
+        break;
+#endif
+
     case FFI_TYPE_SINT64:
     case FFI_TYPE_UINT64:
     case FFI_TYPE_DOUBLE:
@@ -95,7 +136,7 @@ ffi_put_arg (ffi_type *ty, void *src, void *dst)
       abort();
     }
 
-  return ALIGN (z, 4);
+  return FFI_ALIGN (z, 4);
 }
 
 /* ffi_prep_args is called once stack space has been allocated
@@ -198,7 +239,7 @@ ffi_prep_args_VFP (ffi_cif *cif, int flags, void *rvalue,
 }
 
 /* Perform machine dependent cif processing */
-ffi_status
+ffi_status FFI_HIDDEN
 ffi_prep_cif_machdep (ffi_cif *cif)
 {
   int flags = 0, cabi = cif->abi;
@@ -276,7 +317,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
   /* Round the stack up to a multiple of 8 bytes.  This isn't needed
      everywhere, but it is on some platforms, and it doesn't harm anything
      when it isn't needed.  */
-  bytes = ALIGN (bytes, 8);
+  bytes = FFI_ALIGN (bytes, 8);
 
   /* Minimum stack space is the 4 register arguments that we pop.  */
   if (bytes < 4*4)
@@ -289,7 +330,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 }
 
 /* Perform machine dependent cif processing for variadic calls */
-ffi_status
+ffi_status FFI_HIDDEN
 ffi_prep_cif_machdep_var (ffi_cif * cif,
 			  unsigned int nfixedargs, unsigned int ntotalargs)
 {
@@ -389,12 +430,14 @@ ffi_call (ffi_cif *cif, void (*fn) (void), void *rvalue, void **avalue)
   ffi_call_int (cif, fn, rvalue, avalue, NULL);
 }
 
+#ifdef FFI_GO_CLOSURES
 void
 ffi_call_go (ffi_cif *cif, void (*fn) (void), void *rvalue,
 	     void **avalue, void *closure)
 {
   ffi_call_int (cif, fn, rvalue, avalue, closure);
 }
+#endif
 
 static void *
 ffi_prep_incoming_args_SYSV (ffi_cif *cif, void *rvalue,
@@ -408,6 +451,11 @@ ffi_prep_incoming_args_SYSV (ffi_cif *cif, void *rvalue,
       rvalue = *(void **) argp;
       argp += 4;
     }
+  else
+    {
+      if (cif->rtype->size && cif->rtype->size < 4)
+        *(uint32_t *) rvalue = 0;
+    }
 
   for (i = 0, n = cif->nargs; i < n; i++)
     {
@@ -492,6 +540,8 @@ ffi_prep_incoming_args_VFP (ffi_cif *cif, void *rvalue, char *stack,
   return rvalue;
 }
 
+#if FFI_CLOSURES
+
 struct closure_frame
 {
   char vfp_space[8*8] __attribute__((aligned(8)));
@@ -527,257 +577,28 @@ ffi_closure_inner_VFP (ffi_cif *cif,
 
 void ffi_closure_SYSV (void) FFI_HIDDEN;
 void ffi_closure_VFP (void) FFI_HIDDEN;
+#if defined(FFI_EXEC_STATIC_TRAMP)
+void ffi_closure_SYSV_alt (void) FFI_HIDDEN;
+void ffi_closure_VFP_alt (void) FFI_HIDDEN;
+#endif
+
+#ifdef FFI_GO_CLOSURES
 void ffi_go_closure_SYSV (void) FFI_HIDDEN;
 void ffi_go_closure_VFP (void) FFI_HIDDEN;
-
-#if FFI_EXEC_TRAMPOLINE_TABLE
-
-#include <mach/mach.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-extern void *ffi_closure_trampoline_table_page;
-
-typedef struct ffi_trampoline_table ffi_trampoline_table;
-typedef struct ffi_trampoline_table_entry ffi_trampoline_table_entry;
-
-struct ffi_trampoline_table
-{
-  /* contiguous writable and executable pages */
-  vm_address_t config_page;
-  vm_address_t trampoline_page;
-
-  /* free list tracking */
-  uint16_t free_count;
-  ffi_trampoline_table_entry *free_list;
-  ffi_trampoline_table_entry *free_list_pool;
-
-  ffi_trampoline_table *prev;
-  ffi_trampoline_table *next;
-};
-
-struct ffi_trampoline_table_entry
-{
-  void *(*trampoline) ();
-  ffi_trampoline_table_entry *next;
-};
-
-/* Override the standard architecture trampoline size */
-// XXX TODO - Fix
-#undef FFI_TRAMPOLINE_SIZE
-#define FFI_TRAMPOLINE_SIZE 12
-
-/* The trampoline configuration is placed at 4080 bytes prior to the trampoline's entry point */
-#define FFI_TRAMPOLINE_CODELOC_CONFIG(codeloc) ((void **) (((uint8_t *) codeloc) - 4080));
-
-/* The first 16 bytes of the config page are unused, as they are unaddressable from the trampoline page. */
-#define FFI_TRAMPOLINE_CONFIG_PAGE_OFFSET 16
-
-/* Total number of trampolines that fit in one trampoline table */
-#define FFI_TRAMPOLINE_COUNT ((PAGE_SIZE - FFI_TRAMPOLINE_CONFIG_PAGE_OFFSET) / FFI_TRAMPOLINE_SIZE)
-
-static pthread_mutex_t ffi_trampoline_lock = PTHREAD_MUTEX_INITIALIZER;
-static ffi_trampoline_table *ffi_trampoline_tables = NULL;
-
-static ffi_trampoline_table *
-ffi_trampoline_table_alloc ()
-{
-  ffi_trampoline_table *table = NULL;
-
-  /* Loop until we can allocate two contiguous pages */
-  while (table == NULL)
-    {
-      vm_address_t config_page = 0x0;
-      kern_return_t kt;
-
-      /* Try to allocate two pages */
-      kt =
-	vm_allocate (mach_task_self (), &config_page, PAGE_SIZE * 2,
-		     VM_FLAGS_ANYWHERE);
-      if (kt != KERN_SUCCESS)
-	{
-	  fprintf (stderr, "vm_allocate() failure: %d at %s:%d\n", kt,
-		   __FILE__, __LINE__);
-	  break;
-	}
-
-      /* Now drop the second half of the allocation to make room for the trampoline table */
-      vm_address_t trampoline_page = config_page + PAGE_SIZE;
-      kt = vm_deallocate (mach_task_self (), trampoline_page, PAGE_SIZE);
-      if (kt != KERN_SUCCESS)
-	{
-	  fprintf (stderr, "vm_deallocate() failure: %d at %s:%d\n", kt,
-		   __FILE__, __LINE__);
-	  break;
-	}
-
-      /* Remap the trampoline table to directly follow the config page */
-      vm_prot_t cur_prot;
-      vm_prot_t max_prot;
-
-      kt =
-	vm_remap (mach_task_self (), &trampoline_page, PAGE_SIZE, 0x0, FALSE,
-		  mach_task_self (),
-		  (vm_address_t) & ffi_closure_trampoline_table_page, FALSE,
-		  &cur_prot, &max_prot, VM_INHERIT_SHARE);
-
-      /* If we lost access to the destination trampoline page, drop our config allocation mapping and retry */
-      if (kt != KERN_SUCCESS)
-	{
-	  /* Log unexpected failures */
-	  if (kt != KERN_NO_SPACE)
-	    {
-	      fprintf (stderr, "vm_remap() failure: %d at %s:%d\n", kt,
-		       __FILE__, __LINE__);
-	    }
-
-	  vm_deallocate (mach_task_self (), config_page, PAGE_SIZE);
-	  continue;
-	}
-
-      /* We have valid trampoline and config pages */
-      table = calloc (1, sizeof (ffi_trampoline_table));
-      table->free_count = FFI_TRAMPOLINE_COUNT;
-      table->config_page = config_page;
-      table->trampoline_page = trampoline_page;
-
-      /* Create and initialize the free list */
-      table->free_list_pool =
-	calloc (FFI_TRAMPOLINE_COUNT, sizeof (ffi_trampoline_table_entry));
-
-      uint16_t i;
-      for (i = 0; i < table->free_count; i++)
-	{
-	  ffi_trampoline_table_entry *entry = &table->free_list_pool[i];
-	  entry->trampoline =
-	    (void *) (table->trampoline_page + (i * FFI_TRAMPOLINE_SIZE));
-
-	  if (i < table->free_count - 1)
-	    entry->next = &table->free_list_pool[i + 1];
-	}
-
-      table->free_list = table->free_list_pool;
-    }
-
-  return table;
-}
-
-void *
-ffi_closure_alloc (size_t size, void **code)
-{
-  /* Create the closure */
-  ffi_closure *closure = malloc (size);
-  if (closure == NULL)
-    return NULL;
-
-  pthread_mutex_lock (&ffi_trampoline_lock);
-
-  /* Check for an active trampoline table with available entries. */
-  ffi_trampoline_table *table = ffi_trampoline_tables;
-  if (table == NULL || table->free_list == NULL)
-    {
-      table = ffi_trampoline_table_alloc ();
-      if (table == NULL)
-	{
-	  free (closure);
-	  return NULL;
-	}
-
-      /* Insert the new table at the top of the list */
-      table->next = ffi_trampoline_tables;
-      if (table->next != NULL)
-	table->next->prev = table;
-
-      ffi_trampoline_tables = table;
-    }
-
-  /* Claim the free entry */
-  ffi_trampoline_table_entry *entry = ffi_trampoline_tables->free_list;
-  ffi_trampoline_tables->free_list = entry->next;
-  ffi_trampoline_tables->free_count--;
-  entry->next = NULL;
-
-  pthread_mutex_unlock (&ffi_trampoline_lock);
-
-  /* Initialize the return values */
-  *code = entry->trampoline;
-  closure->trampoline_table = table;
-  closure->trampoline_table_entry = entry;
-
-  return closure;
-}
-
-void
-ffi_closure_free (void *ptr)
-{
-  ffi_closure *closure = ptr;
-
-  pthread_mutex_lock (&ffi_trampoline_lock);
-
-  /* Fetch the table and entry references */
-  ffi_trampoline_table *table = closure->trampoline_table;
-  ffi_trampoline_table_entry *entry = closure->trampoline_table_entry;
-
-  /* Return the entry to the free list */
-  entry->next = table->free_list;
-  table->free_list = entry;
-  table->free_count++;
-
-  /* If all trampolines within this table are free, and at least one other table exists, deallocate
-   * the table */
-  if (table->free_count == FFI_TRAMPOLINE_COUNT
-      && ffi_trampoline_tables != table)
-    {
-      /* Remove from the list */
-      if (table->prev != NULL)
-	table->prev->next = table->next;
-
-      if (table->next != NULL)
-	table->next->prev = table->prev;
-
-      /* Deallocate pages */
-      kern_return_t kt;
-      kt = vm_deallocate (mach_task_self (), table->config_page, PAGE_SIZE);
-      if (kt != KERN_SUCCESS)
-	fprintf (stderr, "vm_deallocate() failure: %d at %s:%d\n", kt,
-		 __FILE__, __LINE__);
-
-      kt =
-	vm_deallocate (mach_task_self (), table->trampoline_page, PAGE_SIZE);
-      if (kt != KERN_SUCCESS)
-	fprintf (stderr, "vm_deallocate() failure: %d at %s:%d\n", kt,
-		 __FILE__, __LINE__);
-
-      /* Deallocate free list */
-      free (table->free_list_pool);
-      free (table);
-    }
-  else if (ffi_trampoline_tables != table)
-    {
-      /* Otherwise, bump this table to the top of the list */
-      table->prev = NULL;
-      table->next = ffi_trampoline_tables;
-      if (ffi_trampoline_tables != NULL)
-	ffi_trampoline_tables->prev = table;
-
-      ffi_trampoline_tables = table;
-    }
-
-  pthread_mutex_unlock (&ffi_trampoline_lock);
-
-  /* Free the closure */
-  free (closure);
-}
-
-#else
-
-extern unsigned int ffi_arm_trampoline[2] FFI_HIDDEN;
-
 #endif
 
 /* the cif must already be prep'ed */
 
+#if defined(__FreeBSD__) && defined(__arm__)
+#define __clear_cache(start, end) do { \
+		struct arm_sync_icache_args ua; 		\
+								\
+		ua.addr = (uintptr_t)(start);			\
+		ua.len = (char *)(end) - (char *)start;		\
+		sysarch(ARM_SYNC_ICACHE, &ua);			\
+	} while (0);
+#endif
+
 ffi_status
 ffi_prep_closure_loc (ffi_closure * closure,
 		      ffi_cif * cif,
@@ -796,15 +617,48 @@ ffi_prep_closure_loc (ffi_closure * closure,
     return FFI_BAD_ABI;
 
 #if FFI_EXEC_TRAMPOLINE_TABLE
-  void **config = FFI_TRAMPOLINE_CODELOC_CONFIG (codeloc);
+  void **config = (void **)((uint8_t *)codeloc - PAGE_MAX_SIZE);
   config[0] = closure;
   config[1] = closure_func;
 #else
-  memcpy (closure->tramp, ffi_arm_trampoline, 8);
+
+#if defined(FFI_EXEC_STATIC_TRAMP)
+  if (ffi_tramp_is_present(closure))
+    {
+      /* Initialize the static trampoline's parameters. */
+      if (closure_func == ffi_closure_SYSV)
+        closure_func = ffi_closure_SYSV_alt;
+      else
+        closure_func = ffi_closure_VFP_alt;
+      ffi_tramp_set_parms (closure->ftramp, closure_func, closure);
+      goto out;
+    }
+#endif
+
+  /* Initialize the dynamic trampoline. */
+#ifndef _WIN32
+  memcpy(closure->tramp, ffi_arm_trampoline, 8);
+#else
+  // cast away function type so MSVC doesn't set the lower bit of the function pointer
+  memcpy(closure->tramp, (void*)((uintptr_t)ffi_arm_trampoline & 0xFFFFFFFE), FFI_TRAMPOLINE_CLOSURE_OFFSET);
+#endif
+
+#if defined (__QNX__)
+  msync(closure->tramp, 8, 0x1000000);	/* clear data map */
+  msync(codeloc, 8, 0x1000000);	/* clear insn map */
+#elif defined(_WIN32)
+  FlushInstructionCache(GetCurrentProcess(), closure->tramp, FFI_TRAMPOLINE_SIZE);
+#else
   __clear_cache(closure->tramp, closure->tramp + 8);	/* clear data map */
   __clear_cache(codeloc, codeloc + 8);			/* clear insn map */
+#endif
+#ifdef _WIN32
+  *(void(**)(void))(closure->tramp + FFI_TRAMPOLINE_CLOSURE_FUNCTION) = closure_func;
+#else
   *(void (**)(void))(closure->tramp + 8) = closure_func;
 #endif
+out:
+#endif
 
   closure->cif = cif;
   closure->fun = fun;
@@ -813,6 +667,7 @@ ffi_prep_closure_loc (ffi_closure * closure,
   return FFI_OK;
 }
 
+#ifdef FFI_GO_CLOSURES
 ffi_status
 ffi_prep_go_closure (ffi_go_closure *closure, ffi_cif *cif,
 		     void (*fun) (ffi_cif *, void *, void **, void *))
@@ -834,6 +689,9 @@ ffi_prep_go_closure (ffi_go_closure *closure, ffi_cif *cif,
 
   return FFI_OK;
 }
+#endif
+
+#endif /* FFI_CLOSURES */
 
 /* Below are routines for VFP hard-float support. */
 
@@ -1005,7 +863,7 @@ place_vfp_arg (ffi_cif *cif, int h)
 	}
       /* Found regs to allocate. */
       cif->vfp_used |= new_used;
-      cif->vfp_args[cif->vfp_nargs++] = reg;
+      cif->vfp_args[cif->vfp_nargs++] = (signed char)reg;
 
       /* Update vfp_reg_free. */
       if (cif->vfp_used & (1 << cif->vfp_reg_free))
@@ -1027,7 +885,7 @@ place_vfp_arg (ffi_cif *cif, int h)
 static void
 layout_vfp_args (ffi_cif * cif)
 {
-  int i;
+  unsigned int i;
   /* Init VFP fields */
   cif->vfp_used = 0;
   cif->vfp_nargs = 0;
@@ -1041,3 +899,17 @@ layout_vfp_args (ffi_cif * cif)
 	break;
     }
 }
+
+#if defined(FFI_EXEC_STATIC_TRAMP)
+void *
+ffi_tramp_arch (size_t *tramp_size, size_t *map_size)
+{
+  extern void *trampoline_code_table;
+
+  *tramp_size = ARM_TRAMP_SIZE;
+  *map_size = ARM_TRAMP_MAP_SIZE;
+  return &trampoline_code_table;
+}
+#endif
+
+#endif /* __arm__ or _M_ARM */
diff --git a/libffi/src/arm/ffitarget.h b/libffi/src/arm/ffitarget.h
index 4f473f9..12d5d20 100644
--- a/libffi/src/arm/ffitarget.h
+++ b/libffi/src/arm/ffitarget.h
@@ -43,7 +43,7 @@ typedef enum ffi_abi {
   FFI_SYSV,
   FFI_VFP,
   FFI_LAST_ABI,
-#ifdef __ARM_PCS_VFP
+#if defined(__ARM_PCS_VFP) || defined(_WIN32)
   FFI_DEFAULT_ABI = FFI_VFP,
 #else
   FFI_DEFAULT_ABI = FFI_SYSV,
@@ -57,13 +57,33 @@ typedef enum ffi_abi {
   signed char vfp_args[16]			\
 
 #define FFI_TARGET_SPECIFIC_VARIADIC
+#ifndef _WIN32
 #define FFI_TARGET_HAS_COMPLEX_TYPE
+#endif
 
 /* ---- Definitions for closures ----------------------------------------- */
 
 #define FFI_CLOSURES 1
 #define FFI_GO_CLOSURES 1
-#define FFI_TRAMPOLINE_SIZE 12
 #define FFI_NATIVE_RAW_API 0
 
+#if defined (FFI_EXEC_TRAMPOLINE_TABLE) && FFI_EXEC_TRAMPOLINE_TABLE
+
+#ifdef __MACH__
+#define FFI_TRAMPOLINE_SIZE 12
+#define FFI_TRAMPOLINE_CLOSURE_OFFSET 8
+#else
+#error "No trampoline table implementation"
+#endif
+
+#else
+#ifdef _WIN32
+#define FFI_TRAMPOLINE_SIZE 16
+#define FFI_TRAMPOLINE_CLOSURE_FUNCTION 12
+#else
+#define FFI_TRAMPOLINE_SIZE 12
+#endif
+#define FFI_TRAMPOLINE_CLOSURE_OFFSET FFI_TRAMPOLINE_SIZE
+#endif
+
 #endif
diff --git a/libffi/src/arm/internal.h b/libffi/src/arm/internal.h
index 6cf0b2a..fa8ab0b 100644
--- a/libffi/src/arm/internal.h
+++ b/libffi/src/arm/internal.h
@@ -5,3 +5,13 @@
 #define ARM_TYPE_INT	4
 #define ARM_TYPE_VOID	5
 #define ARM_TYPE_STRUCT	6
+
+#if defined(FFI_EXEC_STATIC_TRAMP)
+/*
+ * For the trampoline table mapping, a mapping size of 4K (base page size)
+ * is chosen.
+ */
+#define ARM_TRAMP_MAP_SHIFT	12
+#define ARM_TRAMP_MAP_SIZE	(1 << ARM_TRAMP_MAP_SHIFT)
+#define ARM_TRAMP_SIZE		20
+#endif
diff --git a/libffi/src/arm/sysv.S b/libffi/src/arm/sysv.S
index fd16589..fb36213 100644
--- a/libffi/src/arm/sysv.S
+++ b/libffi/src/arm/sysv.S
@@ -25,7 +25,8 @@
    DEALINGS IN THE SOFTWARE.
    ----------------------------------------------------------------------- */
 
-#define LIBFFI_ASM	
+#ifdef __arm__
+#define LIBFFI_ASM
 #include <fficonfig.h>
 #include <ffi.h>
 #include <ffi_cfi.h>
@@ -52,11 +53,12 @@
 #endif
 
 /* Conditionally compile unwinder directives.  */
-.macro UNWIND text:vararg
 #ifdef __ARM_EABI__
-	\text
-#endif	
-.endm
+# define UNWIND(...)	__VA_ARGS__
+#else
+# define UNWIND(...)
+#endif
+
 #if defined(HAVE_AS_CFI_PSEUDO_OP) && defined(__ARM_EABI__)
 	.cfi_sections	.debug_frame
 #endif
@@ -77,29 +79,52 @@
 # define TYPE(X, Y)
 #endif
 
-#define ARM_FUNC_START(name, gl) \
-	.align	3; \
-	.ifne gl; .globl CNAME(name); FFI_HIDDEN(CNAME(name)); .endif; \
-	TYPE(name, %function); \
+#define ARM_FUNC_START_LOCAL(name)	\
+	.align	3;			\
+	TYPE(CNAME(name), %function);	\
 	CNAME(name):
 
+#define ARM_FUNC_START(name)		\
+	.globl CNAME(name);		\
+	FFI_HIDDEN(CNAME(name));	\
+	ARM_FUNC_START_LOCAL(name)
+
 #define ARM_FUNC_END(name) \
 	SIZE(name)
 
-/* Aid in defining a jump table with 8 bytes between entries.  */
-.macro E index
-	.if . - 0b - 8*\index
-	.error "type table out of sync"
-	.endif
-.endm
-
 	.text
 	.syntax unified
+#if defined(_WIN32)
+	/* Windows on ARM is thumb-only */
+	.thumb
+#else
+	/* Keep the assembly in ARM mode in other cases, for simplicity
+	 * (to avoid interworking issues). */
+#undef __thumb__
 	.arm
+#endif
 
+/* Aid in defining a jump table with 8 bytes between entries.  */
+#ifdef __thumb__
+/* In thumb mode, instructions can be shorter than expected in arm mode, so
+ * we need to align the start of each case. */
+# define E(index) .align 3
+#elif defined(__clang__)
+/* ??? The clang assembler doesn't handle .if with symbolic expressions.  */
+# define E(index)
+#else
+# define E(index)				\
+	.if . - 0b - 8*index;			\
+	.error "type table out of sync";	\
+	.endif
+#endif
+
+
+#ifndef __clang__
 	/* We require interworking on LDM, which implies ARMv5T,
 	   which implies the existance of BLX.  */
-	.arch	armv5t
+ 	.arch	armv5t
+#endif
 
 	/* Note that we use STC and LDC to encode VFP instructions,
 	   so that we do not need ".fpu vfp", nor get that added to
@@ -111,25 +136,31 @@
 	@ r2:   fn
 	@ r3:	vfp_used
 
-ARM_FUNC_START(ffi_call_VFP, 1)
-	UNWIND	.fnstart
+ARM_FUNC_START(ffi_call_VFP)
+	UNWIND(.fnstart)
 	cfi_startproc
 
 	cmp	r3, #3			@ load only d0 if possible
-	ldcle	p11, cr0, [r0]		@ vldrle d0, [sp]
-	ldcgt	p11, cr0, [r0], {16}	@ vldmgt sp, {d0-d7}
+	ite	le
+#ifdef __clang__
+	vldrle d0, [r0]
+	vldmgt r0, {d0-d7}
+#else
+	ldcle	p11, cr0, [r0]		@ vldrle d0, [r0]
+	ldcgt	p11, cr0, [r0], {16}	@ vldmgt r0, {d0-d7}
+#endif
 	add	r0, r0, #64		@ discard the vfp register args
 	/* FALLTHRU */
 ARM_FUNC_END(ffi_call_VFP)
 
-ARM_FUNC_START(ffi_call_SYSV, 1)
+ARM_FUNC_START(ffi_call_SYSV)
 	stm	r1, {fp, lr}
 	mov	fp, r1
 
 	@ This is a bit of a lie wrt the origin of the unwind info, but
 	@ now we've got the usual frame pointer and two saved registers.
-	UNWIND	.save {fp,lr}
-	UNWIND	.setfp fp, sp
+	UNWIND(.save {fp,lr})
+	UNWIND(.setfp fp, sp)
 	cfi_def_cfa(fp, 8)
 	cfi_rel_offset(fp, 0)
 	cfi_rel_offset(lr, 4)
@@ -150,41 +181,61 @@ ARM_FUNC_START(ffi_call_SYSV, 1)
 	cfi_def_cfa_register(sp)
 
 	@ Store values stored in registers.
+#ifndef __thumb__
 	.align	3
 	add	pc, pc, r3, lsl #3
 	nop
+#else
+	adr	ip, 0f
+	add	ip, ip, r3, lsl #3
+	mov	pc, ip
+	.align	3
+#endif
 0:
-E ARM_TYPE_VFP_S
+E(ARM_TYPE_VFP_S)
+#ifdef __clang__
+	vstr s0, [r2]
+#else
 	stc	p10, cr0, [r2]		@ vstr s0, [r2]
+#endif
 	pop	{fp,pc}
-E ARM_TYPE_VFP_D
+E(ARM_TYPE_VFP_D)
+#ifdef __clang__
+	vstr d0, [r2]
+#else
 	stc	p11, cr0, [r2]		@ vstr d0, [r2]
+#endif
 	pop	{fp,pc}
-E ARM_TYPE_VFP_N
+E(ARM_TYPE_VFP_N)
+#ifdef __clang__
+	vstm r2, {d0-d3}
+#else
 	stc	p11, cr0, [r2], {8}	@ vstm r2, {d0-d3}
+#endif
 	pop	{fp,pc}
-E ARM_TYPE_INT64
+E(ARM_TYPE_INT64)
 	str	r1, [r2, #4]
 	nop
-E ARM_TYPE_INT
+E(ARM_TYPE_INT)
 	str	r0, [r2]
 	pop	{fp,pc}
-E ARM_TYPE_VOID
+E(ARM_TYPE_VOID)
 	pop	{fp,pc}
 	nop
-E ARM_TYPE_STRUCT
+E(ARM_TYPE_STRUCT)
 	pop	{fp,pc}
 
 	cfi_endproc
-	UNWIND	.fnend
+	UNWIND(.fnend)
 ARM_FUNC_END(ffi_call_SYSV)
 
+#if FFI_CLOSURES
 
 /*
 	int ffi_closure_inner_* (cif, fun, user_data, frame)
 */
 
-ARM_FUNC_START(ffi_go_closure_SYSV, 1)
+ARM_FUNC_START(ffi_go_closure_SYSV)
 	cfi_startproc
 	stmdb	sp!, {r0-r3}			@ save argument regs
 	cfi_adjust_cfa_offset(16)
@@ -195,14 +246,21 @@ ARM_FUNC_START(ffi_go_closure_SYSV, 1)
 	cfi_endproc
 ARM_FUNC_END(ffi_go_closure_SYSV)
 
-ARM_FUNC_START(ffi_closure_SYSV, 1)
-	UNWIND	.fnstart
+ARM_FUNC_START(ffi_closure_SYSV)
+	UNWIND(.fnstart)
 	cfi_startproc
+#ifdef _WIN32
+	ldmfd	sp!, {r0, ip}			@ restore fp (r0 is used for stack alignment)
+#endif
 	stmdb	sp!, {r0-r3}			@ save argument regs
 	cfi_adjust_cfa_offset(16)
-	ldr	r0, [ip, #FFI_TRAMPOLINE_SIZE]	  @ load cif
-	ldr	r1, [ip, #FFI_TRAMPOLINE_SIZE+4]  @ load fun
-	ldr	r2, [ip, #FFI_TRAMPOLINE_SIZE+8]  @ load user_data
+
+#if FFI_EXEC_TRAMPOLINE_TABLE
+	ldr ip, [ip]				@ ip points to the config page, dereference to get the ffi_closure*
+#endif
+	ldr	r0, [ip, #FFI_TRAMPOLINE_CLOSURE_OFFSET]	@ load cif
+	ldr	r1, [ip, #FFI_TRAMPOLINE_CLOSURE_OFFSET+4]  @ load fun
+	ldr	r2, [ip, #FFI_TRAMPOLINE_CLOSURE_OFFSET+8]  @ load user_data
 0:
 	add	ip, sp, #16			@ compute entry sp
 	sub	sp, sp, #64+32			@ allocate frame
@@ -212,7 +270,7 @@ ARM_FUNC_START(ffi_closure_SYSV, 1)
 	/* Remember that EABI unwind info only applies at call sites.
 	   We need do nothing except note the save of the stack pointer
 	   and the link registers.  */
-	UNWIND	.save {sp,lr}
+	UNWIND(.save {sp,lr})
 	cfi_adjust_cfa_offset(8)
 	cfi_rel_offset(lr, 4)
 
@@ -222,12 +280,17 @@ ARM_FUNC_START(ffi_closure_SYSV, 1)
 	@ Load values returned in registers.
 	add	r2, sp, #8+64			@ load result
 	adr	r3, CNAME(ffi_closure_ret)
+#ifndef __thumb__
 	add	pc, r3, r0, lsl #3
+#else
+	add	r3, r3, r0, lsl #3
+	mov	pc, r3
+#endif
 	cfi_endproc
-	UNWIND	.fnend
+	UNWIND(.fnend)
 ARM_FUNC_END(ffi_closure_SYSV)
 
-ARM_FUNC_START(ffi_go_closure_VFP, 1)
+ARM_FUNC_START(ffi_go_closure_VFP)
 	cfi_startproc
 	stmdb	sp!, {r0-r3}			@ save argument regs
 	cfi_adjust_cfa_offset(16)
@@ -238,23 +301,34 @@ ARM_FUNC_START(ffi_go_closure_VFP, 1)
 	cfi_endproc
 ARM_FUNC_END(ffi_go_closure_VFP)
 
-ARM_FUNC_START(ffi_closure_VFP, 1)
-	UNWIND	.fnstart
+ARM_FUNC_START(ffi_closure_VFP)
+	UNWIND(.fnstart)
 	cfi_startproc
+#ifdef _WIN32
+	ldmfd	sp!, {r0, ip}			@ restore fp (r0 is used for stack alignment)
+#endif
 	stmdb	sp!, {r0-r3}			@ save argument regs
 	cfi_adjust_cfa_offset(16)
-	ldr	r0, [ip, #FFI_TRAMPOLINE_SIZE]	  @ load cif
-	ldr	r1, [ip, #FFI_TRAMPOLINE_SIZE+4]  @ load fun
-	ldr	r2, [ip, #FFI_TRAMPOLINE_SIZE+8]  @ load user_data
+
+#if FFI_EXEC_TRAMPOLINE_TABLE
+	ldr ip, [ip]				@ ip points to the config page, dereference to get the ffi_closure*
+#endif
+	ldr	r0, [ip, #FFI_TRAMPOLINE_CLOSURE_OFFSET]	@ load cif
+	ldr	r1, [ip, #FFI_TRAMPOLINE_CLOSURE_OFFSET+4]  @ load fun
+	ldr	r2, [ip, #FFI_TRAMPOLINE_CLOSURE_OFFSET+8]  @ load user_data
 0:
 	add	ip, sp, #16
 	sub	sp, sp, #64+32			@ allocate frame
 	cfi_adjust_cfa_offset(64+32)
+#ifdef __clang__
+	vstm sp, {d0-d7}
+#else
 	stc	p11, cr0, [sp], {16}		@ vstm sp, {d0-d7}
+#endif
 	stmdb	sp!, {ip,lr}
 
 	/* See above.  */
-	UNWIND	.save {sp,lr}
+	UNWIND(.save {sp,lr})
 	cfi_adjust_cfa_offset(8)
 	cfi_rel_offset(lr, 4)
 
@@ -264,71 +338,151 @@ ARM_FUNC_START(ffi_closure_VFP, 1)
 	@ Load values returned in registers.
 	add	r2, sp, #8+64			@ load result
 	adr	r3, CNAME(ffi_closure_ret)
+#ifndef __thumb__
 	add	pc, r3, r0, lsl #3
+#else
+	add	r3, r3, r0, lsl #3
+	mov	pc, r3
+#endif
 	cfi_endproc
-	UNWIND	.fnend
+	UNWIND(.fnend)
 ARM_FUNC_END(ffi_closure_VFP)
 
 /* Load values returned in registers for both closure entry points.
    Note that we use LDM with SP in the register set.  This is deprecated
    by ARM, but not yet unpredictable.  */
 
-ARM_FUNC_START(ffi_closure_ret, 0)
+ARM_FUNC_START_LOCAL(ffi_closure_ret)
 	cfi_startproc
 	cfi_rel_offset(sp, 0)
 	cfi_rel_offset(lr, 4)
 0:
-E ARM_TYPE_VFP_S
+E(ARM_TYPE_VFP_S)
+#ifdef __clang__
+	vldr s0, [r2]
+#else
 	ldc	p10, cr0, [r2]			@ vldr s0, [r2]
-	ldm	sp, {sp,pc}
-E ARM_TYPE_VFP_D
+#endif
+	b	call_epilogue
+E(ARM_TYPE_VFP_D)
+#ifdef __clang__
+	vldr d0, [r2]
+#else
 	ldc	p11, cr0, [r2]			@ vldr d0, [r2]
-	ldm	sp, {sp,pc}
-E ARM_TYPE_VFP_N
+#endif
+	b	call_epilogue
+E(ARM_TYPE_VFP_N)
+#ifdef __clang__
+	vldm r2, {d0-d3}
+#else
 	ldc	p11, cr0, [r2], {8}		@ vldm r2, {d0-d3}
-	ldm	sp, {sp,pc}
-E ARM_TYPE_INT64
+#endif
+	b	call_epilogue
+E(ARM_TYPE_INT64)
 	ldr	r1, [r2, #4]
 	nop
-E ARM_TYPE_INT
+E(ARM_TYPE_INT)
 	ldr	r0, [r2]
-	ldm	sp, {sp,pc}
-E ARM_TYPE_VOID
-	ldm	sp, {sp,pc}
+	b	call_epilogue
+E(ARM_TYPE_VOID)
+	b	call_epilogue
 	nop
-E ARM_TYPE_STRUCT
+E(ARM_TYPE_STRUCT)
+	b	call_epilogue
+call_epilogue:
+#ifndef __thumb__
 	ldm	sp, {sp,pc}
+#else
+	ldm	sp, {ip,lr}
+	mov	sp, ip
+	bx	lr
+#endif
 	cfi_endproc
 ARM_FUNC_END(ffi_closure_ret)
 
-#if FFI_EXEC_TRAMPOLINE_TABLE
+#if defined(FFI_EXEC_STATIC_TRAMP)
+ARM_FUNC_START(ffi_closure_SYSV_alt)
+	/* See the comments above trampoline_code_table. */
+	ldr	ip, [sp, #4]			/* Load closure in ip */
+	add	sp, sp, 8			/* Restore the stack */
+	b	CNAME(ffi_closure_SYSV)
+ARM_FUNC_END(ffi_closure_SYSV_alt)
+
+ARM_FUNC_START(ffi_closure_VFP_alt)
+	/* See the comments above trampoline_code_table. */
+	ldr	ip, [sp, #4]			/* Load closure in ip */
+	add	sp, sp, 8			/* Restore the stack */
+	b	CNAME(ffi_closure_VFP)
+ARM_FUNC_END(ffi_closure_VFP_alt)
 
-/* ??? The iOS support should be updated.  The first insn used to
-   be STMFD, but that's been moved into ffi_closure_SYSV.  If the
-   writable page is put after this one we can make use of the
-   pc+8 feature of the architecture.  We can also reduce the size
-   of the thunk to 8 and pack more of these into the page.
+/*
+ * Below is the definition of the trampoline code table. Each element in
+ * the code table is a trampoline.
+ */
+/*
+ * The trampoline uses register ip (r12). It saves the original value of ip
+ * on the stack.
+ *
+ * The trampoline has two parameters - target code to jump to and data for
+ * the target code. The trampoline extracts the parameters from its parameter
+ * block (see tramp_table_map()). The trampoline saves the data address on
+ * the stack. Finally, it jumps to the target code.
+ *
+ * The target code can choose to:
+ *
+ * - restore the value of ip
+ * - load the data address in a register
+ * - restore the stack pointer to what it was when the trampoline was invoked.
+ */
+	.align	ARM_TRAMP_MAP_SHIFT
+ARM_FUNC_START(trampoline_code_table)
+	.rept	ARM_TRAMP_MAP_SIZE / ARM_TRAMP_SIZE
+	sub	sp, sp, #8		/* Make space on the stack */
+	str	ip, [sp]		/* Save ip on stack */
+	ldr	ip, [pc, #4080]		/* Copy data into ip */
+	str	ip, [sp, #4]		/* Save data on stack */
+	ldr	pc, [pc, #4076]		/* Copy code into PC */
+	.endr
+ARM_FUNC_END(trampoline_code_table)
+	.align	ARM_TRAMP_MAP_SHIFT
+#endif /* FFI_EXEC_STATIC_TRAMP */
+
+#endif /* FFI_CLOSURES */
+
+#if FFI_EXEC_TRAMPOLINE_TABLE
 
-   In the meantime, simply replace the STMFD with a NOP so as to
-   keep all the magic numbers the same within ffi.c.  */
+#ifdef __MACH__
+#include <mach/machine/vm_param.h>
 
-	.align	12
+.align	PAGE_MAX_SHIFT
 ARM_FUNC_START(ffi_closure_trampoline_table_page)
-.rept	4096 / 12
-	nop
-	ldr	ip, [pc, #-4092]
-	ldr	pc, [pc, #-4092]
+.rept	PAGE_MAX_SIZE / FFI_TRAMPOLINE_SIZE
+	adr ip, #-PAGE_MAX_SIZE   @ the config page is PAGE_MAX_SIZE behind the trampoline page
+	sub ip, #8				  @ account for pc bias
+	ldr	pc, [ip, #4]		  @ jump to ffi_closure_SYSV or ffi_closure_VFP
 .endr
+ARM_FUNC_END(ffi_closure_trampoline_table_page)
+#endif
+
+#elif defined(_WIN32)
+
+ARM_FUNC_START(ffi_arm_trampoline)
+0:	adr	ip, 0b
+	stmdb	sp!, {r0, ip}
+	ldr	pc, 1f
+1:	.long	0
+ARM_FUNC_END(ffi_arm_trampoline)
 
 #else
 
-ARM_FUNC_START(ffi_arm_trampoline, 1)
+ARM_FUNC_START(ffi_arm_trampoline)
 0:	adr	ip, 0b
 	ldr	pc, 1f
 1:	.long	0
 ARM_FUNC_END(ffi_arm_trampoline)
 
 #endif /* FFI_EXEC_TRAMPOLINE_TABLE */
+#endif /* __arm__ */
 
 #if defined __ELF__ && defined __linux__
 	.section	.note.GNU-stack,"",%progbits
diff --git a/libffi/src/arm/sysv_msvc_arm32.S b/libffi/src/arm/sysv_msvc_arm32.S
new file mode 100644
index 0000000..5c99d02
--- /dev/null
+++ b/libffi/src/arm/sysv_msvc_arm32.S
@@ -0,0 +1,311 @@
+/* -----------------------------------------------------------------------
+   sysv.S - Copyright (c) 1998, 2008, 2011 Red Hat, Inc.
+        Copyright (c) 2011 Plausible Labs Cooperative, Inc.
+        Copyright (c) 2019 Microsoft Corporation.
+
+   ARM Foreign Function Interface
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+   ----------------------------------------------------------------------- */
+
+#define LIBFFI_ASM
+#include <fficonfig.h>
+#include <ffi.h>
+#include <ffi_cfi.h>
+#include "internal.h"
+#include "ksarm.h"
+
+
+        ; 8 byte aligned AREA to support 8 byte aligned jump tables
+        MACRO
+        NESTED_ENTRY_FFI $FuncName, $AreaName, $ExceptHandler
+
+        ; compute the function's labels
+        __DeriveFunctionLabels $FuncName
+
+        ; determine the area we will put the function into
+__FuncArea   SETS    "|.text|"
+        IF "$AreaName" != ""
+__FuncArea   SETS    "$AreaName"
+        ENDIF
+
+        ; set up the exception handler itself
+__FuncExceptionHandler SETS ""
+        IF "$ExceptHandler" != ""
+__FuncExceptionHandler SETS    "|$ExceptHandler|"
+        ENDIF
+
+        ; switch to the specified area, jump tables require 8 byte alignment
+        AREA    $__FuncArea,CODE,CODEALIGN,ALIGN=3,READONLY
+
+        ; export the function name
+        __ExportProc $FuncName
+
+        ; flush any pending literal pool stuff
+        ROUT
+
+        ; reset the state of the unwind code tracking
+        __ResetUnwindState
+
+        MEND
+
+;        MACRO
+;        TABLE_ENTRY $Type, $Table
+;$Type_$Table
+;        MEND
+
+#define E(index,table) return_##index##_##table
+
+    ; r0:   stack
+    ; r1:   frame
+    ; r2:   fn
+    ; r3:   vfp_used
+
+    ; fake entry point exists only to generate exists only to 
+    ; generate .pdata for exception unwinding
+    NESTED_ENTRY_FFI ffi_call_VFP_fake
+    PROLOG_PUSH  {r11, lr}          ; save fp and lr for unwind
+
+    ALTERNATE_ENTRY ffi_call_VFP
+    cmp    r3, #3                   ; load only d0 if possible
+    vldrle d0, [r0]
+    vldmgt r0, {d0-d7}
+    add    r0, r0, #64              ; discard the vfp register args
+    b ffi_call_SYSV
+    NESTED_END ffi_call_VFP_fake
+
+    ; fake entry point exists only to generate exists only to 
+    ; generate .pdata for exception unwinding
+    NESTED_ENTRY_FFI ffi_call_SYSV_fake
+    PROLOG_PUSH  {r11, lr}          ; save fp and lr for unwind
+
+    ALTERNATE_ENTRY ffi_call_SYSV
+    stm    r1, {fp, lr}
+    mov    fp, r1
+
+    mov    sp, r0                   ; install the stack pointer
+    mov    lr, r2                   ; move the fn pointer out of the way
+    ldr    ip, [fp, #16]            ; install the static chain
+    ldmia  sp!, {r0-r3}             ; move first 4 parameters in registers.
+    blx    lr                       ; call fn
+
+    ; Load r2 with the pointer to storage for the return value
+    ; Load r3 with the return type code
+    ldr    r2, [fp, #8]
+    ldr    r3, [fp, #12]
+
+    ; Deallocate the stack with the arguments.
+    mov    sp, fp
+
+    ; Store values stored in registers.
+    ALIGN 8
+    lsl     r3, #3
+    add     r3, r3, pc
+    add     r3, #8
+    mov     pc, r3
+
+
+E(ARM_TYPE_VFP_S, ffi_call)
+    ALIGN 8
+    vstr s0, [r2]
+    pop    {fp,pc}
+E(ARM_TYPE_VFP_D, ffi_call)
+    ALIGN 8
+    vstr d0, [r2]
+    pop    {fp,pc}
+E(ARM_TYPE_VFP_N, ffi_call)
+    ALIGN 8
+    vstm r2, {d0-d3}
+    pop    {fp,pc}
+E(ARM_TYPE_INT64, ffi_call)
+    ALIGN 8
+    str    r1, [r2, #4]
+    nop
+E(ARM_TYPE_INT, ffi_call)
+    ALIGN 8
+    str    r0, [r2]
+    pop    {fp,pc}
+E(ARM_TYPE_VOID, ffi_call)
+    ALIGN 8
+    pop    {fp,pc}
+    nop
+E(ARM_TYPE_STRUCT, ffi_call)
+    ALIGN 8
+    cmp r3, #ARM_TYPE_STRUCT
+    pop    {fp,pc}
+    NESTED_END ffi_call_SYSV_fake
+
+    IMPORT |ffi_closure_inner_SYSV|
+    /*
+    int ffi_closure_inner_SYSV
+    (
+        cif,        ; r0
+        fun,        ; r1
+        user_data,  ; r2
+        frame       ; r3
+    )
+    */
+
+    NESTED_ENTRY_FFI ffi_go_closure_SYSV
+    stmdb   sp!, {r0-r3}            ; save argument regs
+    ldr     r0, [ip, #4]            ; load cif
+    ldr     r1, [ip, #8]            ; load fun
+    mov     r2, ip                  ; load user_data
+    b       ffi_go_closure_SYSV_0
+    NESTED_END ffi_go_closure_SYSV
+
+    ; r3:    ffi_closure
+
+    ; fake entry point exists only to generate exists only to 
+    ; generate .pdata for exception unwinding
+    NESTED_ENTRY_FFI ffi_closure_SYSV_fake  
+    PROLOG_PUSH  {r11, lr}          ; save fp and lr for unwind
+    ALTERNATE_ENTRY ffi_closure_SYSV
+    ldmfd   sp!, {ip,r0}            ; restore fp (r0 is used for stack alignment)
+    stmdb   sp!, {r0-r3}            ; save argument regs
+
+    ldr     r0, [ip, #FFI_TRAMPOLINE_CLOSURE_OFFSET]    ; ffi_closure->cif
+    ldr     r1, [ip, #FFI_TRAMPOLINE_CLOSURE_OFFSET+4]  ; ffi_closure->fun
+    ldr     r2, [ip, #FFI_TRAMPOLINE_CLOSURE_OFFSET+8]  ; ffi_closure->user_data
+
+    ALTERNATE_ENTRY ffi_go_closure_SYSV_0
+    add     ip, sp, #16             ; compute entry sp
+
+    sub     sp, sp, #64+32          ; allocate frame parameter (sizeof(vfp_space) = 64, sizeof(result) = 32)
+    mov     r3, sp                  ; set frame parameter
+    stmdb   sp!, {ip,lr}
+
+    bl      ffi_closure_inner_SYSV  ; call the Python closure
+
+                                    ; Load values returned in registers.
+    add     r2, sp, #64+8           ; address of closure_frame->result
+    bl      ffi_closure_ret         ; move result to correct register or memory for type
+
+    ldmfd   sp!, {ip,lr}
+    mov     sp, ip                  ; restore stack pointer
+    mov     pc, lr
+    NESTED_END ffi_closure_SYSV_fake
+
+    IMPORT |ffi_closure_inner_VFP|
+    /*
+    int ffi_closure_inner_VFP
+    (
+        cif,        ; r0
+        fun,        ; r1
+        user_data,  ; r2
+        frame       ; r3
+    )
+    */
+
+    NESTED_ENTRY_FFI ffi_go_closure_VFP
+    stmdb   sp!, {r0-r3}			; save argument regs
+    ldr	r0, [ip, #4]			; load cif
+    ldr	r1, [ip, #8]			; load fun
+    mov	r2, ip				; load user_data
+    b	ffi_go_closure_VFP_0
+    NESTED_END ffi_go_closure_VFP
+
+    ; fake entry point exists only to generate exists only to 
+    ; generate .pdata for exception unwinding
+    ; r3:    closure
+    NESTED_ENTRY_FFI ffi_closure_VFP_fake
+    PROLOG_PUSH  {r11, lr}          ; save fp and lr for unwind
+
+    ALTERNATE_ENTRY ffi_closure_VFP
+    ldmfd   sp!, {ip,r0}            ; restore fp (r0 is used for stack alignment)
+    stmdb   sp!, {r0-r3}            ; save argument regs
+
+    ldr     r0, [ip, #FFI_TRAMPOLINE_CLOSURE_OFFSET]    ; load cif
+    ldr     r1, [ip, #FFI_TRAMPOLINE_CLOSURE_OFFSET+4]  ; load fun
+    ldr     r2, [ip, #FFI_TRAMPOLINE_CLOSURE_OFFSET+8]  ; load user_data
+
+    ALTERNATE_ENTRY ffi_go_closure_VFP_0
+    add     ip, sp, #16             ; compute entry sp
+    sub     sp, sp, #32             ; save space for closure_frame->result
+    vstmdb  sp!, {d0-d7}            ; push closure_frame->vfp_space
+
+    mov     r3, sp                  ; save closure_frame
+    stmdb   sp!, {ip,lr}
+
+    bl      ffi_closure_inner_VFP
+
+    ; Load values returned in registers.
+    add     r2, sp, #64+8           ; load result
+    bl      ffi_closure_ret
+    ldmfd   sp!, {ip,lr}
+    mov     sp, ip                  ; restore stack pointer
+    mov     pc, lr
+    NESTED_END ffi_closure_VFP_fake
+
+/* Load values returned in registers for both closure entry points.
+   Note that we use LDM with SP in the register set.  This is deprecated
+   by ARM, but not yet unpredictable.  */
+
+    NESTED_ENTRY_FFI ffi_closure_ret
+    stmdb sp!, {fp,lr}
+
+    ALIGN 8
+    lsl     r0, #3
+    add     r0, r0, pc
+    add     r0, #8
+    mov     pc, r0
+
+E(ARM_TYPE_VFP_S, ffi_closure)
+    ALIGN 8
+    vldr s0, [r2]
+    b call_epilogue
+E(ARM_TYPE_VFP_D, ffi_closure)
+    ALIGN 8
+    vldr d0, [r2]
+    b call_epilogue
+E(ARM_TYPE_VFP_N, ffi_closure)
+    ALIGN 8
+    vldm r2, {d0-d3}
+    b call_epilogue
+E(ARM_TYPE_INT64, ffi_closure)
+    ALIGN 8
+    ldr    r1, [r2, #4]
+    nop
+E(ARM_TYPE_INT, ffi_closure)
+    ALIGN 8
+    ldr    r0, [r2]
+    b call_epilogue
+E(ARM_TYPE_VOID, ffi_closure)
+    ALIGN 8
+    b call_epilogue
+    nop
+E(ARM_TYPE_STRUCT, ffi_closure)
+    ALIGN 8
+    b call_epilogue
+call_epilogue
+    ldmfd sp!, {fp,pc}
+    NESTED_END ffi_closure_ret
+
+    AREA |.trampoline|, DATA, THUMB, READONLY
+    EXPORT |ffi_arm_trampoline|
+|ffi_arm_trampoline| DATA
+thisproc    adr    ip, thisproc
+            stmdb  sp!, {ip, r0}
+            ldr    pc, [pc, #0]
+            DCD    0
+            ;ENDP
+
+    END
+\ No newline at end of file
diff --git a/libffi/src/closures.c b/libffi/src/closures.c
index 721ff00..f7bead6 100644
--- a/libffi/src/closures.c
+++ b/libffi/src/closures.c
@@ -1,5 +1,6 @@
 /* -----------------------------------------------------------------------
-   closures.c - Copyright (c) 2007, 2009, 2010  Red Hat, Inc.
+   closures.c - Copyright (c) 2019 Anthony Green
+                Copyright (c) 2007, 2009, 2010 Red Hat, Inc.
                 Copyright (C) 2007, 2009, 2010 Free Software Foundation, Inc
                 Copyright (c) 2011 Plausible Labs Cooperative, Inc.
 
@@ -30,11 +31,98 @@
 #define _GNU_SOURCE 1
 #endif
 
+#include <fficonfig.h>
 #include <ffi.h>
 #include <ffi_common.h>
+#include <tramp.h>
+
+#ifdef __NetBSD__
+#include <sys/param.h>
+#endif
+
+#if __NetBSD_Version__ - 0 >= 799007200
+/* NetBSD with PROT_MPROTECT */
+#include <sys/mman.h>
+
+#include <stddef.h>
+#include <unistd.h>
+#ifdef  HAVE_SYS_MEMFD_H
+#include <sys/memfd.h>
+#endif
+
+static const size_t overhead =
+  (sizeof(max_align_t) > sizeof(void *) + sizeof(size_t)) ?
+    sizeof(max_align_t)
+    : sizeof(void *) + sizeof(size_t);
+
+#define ADD_TO_POINTER(p, d) ((void *)((uintptr_t)(p) + (d)))
+
+void *
+ffi_closure_alloc (size_t size, void **code)
+{
+  static size_t page_size;
+  size_t rounded_size;
+  void *codeseg, *dataseg;
+  int prot;
+
+  /* Expect that PAX mprotect is active and a separate code mapping is necessary. */
+  if (!code)
+    return NULL;
+
+  /* Obtain system page size. */
+  if (!page_size)
+    page_size = sysconf(_SC_PAGESIZE);
+
+  /* Round allocation size up to the next page, keeping in mind the size field and pointer to code map. */
+  rounded_size = (size + overhead + page_size - 1) & ~(page_size - 1);
+
+  /* Primary mapping is RW, but request permission to switch to PROT_EXEC later. */
+  prot = PROT_READ | PROT_WRITE | PROT_MPROTECT(PROT_EXEC);
+  dataseg = mmap(NULL, rounded_size, prot, MAP_ANON | MAP_PRIVATE, -1, 0);
+  if (dataseg == MAP_FAILED)
+    return NULL;
+
+  /* Create secondary mapping and switch it to RX. */
+  codeseg = mremap(dataseg, rounded_size, NULL, rounded_size, MAP_REMAPDUP);
+  if (codeseg == MAP_FAILED) {
+    munmap(dataseg, rounded_size);
+    return NULL;
+  }
+  if (mprotect(codeseg, rounded_size, PROT_READ | PROT_EXEC) == -1) {
+    munmap(codeseg, rounded_size);
+    munmap(dataseg, rounded_size);
+    return NULL;
+  }
+
+  /* Remember allocation size and location of the secondary mapping for ffi_closure_free. */
+  memcpy(dataseg, &rounded_size, sizeof(rounded_size));
+  memcpy(ADD_TO_POINTER(dataseg, sizeof(size_t)), &codeseg, sizeof(void *));
+  *code = ADD_TO_POINTER(codeseg, overhead);
+  return ADD_TO_POINTER(dataseg, overhead);
+}
+
+void
+ffi_closure_free (void *ptr)
+{
+  void *codeseg, *dataseg;
+  size_t rounded_size;
+
+  dataseg = ADD_TO_POINTER(ptr, -overhead);
+  memcpy(&rounded_size, dataseg, sizeof(rounded_size));
+  memcpy(&codeseg, ADD_TO_POINTER(dataseg, sizeof(size_t)), sizeof(void *));
+  munmap(dataseg, rounded_size);
+  munmap(codeseg, rounded_size);
+}
+
+int
+ffi_tramp_is_present (__attribute__((unused)) void *ptr)
+{
+  return 0;
+}
+#else /* !NetBSD with PROT_MPROTECT */
 
 #if !FFI_MMAP_EXEC_WRIT && !FFI_EXEC_TRAMPOLINE_TABLE
-# if __gnu_linux__ && !defined(__ANDROID__)
+# if __linux__ && !defined(__ANDROID__)
 /* This macro indicates it may be forbidden to map anonymous memory
    with both write and execute permission.  Code compiled when this
    option is defined will attempt to map such pages once, but if it
@@ -45,7 +133,7 @@
 #  define FFI_MMAP_EXEC_WRIT 1
 #  define HAVE_MNTENT 1
 # endif
-# if defined(X86_WIN32) || defined(X86_WIN64) || defined(__OS2__)
+# if defined(_WIN32) || defined(__OS2__)
 /* Windows systems may have Data Execution Protection (DEP) enabled, 
    which requires the use of VirtualMalloc/VirtualFree to alloc/free
    executable memory. */
@@ -54,7 +142,7 @@
 #endif
 
 #if FFI_MMAP_EXEC_WRIT && !defined FFI_MMAP_EXEC_SELINUX
-# ifdef __linux__
+# if defined(__linux__) && !defined(__ANDROID__)
 /* When defined to 1 check for SELinux and if SELinux is active,
    don't attempt PROT_EXEC|PROT_WRITE mapping at all, as that
    might cause audit messages.  */
@@ -64,11 +152,226 @@
 
 #if FFI_CLOSURES
 
-# if FFI_EXEC_TRAMPOLINE_TABLE
+#if FFI_EXEC_TRAMPOLINE_TABLE
+
+#ifdef __MACH__
+
+#include <mach/mach.h>
+#include <pthread.h>
+#ifdef HAVE_PTRAUTH
+#include <ptrauth.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+
+extern void *ffi_closure_trampoline_table_page;
+
+typedef struct ffi_trampoline_table ffi_trampoline_table;
+typedef struct ffi_trampoline_table_entry ffi_trampoline_table_entry;
+
+struct ffi_trampoline_table
+{
+  /* contiguous writable and executable pages */
+  vm_address_t config_page;
+
+  /* free list tracking */
+  uint16_t free_count;
+  ffi_trampoline_table_entry *free_list;
+  ffi_trampoline_table_entry *free_list_pool;
+
+  ffi_trampoline_table *prev;
+  ffi_trampoline_table *next;
+};
+
+struct ffi_trampoline_table_entry
+{
+  void *(*trampoline) (void);
+  ffi_trampoline_table_entry *next;
+};
+
+/* Total number of trampolines that fit in one trampoline table */
+#define FFI_TRAMPOLINE_COUNT (PAGE_MAX_SIZE / FFI_TRAMPOLINE_SIZE)
+
+static pthread_mutex_t ffi_trampoline_lock = PTHREAD_MUTEX_INITIALIZER;
+static ffi_trampoline_table *ffi_trampoline_tables = NULL;
+
+static ffi_trampoline_table *
+ffi_trampoline_table_alloc (void)
+{
+  ffi_trampoline_table *table;
+  vm_address_t config_page;
+  vm_address_t trampoline_page;
+  vm_address_t trampoline_page_template;
+  vm_prot_t cur_prot;
+  vm_prot_t max_prot;
+  kern_return_t kt;
+  uint16_t i;
+
+  /* Allocate two pages -- a config page and a placeholder page */
+  config_page = 0x0;
+  kt = vm_allocate (mach_task_self (), &config_page, PAGE_MAX_SIZE * 2,
+		    VM_FLAGS_ANYWHERE);
+  if (kt != KERN_SUCCESS)
+    return NULL;
+
+  /* Remap the trampoline table on top of the placeholder page */
+  trampoline_page = config_page + PAGE_MAX_SIZE;
+
+#ifdef HAVE_PTRAUTH
+  trampoline_page_template = (vm_address_t)(uintptr_t)ptrauth_auth_data((void *)&ffi_closure_trampoline_table_page, ptrauth_key_function_pointer, 0);
+#else
+  trampoline_page_template = (vm_address_t)&ffi_closure_trampoline_table_page;
+#endif
+
+#ifdef __arm__
+  /* ffi_closure_trampoline_table_page can be thumb-biased on some ARM archs */
+  trampoline_page_template &= ~1UL;
+#endif
+  kt = vm_remap (mach_task_self (), &trampoline_page, PAGE_MAX_SIZE, 0x0,
+		 VM_FLAGS_OVERWRITE, mach_task_self (), trampoline_page_template,
+		 FALSE, &cur_prot, &max_prot, VM_INHERIT_SHARE);
+  if (kt != KERN_SUCCESS || !(cur_prot & VM_PROT_EXECUTE))
+    {
+      vm_deallocate (mach_task_self (), config_page, PAGE_MAX_SIZE * 2);
+      return NULL;
+    }
+
+  /* We have valid trampoline and config pages */
+  table = calloc (1, sizeof (ffi_trampoline_table));
+  table->free_count = FFI_TRAMPOLINE_COUNT;
+  table->config_page = config_page;
+
+  /* Create and initialize the free list */
+  table->free_list_pool =
+    calloc (FFI_TRAMPOLINE_COUNT, sizeof (ffi_trampoline_table_entry));
+
+  for (i = 0; i < table->free_count; i++)
+    {
+      ffi_trampoline_table_entry *entry = &table->free_list_pool[i];
+      entry->trampoline =
+	(void *) (trampoline_page + (i * FFI_TRAMPOLINE_SIZE));
+#ifdef HAVE_PTRAUTH
+      entry->trampoline = ptrauth_sign_unauthenticated(entry->trampoline, ptrauth_key_function_pointer, 0);
+#endif
+
+      if (i < table->free_count - 1)
+	entry->next = &table->free_list_pool[i + 1];
+    }
+
+  table->free_list = table->free_list_pool;
+
+  return table;
+}
+
+static void
+ffi_trampoline_table_free (ffi_trampoline_table *table)
+{
+  /* Remove from the list */
+  if (table->prev != NULL)
+    table->prev->next = table->next;
+
+  if (table->next != NULL)
+    table->next->prev = table->prev;
+
+  /* Deallocate pages */
+  vm_deallocate (mach_task_self (), table->config_page, PAGE_MAX_SIZE * 2);
+
+  /* Deallocate free list */
+  free (table->free_list_pool);
+  free (table);
+}
+
+void *
+ffi_closure_alloc (size_t size, void **code)
+{
+  /* Create the closure */
+  ffi_closure *closure = malloc (size);
+  if (closure == NULL)
+    return NULL;
+
+  pthread_mutex_lock (&ffi_trampoline_lock);
+
+  /* Check for an active trampoline table with available entries. */
+  ffi_trampoline_table *table = ffi_trampoline_tables;
+  if (table == NULL || table->free_list == NULL)
+    {
+      table = ffi_trampoline_table_alloc ();
+      if (table == NULL)
+	{
+	  pthread_mutex_unlock (&ffi_trampoline_lock);
+	  free (closure);
+	  return NULL;
+	}
+
+      /* Insert the new table at the top of the list */
+      table->next = ffi_trampoline_tables;
+      if (table->next != NULL)
+	table->next->prev = table;
+
+      ffi_trampoline_tables = table;
+    }
+
+  /* Claim the free entry */
+  ffi_trampoline_table_entry *entry = ffi_trampoline_tables->free_list;
+  ffi_trampoline_tables->free_list = entry->next;
+  ffi_trampoline_tables->free_count--;
+  entry->next = NULL;
+
+  pthread_mutex_unlock (&ffi_trampoline_lock);
+
+  /* Initialize the return values */
+  *code = entry->trampoline;
+  closure->trampoline_table = table;
+  closure->trampoline_table_entry = entry;
+
+  return closure;
+}
+
+void
+ffi_closure_free (void *ptr)
+{
+  ffi_closure *closure = ptr;
+
+  pthread_mutex_lock (&ffi_trampoline_lock);
+
+  /* Fetch the table and entry references */
+  ffi_trampoline_table *table = closure->trampoline_table;
+  ffi_trampoline_table_entry *entry = closure->trampoline_table_entry;
+
+  /* Return the entry to the free list */
+  entry->next = table->free_list;
+  table->free_list = entry;
+  table->free_count++;
+
+  /* If all trampolines within this table are free, and at least one other table exists, deallocate
+   * the table */
+  if (table->free_count == FFI_TRAMPOLINE_COUNT
+      && ffi_trampoline_tables != table)
+    {
+      ffi_trampoline_table_free (table);
+    }
+  else if (ffi_trampoline_tables != table)
+    {
+      /* Otherwise, bump this table to the top of the list */
+      table->prev = NULL;
+      table->next = ffi_trampoline_tables;
+      if (ffi_trampoline_tables != NULL)
+	ffi_trampoline_tables->prev = table;
+
+      ffi_trampoline_tables = table;
+    }
+
+  pthread_mutex_unlock (&ffi_trampoline_lock);
+
+  /* Free the closure */
+  free (closure);
+}
+
+#endif
 
 // Per-target implementation; It's unclear what can reasonable be shared between two OS/architecture implementations.
 
-# elif FFI_MMAP_EXEC_WRIT /* !FFI_EXEC_TRAMPOLINE_TABLE */
+#elif FFI_MMAP_EXEC_WRIT /* !FFI_EXEC_TRAMPOLINE_TABLE */
 
 #define USE_LOCKS 1
 #define USE_DL_PREFIX 1
@@ -94,14 +397,6 @@
 /* Don't allocate more than a page unless needed.  */
 #define DEFAULT_GRANULARITY ((size_t)malloc_getpagesize)
 
-#if FFI_CLOSURE_TEST
-/* Don't release single pages, to avoid a worst-case scenario of
-   continuously allocating and releasing single pages, but release
-   pairs of pages, which should do just as well given that allocations
-   are likely to be small.  */
-#define DEFAULT_TRIM_THRESHOLD ((size_t)malloc_getpagesize)
-#endif
-
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
@@ -111,7 +406,7 @@
 #endif
 #include <string.h>
 #include <stdio.h>
-#if !defined(X86_WIN32) && !defined(X86_WIN64)
+#if !defined(_WIN32)
 #ifdef HAVE_MNTENT
 #include <mntent.h>
 #endif /* HAVE_MNTENT */
@@ -237,11 +532,11 @@ static int dlmalloc_trim(size_t) MAYBE_UNUSED;
 static size_t dlmalloc_usable_size(void*) MAYBE_UNUSED;
 static void dlmalloc_stats(void) MAYBE_UNUSED;
 
-#if !(defined(X86_WIN32) || defined(X86_WIN64) || defined(__OS2__)) || defined (__CYGWIN__) || defined(__INTERIX)
+#if !(defined(_WIN32) || defined(__OS2__)) || defined (__CYGWIN__) || defined(__INTERIX)
 /* Use these for mmap and munmap within dlmalloc.c.  */
 static void *dlmmap(void *, size_t, int, int, int, off_t);
 static int dlmunmap(void *, size_t);
-#endif /* !(defined(X86_WIN32) || defined(X86_WIN64) || defined(__OS2__)) || defined (__CYGWIN__) || defined(__INTERIX) */
+#endif /* !(defined(_WIN32) || defined(__OS2__)) || defined (__CYGWIN__) || defined(__INTERIX) */
 
 #define mmap dlmmap
 #define munmap dlmunmap
@@ -251,7 +546,7 @@ static int dlmunmap(void *, size_t);
 #undef mmap
 #undef munmap
 
-#if !(defined(X86_WIN32) || defined(X86_WIN64) || defined(__OS2__)) || defined (__CYGWIN__) || defined(__INTERIX)
+#if !(defined(_WIN32) || defined(__OS2__)) || defined (__CYGWIN__) || defined(__INTERIX)
 
 /* A mutex used to synchronize access to *exec* variables in this file.  */
 static pthread_mutex_t open_temp_exec_file_mutex = PTHREAD_MUTEX_INITIALIZER;
@@ -263,6 +558,17 @@ static int execfd = -1;
 /* The amount of space already allocated from the temporary file.  */
 static size_t execsize = 0;
 
+#ifdef HAVE_MEMFD_CREATE
+/* Open a temporary file name, and immediately unlink it.  */
+static int
+open_temp_exec_file_memfd (const char *name)
+{
+  int fd;
+  fd = memfd_create (name, MFD_CLOEXEC);
+  return fd;
+}
+#endif
+
 /* Open a temporary file name, and immediately unlink it.  */
 static int
 open_temp_exec_file_name (char *name, int flags)
@@ -308,7 +614,7 @@ open_temp_exec_file_dir (const char *dir)
   }
 #endif
 
-  lendir = strlen (dir);
+  lendir = (int) strlen (dir);
   tempname = __builtin_alloca (lendir + sizeof (suffix));
 
   if (!tempname)
@@ -390,6 +696,10 @@ static struct
   const char *arg;
   int repeat;
 } open_temp_exec_file_opts[] = {
+#ifdef HAVE_MEMFD_CREATE
+  { open_temp_exec_file_memfd, "libffi", 0 },
+#endif
+  { open_temp_exec_file_env, "LIBFFI_TMPDIR", 0 },
   { open_temp_exec_file_env, "TMPDIR", 0 },
   { open_temp_exec_file_dir, "/tmp", 0 },
   { open_temp_exec_file_dir, "/var/tmp", 0 },
@@ -449,6 +759,36 @@ open_temp_exec_file (void)
   return fd;
 }
 
+/* We need to allocate space in a file that will be backing a writable
+   mapping.  Several problems exist with the usual approaches:
+   - fallocate() is Linux-only
+   - posix_fallocate() is not available on all platforms
+   - ftruncate() does not allocate space on filesystems with sparse files
+   Failure to allocate the space will cause SIGBUS to be thrown when
+   the mapping is subsequently written to.  */
+static int
+allocate_space (int fd, off_t offset, off_t len)
+{
+  static size_t page_size;
+
+  /* Obtain system page size. */
+  if (!page_size)
+    page_size = sysconf(_SC_PAGESIZE);
+
+  unsigned char buf[page_size];
+  memset (buf, 0, page_size);
+
+  while (len > 0)
+    {
+      off_t to_write = (len < page_size) ? len : page_size;
+      if (write (fd, buf, to_write) < to_write)
+        return -1;
+      len -= to_write;
+    }
+
+  return 0;
+}
+
 /* Map in a chunk of memory from the temporary exec file into separate
    locations in the virtual memory address space, one writable and one
    executable.  Returns the address of the writable portion, after
@@ -470,7 +810,7 @@ dlmmap_locked (void *start, size_t length, int prot, int flags, off_t offset)
 
   offset = execsize;
 
-  if (ftruncate (execfd, offset + length))
+  if (allocate_space (execfd, offset, length))
     return MFAIL;
 
   flags &= ~(MAP_PRIVATE | MAP_ANONYMOUS);
@@ -485,7 +825,13 @@ dlmmap_locked (void *start, size_t length, int prot, int flags, off_t offset)
 	  close (execfd);
 	  goto retry_open;
 	}
-      ftruncate (execfd, offset);
+      if (ftruncate (execfd, offset) != 0)
+      {
+        /* Fixme : Error logs can be added here. Returning an error for
+         * ftruncte() will not add any advantage as it is being
+         * validating in the error case. */
+      }
+
       return MFAIL;
     }
   else if (!offset
@@ -497,7 +843,12 @@ dlmmap_locked (void *start, size_t length, int prot, int flags, off_t offset)
   if (start == MFAIL)
     {
       munmap (ptr, length);
-      ftruncate (execfd, offset);
+      if (ftruncate (execfd, offset) != 0)
+      {
+        /* Fixme : Error logs can be added here. Returning an error for
+         * ftruncte() will not add any advantage as it is being
+         * validating in the error case. */
+      }
       return start;
     }
 
@@ -521,9 +872,11 @@ dlmmap (void *start, size_t length, int prot,
 	  && flags == (MAP_PRIVATE | MAP_ANONYMOUS)
 	  && fd == -1 && offset == 0);
 
-#if FFI_CLOSURE_TEST
-  printf ("mapping in %zi\n", length);
-#endif
+  if (execfd == -1 && ffi_tramp_is_supported ())
+    {
+      ptr = mmap (start, length, prot & ~PROT_EXEC, flags, fd, offset);
+      return ptr;
+    }
 
   if (execfd == -1 && is_emutramp_enabled ())
     {
@@ -570,10 +923,6 @@ dlmunmap (void *start, size_t length)
   msegmentptr seg = segment_holding (gm, start);
   void *code;
 
-#if FFI_CLOSURE_TEST
-  printf ("unmapping %zi\n", length);
-#endif
-
   if (seg && (code = add_segment_exec_offset (start, seg)) != start)
     {
       int ret = munmap (code, length);
@@ -600,7 +949,7 @@ segment_holding_code (mstate m, char* addr)
 }
 #endif
 
-#endif /* !(defined(X86_WIN32) || defined(X86_WIN64) || defined(__OS2__)) || defined (__CYGWIN__) || defined(__INTERIX) */
+#endif /* !(defined(_WIN32) || defined(__OS2__)) || defined (__CYGWIN__) || defined(__INTERIX) */
 
 /* Allocate a chunk of memory with the given size.  Returns a pointer
    to the writable address, and sets *CODE to the executable
@@ -608,23 +957,52 @@ segment_holding_code (mstate m, char* addr)
 void *
 ffi_closure_alloc (size_t size, void **code)
 {
-  void *ptr;
+  void *ptr, *ftramp;
 
   if (!code)
     return NULL;
 
-  ptr = dlmalloc (size);
+  ptr = FFI_CLOSURE_PTR (dlmalloc (size));
 
   if (ptr)
     {
       msegmentptr seg = segment_holding (gm, ptr);
 
       *code = add_segment_exec_offset (ptr, seg);
+      if (!ffi_tramp_is_supported ())
+        return ptr;
+
+      ftramp = ffi_tramp_alloc (0);
+      if (ftramp == NULL)
+      {
+        dlfree (FFI_RESTORE_PTR (ptr));
+        return NULL;
+      }
+      *code = ffi_tramp_get_addr (ftramp);
+      ((ffi_closure *) ptr)->ftramp = ftramp;
     }
 
   return ptr;
 }
 
+void *
+ffi_data_to_code_pointer (void *data)
+{
+  msegmentptr seg = segment_holding (gm, data);
+  /* We expect closures to be allocated with ffi_closure_alloc(), in
+     which case seg will be non-NULL.  However, some users take on the
+     burden of managing this memory themselves, in which case this
+     we'll just return data. */
+  if (seg)
+    {
+      if (!ffi_tramp_is_supported ())
+        return add_segment_exec_offset (data, seg);
+      return ffi_tramp_get_addr (((ffi_closure *) data)->ftramp);
+    }
+  else
+    return data;
+}
+
 /* Release a chunk of memory allocated with ffi_closure_alloc.  If
    FFI_CLOSURE_FREE_CODE is nonzero, the given address can be the
    writable or the executable address given.  Otherwise, only the
@@ -638,30 +1016,19 @@ ffi_closure_free (void *ptr)
   if (seg)
     ptr = sub_segment_exec_offset (ptr, seg);
 #endif
+  if (ffi_tramp_is_supported ())
+    ffi_tramp_free (((ffi_closure *) ptr)->ftramp);
 
-  dlfree (ptr);
+  dlfree (FFI_RESTORE_PTR (ptr));
 }
 
-
-#if FFI_CLOSURE_TEST
-/* Do some internal sanity testing to make sure allocation and
-   deallocation of pages are working as intended.  */
-int main ()
-{
-  void *p[3];
-#define GET(idx, len) do { p[idx] = dlmalloc (len); printf ("allocated %zi for p[%i]\n", (len), (idx)); } while (0)
-#define PUT(idx) do { printf ("freeing p[%i]\n", (idx)); dlfree (p[idx]); } while (0)
-  GET (0, malloc_getpagesize / 2);
-  GET (1, 2 * malloc_getpagesize - 64 * sizeof (void*));
-  PUT (1);
-  GET (1, 2 * malloc_getpagesize);
-  GET (2, malloc_getpagesize / 2);
-  PUT (1);
-  PUT (0);
-  PUT (2);
-  return 0;
+int
+ffi_tramp_is_present (void *ptr)
+{
+  msegmentptr seg = segment_holding (gm, ptr);
+  return seg != NULL && ffi_tramp_is_supported();
 }
-#endif /* FFI_CLOSURE_TEST */
+
 # else /* ! FFI_MMAP_EXEC_WRIT */
 
 /* On many systems, memory returned by malloc is writable and
@@ -675,14 +1042,28 @@ ffi_closure_alloc (size_t size, void **code)
   if (!code)
     return NULL;
 
-  return *code = malloc (size);
+  return *code = FFI_CLOSURE_PTR (malloc (size));
 }
 
 void
 ffi_closure_free (void *ptr)
 {
-  free (ptr);
+  free (FFI_RESTORE_PTR (ptr));
+}
+
+void *
+ffi_data_to_code_pointer (void *data)
+{
+  return data;
+}
+
+int
+ffi_tramp_is_present (__attribute__((unused)) void *ptr)
+{
+  return 0;
 }
 
 # endif /* ! FFI_MMAP_EXEC_WRIT */
 #endif /* FFI_CLOSURES */
+
+#endif /* NetBSD with PROT_MPROTECT */
diff --git a/libffi/src/cris/ffi.c b/libffi/src/cris/ffi.c
index aaca5b1..9011fde 100644
--- a/libffi/src/cris/ffi.c
+++ b/libffi/src/cris/ffi.c
@@ -29,7 +29,7 @@
 #include <ffi.h>
 #include <ffi_common.h>
 
-#define STACK_ARG_SIZE(x) ALIGN(x, FFI_SIZEOF_ARG)
+#define STACK_ARG_SIZE(x) FFI_ALIGN(x, FFI_SIZEOF_ARG)
 
 static ffi_status
 initialize_aggregate_packed_struct (ffi_type * arg)
@@ -190,7 +190,7 @@ ffi_prep_cif_core (ffi_cif * cif,
       FFI_ASSERT_VALID_TYPE (*ptr);
 
       if (((*ptr)->alignment - 1) & bytes)
-	bytes = ALIGN (bytes, (*ptr)->alignment);
+	bytes = FFI_ALIGN (bytes, (*ptr)->alignment);
       if ((*ptr)->type == FFI_TYPE_STRUCT)
 	{
 	  if ((*ptr)->size > 8)
diff --git a/libffi/src/csky/ffi.c b/libffi/src/csky/ffi.c
new file mode 100644
index 0000000..af50b7c
--- /dev/null
+++ b/libffi/src/csky/ffi.c
@@ -0,0 +1,395 @@
+/* -----------------------------------------------------------------------
+   ffi.c
+
+   CSKY Foreign Function Interface
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+   ----------------------------------------------------------------------- */
+
+#include <ffi.h>
+#include <ffi_common.h>
+
+#include <stdlib.h>
+
+/* ffi_prep_args is called by the assembly routine once stack space
+   has been allocated for the function's arguments
+*/
+void ffi_prep_args(char *stack, extended_cif *ecif)
+{
+ register unsigned int i;
+ register void **p_argv;
+ register char *argp;
+ register ffi_type **p_arg;
+
+ argp = stack;
+
+ if ( ecif->cif->flags == FFI_TYPE_STRUCT ) {
+  *(void **) argp = ecif->rvalue;
+  argp += 4;
+ }
+
+ p_argv = ecif->avalue;
+
+ for (i = ecif->cif->nargs, p_arg = ecif->cif->arg_types;
+   (i != 0);
+   i--, p_arg++)
+ {
+  size_t z;
+  size_t alignment;
+
+  /* Align if necessary */
+  alignment = (*p_arg)->alignment;
+#ifdef __CSKYABIV1__
+  /*
+   * Adapt ABIV1 bug.
+   * If struct's size is larger than 8 bytes, then it always alignment as 4 bytes.
+   */
+  if (((*p_arg)->type == FFI_TYPE_STRUCT) && ((*p_arg)->size > 8) && (alignment == 8)) {
+   alignment = 4;
+  }
+#endif
+
+  if ((alignment - 1) & (unsigned) argp) {
+   argp = (char *) FFI_ALIGN(argp, alignment);
+  }
+
+  if ((*p_arg)->type == FFI_TYPE_STRUCT)
+   argp = (char *) FFI_ALIGN(argp, 4);
+
+  z = (*p_arg)->size;
+  if (z < sizeof(int))
+  {
+   z = sizeof(int);
+   switch ((*p_arg)->type)
+   {
+   case FFI_TYPE_SINT8:
+    *(signed int *) argp = (signed int)*(SINT8 *)(* p_argv);
+    break;
+
+   case FFI_TYPE_UINT8:
+    *(unsigned int *) argp = (unsigned int)*(UINT8 *)(* p_argv);
+    break;
+
+   case FFI_TYPE_SINT16:
+    *(signed int *) argp = (signed int)*(SINT16 *)(* p_argv);
+    break;
+
+   case FFI_TYPE_UINT16:
+    *(unsigned int *) argp = (unsigned int)*(UINT16 *)(* p_argv);
+    break;
+
+   case FFI_TYPE_STRUCT:
+#ifdef __CSKYBE__
+    memcpy((argp + 4 - (*p_arg)->size), *p_argv, (*p_arg)->size);
+#else
+    memcpy(argp, *p_argv, (*p_arg)->size);
+#endif
+    break;
+
+   default:
+    FFI_ASSERT(0);
+   }
+  }
+  else if (z == sizeof(int))
+  {
+   *(unsigned int *) argp = (unsigned int)*(UINT32 *)(* p_argv);
+  }
+  else
+  {
+   memcpy(argp, *p_argv, z);
+  }
+  p_argv++;
+  argp += z;
+ }
+
+ return;
+}
+
+/* Perform machine dependent cif processing */
+ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
+{
+  /* Round the stack up to a multiple of 8 bytes.  This isn't needed
+     everywhere, but it is on some platforms, and it doesn't hcsky anything
+     when it isn't needed.  */
+  cif->bytes = (cif->bytes + 7) & ~7;
+
+  /* Set the return type flag */
+  switch (cif->rtype->type)
+    {
+
+    case FFI_TYPE_DOUBLE:
+    case FFI_TYPE_SINT64:
+    case FFI_TYPE_UINT64:
+      cif->flags = (unsigned) FFI_TYPE_SINT64;
+      break;
+
+    case FFI_TYPE_STRUCT:
+      if (cif->rtype->size <= 4)
+ /* A Composite Type not larger than 4 bytes is returned in r0.  */
+ cif->flags = (unsigned)FFI_TYPE_INT;
+      else if (cif->rtype->size <= 8)
+ /* A Composite Type not larger than 8 bytes is returned in r0, r1.  */
+ cif->flags = (unsigned)FFI_TYPE_SINT64;
+      else
+ /* A Composite Type larger than 8 bytes, or whose size cannot
+    be determined statically ... is stored in memory at an
+    address passed [in r0].  */
+ cif->flags = (unsigned)FFI_TYPE_STRUCT;
+      break;
+
+    default:
+      cif->flags = FFI_TYPE_INT;
+      break;
+    }
+
+  return FFI_OK;
+}
+
+/* Perform machine dependent cif processing for variadic calls */
+ffi_status ffi_prep_cif_machdep_var(ffi_cif *cif,
+        unsigned int nfixedargs,
+        unsigned int ntotalargs)
+{
+  return ffi_prep_cif_machdep(cif);
+}
+
+/* Prototypes for assembly functions, in sysv.S */
+extern void ffi_call_SYSV (void (*fn)(void), extended_cif *, unsigned, unsigned, unsigned *);
+
+void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  extended_cif ecif;
+
+  int small_struct = (cif->flags == FFI_TYPE_INT
+        && cif->rtype->type == FFI_TYPE_STRUCT);
+
+  ecif.cif = cif;
+  ecif.avalue = avalue;
+
+  unsigned int temp;
+
+  /* If the return value is a struct and we don't have a return */
+  /* value address then we need to make one          */
+
+  if ((rvalue == NULL) &&
+      (cif->flags == FFI_TYPE_STRUCT))
+    {
+      ecif.rvalue = alloca(cif->rtype->size);
+    }
+  else if (small_struct)
+    ecif.rvalue = &temp;
+  else
+    ecif.rvalue = rvalue;
+
+  switch (cif->abi)
+    {
+    case FFI_SYSV:
+      ffi_call_SYSV (fn, &ecif, cif->bytes, cif->flags, ecif.rvalue);
+      break;
+
+    default:
+      FFI_ASSERT(0);
+      break;
+    }
+  if (small_struct)
+#ifdef __CSKYBE__
+    memcpy (rvalue, ((unsigned char *)&temp + (4 - cif->rtype->size)), cif->rtype->size);
+#else
+    memcpy (rvalue, &temp, cif->rtype->size);
+#endif
+}
+
+/** private members **/
+
+static void ffi_prep_incoming_args_SYSV (char *stack, void **ret,
+      void** args, ffi_cif* cif);
+
+void ffi_closure_SYSV (ffi_closure *);
+
+/* This function is jumped to by the trampoline */
+
+unsigned int
+ffi_closure_SYSV_inner (closure, respp, args)
+     ffi_closure *closure;
+     void **respp;
+     void *args;
+{
+  // our various things...
+  ffi_cif       *cif;
+  void         **arg_area;
+
+  cif         = closure->cif;
+  arg_area    = (void**) alloca (cif->nargs * sizeof (void*));
+
+  /* this call will initialize ARG_AREA, such that each
+   * element in that array points to the corresponding
+   * value on the stack; and if the function returns
+   * a structure, it will re-set RESP to point to the
+   * structure return address.  */
+
+  ffi_prep_incoming_args_SYSV(args, respp, arg_area, cif);
+
+  (closure->fun) (cif, *respp, arg_area, closure->user_data);
+
+#ifdef __CSKYBE__
+  if (cif->flags == FFI_TYPE_INT && cif->rtype->type == FFI_TYPE_STRUCT) {
+      unsigned int tmp = 0;
+      tmp = *(unsigned int *)(*respp);
+      *(unsigned int *)(*respp) = (tmp >> ((4 - cif->rtype->size) * 8));
+  }
+#endif
+
+  return cif->flags;
+}
+
+
+static void
+ffi_prep_incoming_args_SYSV(char *stack, void **rvalue,
+       void **avalue, ffi_cif *cif)
+{
+  register unsigned int i;
+  register void **p_argv;
+  register char *argp;
+  register ffi_type **p_arg;
+
+  argp = stack;
+
+  if ( cif->flags == FFI_TYPE_STRUCT ) {
+    *rvalue = *(void **) argp;
+    argp += 4;
+  }
+
+  p_argv = avalue;
+
+  for (i = cif->nargs, p_arg = cif->arg_types; (i != 0); i--, p_arg++)
+    {
+      size_t z;
+      size_t alignment;
+
+      alignment = (*p_arg)->alignment;
+      if (alignment < 4)
+ alignment = 4;
+
+#ifdef __CSKYABIV1__
+      /*
+       * Adapt ABIV1 bug.
+       * If struct's size is larger than 8 bytes, then it always alignment as 4 bytes.
+       */
+      if (((*p_arg)->type == FFI_TYPE_STRUCT) && ((*p_arg)->size > 8) && (alignment == 8)) {
+        alignment = 4;
+      }
+#endif
+
+      /* Align if necessary */
+      if ((alignment - 1) & (unsigned) argp) {
+ argp = (char *) FFI_ALIGN(argp, alignment);
+      }
+
+      z = (*p_arg)->size;
+
+#ifdef __CSKYBE__
+      unsigned int tmp = 0;
+      if ((*p_arg)->size < 4) {
+        tmp = *(unsigned int *)argp;
+        memcpy(argp, ((unsigned char *)&tmp + (4 - (*p_arg)->size)), (*p_arg)->size);
+      }
+#else
+      /* because we're little endian, this is what it turns into.   */
+#endif
+      *p_argv = (void*) argp;
+
+      p_argv++;
+      argp += z;
+    }
+
+  return;
+}
+
+/* How to make a trampoline.  */
+
+extern unsigned char ffi_csky_trampoline[TRAMPOLINE_SIZE];
+
+/*
+ * Since there is no __clear_cache in libgcc in csky toolchain.
+ * define ffi_csky_cacheflush in sysv.S.
+ * void ffi_csky_cacheflush(uint32 start_addr, uint32 size, int cache)
+ */
+#define CACHEFLUSH_IN_FFI 1
+#if CACHEFLUSH_IN_FFI
+extern void ffi_csky_cacheflush(unsigned char *__tramp, unsigned int k,
+  int i);
+#define FFI_INIT_TRAMPOLINE(TRAMP,FUN,CTX)                              \
+({ unsigned char *__tramp = (unsigned char*)(TRAMP);                    \
+   unsigned int  __fun = (unsigned int)(FUN);                           \
+   unsigned int  __ctx = (unsigned int)(CTX);                           \
+   unsigned char *insns = (unsigned char *)(CTX);                       \
+   memcpy (__tramp, ffi_csky_trampoline, TRAMPOLINE_SIZE);              \
+   *(unsigned int*) &__tramp[TRAMPOLINE_SIZE] = __ctx;                  \
+   *(unsigned int*) &__tramp[TRAMPOLINE_SIZE + 4] = __fun;              \
+   ffi_csky_cacheflush(&__tramp[0], TRAMPOLINE_SIZE, 3); /* Clear data mapping.  */ \
+   ffi_csky_cacheflush(insns, TRAMPOLINE_SIZE, 3);                       \
+                                                 /* Clear instruction   \
+                                                    mapping.  */        \
+ })
+#else
+#define FFI_INIT_TRAMPOLINE(TRAMP,FUN,CTX)                              \
+({ unsigned char *__tramp = (unsigned char*)(TRAMP);                    \
+   unsigned int  __fun = (unsigned int)(FUN);                           \
+   unsigned int  __ctx = (unsigned int)(CTX);                           \
+   unsigned char *insns = (unsigned char *)(CTX);                       \
+   memcpy (__tramp, ffi_csky_trampoline, TRAMPOLINE_SIZE);              \
+   *(unsigned int*) &__tramp[TRAMPOLINE_SIZE] = __ctx;                  \
+   *(unsigned int*) &__tramp[TRAMPOLINE_SIZE + 4] = __fun;              \
+   __clear_cache((&__tramp[0]), (&__tramp[TRAMPOLINE_SIZE-1])); /* Clear data mapping.  */ \
+   __clear_cache(insns, insns + TRAMPOLINE_SIZE);                       \
+                                                 /* Clear instruction   \
+                                                    mapping.  */        \
+ })
+#endif
+
+/* the cif must already be prep'ed */
+
+ffi_status
+ffi_prep_closure_loc (ffi_closure* closure,
+        ffi_cif* cif,
+        void (*fun)(ffi_cif*,void*,void**,void*),
+        void *user_data,
+        void *codeloc)
+{
+  void (*closure_func)(ffi_closure*) = NULL;
+
+  if (cif->abi == FFI_SYSV)
+    closure_func = &ffi_closure_SYSV;
+  else
+    return FFI_BAD_ABI;
+
+  FFI_INIT_TRAMPOLINE (&closure->tramp[0], \
+         closure_func,  \
+         codeloc);
+
+  closure->cif  = cif;
+  closure->user_data = user_data;
+  closure->fun  = fun;
+
+  return FFI_OK;
+}
+
+
diff --git a/libffi/src/csky/ffitarget.h b/libffi/src/csky/ffitarget.h
new file mode 100644
index 0000000..f770aac
--- /dev/null
+++ b/libffi/src/csky/ffitarget.h
@@ -0,0 +1,63 @@
+/* -----------------------------------------------------------------*-C-*-
+   ffitarget.h - Copyright (c) 2012  Anthony Green
+                 Copyright (c) 2010  CodeSourcery
+                 Copyright (c) 1996-2003  Red Hat, Inc.
+
+   Target configuration macros for CSKY.
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+
+   ----------------------------------------------------------------------- */
+
+#ifndef LIBFFI_TARGET_H
+#define LIBFFI_TARGET_H
+
+#ifndef LIBFFI_H
+#error "Please do not include ffitarget.h directly into your source.  Use ffi.h instead."
+#endif
+
+#ifndef LIBFFI_ASM
+typedef unsigned long          ffi_arg;
+typedef signed long            ffi_sarg;
+
+typedef enum ffi_abi {
+  FFI_FIRST_ABI = 0,
+  FFI_SYSV,
+  FFI_LAST_ABI,
+  FFI_DEFAULT_ABI = FFI_SYSV,
+} ffi_abi;
+#endif
+
+#ifdef __CSKYABIV2__
+#define FFI_ASM_ARGREG_SIZE 16
+#define TRAMPOLINE_SIZE 16
+#define FFI_TRAMPOLINE_SIZE 24
+#else
+#define FFI_ASM_ARGREG_SIZE 24
+#define TRAMPOLINE_SIZE 20
+#define FFI_TRAMPOLINE_SIZE 28
+#endif
+
+/* ---- Definitions for closures ----------------------------------------- */
+
+#define FFI_CLOSURES 1
+#define FFI_NATIVE_RAW_API 0
+#endif
diff --git a/libffi/src/csky/sysv.S b/libffi/src/csky/sysv.S
new file mode 100644
index 0000000..21670bf
--- /dev/null
+++ b/libffi/src/csky/sysv.S
@@ -0,0 +1,371 @@
+/* -----------------------------------------------------------------------
+   sysv.S
+
+   CSKY Foreign Function Interface
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+   ----------------------------------------------------------------------- */
+
+#define LIBFFI_ASM
+#include <fficonfig.h>
+#include <ffi.h>
+
+.macro CSKY_FUNC_START name
+ .text
+ .align 2
+ .globl \name
+ .type \name, @function
+ \name:
+.endm
+
+#ifdef __CSKYABIV2__
+
+ /*
+  * a0:   fn
+  * a1:   &ecif
+  * a2:   cif->bytes
+  * a3:   fig->flags
+  * sp+0: ecif.rvalue
+  */
+CSKY_FUNC_START ffi_call_SYSV
+ /* Save registers */
+ .cfi_startproc
+ subi sp, 28
+ .cfi_def_cfa_offset 28
+ stw a0, (sp, 0x0)
+ .cfi_offset 0, -28
+ stw a1, (sp, 0x4)
+ .cfi_offset 1, -24
+ stw a2, (sp, 0x8)
+ .cfi_offset 2, -20
+ stw a3, (sp, 0xC)
+ .cfi_offset 3, -16
+ stw l0, (sp, 0x10)
+ .cfi_offset 4, -12
+ stw l1, (sp, 0x14)
+ .cfi_offset 5, -8
+ stw lr, (sp, 0x18)
+ .cfi_offset 15, -4
+
+ mov l0, sp
+ .cfi_def_cfa_register 4
+
+ /* Make room for all of the new args. */
+ subu sp, sp, a2
+
+ /* Place all of the ffi_prep_args in position */
+ mov a0, sp
+ /*     a1 already set */
+
+ /* Call ffi_prep_args(stack, &ecif) */
+ jsri ffi_prep_args
+
+ /* move first 4 parameters in registers */
+ ldw a0, (sp, 0x0)
+ ldw a1, (sp, 0x4)
+ ldw a2, (sp, 0x8)
+ ldw a3, (sp, 0xC)
+
+ /* and adjust stack */
+ subu lr, l0, sp /* cif->bytes == l0 - sp */
+ cmphsi lr, 16
+ movi l1, 16
+ movt lr, l1
+ addu sp, sp, lr
+
+ ldw l1, (l0, 0) /* load fn() in advance */
+
+ /* call (fn) (...) */
+ jsr l1
+
+ /* Remove the space we pushed for the args */
+ mov sp, l0
+
+ /* Load r2 with the pointer to storage for the return value */
+ ldw a2, (sp, 0x1C)
+
+ /* Load r3 with the return type code */
+ ldw a3, (sp, 0xC)
+
+ /* If the return value pointer is NULL, assume no return value. */
+ cmpnei a2, 0
+ bf .Lepilogue
+
+ cmpnei a3, FFI_TYPE_STRUCT
+ bf .Lepilogue
+
+ /* return INT64 */
+ cmpnei a3, FFI_TYPE_SINT64
+ bt .Lretint
+ /* stw a0, (a2, 0x0) at .Lretint */
+ stw a1, (a2, 0x4)
+
+.Lretint:
+ /* return INT */
+ stw a0, (a2, 0x0)
+
+.Lepilogue:
+ ldw a0, (sp, 0x0)
+ ldw a1, (sp, 0x4)
+ ldw a2, (sp, 0x8)
+ ldw a3, (sp, 0xC)
+ ldw l0, (sp, 0x10)
+ ldw l1, (sp, 0x14)
+ ldw lr, (sp, 0x18)
+ addi sp, sp, 28
+ rts
+ .cfi_endproc
+        .size    ffi_call_SYSV, .-ffi_call_SYSV
+
+
+ /*
+  * unsigned int FFI_HIDDEN
+  * ffi_closure_SYSV_inner (closure, respp, args)
+  *      ffi_closure *closure;
+  *      void **respp;
+  *      void *args;
+  */
+CSKY_FUNC_START ffi_closure_SYSV
+ .cfi_startproc
+ mov a2, sp
+ addi a1, sp, 16
+ subi sp, sp, 24
+ .cfi_def_cfa_offset 40
+ stw a1, (sp, 0x10)
+ .cfi_offset 1, -24
+ stw lr, (sp, 0x14)
+ .cfi_offset 15, -20
+ stw sp, (sp, 0x8)
+ addi a1, sp, 8
+ jsri ffi_closure_SYSV_inner
+ ldw a0, (sp, 0x0)
+ /*
+  * if FFI_TYPE_SINT64, need a1.
+  * if FFI_TYPE_INT, ignore a1.
+  */
+ ldw a1, (sp, 0x4)
+
+ ldw lr, (sp, 0x14)
+ addi sp, sp, 40
+ rts
+ .cfi_endproc
+        .size    ffi_closure_SYSV, .-ffi_closure_SYSV
+
+CSKY_FUNC_START ffi_csky_trampoline
+ subi sp, sp, 16
+ stw a0, (sp, 0x0)
+ stw a1, (sp, 0x4)
+ stw a2, (sp, 0x8)
+ stw a3, (sp, 0xC)
+ lrw a0, [.Lctx]
+ lrw a1, [.Lfun]
+ jmp a1
+.Lctx:
+ mov a0, a0
+ mov a0, a0
+.Lfun:
+
+        .size    ffi_csky_trampoline, .-ffi_csky_trampoline
+
+CSKY_FUNC_START ffi_csky_cacheflush
+ mov t0, r7
+ movi r7, 123
+ trap 0
+ mov r7, t0
+ rts
+
+        .size    ffi_csky_cacheflush, .-ffi_csky_cacheflush
+
+#else /* !__CSKYABIV2__ */
+
+ /*
+  * a0:   fn
+  * a1:   &ecif
+  * a2:   cif->bytes
+  * a3:   fig->flags
+  * a4:   ecif.rvalue
+  */
+CSKY_FUNC_START ffi_call_SYSV
+ /* Save registers */
+ .cfi_startproc
+ subi sp, 32
+ subi sp, 8
+ .cfi_def_cfa_offset 40
+ stw a0, (sp, 0x0)
+ .cfi_offset 2, -40
+ stw a1, (sp, 0x4)
+ .cfi_offset 3, -36
+ stw a2, (sp, 0x8)
+ .cfi_offset 4, -32
+ stw a3, (sp, 0xC)
+ .cfi_offset 5, -28
+ stw a4, (sp, 0x10)
+ .cfi_offset 6, -24
+ stw a5, (sp, 0x14)
+ .cfi_offset 7, -20
+ stw l0, (sp, 0x18)
+ .cfi_offset 8, -16
+ stw l1, (sp, 0x1C)
+ .cfi_offset 9, -12
+ stw lr, (sp, 0x20)
+ .cfi_offset 15, -8
+
+ mov l0, sp
+ .cfi_def_cfa_register 8
+
+ /* Make room for all of the new args. */
+ subu sp, sp, a2
+
+ /* Place all of the ffi_prep_args in position */
+ mov a0, sp
+ /*     a1 already set */
+
+ /* Call ffi_prep_args(stack, &ecif) */
+ jsri ffi_prep_args
+
+ /* move first 4 parameters in registers */
+ ldw a0, (sp, 0x0)
+ ldw a1, (sp, 0x4)
+ ldw a2, (sp, 0x8)
+ ldw a3, (sp, 0xC)
+ ldw a4, (sp, 0x10)
+ ldw a5, (sp, 0x14)
+
+ /* and adjust stack */
+ mov lr, l0
+ subu lr, sp  /* cif->bytes == l0 - sp */
+ movi l1, 24
+ cmphs lr, l1
+ movt lr, l1
+ addu sp, sp, lr
+
+ ldw l1, (l0, 0) /* load fn() in advance */
+
+ /* call (fn) (...) */
+ jsr l1
+
+ /* Remove the space we pushed for the args */
+ mov sp, l0
+
+ /* Load r2 with the pointer to storage for the return value */
+ ldw a2, (sp, 0x10)
+
+ /* Load r3 with the return type code */
+ ldw a3, (sp, 0xC)
+
+ /* If the return value pointer is NULL, assume no return value. */
+ cmpnei a2, 0
+ bf .Lepilogue
+
+ cmpnei a3, FFI_TYPE_STRUCT
+ bf .Lepilogue
+
+ /* return INT64 */
+ cmpnei a3, FFI_TYPE_SINT64
+ bt .Lretint
+ /* stw a0, (a2, 0x0) at .Lretint */
+ stw a1, (a2, 0x4)
+
+.Lretint:
+ /* return INT */
+ stw a0, (a2, 0x0)
+
+.Lepilogue:
+ ldw a0, (sp, 0x0)
+ ldw a1, (sp, 0x4)
+ ldw a2, (sp, 0x8)
+ ldw a3, (sp, 0xC)
+ ldw a4, (sp, 0x10)
+ ldw a5, (sp, 0x14)
+ ldw l0, (sp, 0x18)
+ ldw l1, (sp, 0x1C)
+ ldw lr, (sp, 0x20)
+ addi sp, sp, 32
+ addi sp, sp, 8
+ rts
+ .cfi_endproc
+
+        .size    ffi_call_SYSV, .-ffi_call_SYSV
+
+
+ /*
+  * unsigned int FFI_HIDDEN
+  * ffi_closure_SYSV_inner (closure, respp, args)
+  *      ffi_closure *closure;
+  *      void **respp;
+  *      void *args;
+  */
+CSKY_FUNC_START ffi_closure_SYSV
+ .cfi_startproc
+ mov a2, sp
+ mov a1, sp
+ addi a1, 24
+ subi sp, sp, 24
+ .cfi_def_cfa_offset 48
+ stw a1, (sp, 0x10)
+ .cfi_offset 3, -32
+ stw lr, (sp, 0x14)
+ .cfi_offset 15, -28
+ stw sp, (sp, 0x8)
+ mov a1, sp
+ addi a1, 8
+ jsri ffi_closure_SYSV_inner
+ ldw a0, (sp, 0x0)
+ /*
+  * if FFI_TYPE_SINT64, need a1.
+  * if FFI_TYPE_INT, ignore a1.
+  */
+ ldw a1, (sp, 0x4)
+
+ ldw lr, (sp, 0x14)
+ addi sp, sp, 24
+ addi sp, sp, 24
+ rts
+ .cfi_endproc
+
+        .size    ffi_closure_SYSV, .-ffi_closure_SYSV
+
+CSKY_FUNC_START ffi_csky_trampoline
+ subi sp, 24
+ stw a0, (sp, 0x0)
+ stw a1, (sp, 0x4)
+ stw a2, (sp, 0x8)
+ stw a3, (sp, 0xC)
+ stw a4, (sp, 0x10)
+ stw a5, (sp, 0x14)
+ lrw a0, [.Lctx]
+ lrw a1, [.Lfun]
+ jmp a1
+.Lctx:
+ mov a0, a0
+ mov a0, a0
+.Lfun:
+
+        .size    ffi_csky_trampoline, .-ffi_csky_trampoline
+
+CSKY_FUNC_START ffi_csky_cacheflush
+ lrw r1, 123
+ trap 0
+ rts
+
+        .size    ffi_csky_cacheflush, .-ffi_csky_cacheflush
+
+#endif /* __CSKYABIV2__ */
diff --git a/libffi/src/dlmalloc.c b/libffi/src/dlmalloc.c
index 7e4ea83..1aba657 100644
--- a/libffi/src/dlmalloc.c
+++ b/libffi/src/dlmalloc.c
@@ -438,6 +438,11 @@ DEFAULT_MMAP_THRESHOLD       default: 256K
 
 */
 
+#if defined __linux__ && !defined _GNU_SOURCE
+/* mremap() on Linux requires this via sys/mman.h */
+#define _GNU_SOURCE 1
+#endif
+
 #ifndef WIN32
 #ifdef _WIN32
 #define WIN32 1
@@ -2366,7 +2371,7 @@ static size_t traverse_and_check(mstate m);
 
 #else /* GNUC */
 #if  USE_BUILTIN_FFS
-#define compute_bit2idx(X, I) I = ffs(X)-1
+#define compute_bit2idx(X, I) I = __builtin_ffs(X)-1
 
 #else /* USE_BUILTIN_FFS */
 #define compute_bit2idx(X, I)\
diff --git a/libffi/src/frv/ffi.c b/libffi/src/frv/ffi.c
index 5698c89..ed1c65a 100644
--- a/libffi/src/frv/ffi.c
+++ b/libffi/src/frv/ffi.c
@@ -107,7 +107,7 @@ void *ffi_prep_args(char *stack, extended_cif *ecif)
       count += z;
     }
 
-  return (stack + ((count > 24) ? 24 : ALIGN_DOWN(count, 8)));
+  return (stack + ((count > 24) ? 24 : FFI_ALIGN_DOWN(count, 8)));
 }
 
 /* Perform machine dependent cif processing */
@@ -118,7 +118,7 @@ ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
   else
     cif->flags = cif->rtype->size;
 
-  cif->bytes = ALIGN (cif->bytes, 8);
+  cif->bytes = FFI_ALIGN (cif->bytes, 8);
 
   return FFI_OK;
 }
diff --git a/libffi/src/ia64/ffi.c b/libffi/src/ia64/ffi.c
index b77a836..b1d04c3 100644
--- a/libffi/src/ia64/ffi.c
+++ b/libffi/src/ia64/ffi.c
@@ -220,8 +220,8 @@ hfa_element_type (ffi_type *type, int nested)
 
 /* Perform machine dependent cif processing. */
 
-ffi_status
-ffi_prep_cif_machdep(ffi_cif *cif)
+static ffi_status
+ffi_prep_cif_machdep_core(ffi_cif *cif)
 {
   int flags;
 
@@ -271,6 +271,22 @@ ffi_prep_cif_machdep(ffi_cif *cif)
   return FFI_OK;
 }
 
+ffi_status
+ffi_prep_cif_machdep(ffi_cif *cif)
+{
+  cif->nfixedargs = cif->nargs;
+  return ffi_prep_cif_machdep_core(cif);
+}
+
+ffi_status
+ffi_prep_cif_machdep_var(ffi_cif *cif,
+			 unsigned int nfixedargs,
+			 unsigned int ntotalargs MAYBE_UNUSED)
+{
+  cif->nfixedargs = nfixedargs;
+  return ffi_prep_cif_machdep_core(cif);
+}
+
 extern int ffi_call_unix (struct ia64_args *, PTR64, void (*)(void), UINT64);
 
 void
@@ -454,10 +470,11 @@ ffi_closure_unix_inner (ffi_closure *closure, struct ia64_args *stack,
   ffi_cif *cif;
   void **avalue;
   ffi_type **p_arg;
-  long i, avn, gpcount, fpcount;
+  long i, avn, gpcount, fpcount, nfixedargs;
 
   cif = closure->cif;
   avn = cif->nargs;
+  nfixedargs = cif->nfixedargs;
   avalue = alloca (avn * sizeof (void *));
 
   /* If the structure return value is passed in memory get that location
@@ -468,6 +485,7 @@ ffi_closure_unix_inner (ffi_closure *closure, struct ia64_args *stack,
   gpcount = fpcount = 0;
   for (i = 0, p_arg = cif->arg_types; i < avn; i++, p_arg++)
     {
+      int named = i < nfixedargs;
       switch ((*p_arg)->type)
 	{
 	case FFI_TYPE_SINT8:
@@ -491,7 +509,7 @@ ffi_closure_unix_inner (ffi_closure *closure, struct ia64_args *stack,
 	  break;
 
 	case FFI_TYPE_FLOAT:
-	  if (gpcount < 8 && fpcount < 8)
+	  if (named && gpcount < 8 && fpcount < 8)
 	    {
 	      fpreg *addr = &stack->fp_regs[fpcount++];
 	      float result;
@@ -505,7 +523,7 @@ ffi_closure_unix_inner (ffi_closure *closure, struct ia64_args *stack,
 	  break;
 
 	case FFI_TYPE_DOUBLE:
-	  if (gpcount < 8 && fpcount < 8)
+	  if (named && gpcount < 8 && fpcount < 8)
 	    {
 	      fpreg *addr = &stack->fp_regs[fpcount++];
 	      double result;
@@ -521,7 +539,7 @@ ffi_closure_unix_inner (ffi_closure *closure, struct ia64_args *stack,
 	case FFI_TYPE_LONGDOUBLE:
 	  if (gpcount & 1)
 	    gpcount++;
-	  if (LDBL_MANT_DIG == 64 && gpcount < 8 && fpcount < 8)
+	  if (LDBL_MANT_DIG == 64 && named && gpcount < 8 && fpcount < 8)
 	    {
 	      fpreg *addr = &stack->fp_regs[fpcount++];
 	      __float80 result;
diff --git a/libffi/src/ia64/ffitarget.h b/libffi/src/ia64/ffitarget.h
index e68cea6..fd5b9a0 100644
--- a/libffi/src/ia64/ffitarget.h
+++ b/libffi/src/ia64/ffitarget.h
@@ -50,6 +50,7 @@ typedef enum ffi_abi {
 #define FFI_TRAMPOLINE_SIZE 24  /* Really the following struct, which 	*/
 				/* can be interpreted as a C function	*/
 				/* descriptor:				*/
+#define FFI_TARGET_SPECIFIC_VARIADIC 1
+#define FFI_EXTRA_CIF_FIELDS unsigned nfixedargs
 
 #endif
-
diff --git a/libffi/src/ia64/unix.S b/libffi/src/ia64/unix.S
index 4d2a86d..e2547e0 100644
--- a/libffi/src/ia64/unix.S
+++ b/libffi/src/ia64/unix.S
@@ -175,7 +175,6 @@ ffi_call_unix:
 	;;
 
 .Lst_small_struct:
-	add	sp = -16, sp
 	cmp.lt	p6, p0 = 8, in3
 	cmp.lt	p7, p0 = 16, in3
 	cmp.lt	p8, p0 = 24, in3
@@ -191,6 +190,12 @@ ffi_call_unix:
 (p8)	st8	[r18] = r11
 	mov	out1 = sp
 	mov	out2 = in3
+	;;
+	// ia64 software calling convention requires
+	// top 16 bytes of stack to be scratch space
+	// PLT resolver uses that scratch space at
+	// 'memcpy' symbol reolution time
+	add	sp = -16, sp
 	br.call.sptk.many b0 = memcpy#
 	;;
 	mov	ar.pfs = loc0
@@ -529,6 +534,7 @@ ffi_closure_unix:
 	data8	@pcrel(.Lst_int64)		// FFI_TYPE_SINT64
 	data8	@pcrel(.Lst_void)		// FFI_TYPE_STRUCT
 	data8	@pcrel(.Lst_int64)		// FFI_TYPE_POINTER
+	data8	@pcrel(.Lst_void)		// FFI_TYPE_COMPLEX (not implemented)
 	data8 	@pcrel(.Lst_small_struct)	// FFI_IA64_TYPE_SMALL_STRUCT
 	data8	@pcrel(.Lst_hfa_float)		// FFI_IA64_TYPE_HFA_FLOAT
 	data8	@pcrel(.Lst_hfa_double)		// FFI_IA64_TYPE_HFA_DOUBLE
@@ -550,6 +556,7 @@ ffi_closure_unix:
 	data8	@pcrel(.Lld_int)		// FFI_TYPE_SINT64
 	data8	@pcrel(.Lld_void)		// FFI_TYPE_STRUCT
 	data8	@pcrel(.Lld_int)		// FFI_TYPE_POINTER
+	data8	@pcrel(.Lld_void)		// FFI_TYPE_COMPLEX (not implemented)
 	data8 	@pcrel(.Lld_small_struct)	// FFI_IA64_TYPE_SMALL_STRUCT
 	data8	@pcrel(.Lld_hfa_float)		// FFI_IA64_TYPE_HFA_FLOAT
 	data8	@pcrel(.Lld_hfa_double)		// FFI_IA64_TYPE_HFA_DOUBLE
diff --git a/libffi/src/java_raw_api.c b/libffi/src/java_raw_api.c
index 127123d..114d3e4 100644
--- a/libffi/src/java_raw_api.c
+++ b/libffi/src/java_raw_api.c
@@ -114,7 +114,7 @@ ffi_java_raw_to_ptrarray (ffi_cif *cif, ffi_java_raw *raw, void **args)
 	default:
 	  *args = raw;
 	  raw +=
-	    ALIGN ((*tp)->size, sizeof(ffi_java_raw)) / sizeof(ffi_java_raw);
+	    FFI_ALIGN ((*tp)->size, sizeof(ffi_java_raw)) / sizeof(ffi_java_raw);
 	}
     }
 
@@ -142,7 +142,7 @@ ffi_java_raw_to_ptrarray (ffi_cif *cif, ffi_java_raw *raw, void **args)
 #else /* FFI_SIZEOF_JAVA_RAW != 8 */
 	*args = (void*) raw;
 	raw +=
-	  ALIGN ((*tp)->size, sizeof(ffi_java_raw)) / sizeof(ffi_java_raw);
+	  FFI_ALIGN ((*tp)->size, sizeof(ffi_java_raw)) / sizeof(ffi_java_raw);
 #endif /* FFI_SIZEOF_JAVA_RAW == 8 */
     }
 
@@ -234,7 +234,7 @@ ffi_java_ptrarray_to_raw (ffi_cif *cif, void **args, ffi_java_raw *raw)
 #else
 	  memcpy ((void*) raw->data, (void*)*args, (*tp)->size);
 	  raw +=
-	    ALIGN ((*tp)->size, sizeof(ffi_java_raw)) / sizeof(ffi_java_raw);
+	    FFI_ALIGN ((*tp)->size, sizeof(ffi_java_raw)) / sizeof(ffi_java_raw);
 #endif
 	}
     }
diff --git a/libffi/src/kvx/asm.h b/libffi/src/kvx/asm.h
new file mode 100644
index 0000000..4edba41
--- /dev/null
+++ b/libffi/src/kvx/asm.h
@@ -0,0 +1,5 @@
+/* args are passed on registers from r0 up to r11 => 12*8 bytes */
+#define REG_ARGS_SIZE (12*8)
+#define KVX_REGISTER_SIZE (8)
+#define KVX_ABI_SLOT_SIZE (KVX_REGISTER_SIZE)
+#define KVX_ABI_MAX_AGGREGATE_IN_REG_SIZE (4*KVX_ABI_SLOT_SIZE)
diff --git a/libffi/src/kvx/ffi.c b/libffi/src/kvx/ffi.c
new file mode 100644
index 0000000..58f6aef
--- /dev/null
+++ b/libffi/src/kvx/ffi.c
@@ -0,0 +1,273 @@
+/* Copyright (c) 2020 Kalray
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+``Software''), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
+
+#if defined(__kvx__)
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <fficonfig.h>
+#include <ffi.h>
+#include "ffi_common.h"
+#include "asm.h"
+
+#define ALIGN(x, a) ALIGN_MASK(x, (typeof(x))(a) - 1)
+#define ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask))
+#define KVX_ABI_STACK_ALIGNMENT (32)
+#define KVX_ABI_STACK_ARG_ALIGNMENT (8)
+#define max(a,b) ((a) > (b) ? (a) : (b))
+
+#ifdef FFI_DEBUG
+#define DEBUG_PRINT(...) do{ fprintf( stderr, __VA_ARGS__ ); } while(0)
+#else
+#define DEBUG_PRINT(...)
+#endif
+
+struct ret_value {
+	unsigned long int r0;
+	unsigned long int r1;
+	unsigned long int r2;
+	unsigned long int r3;
+};
+
+extern struct ret_value ffi_call_SYSV(unsigned total_size,
+                                      unsigned size,
+                                      extended_cif *ecif,
+                                      unsigned *rvalue_addr,
+                                      void *fn,
+                                      unsigned int_ext_method);
+
+/* Perform machine dependent cif processing */
+ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
+{
+  cif->flags = cif->rtype->size;
+  return FFI_OK;
+}
+
+/* ffi_prep_args is called by the assembly routine once stack space
+   has been allocated for the function's arguments */
+
+void *ffi_prep_args(char *stack, unsigned int arg_slots_size, extended_cif *ecif)
+{
+  char *stacktemp = stack;
+  char *current_arg_passed_by_value = stack + arg_slots_size;
+  int i, s;
+  ffi_type **arg;
+  int count = 0;
+  ffi_cif *cif = ecif->cif;
+  void **argv = ecif->avalue;
+
+  arg = cif->arg_types;
+
+  DEBUG_PRINT("stack: %p\n", stack);
+  DEBUG_PRINT("arg_slots_size: %u\n", arg_slots_size);
+  DEBUG_PRINT("current_arg_passed_by_value: %p\n", current_arg_passed_by_value);
+  DEBUG_PRINT("ecif: %p\n", ecif);
+  DEBUG_PRINT("ecif->avalue: %p\n", ecif->avalue);
+
+  for (i = 0; i < cif->nargs; i++) {
+
+    s = KVX_ABI_SLOT_SIZE;
+    switch((*arg)->type) {
+      case FFI_TYPE_SINT8:
+      case FFI_TYPE_UINT8:
+      case FFI_TYPE_SINT16:
+      case FFI_TYPE_UINT16:
+      case FFI_TYPE_SINT32:
+      case FFI_TYPE_UINT32:
+      case FFI_TYPE_FLOAT:
+      case FFI_TYPE_DOUBLE:
+      case FFI_TYPE_UINT64:
+      case FFI_TYPE_SINT64:
+      case FFI_TYPE_POINTER:
+        DEBUG_PRINT("INT64/32/16/8/FLOAT/DOUBLE or POINTER @%p\n", stack);
+        *(uint64_t *) stack = *(uint64_t *)(* argv);
+        break;
+
+      case FFI_TYPE_COMPLEX:
+        if ((*arg)->size == 8)
+          *(_Complex float *) stack = *(_Complex float *)(* argv);
+        else if ((*arg)->size == 16) {
+          *(_Complex double *) stack = *(_Complex double *)(* argv);
+          s = 16;
+        } else
+          abort();
+        break;
+      case FFI_TYPE_STRUCT: {
+        char *value;
+        unsigned int written_size = 0;
+        DEBUG_PRINT("struct by value @%p\n", stack);
+        if ((*arg)->size > KVX_ABI_MAX_AGGREGATE_IN_REG_SIZE) {
+          DEBUG_PRINT("big struct\n");
+          *(uint64_t *) stack = (uintptr_t)current_arg_passed_by_value;
+          value = current_arg_passed_by_value;
+          current_arg_passed_by_value += (*arg)->size;
+          written_size = KVX_ABI_SLOT_SIZE;
+        } else {
+          value = stack;
+          written_size = (*arg)->size;
+        }
+        memcpy(value, *argv, (*arg)->size);
+        s = ALIGN(written_size, KVX_ABI_STACK_ARG_ALIGNMENT);
+        break;
+      }
+      default:
+        printf("Error: unsupported arg type %d\n", (*arg)->type);
+        abort();
+        break;
+
+    }
+    stack += s;
+    count += s;
+    argv++;
+    arg++;
+  }
+#ifdef FFI_DEBUG
+  FFI_ASSERT(((intptr_t)(stacktemp + REG_ARGS_SIZE) & (KVX_ABI_STACK_ALIGNMENT-1)) == 0);
+#endif
+  return stacktemp + REG_ARGS_SIZE;
+}
+
+/* Perform machine dependent cif processing when we have a variadic function */
+
+ffi_status ffi_prep_cif_machdep_var(ffi_cif *cif, unsigned int nfixedargs,
+                                    unsigned int ntotalargs)
+{
+  cif->flags = cif->rtype->size;
+  return FFI_OK;
+}
+
+static unsigned long handle_small_int_ext(kvx_intext_method *int_ext_method,
+                                          const ffi_type *rtype)
+{
+  switch (rtype->type) {
+    case FFI_TYPE_SINT8:
+      *int_ext_method = KVX_RET_SXBD;
+      return KVX_REGISTER_SIZE;
+
+    case FFI_TYPE_SINT16:
+      *int_ext_method = KVX_RET_SXHD;
+      return KVX_REGISTER_SIZE;
+
+    case FFI_TYPE_SINT32:
+      *int_ext_method = KVX_RET_SXWD;
+      return KVX_REGISTER_SIZE;
+
+    case FFI_TYPE_UINT8:
+      *int_ext_method = KVX_RET_ZXBD;
+      return KVX_REGISTER_SIZE;
+
+    case FFI_TYPE_UINT16:
+      *int_ext_method = KVX_RET_ZXHD;
+      return KVX_REGISTER_SIZE;
+
+    case FFI_TYPE_UINT32:
+      *int_ext_method = KVX_RET_ZXWD;
+      return KVX_REGISTER_SIZE;
+
+    default:
+      *int_ext_method = KVX_RET_NONE;
+      return rtype->size;
+  }
+}
+
+void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  int i;
+  unsigned long int slot_fitting_args_size = 0;
+  unsigned long int total_size = 0;
+  unsigned long int big_struct_size = 0;
+  kvx_intext_method int_extension_method;
+  ffi_type **arg;
+  struct ret_value local_rvalue = {0};
+  size_t wb_size;
+
+
+  /* Calculate size to allocate on stack */
+  for (i = 0, arg = cif->arg_types; i < cif->nargs; i++, arg++) {
+    DEBUG_PRINT("argument %d, type %d, size %lu\n", i, (*arg)->type, (*arg)->size);
+    if (((*arg)->type == FFI_TYPE_STRUCT) || ((*arg)->type == FFI_TYPE_COMPLEX)) {
+      if ((*arg)->size <= KVX_ABI_MAX_AGGREGATE_IN_REG_SIZE) {
+        slot_fitting_args_size += ALIGN((*arg)->size, KVX_ABI_SLOT_SIZE);
+      } else {
+        slot_fitting_args_size += KVX_ABI_SLOT_SIZE; /* aggregate passed by reference */
+        big_struct_size += ALIGN((*arg)->size, KVX_ABI_SLOT_SIZE);
+      }
+    } else if ((*arg)->size <= KVX_ABI_SLOT_SIZE) {
+      slot_fitting_args_size += KVX_ABI_SLOT_SIZE;
+    } else {
+      printf("Error: unsupported arg size %ld arg type %d\n", (*arg)->size, (*arg)->type);
+      abort(); /* should never happen? */
+    }
+  }
+
+  extended_cif ecif;
+  ecif.cif = cif;
+  ecif.avalue = avalue;
+  ecif.rvalue = rvalue;
+
+  /* This implementation allocates anyway for all register based args */
+  slot_fitting_args_size = max(slot_fitting_args_size, REG_ARGS_SIZE);
+  total_size = slot_fitting_args_size + big_struct_size;
+  total_size = ALIGN(total_size, KVX_ABI_STACK_ALIGNMENT);
+
+  /* wb_size: write back size, the size we will need to write back to user
+   * provided buffer. In theory it should always be cif->flags which is
+   * cif->rtype->size. But libffi API mandates that for integral types
+   * of size <= system register size, then we *MUST* write back
+   * the size of system register size.
+   * in our case, if size <= 8 bytes we must write back 8 bytes.
+   * floats, complex and structs are not affected, only integrals.
+   */
+  wb_size = handle_small_int_ext(&int_extension_method, cif->rtype);
+
+  switch (cif->abi) {
+    case FFI_SYSV:
+      DEBUG_PRINT("total_size: %lu\n", total_size);
+      DEBUG_PRINT("slot fitting args size: %lu\n", slot_fitting_args_size);
+      DEBUG_PRINT("rvalue: %p\n", rvalue);
+      DEBUG_PRINT("fn: %p\n", fn);
+      DEBUG_PRINT("rsize: %u\n", cif->flags);
+      DEBUG_PRINT("wb_size: %u\n", wb_size);
+      DEBUG_PRINT("int_extension_method: %u\n", int_extension_method);
+      local_rvalue = ffi_call_SYSV(total_size, slot_fitting_args_size,
+                                   &ecif, rvalue, fn, int_extension_method);
+      if ((cif->flags <= KVX_ABI_MAX_AGGREGATE_IN_REG_SIZE)
+          && (cif->rtype->type != FFI_TYPE_VOID))
+        memcpy(rvalue, &local_rvalue, wb_size);
+      break;
+    default:
+      abort();
+      break;
+  }
+}
+
+/* Closures not supported yet */
+ffi_status
+ffi_prep_closure_loc (ffi_closure* closure,
+                      ffi_cif* cif,
+                      void (*fun)(ffi_cif*,void*,void**,void*),
+                      void *user_data,
+                      void *codeloc)
+{
+  return FFI_BAD_ABI;
+}
+
+#endif /* (__kvx__) */
diff --git a/libffi/src/kvx/ffitarget.h b/libffi/src/kvx/ffitarget.h
new file mode 100644
index 0000000..8df8735
--- /dev/null
+++ b/libffi/src/kvx/ffitarget.h
@@ -0,0 +1,75 @@
+/* -----------------------------------------------------------------------
+   ffitarget.h - Copyright (c) 2020 Kalray
+
+   KVX Target configuration macros
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+   ----------------------------------------------------------------------- */
+
+#ifndef LIBFFI_TARGET_H
+#define LIBFFI_TARGET_H
+
+#ifndef LIBFFI_H
+#error "Please do not include ffitarget.h directly into your source. Use ffi.h instead."
+#endif
+
+/* ---- System specific configurations ----------------------------------- */
+
+#ifndef LIBFFI_ASM
+typedef unsigned long          ffi_arg;
+typedef signed long            ffi_sarg;
+
+typedef enum ffi_abi {
+  FFI_FIRST_ABI = 0,
+  FFI_SYSV,
+  FFI_LAST_ABI,
+  FFI_DEFAULT_ABI = FFI_SYSV
+} ffi_abi;
+
+/* Those values are set depending on return type
+ * they are used in the assembly code in sysv.S
+ */
+typedef enum kvx_intext_method {
+  KVX_RET_NONE = 0,
+  KVX_RET_SXBD = 1,
+  KVX_RET_SXHD = 2,
+  KVX_RET_SXWD = 3,
+  KVX_RET_ZXBD = 4,
+  KVX_RET_ZXHD = 5,
+  KVX_RET_ZXWD = 6
+} kvx_intext_method;
+
+#endif
+
+/* ---- Definitions for closures ----------------------------------------- */
+
+/* This is only to allow Python to compile
+ * but closures are not supported yet
+ */
+#define FFI_CLOSURES 1
+#define FFI_TRAMPOLINE_SIZE 0
+
+#define FFI_NATIVE_RAW_API 0
+#define FFI_TARGET_SPECIFIC_VARIADIC 1
+#define FFI_TARGET_HAS_COMPLEX_TYPE
+
+#endif
+
diff --git a/libffi/src/kvx/sysv.S b/libffi/src/kvx/sysv.S
new file mode 100644
index 0000000..952afc7
--- /dev/null
+++ b/libffi/src/kvx/sysv.S
@@ -0,0 +1,127 @@
+/* Copyright (c) 2020 Kalray
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+``Software''), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
+
+#if defined(__kvx__)
+#define LIBFFI_ASM
+#include <fficonfig.h>
+#include <ffi.h>
+#include <ffi_cfi.h>
+#include <kvx/asm.h>
+
+.text
+.global ffi_call_SYSV
+.type ffi_call_SYSV, @function
+.type ffi_prep_args, @function
+.align 8
+
+/* ffi_call_SYSV
+
+  r0: total size to allocate on stack
+  r1: size of arg slots
+  r2: extended cif structure, DO NOT REMOVE: it is used by ffi_prep_args()
+  r3: return value address
+  r4: function to call
+  r5: integer sign extension method to be used
+*/
+ffi_call_SYSV:
+	addd $r12 = $r12, -64
+	so (-32)[$r12] = $r20r21r22r23
+	;;
+	sd (0)[$r12] = $r24
+	;;
+	get $r23 = $ra
+	copyd $r20 = $r12
+	sbfd $r12 = $r0, $r12
+	;;
+	copyd $r0 = $r12
+	copyd $r21 = $r3
+	copyd $r22 = $r4
+	copyd $r24 = $r5
+	call ffi_prep_args
+	;;
+	lo $r8r9r10r11 = (64)[$r12]
+	;;
+	lo $r4r5r6r7 = (32)[$r12]
+	;;
+	lo $r0r1r2r3 = (0)[$r12]
+	copyd $r12 = $r0
+	/* $r15 is the register used by the ABI to return big (>32 bytes)
+	 * structs by value.
+	 * It is also referred to as the "struct register" in the ABI.
+	 */
+	copyd $r15 = $r21
+	icall $r22
+	;;
+	pcrel $r4 = @pcrel(.Ltable)
+	cb.deqz $r24 ? .Lend
+	;;
+	addx8d $r24 = $r24, $r4
+	;;
+	igoto $r24
+	;;
+.Ltable:
+0: /* we should never arrive here */
+	goto .Lerror
+	nop
+	;;
+1: /* Sign extend byte to double */
+	sxbd $r0 = $r0
+	goto .Lend
+	;;
+2: /* Sign extend half to double */
+	sxhd $r0 = $r0
+	goto .Lend
+	;;
+3: /* Sign extend word to double */
+	sxwd $r0 = $r0
+	goto .Lend
+	;;
+4: /* Zero extend byte to double */
+	zxbd $r0 = $r0
+	goto .Lend
+	;;
+5: /* Zero extend half to double */
+	zxhd $r0 = $r0
+	goto .Lend
+	;;
+6: /* Zero extend word to double */
+	zxwd $r0 = $r0
+	/* Fallthrough to .Lend */
+	;;
+.Lend:
+	ld $r24 = (0)[$r12]
+	;;
+	set $ra = $r23
+	lo $r20r21r22r23 = (32)[$r20]
+	addd $r12 = $r20, 64
+	;;
+	ret
+	;;
+.Lerror:
+	errop
+	;;
+
+#endif /* __kvx__ */
+
+#if defined __ELF__ && defined __linux__
+	.section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/libffi/src/m32r/ffi.c b/libffi/src/m32r/ffi.c
index 3000063..ab8fc4e 100644
--- a/libffi/src/m32r/ffi.c
+++ b/libffi/src/m32r/ffi.c
@@ -61,7 +61,7 @@ void ffi_prep_args(char *stack, extended_cif *ecif)
 
       /* Align if necessary.  */
       if (((*p_arg)->alignment - 1) & (unsigned) argp)
-	argp = (char *) ALIGN (argp, (*p_arg)->alignment);
+	argp = (char *) FFI_ALIGN (argp, (*p_arg)->alignment);
 
       if (avn != 0) 
 	{
diff --git a/libffi/src/m68k/ffi.c b/libffi/src/m68k/ffi.c
index 0dee938..0330184 100644
--- a/libffi/src/m68k/ffi.c
+++ b/libffi/src/m68k/ffi.c
@@ -105,7 +105,7 @@ ffi_prep_args (void *stack, extended_cif *ecif)
 
 	  /* Align if necessary.  */
 	  if ((sizeof(int) - 1) & z)
-	    z = ALIGN(z, sizeof(int));
+	    z = FFI_ALIGN(z, sizeof(int));
 	}
 
       p_argv++;
@@ -297,7 +297,7 @@ ffi_prep_incoming_args_SYSV (char *stack, void **avalue, ffi_cif *cif)
 
 	  /* Align if necessary */
 	  if ((sizeof(int) - 1) & z)
-	    z = ALIGN(z, sizeof(int));
+	    z = FFI_ALIGN(z, sizeof(int));
 	}
 
       p_argv++;
diff --git a/libffi/src/m68k/sysv.S b/libffi/src/m68k/sysv.S
index ec2b14f..ea40f11 100644
--- a/libffi/src/m68k/sysv.S
+++ b/libffi/src/m68k/sysv.S
@@ -3,7 +3,7 @@
    sysv.S - Copyright (c) 2012 Alan Hourihane
 	    Copyright (c) 1998, 2012 Andreas Schwab
 	    Copyright (c) 2008 Red Hat, Inc.
-	    Copyright (c) 2012 Thorsten Glaser
+	    Copyright (c) 2012, 2016 Thorsten Glaser
 
    m68k Foreign Function Interface
 
@@ -72,6 +72,15 @@ CALLFUNC(ffi_call_SYSV):
 	pea	4(%sp)
 #if !defined __PIC__
 	jsr	CALLFUNC(ffi_prep_args)
+#elif defined(__uClinux__) && defined(__ID_SHARED_LIBRARY__)
+	move.l  _current_shared_library_a5_offset_(%a5),%a0
+	move.l  CALLFUNC(ffi_prep_args@GOT)(%a0),%a0
+	jsr     (%a0)
+#elif defined(__mcoldfire__) && !defined(__mcfisab__) && !defined(__mcfisac__)
+	move.l  #_GLOBAL_OFFSET_TABLE_@GOTPC,%a0
+	lea     (-6,%pc,%a0),%a0
+	move.l  CALLFUNC(ffi_prep_args@GOT)(%a0),%a0
+	jsr     (%a0)
 #else
 	bsr.l	CALLFUNC(ffi_prep_args@PLTPC)
 #endif
@@ -215,6 +224,15 @@ CALLFUNC(ffi_closure_SYSV):
 	move.l	%a0,-(%sp)
 #if !defined __PIC__
 	jsr	CALLFUNC(ffi_closure_SYSV_inner)
+#elif defined(__uClinux__) && defined(__ID_SHARED_LIBRARY__)
+	move.l  _current_shared_library_a5_offset_(%a5),%a0
+	move.l  CALLFUNC(ffi_closure_SYSV_inner@GOT)(%a0),%a0
+	jsr     (%a0)
+#elif defined(__mcoldfire__) && !defined(__mcfisab__) && !defined(__mcfisac__)
+	move.l  #_GLOBAL_OFFSET_TABLE_@GOTPC,%a0
+	lea     (-6,%pc,%a0),%a0
+	move.l  CALLFUNC(ffi_closure_SYSV_inner@GOT)(%a0),%a0
+	jsr     (%a0)
 #else
 	bsr.l	CALLFUNC(ffi_closure_SYSV_inner@PLTPC)
 #endif
@@ -317,6 +335,15 @@ CALLFUNC(ffi_closure_struct_SYSV):
 	move.l	%a0,-(%sp)
 #if !defined __PIC__
 	jsr	CALLFUNC(ffi_closure_SYSV_inner)
+#elif defined(__uClinux__) && defined(__ID_SHARED_LIBRARY__)
+	move.l  _current_shared_library_a5_offset_(%a5),%a0
+	move.l  CALLFUNC(ffi_closure_SYSV_inner@GOT)(%a0),%a0
+	jsr     (%a0)
+#elif defined(__mcoldfire__) && !defined(__mcfisab__) && !defined(__mcfisac__)
+	move.l  #_GLOBAL_OFFSET_TABLE_@GOTPC,%a0
+	lea     (-6,%pc,%a0),%a0
+	move.l  CALLFUNC(ffi_closure_SYSV_inner@GOT)(%a0),%a0
+	jsr     (%a0)
 #else
 	bsr.l	CALLFUNC(ffi_closure_SYSV_inner@PLTPC)
 #endif
diff --git a/libffi/src/m88k/ffi.c b/libffi/src/m88k/ffi.c
index 68df494..57b344f 100644
--- a/libffi/src/m88k/ffi.c
+++ b/libffi/src/m88k/ffi.c
@@ -134,7 +134,7 @@ ffi_prep_args (void *stack, extended_cif *ecif)
       /* Enforce proper stack alignment of 64-bit types */
       if (argp == stackp && a > sizeof (int))
 	{
-	  stackp = (char *) ALIGN(stackp, a);
+	  stackp = (char *) FFI_ALIGN(stackp, a);
 	  argp = stackp;
 	}
 
@@ -177,7 +177,7 @@ ffi_prep_args (void *stack, extended_cif *ecif)
 
       /* Align if necessary.  */
       if ((sizeof (int) - 1) & z)
-	z = ALIGN(z, sizeof (int));
+	z = FFI_ALIGN(z, sizeof (int));
 
       p_argv++;
 
@@ -320,7 +320,7 @@ ffi_prep_closure_args_OBSD (ffi_cif *cif, void **avalue, unsigned int *regp,
       /* Enforce proper stack alignment of 64-bit types */
       if (argp == stackp && a > sizeof (int))
 	{
-	  stackp = (char *) ALIGN(stackp, a);
+	  stackp = (char *) FFI_ALIGN(stackp, a);
 	  argp = stackp;
 	}
 
@@ -331,7 +331,7 @@ ffi_prep_closure_args_OBSD (ffi_cif *cif, void **avalue, unsigned int *regp,
 
       /* Align if necessary */
       if ((sizeof (int) - 1) & z)
-	z = ALIGN(z, sizeof (int));
+	z = FFI_ALIGN(z, sizeof (int));
 
       p_argv++;
 
diff --git a/libffi/src/metag/ffi.c b/libffi/src/metag/ffi.c
index 46b383e..3aecb0b 100644
--- a/libffi/src/metag/ffi.c
+++ b/libffi/src/metag/ffi.c
@@ -61,7 +61,7 @@ unsigned int ffi_prep_args(char *stack, extended_cif *ecif)
 		argp -= z;
 
 		/* Align if necessary */
-		argp = (char *) ALIGN_DOWN(ALIGN_DOWN(argp, (*p_arg)->alignment), 4);
+		argp = (char *) FFI_ALIGN_DOWN(FFI_ALIGN_DOWN(argp, (*p_arg)->alignment), 4);
 
 		if (z < sizeof(int)) {
 			z = sizeof(int);
@@ -93,7 +93,7 @@ unsigned int ffi_prep_args(char *stack, extended_cif *ecif)
 
 	/* return the size of the arguments to be passed in registers,
 	   padded to an 8 byte boundary to preserve stack alignment */
-	return ALIGN(MIN(stack - argp, 6*4), 8);
+	return FFI_ALIGN(MIN(stack - argp, 6*4), 8);
 }
 
 /* Perform machine dependent cif processing */
@@ -112,20 +112,20 @@ ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
 
 		/* Add any padding if necessary */
 		if (((*ptr)->alignment - 1) & bytes)
-			bytes = ALIGN(bytes, (*ptr)->alignment);
+			bytes = FFI_ALIGN(bytes, (*ptr)->alignment);
 
-		bytes += ALIGN((*ptr)->size, 4);
+		bytes += FFI_ALIGN((*ptr)->size, 4);
 	}
 
 	/* Ensure arg space is aligned to an 8-byte boundary */
-	bytes = ALIGN(bytes, 8);
+	bytes = FFI_ALIGN(bytes, 8);
 
 	/* Make space for the return structure pointer */
 	if (cif->rtype->type == FFI_TYPE_STRUCT) {
 		bytes += sizeof(void*);
 
 		/* Ensure stack is aligned to an 8-byte boundary */
-		bytes = ALIGN(bytes, 8);
+		bytes = FFI_ALIGN(bytes, 8);
 	}
 
 	cif->bytes = bytes;
@@ -319,7 +319,7 @@ static void ffi_prep_incoming_args_SYSV(char *stack, void **rvalue,
 		if (alignment < 4)
 			alignment = 4;
 		if ((alignment - 1) & (unsigned)argp)
-			argp = (char *) ALIGN(argp, alignment);
+			argp = (char *) FFI_ALIGN(argp, alignment);
 
 		z = (*p_arg)->size;
 		*p_argv = (void*) argp;
diff --git a/libffi/src/microblaze/ffi.c b/libffi/src/microblaze/ffi.c
index ea962ea..df6e33c 100644
--- a/libffi/src/microblaze/ffi.c
+++ b/libffi/src/microblaze/ffi.c
@@ -35,7 +35,7 @@ extern void ffi_closure_SYSV(void);
 
 #define WORD_SIZE			sizeof(unsigned int)
 #define ARGS_REGISTER_SIZE	(WORD_SIZE * 6)
-#define WORD_ALIGN(x)		ALIGN(x, WORD_SIZE)
+#define WORD_FFI_ALIGN(x)		FFI_ALIGN(x, WORD_SIZE)
 
 /* ffi_prep_args is called by the assembly routine once stack space
    has been allocated for the function's arguments */
@@ -46,12 +46,12 @@ void ffi_prep_args(void* stack, extended_cif* ecif)
 	void** p_argv;
 	void* stack_args_p = stack;
 
-	p_argv = ecif->avalue;
-
 	if (ecif == NULL || ecif->cif == NULL) {
 		return; /* no description to prepare */
 	}
 
+	p_argv = ecif->avalue;
+
 	if ((ecif->cif->rtype != NULL) &&
 			(ecif->cif->rtype->type == FFI_TYPE_STRUCT))
 	{
@@ -74,7 +74,7 @@ void ffi_prep_args(void* stack, extended_cif* ecif)
 		int type = (*p_arg)->type;
 		void* value = p_argv[i];
 		char* addr = stack_args_p;
-		int aligned_size = WORD_ALIGN(size);
+		int aligned_size = WORD_FFI_ALIGN(size);
 
 		/* force word alignment on the stack */
 		stack_args_p += aligned_size;
@@ -259,7 +259,7 @@ void ffi_closure_call_SYSV(void* register_args, void* stack_args,
 				avalue[i] = ptr;
 				break;
 		}
-		ptr += WORD_ALIGN(arg_types[i]->size);
+		ptr += WORD_FFI_ALIGN(arg_types[i]->size);
 	}
 
 	/* set the return type info passed back to the wrapper */
diff --git a/libffi/src/mips/ffi.c b/libffi/src/mips/ffi.c
index ecd783a..979ca49 100644
--- a/libffi/src/mips/ffi.c
+++ b/libffi/src/mips/ffi.c
@@ -29,6 +29,7 @@
 #include <ffi.h>
 #include <ffi_common.h>
 
+#include <stdint.h>
 #include <stdlib.h>
 
 #ifdef __GNUC__
@@ -38,7 +39,9 @@
 #endif
 
 #ifndef USE__BUILTIN___CLEAR_CACHE
-#  if defined(__OpenBSD__)
+#  if defined(__FreeBSD__)
+#    include <machine/sysarch.h>
+#  elif defined(__OpenBSD__)
 #    include <mips64/sysarch.h>
 #  else
 #    include <sys/cachectl.h>
@@ -116,7 +119,7 @@ static void ffi_prep_args(char *stack,
       
       if ((a - 1) & (unsigned long) argp)
 	{
-	  argp = (char *) ALIGN(argp, a);
+	  argp = (char *) FFI_ALIGN(argp, a);
 	  FIX_ARGP;
 	}
 
@@ -247,7 +250,7 @@ calc_n32_struct_flags(int soft_float, ffi_type *arg,
   while ((e = arg->elements[index]))
     {
       /* Align this object.  */
-      *loc = ALIGN(*loc, e->alignment);
+      *loc = FFI_ALIGN(*loc, e->alignment);
       if (e->type == FFI_TYPE_DOUBLE)
 	{
           /* Already aligned to FFI_SIZEOF_ARG.  */
@@ -262,7 +265,7 @@ calc_n32_struct_flags(int soft_float, ffi_type *arg,
       index++;
     }
   /* Next Argument register at alignment of FFI_SIZEOF_ARG.  */
-  *arg_reg = ALIGN(*loc, FFI_SIZEOF_ARG) / FFI_SIZEOF_ARG;
+  *arg_reg = FFI_ALIGN(*loc, FFI_SIZEOF_ARG) / FFI_SIZEOF_ARG;
 
   return flags;
 }
@@ -322,9 +325,10 @@ calc_n32_return_struct_flags(int soft_float, ffi_type *arg)
 #endif
 
 /* Perform machine dependent cif processing */
-ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
+static ffi_status ffi_prep_cif_machdep_int(ffi_cif *cif, unsigned nfixedargs)
 {
   cif->flags = 0;
+  cif->mips_nfixedargs = nfixedargs;
 
 #ifdef FFI_MIPS_O32
   /* Set the flags necessary for O32 processing.  FFI_O32_SOFT_FLOAT
@@ -333,7 +337,7 @@ ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
 
   if (cif->rtype->type != FFI_TYPE_STRUCT && cif->abi == FFI_O32)
     {
-      if (cif->nargs > 0)
+      if (cif->nargs > 0 && cif->nargs == nfixedargs)
 	{
 	  switch ((cif->arg_types)[0]->type)
 	    {
@@ -450,7 +454,9 @@ ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
     while (count-- > 0 && arg_reg < 8)
       {
 	type = (cif->arg_types)[index]->type;
-	if (soft_float)
+
+	// Pass variadic arguments in integer registers even if they're floats
+	if (soft_float || index >= nfixedargs)
 	  {
 	    switch (type)
 	      {
@@ -474,9 +480,9 @@ ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
 	    break;
           case FFI_TYPE_LONGDOUBLE:
             /* Align it.  */
-            arg_reg = ALIGN(arg_reg, 2);
+            arg_reg = FFI_ALIGN(arg_reg, 2);
             /* Treat it as two adjacent doubles.  */
-	    if (soft_float) 
+	    if (soft_float || index >= nfixedargs)
 	      {
 		arg_reg += 2;
 	      }
@@ -493,7 +499,7 @@ ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
 
 	  case FFI_TYPE_STRUCT:
             loc = arg_reg * FFI_SIZEOF_ARG;
-	    cif->flags += calc_n32_struct_flags(soft_float,
+	    cif->flags += calc_n32_struct_flags(soft_float || index >= nfixedargs,
 						(cif->arg_types)[index],
 						&loc, &arg_reg);
 	    break;
@@ -578,17 +584,30 @@ ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
   return FFI_OK;
 }
 
+ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
+{
+    return ffi_prep_cif_machdep_int(cif, cif->nargs);
+}
+
+ffi_status ffi_prep_cif_machdep_var(ffi_cif *cif,
+                                    unsigned nfixedargs,
+                                    unsigned ntotalargs MAYBE_UNUSED)
+{
+    return ffi_prep_cif_machdep_int(cif, nfixedargs);
+}
+
 /* Low level routine for calling O32 functions */
 extern int ffi_call_O32(void (*)(char *, extended_cif *, int, int), 
 			extended_cif *, unsigned, 
-			unsigned, unsigned *, void (*)(void));
+			unsigned, unsigned *, void (*)(void), void *closure);
 
 /* Low level routine for calling N32 functions */
 extern int ffi_call_N32(void (*)(char *, extended_cif *, int, int), 
 			extended_cif *, unsigned, 
-			unsigned, void *, void (*)(void));
+			unsigned, void *, void (*)(void), void *closure);
 
-void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+void ffi_call_int(ffi_cif *cif, void (*fn)(void), void *rvalue, 
+	      void **avalue, void *closure)
 {
   extended_cif ecif;
 
@@ -610,7 +629,7 @@ void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
     case FFI_O32:
     case FFI_O32_SOFT_FLOAT:
       ffi_call_O32(ffi_prep_args, &ecif, cif->bytes, 
-		   cif->flags, ecif.rvalue, fn);
+		   cif->flags, ecif.rvalue, fn, closure);
       break;
 #endif
 
@@ -642,7 +661,7 @@ void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 #endif
 	  }
         ffi_call_N32(ffi_prep_args, &ecif, cif->bytes,
-                     cif->flags, rvalue_copy, fn);
+                     cif->flags, rvalue_copy, fn, closure);
         if (copy_rvalue)
           memcpy(ecif.rvalue, rvalue_copy + copy_offset, cif->rtype->size);
       }
@@ -655,11 +674,27 @@ void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
     }
 }
 
+void
+ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, NULL);
+}
+
+void
+ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	     void **avalue, void *closure)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, closure);
+}
+
+
 #if FFI_CLOSURES
 #if defined(FFI_MIPS_O32)
 extern void ffi_closure_O32(void);
+extern void ffi_go_closure_O32(void);
 #else
 extern void ffi_closure_N32(void);
+extern void ffi_go_closure_N32(void);
 #endif /* FFI_MIPS_O32 */
 
 ffi_status
@@ -744,11 +779,13 @@ ffi_prep_closure_loc (ffi_closure *closure,
   closure->fun = fun;
   closure->user_data = user_data;
 
+#if !defined(__FreeBSD__)
 #ifdef USE__BUILTIN___CLEAR_CACHE
   __builtin___clear_cache(clear_location, clear_location + FFI_TRAMPOLINE_SIZE);
 #else
   cacheflush (clear_location, FFI_TRAMPOLINE_SIZE, ICACHE);
 #endif
+#endif /* ! __FreeBSD__ */
   return FFI_OK;
 }
 
@@ -770,27 +807,28 @@ ffi_prep_closure_loc (ffi_closure *closure,
  * Based on the similar routine for sparc.
  */
 int
-ffi_closure_mips_inner_O32 (ffi_closure *closure,
+ffi_closure_mips_inner_O32 (ffi_cif *cif,
+                            void (*fun)(ffi_cif*, void*, void**, void*),
+			    void *user_data,
 			    void *rvalue, ffi_arg *ar,
 			    double *fpr)
 {
-  ffi_cif *cif;
   void **avaluep;
   ffi_arg *avalue;
   ffi_type **arg_types;
   int i, avn, argn, seen_int;
 
-  cif = closure->cif;
   avalue = alloca (cif->nargs * sizeof (ffi_arg));
   avaluep = alloca (cif->nargs * sizeof (ffi_arg));
 
-  seen_int = (cif->abi == FFI_O32_SOFT_FLOAT);
+  seen_int = (cif->abi == FFI_O32_SOFT_FLOAT) || (cif->mips_nfixedargs != cif->nargs);
   argn = 0;
 
   if ((cif->flags >> (FFI_FLAG_BITS * 2)) == FFI_TYPE_STRUCT)
     {
-      rvalue = (void *)(UINT32)ar[0];
+      rvalue = (void *)(uintptr_t)ar[0];
       argn = 1;
+      seen_int = 1;
     }
 
   i = 0;
@@ -799,6 +837,8 @@ ffi_closure_mips_inner_O32 (ffi_closure *closure,
 
   while (i < avn)
     {
+      if (arg_types[i]->alignment == 8 && (argn & 0x1))
+        argn++;
       if (i < 2 && !seen_int &&
 	  (arg_types[i]->type == FFI_TYPE_FLOAT ||
 	   arg_types[i]->type == FFI_TYPE_DOUBLE ||
@@ -813,8 +853,6 @@ ffi_closure_mips_inner_O32 (ffi_closure *closure,
 	}
       else
 	{
-	  if (arg_types[i]->alignment == 8 && (argn & 0x1))
-	    argn++;
 	  switch (arg_types[i]->type)
 	    {
 	      case FFI_TYPE_SINT8:
@@ -843,12 +881,12 @@ ffi_closure_mips_inner_O32 (ffi_closure *closure,
 	    }
 	  seen_int = 1;
 	}
-      argn += ALIGN(arg_types[i]->size, FFI_SIZEOF_ARG) / FFI_SIZEOF_ARG;
+      argn += FFI_ALIGN(arg_types[i]->size, FFI_SIZEOF_ARG) / FFI_SIZEOF_ARG;
       i++;
     }
 
   /* Invoke the closure. */
-  (closure->fun) (cif, rvalue, avaluep, closure->user_data);
+  fun(cif, rvalue, avaluep, user_data);
 
   if (cif->abi == FFI_O32_SOFT_FLOAT)
     {
@@ -884,7 +922,7 @@ copy_struct_N32(char *target, unsigned offset, ffi_abi abi, ffi_type *type,
       char *argp;
       char *fpp;
 
-      o = ALIGN(offset, elt_type->alignment);
+      o = FFI_ALIGN(offset, elt_type->alignment);
       arg_offset += o - offset;
       offset = o;
       argn += arg_offset / sizeof(ffi_arg);
@@ -924,11 +962,12 @@ copy_struct_N32(char *target, unsigned offset, ffi_abi abi, ffi_type *type,
  *
  */
 int
-ffi_closure_mips_inner_N32 (ffi_closure *closure,
+ffi_closure_mips_inner_N32 (ffi_cif *cif, 
+			    void (*fun)(ffi_cif*, void*, void**, void*),
+                            void *user_data,
 			    void *rvalue, ffi_arg *ar,
 			    ffi_arg *fpr)
 {
-  ffi_cif *cif;
   void **avaluep;
   ffi_arg *avalue;
   ffi_type **arg_types;
@@ -936,7 +975,6 @@ ffi_closure_mips_inner_N32 (ffi_closure *closure,
   int soft_float;
   ffi_arg *argp;
 
-  cif = closure->cif;
   soft_float = cif->abi == FFI_N64_SOFT_FLOAT
     || cif->abi == FFI_N32_SOFT_FLOAT;
   avalue = alloca (cif->nargs * sizeof (ffi_arg));
@@ -964,10 +1002,10 @@ ffi_closure_mips_inner_N32 (ffi_closure *closure,
 	  || arg_types[i]->type == FFI_TYPE_DOUBLE
 	  || arg_types[i]->type == FFI_TYPE_LONGDOUBLE)
         {
-          argp = (argn >= 8 || soft_float) ? ar + argn : fpr + argn;
-          if ((arg_types[i]->type == FFI_TYPE_LONGDOUBLE) && ((unsigned)argp & (arg_types[i]->alignment-1)))
+          argp = (argn >= 8 || i >= cif->mips_nfixedargs || soft_float) ? ar + argn : fpr + argn;
+          if ((arg_types[i]->type == FFI_TYPE_LONGDOUBLE) && ((uintptr_t)argp & (arg_types[i]->alignment-1)))
             {
-              argp=(ffi_arg*)ALIGN(argp,arg_types[i]->alignment);
+              argp=(ffi_arg*)FFI_ALIGN(argp,arg_types[i]->alignment);
               argn++;
             }
 #if defined(__MIPSEB__) || defined(_MIPSEB)
@@ -982,7 +1020,7 @@ ffi_closure_mips_inner_N32 (ffi_closure *closure,
           unsigned type = arg_types[i]->type;
 
           if (arg_types[i]->alignment > sizeof(ffi_arg))
-            argn = ALIGN(argn, arg_types[i]->alignment / sizeof(ffi_arg));
+            argn = FFI_ALIGN(argn, arg_types[i]->alignment / sizeof(ffi_arg));
 
           argp = ar + argn;
 
@@ -1033,7 +1071,7 @@ ffi_closure_mips_inner_N32 (ffi_closure *closure,
                      it was passed in registers.  */
                   avaluep[i] = alloca(arg_types[i]->size);
                   copy_struct_N32(avaluep[i], 0, cif->abi, arg_types[i],
-                                  argn, 0, ar, fpr, soft_float);
+                                  argn, 0, ar, fpr, i >= cif->mips_nfixedargs || soft_float);
 
                   break;
                 }
@@ -1043,16 +1081,54 @@ ffi_closure_mips_inner_N32 (ffi_closure *closure,
               break;
             }
         }
-      argn += ALIGN(arg_types[i]->size, sizeof(ffi_arg)) / sizeof(ffi_arg);
+      argn += FFI_ALIGN(arg_types[i]->size, sizeof(ffi_arg)) / sizeof(ffi_arg);
       i++;
     }
 
   /* Invoke the closure. */
-  (closure->fun) (cif, rvalue, avaluep, closure->user_data);
+  fun (cif, rvalue, avaluep, user_data);
 
   return cif->flags >> (FFI_FLAG_BITS * 8);
 }
 
 #endif /* FFI_MIPS_N32 */
 
+#if defined(FFI_MIPS_O32)
+extern void ffi_closure_O32(void);
+extern void ffi_go_closure_O32(void);
+#else
+extern void ffi_closure_N32(void);
+extern void ffi_go_closure_N32(void);
+#endif /* FFI_MIPS_O32 */
+
+ffi_status
+ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
+		     void (*fun)(ffi_cif*,void*,void**,void*))
+{
+  void * fn;
+
+#if defined(FFI_MIPS_O32)
+  if (cif->abi != FFI_O32 && cif->abi != FFI_O32_SOFT_FLOAT)
+    return FFI_BAD_ABI;
+  fn = ffi_go_closure_O32;
+#else
+#if _MIPS_SIM ==_ABIN32
+  if (cif->abi != FFI_N32
+      && cif->abi != FFI_N32_SOFT_FLOAT)
+    return FFI_BAD_ABI;
+#else
+  if (cif->abi != FFI_N64
+      && cif->abi != FFI_N64_SOFT_FLOAT)
+    return FFI_BAD_ABI;
+#endif
+  fn = ffi_go_closure_N32;
+#endif /* FFI_MIPS_O32 */
+
+  closure->tramp = (void *)fn;
+  closure->cif = cif;
+  closure->fun = fun;
+
+  return FFI_OK;
+}
+
 #endif /* FFI_CLOSURES */
diff --git a/libffi/src/mips/ffitarget.h b/libffi/src/mips/ffitarget.h
index 717d659..fdd5ca9 100644
--- a/libffi/src/mips/ffitarget.h
+++ b/libffi/src/mips/ffitarget.h
@@ -32,7 +32,7 @@
 #error "Please do not include ffitarget.h directly into your source.  Use ffi.h instead."
 #endif
 
-#ifdef linux
+#ifdef __linux__
 # include <asm/sgidefs.h>
 #elif defined(__rtems__)
 /*
@@ -41,7 +41,7 @@
 #define _MIPS_SIM_ABI32		1
 #define _MIPS_SIM_NABI32	2
 #define _MIPS_SIM_ABI64		3
-#elif !defined(__OpenBSD__)
+#elif !defined(__OpenBSD__) && !defined(__FreeBSD__)
 # include <sgidefs.h>
 #endif
 
@@ -224,24 +224,21 @@ typedef enum ffi_abi {
 #endif
 } ffi_abi;
 
-#define FFI_EXTRA_CIF_FIELDS unsigned rstruct_flag
+#define FFI_EXTRA_CIF_FIELDS unsigned rstruct_flag; unsigned mips_nfixedargs
+#define FFI_TARGET_SPECIFIC_VARIADIC
 #endif /* !LIBFFI_ASM */
 
 /* ---- Definitions for closures ----------------------------------------- */
 
-#if defined(FFI_MIPS_O32)
 #define FFI_CLOSURES 1
-#define FFI_TRAMPOLINE_SIZE 20
-#else
-/* N32/N64. */
-# define FFI_CLOSURES 1
-#if _MIPS_SIM==_ABI64
-#define FFI_TRAMPOLINE_SIZE 52
+#define FFI_GO_CLOSURES 1
+#define FFI_NATIVE_RAW_API 0
+
+#if defined(FFI_MIPS_O32) || (_MIPS_SIM ==_ABIN32)
+# define FFI_TRAMPOLINE_SIZE 20
 #else
-#define FFI_TRAMPOLINE_SIZE 20
+# define FFI_TRAMPOLINE_SIZE 56
 #endif
-#endif /* FFI_MIPS_O32 */
-#define FFI_NATIVE_RAW_API 0
 
 #endif
 
diff --git a/libffi/src/mips/n32.S b/libffi/src/mips/n32.S
index 06e6c46..23b77fd 100644
--- a/libffi/src/mips/n32.S
+++ b/libffi/src/mips/n32.S
@@ -37,8 +37,12 @@
 #define flags	 a3
 #define raddr    a4
 #define fn       a5
+#define closure  a6
 
-#define SIZEOF_FRAME	( 8 * FFI_SIZEOF_ARG )
+/* Note: to keep stack 16 byte aligned we need even number slots 
+   used 9 slots here
+*/
+#define SIZEOF_FRAME	( 10 * FFI_SIZEOF_ARG )
 
 #ifdef __GNUC__
 	.abicalls
@@ -51,24 +55,25 @@
 	.globl	ffi_call_N32
 	.ent	ffi_call_N32
 ffi_call_N32:	
-.LFB3:
+.LFB0:
 	.frame	$fp, SIZEOF_FRAME, ra
 	.mask	0xc0000000,-FFI_SIZEOF_ARG
 	.fmask	0x00000000,0
 
 	# Prologue
 	SUBU	$sp, SIZEOF_FRAME			# Frame size
-.LCFI0:
+.LCFI00:
 	REG_S	$fp, SIZEOF_FRAME - 2*FFI_SIZEOF_ARG($sp)	# Save frame pointer
 	REG_S	ra, SIZEOF_FRAME - 1*FFI_SIZEOF_ARG($sp)	# Save return address
-.LCFI1:
+.LCFI01:
 	move	$fp, $sp
-.LCFI3:
+.LCFI02:
 	move	t9, callback	# callback function pointer
 	REG_S	bytes, 2*FFI_SIZEOF_ARG($fp) # bytes
 	REG_S	flags, 3*FFI_SIZEOF_ARG($fp) # flags
 	REG_S	raddr, 4*FFI_SIZEOF_ARG($fp) # raddr
 	REG_S	fn,    5*FFI_SIZEOF_ARG($fp) # fn
+	REG_S	closure, 6*FFI_SIZEOF_ARG($fp) # closure
 
 	# Allocate at least 4 words in the argstack
 	move	v0, bytes
@@ -109,6 +114,16 @@ loadregs:
 
 	REG_L	t6, 3*FFI_SIZEOF_ARG($fp)  # load the flags word into t6.
 
+#ifdef __mips_soft_float
+	REG_L	a0, 0*FFI_SIZEOF_ARG(t9)
+	REG_L	a1, 1*FFI_SIZEOF_ARG(t9)
+	REG_L	a2, 2*FFI_SIZEOF_ARG(t9)
+	REG_L	a3, 3*FFI_SIZEOF_ARG(t9)
+	REG_L	a4, 4*FFI_SIZEOF_ARG(t9)
+	REG_L	a5, 5*FFI_SIZEOF_ARG(t9)
+	REG_L	a6, 6*FFI_SIZEOF_ARG(t9)
+	REG_L	a7, 7*FFI_SIZEOF_ARG(t9)
+#else
 	and	t4, t6, ((1<<FFI_FLAG_BITS)-1)
 	REG_L	a0, 0*FFI_SIZEOF_ARG(t9)
 	beqz	t4, arg1_next
@@ -195,11 +210,15 @@ arg7_next:
 arg8_doublep:	
  	l.d	$f19, 7*FFI_SIZEOF_ARG(t9)	
 arg8_next:	
+#endif
 
 callit:		
 	# Load the function pointer
 	REG_L	t9, 5*FFI_SIZEOF_ARG($fp)
 
+	# install the static chain(t7=$15)
+	REG_L	t7, 6*FFI_SIZEOF_ARG($fp)
+
 	# If the return value pointer is NULL, assume no return value.
 	REG_L	t5, 4*FFI_SIZEOF_ARG($fp)
 	beqz	t5, noretval
@@ -216,6 +235,7 @@ retint:
 	b	epilogue
 
 retfloat:
+#ifndef __mips_soft_float
 	bne     t6, FFI_TYPE_FLOAT, retdouble
 	jal	t9
 	REG_L	t4, 4*FFI_SIZEOF_ARG($fp)
@@ -274,6 +294,7 @@ retstruct_f_d:
 	s.s	$f0, 0(t4)
 	s.d	$f2, 8(t4)
 	b	epilogue
+#endif
 
 retstruct_d_soft:
 	bne	t6, FFI_TYPE_STRUCT_D_SOFT, retstruct_f_soft
@@ -348,7 +369,7 @@ epilogue:
 	ADDU	$sp, SIZEOF_FRAME		      # Fix stack pointer
 	j	ra
 
-.LFE3:
+.LFE0:
 	.end	ffi_call_N32
 
 /* ffi_closure_N32. Expects address of the passed-in ffi_closure in t0
@@ -408,6 +429,41 @@ epilogue:
 #define GP_OFF2		(0  * FFI_SIZEOF_ARG)
 
 	.align	2
+	.globl	ffi_go_closure_N32
+	.ent	ffi_go_closure_N32
+ffi_go_closure_N32:
+.LFB1:
+	.frame	$sp, SIZEOF_FRAME2, ra
+	.mask	0x90000000,-(SIZEOF_FRAME2 - RA_OFF2)
+	.fmask	0x00000000,0
+	SUBU	$sp, SIZEOF_FRAME2
+.LCFI10:
+	.cpsetup t9, GP_OFF2, ffi_go_closure_N32
+	REG_S	ra, RA_OFF2($sp)	# Save return address
+.LCFI11:
+
+	REG_S	a0, A0_OFF2($sp)
+	REG_S	a1, A1_OFF2($sp)
+	REG_S	a2, A2_OFF2($sp)
+	REG_S	a3, A3_OFF2($sp)
+	REG_S	a4, A4_OFF2($sp)
+	REG_S	a5, A5_OFF2($sp)
+
+	# Call ffi_closure_mips_inner_N32 to do the real work.
+	LA	t9, ffi_closure_mips_inner_N32
+	REG_L	a0, 8($15)   # cif
+	REG_L	a1, 16($15) # fun
+	move	a2, t7                     # userdata=closure
+	ADDU	a3, $sp, V0_OFF2           # rvalue
+	ADDU	a4, $sp, A0_OFF2           # ar
+	ADDU	a5, $sp, F12_OFF2          # fpr
+
+	b	$do_closure
+
+.LFE1:	
+	.end	ffi_go_closure_N32
+
+	.align	2
 	.globl	ffi_closure_N32
 	.ent	ffi_closure_N32
 ffi_closure_N32:
@@ -416,21 +472,33 @@ ffi_closure_N32:
 	.mask	0x90000000,-(SIZEOF_FRAME2 - RA_OFF2)
 	.fmask	0x00000000,0
 	SUBU	$sp, SIZEOF_FRAME2
-.LCFI5:
+.LCFI20:
 	.cpsetup t9, GP_OFF2, ffi_closure_N32
 	REG_S	ra, RA_OFF2($sp)	# Save return address
-.LCFI6:
-	# Store all possible argument registers. If there are more than
-	# fit in registers, then they were stored on the stack.
+.LCFI21:
 	REG_S	a0, A0_OFF2($sp)
 	REG_S	a1, A1_OFF2($sp)
 	REG_S	a2, A2_OFF2($sp)
 	REG_S	a3, A3_OFF2($sp)
 	REG_S	a4, A4_OFF2($sp)
 	REG_S	a5, A5_OFF2($sp)
+
+	# Call ffi_closure_mips_inner_N32 to do the real work.
+	LA	t9, ffi_closure_mips_inner_N32
+	REG_L	a0, 56($12)   # cif
+	REG_L	a1, 64($12)   # fun
+	REG_L	a2, 72($12) # user_data
+	ADDU	a3, $sp, V0_OFF2
+	ADDU	a4, $sp, A0_OFF2
+	ADDU	a5, $sp, F12_OFF2
+
+$do_closure:
+	# Store all possible argument registers. If there are more than
+	# fit in registers, then they were stored on the stack.
 	REG_S	a6, A6_OFF2($sp)
 	REG_S	a7, A7_OFF2($sp)
 
+#ifndef __mips_soft_float
 	# Store all possible float/double registers.
 	s.d	$f12, F12_OFF2($sp)
 	s.d	$f13, F13_OFF2($sp)
@@ -440,13 +508,8 @@ ffi_closure_N32:
 	s.d	$f17, F17_OFF2($sp)
 	s.d	$f18, F18_OFF2($sp)
 	s.d	$f19, F19_OFF2($sp)
+#endif
 
-	# Call ffi_closure_mips_inner_N32 to do the real work.
-	LA	t9, ffi_closure_mips_inner_N32
-	move	a0, $12	 # Pointer to the ffi_closure
-	ADDU	a1, $sp, V0_OFF2
-	ADDU	a2, $sp, A0_OFF2
-	ADDU	a3, $sp, F12_OFF2
 	jalr	t9
 
 	# Return flags are in v0
@@ -460,6 +523,7 @@ cls_retint:
 	b	cls_epilogue
 
 cls_retfloat:
+#ifndef __mips_soft_float
 	bne     v0, FFI_TYPE_FLOAT, cls_retdouble
 	l.s	$f0, V0_OFF2($sp)
 	b	cls_epilogue
@@ -502,6 +566,7 @@ cls_retstruct_f_d:
 	l.s	$f0, V0_OFF2($sp)
 	l.d	$f2, V1_OFF2($sp)
 	b	cls_epilogue
+#endif
 	
 cls_retstruct_small2:	
 	REG_L	v0, V0_OFF2($sp)
@@ -517,7 +582,7 @@ cls_epilogue:
 	.end	ffi_closure_N32
 
 #ifdef __GNUC__
-        .section        .eh_frame,"aw",@progbits
+        .section        .eh_frame,EH_FRAME_FLAGS,@progbits
 .Lframe1:
         .4byte  .LECIE1-.LSCIE1		# length
 .LSCIE1:
@@ -533,46 +598,66 @@ cls_epilogue:
         .align  EH_FRAME_ALIGN
 .LECIE1:
 
-.LSFDE1:
-        .4byte  .LEFDE1-.LASFDE1	# length.
-.LASFDE1:
-        .4byte  .LASFDE1-.Lframe1	# CIE_pointer.
-        FDE_ADDR_BYTES  .LFB3		# initial_location.
-        FDE_ADDR_BYTES  .LFE3-.LFB3	# address_range.
+.LSFDE0:
+        .4byte  .LEFDE0-.LASFDE0	# length.
+.LASFDE0:
+        .4byte  .LASFDE0-.Lframe1	# CIE_pointer.
+        FDE_ADDR_BYTES  .LFB0		# initial_location.
+        FDE_ADDR_BYTES  .LFE0-.LFB0	# address_range.
         .byte   0x4			# DW_CFA_advance_loc4
-        .4byte  .LCFI0-.LFB3		# to .LCFI0
+        .4byte  .LCFI00-.LFB0		# to .LCFI00
         .byte   0xe			# DW_CFA_def_cfa_offset
         .uleb128 SIZEOF_FRAME		# adjust stack.by SIZEOF_FRAME
         .byte   0x4			# DW_CFA_advance_loc4
-        .4byte  .LCFI1-.LCFI0		# to .LCFI1
+        .4byte  .LCFI01-.LCFI00		# to .LCFI01
         .byte   0x9e			# DW_CFA_offset of $fp
         .uleb128 2*FFI_SIZEOF_ARG/4	# 
         .byte   0x9f			# DW_CFA_offset of ra
         .uleb128 1*FFI_SIZEOF_ARG/4	# 
         .byte   0x4			# DW_CFA_advance_loc4
-        .4byte  .LCFI3-.LCFI1		# to .LCFI3
+        .4byte  .LCFI02-.LCFI01		# to .LCFI02
         .byte   0xd			# DW_CFA_def_cfa_register
         .uleb128 0x1e			# in $fp
         .align  EH_FRAME_ALIGN
+.LEFDE0:
+
+.LSFDE1:
+	.4byte	.LEFDE1-.LASFDE1	# length
+.LASFDE1:
+	.4byte	.LASFDE1-.Lframe1	# CIE_pointer.
+	FDE_ADDR_BYTES	.LFB1		# initial_location.
+	FDE_ADDR_BYTES	.LFE1-.LFB1	# address_range.
+	.byte	0x4			# DW_CFA_advance_loc4
+	.4byte	.LCFI10-.LFB1		# to .LCFI10
+	.byte	0xe			# DW_CFA_def_cfa_offset
+	.uleb128 SIZEOF_FRAME2		# adjust stack.by SIZEOF_FRAME
+	.byte	0x4			# DW_CFA_advance_loc4
+	.4byte	.LCFI11-.LCFI10		# to .LCFI11
+	.byte	0x9c			# DW_CFA_offset of $gp ($28)
+	.uleb128 (SIZEOF_FRAME2 - GP_OFF2)/4
+	.byte	0x9f			# DW_CFA_offset of ra ($31)
+	.uleb128 (SIZEOF_FRAME2 - RA_OFF2)/4
+	.align	EH_FRAME_ALIGN
 .LEFDE1:
-.LSFDE3:
-	.4byte	.LEFDE3-.LASFDE3	# length
-.LASFDE3:
-	.4byte	.LASFDE3-.Lframe1	# CIE_pointer.
+
+.LSFDE2:
+	.4byte	.LEFDE2-.LASFDE2	# length
+.LASFDE2:
+	.4byte	.LASFDE2-.Lframe1	# CIE_pointer.
 	FDE_ADDR_BYTES	.LFB2		# initial_location.
 	FDE_ADDR_BYTES	.LFE2-.LFB2	# address_range.
 	.byte	0x4			# DW_CFA_advance_loc4
-	.4byte	.LCFI5-.LFB2		# to .LCFI5
+	.4byte	.LCFI20-.LFB2		# to .LCFI20
 	.byte	0xe			# DW_CFA_def_cfa_offset
 	.uleb128 SIZEOF_FRAME2		# adjust stack.by SIZEOF_FRAME
 	.byte	0x4			# DW_CFA_advance_loc4
-	.4byte	.LCFI6-.LCFI5		# to .LCFI6
+	.4byte	.LCFI21-.LCFI20		# to .LCFI21
 	.byte	0x9c			# DW_CFA_offset of $gp ($28)
 	.uleb128 (SIZEOF_FRAME2 - GP_OFF2)/4
 	.byte	0x9f			# DW_CFA_offset of ra ($31)
 	.uleb128 (SIZEOF_FRAME2 - RA_OFF2)/4
 	.align	EH_FRAME_ALIGN
-.LEFDE3:
+.LEFDE2:
 #endif /* __GNUC__ */	
 	
 #endif
diff --git a/libffi/src/mips/o32.S b/libffi/src/mips/o32.S
index eb27981..799139b 100644
--- a/libffi/src/mips/o32.S
+++ b/libffi/src/mips/o32.S
@@ -50,14 +50,14 @@ ffi_call_O32:
 $LFB0:
 	# Prologue
 	SUBU	$sp, SIZEOF_FRAME	# Frame size
-$LCFI0:
+$LCFI00:
 	REG_S	$fp, FP_OFF($sp)	# Save frame pointer
-$LCFI1:
+$LCFI01:
 	REG_S	ra, RA_OFF($sp)		# Save return address
-$LCFI2:
+$LCFI02:
 	move	$fp, $sp
 
-$LCFI3:
+$LCFI03:
 	move	t9, callback		# callback function pointer
 	REG_S	flags, A3_OFF($fp)	# flags
 
@@ -82,13 +82,16 @@ sixteen:
 		
 	ADDU	$sp, 4 * FFI_SIZEOF_ARG		# adjust $sp to new args
 
+#ifndef __mips_soft_float
 	bnez	t0, pass_d			# make it quick for int
+#endif
 	REG_L	a0, 0*FFI_SIZEOF_ARG($sp)	# just go ahead and load the
 	REG_L	a1, 1*FFI_SIZEOF_ARG($sp)	# four regs.
 	REG_L	a2, 2*FFI_SIZEOF_ARG($sp)
 	REG_L	a3, 3*FFI_SIZEOF_ARG($sp)
 	b	call_it
 
+#ifndef __mips_soft_float
 pass_d:
 	bne	t0, FFI_ARGS_D, pass_f
 	l.d	$f12, 0*FFI_SIZEOF_ARG($sp)	# load $fp regs from args
@@ -130,8 +133,12 @@ pass_f_d:
  #	bne	t0, FFI_ARGS_F_D, call_it
 	l.s	$f12, 0*FFI_SIZEOF_ARG($sp)	# load $fp regs from args
 	l.d	$f14, 2*FFI_SIZEOF_ARG($sp)	# passing double and float
+#endif
 
 call_it:	
+	# Load the static chain pointer
+	REG_L	t7, SIZEOF_FRAME + 6*FFI_SIZEOF_ARG($fp)
+
 	# Load the function pointer
 	REG_L	t9, SIZEOF_FRAME + 5*FFI_SIZEOF_ARG($fp)
 
@@ -158,14 +165,23 @@ retfloat:
 	bne     t2, FFI_TYPE_FLOAT, retdouble
 	jalr	t9
 	REG_L	t0, SIZEOF_FRAME + 4*FFI_SIZEOF_ARG($fp)
+#ifndef __mips_soft_float
 	s.s	$f0, 0(t0)
+#else
+	REG_S	v0, 0(t0)
+#endif
 	b	epilogue
 
 retdouble:	
 	bne	t2, FFI_TYPE_DOUBLE, noretval
 	jalr	t9
 	REG_L	t0, SIZEOF_FRAME + 4*FFI_SIZEOF_ARG($fp)
+#ifndef __mips_soft_float
 	s.d	$f0, 0(t0)
+#else
+	REG_S	v1, 4(t0)
+	REG_S	v0, 0(t0)
+#endif
 	b	epilogue
 	
 noretval:	
@@ -204,13 +220,15 @@ $LFE0:
 	-8 - f14 (le low, be high)
 	-9 - f12 (le high, be low)
        -10 - f12 (le low, be high)
-       -11 - Called function a3 save
-       -12 - Called function a2 save
-       -13 - Called function a1 save
-       -14 - Called function a0 save, our sp and fp point here
+       -11 - Called function a5 save
+       -12 - Called function a4 save
+       -13 - Called function a3 save
+       -14 - Called function a2 save
+       -15 - Called function a1 save
+       -16 - Called function a0 save, our sp and fp point here
 	 */
 	
-#define SIZEOF_FRAME2	(14 * FFI_SIZEOF_ARG)
+#define SIZEOF_FRAME2	(16 * FFI_SIZEOF_ARG)
 #define A3_OFF2		(SIZEOF_FRAME2 + 3 * FFI_SIZEOF_ARG)
 #define A2_OFF2		(SIZEOF_FRAME2 + 2 * FFI_SIZEOF_ARG)
 #define A1_OFF2		(SIZEOF_FRAME2 + 1 * FFI_SIZEOF_ARG)
@@ -225,13 +243,73 @@ $LFE0:
 #define FA_1_0_OFF2	(SIZEOF_FRAME2 - 8 * FFI_SIZEOF_ARG)
 #define FA_0_1_OFF2	(SIZEOF_FRAME2 - 9 * FFI_SIZEOF_ARG)
 #define FA_0_0_OFF2	(SIZEOF_FRAME2 - 10 * FFI_SIZEOF_ARG)
+#define CALLED_A5_OFF2  (SIZEOF_FRAME2 - 11 * FFI_SIZEOF_ARG)
+#define CALLED_A4_OFF2  (SIZEOF_FRAME2 - 12 * FFI_SIZEOF_ARG)
 
 	.text
+
+	.align	2
+	.globl	ffi_go_closure_O32
+	.ent	ffi_go_closure_O32
+ffi_go_closure_O32:
+$LFB1:
+	# Prologue
+	.frame	$fp, SIZEOF_FRAME2, ra
+	.set	noreorder
+	.cpload	t9
+	.set	reorder
+	SUBU	$sp, SIZEOF_FRAME2
+	.cprestore GP_OFF2
+$LCFI10:
+
+	REG_S	$16, S0_OFF2($sp)	 # Save s0
+	REG_S	$fp, FP_OFF2($sp)	 # Save frame pointer
+	REG_S	ra, RA_OFF2($sp)	 # Save return address
+$LCFI11:
+
+	move	$fp, $sp
+$LCFI12:
+
+	REG_S	a0, A0_OFF2($fp)
+	REG_S	a1, A1_OFF2($fp)
+	REG_S	a2, A2_OFF2($fp)
+	REG_S	a3, A3_OFF2($fp)
+
+	# Load ABI enum to s0
+	REG_L	$16, 4($15)	# cif 
+	REG_L	$16, 0($16)	# abi is first member.
+
+	li	$13, 1		# FFI_O32
+	bne	$16, $13, 1f	# Skip fp save if FFI_O32_SOFT_FLOAT
+	
+#ifndef __mips_soft_float
+	# Store all possible float/double registers.
+	s.d	$f12, FA_0_0_OFF2($fp)
+	s.d	$f14, FA_1_0_OFF2($fp)
+#endif
+1:
+	# prepare arguments for ffi_closure_mips_inner_O32
+	REG_L	a0, 4($15)	 # cif 
+	REG_L	a1, 8($15)	 # fun
+	move	a2, $15		 # user_data = go closure
+	addu	a3, $fp, V0_OFF2 # rvalue
+
+	addu	t9, $fp, A0_OFF2 # ar
+	REG_S   t9, CALLED_A4_OFF2($fp)
+
+	addu	t9, $fp, FA_0_0_OFF2 #fpr
+	REG_S   t9, CALLED_A5_OFF2($fp)
+
+	b $do_closure
+
+$LFE1:
+	.end ffi_go_closure_O32
+
 	.align	2
 	.globl	ffi_closure_O32
 	.ent	ffi_closure_O32
 ffi_closure_O32:
-$LFB1:
+$LFB2:
 	# Prologue
 	.frame	$fp, SIZEOF_FRAME2, ra
 	.set	noreorder
@@ -239,14 +317,14 @@ $LFB1:
 	.set	reorder
 	SUBU	$sp, SIZEOF_FRAME2
 	.cprestore GP_OFF2
-$LCFI4:
+$LCFI20:
 	REG_S	$16, S0_OFF2($sp)	 # Save s0
 	REG_S	$fp, FP_OFF2($sp)	 # Save frame pointer
 	REG_S	ra, RA_OFF2($sp)	 # Save return address
-$LCFI6:
+$LCFI21:
 	move	$fp, $sp
 
-$LCFI7:
+$LCFI22:
 	# Store all possible argument registers. If there are more than
 	# four arguments, then they are stored above where we put a3.
 	REG_S	a0, A0_OFF2($fp)
@@ -261,16 +339,27 @@ $LCFI7:
 	li	$13, 1		# FFI_O32
 	bne	$16, $13, 1f	# Skip fp save if FFI_O32_SOFT_FLOAT
 	
+#ifndef __mips_soft_float
 	# Store all possible float/double registers.
 	s.d	$f12, FA_0_0_OFF2($fp)
 	s.d	$f14, FA_1_0_OFF2($fp)
+#endif
 1:	
-	# Call ffi_closure_mips_inner_O32 to do the work.
+	# prepare arguments for ffi_closure_mips_inner_O32
+	REG_L	a0, 20($12)	 # cif pointer follows tramp.
+	REG_L	a1, 24($12)	 # fun
+	REG_L	a2, 28($12)	 # user_data
+	addu	a3, $fp, V0_OFF2 # rvalue
+
+	addu	t9, $fp, A0_OFF2 # ar
+	REG_S   t9, CALLED_A4_OFF2($fp)
+
+	addu	t9, $fp, FA_0_0_OFF2 #fpr
+	REG_S   t9, CALLED_A5_OFF2($fp)
+
+$do_closure:
 	la	t9, ffi_closure_mips_inner_O32
-	move	a0, $12	 # Pointer to the ffi_closure
-	addu	a1, $fp, V0_OFF2
-	addu	a2, $fp, A0_OFF2
-	addu	a3, $fp, FA_0_0_OFF2
+	# Call ffi_closure_mips_inner_O32 to do the work.
 	jalr	t9
 
 	# Load the return value into the appropriate register.
@@ -281,6 +370,7 @@ $LCFI7:
 	li	$13, 1		# FFI_O32
 	bne	$16, $13, 1f	# Skip fp restore if FFI_O32_SOFT_FLOAT
 
+#ifndef __mips_soft_float
 	li	$9, FFI_TYPE_FLOAT
 	l.s	$f0, V0_OFF2($fp)
 	beq	$8, $9, closure_done
@@ -288,6 +378,7 @@ $LCFI7:
 	li	$9, FFI_TYPE_DOUBLE
 	l.d	$f0, V0_OFF2($fp)
 	beq	$8, $9, closure_done
+#endif
 1:	
 	REG_L	$3, V1_OFF2($fp)
 	REG_L	$2, V0_OFF2($fp)
@@ -300,7 +391,7 @@ closure_done:
 	REG_L	ra,  RA_OFF2($sp)	 # Restore return address
 	ADDU	$sp, SIZEOF_FRAME2
 	j	ra
-$LFE1:
+$LFE2:
 	.end	ffi_closure_O32
 
 /* DWARF-2 unwind info. */
@@ -322,6 +413,7 @@ $LSCIE0:
 	.uleb128 0x0
 	.align	2
 $LECIE0:
+
 $LSFDE0:
 	.4byte	$LEFDE0-$LASFDE0	 # FDE Length
 $LASFDE0:
@@ -330,11 +422,11 @@ $LASFDE0:
 	.4byte	$LFE0-$LFB0	 # FDE address range
 	.uleb128 0x0	 # Augmentation size
 	.byte	0x4	 # DW_CFA_advance_loc4
-	.4byte	$LCFI0-$LFB0
+	.4byte	$LCFI00-$LFB0
 	.byte	0xe	 # DW_CFA_def_cfa_offset
 	.uleb128 0x18
 	.byte	0x4	 # DW_CFA_advance_loc4
-	.4byte	$LCFI2-$LCFI0
+	.4byte	$LCFI01-$LCFI00
 	.byte	0x11	 # DW_CFA_offset_extended_sf
 	.uleb128 0x1e	 # $fp
 	.sleb128 -2	 # SIZEOF_FRAME2 - 2*FFI_SIZEOF_ARG($sp)
@@ -342,12 +434,13 @@ $LASFDE0:
 	.uleb128 0x1f	 # $ra
 	.sleb128 -1	 # SIZEOF_FRAME2 - 1*FFI_SIZEOF_ARG($sp)
 	.byte	0x4	 # DW_CFA_advance_loc4
-	.4byte	$LCFI3-$LCFI2
+	.4byte	$LCFI02-$LCFI01
 	.byte	0xc	 # DW_CFA_def_cfa
 	.uleb128 0x1e
 	.uleb128 0x18
 	.align	2
 $LEFDE0:
+
 $LSFDE1:
 	.4byte	$LEFDE1-$LASFDE1	 # FDE Length
 $LASFDE1:
@@ -356,11 +449,11 @@ $LASFDE1:
 	.4byte	$LFE1-$LFB1	 # FDE address range
 	.uleb128 0x0	 # Augmentation size
 	.byte	0x4	 # DW_CFA_advance_loc4
-	.4byte	$LCFI4-$LFB1
+	.4byte	$LCFI10-$LFB1
 	.byte	0xe	 # DW_CFA_def_cfa_offset
-	.uleb128 0x38
+	.uleb128 SIZEOF_FRAME2
 	.byte	0x4	 # DW_CFA_advance_loc4
-	.4byte	$LCFI6-$LCFI4
+	.4byte	$LCFI11-$LCFI10
 	.byte	0x11	 # DW_CFA_offset_extended_sf
 	.uleb128 0x10	 # $16
 	.sleb128 -3	 # SIZEOF_FRAME2 - 3*FFI_SIZEOF_ARG($sp)
@@ -371,11 +464,41 @@ $LASFDE1:
 	.uleb128 0x1f	 # $ra
 	.sleb128 -1	 # SIZEOF_FRAME2 - 1*FFI_SIZEOF_ARG($sp)
 	.byte	0x4	 # DW_CFA_advance_loc4
-	.4byte	$LCFI7-$LCFI6
+	.4byte	$LCFI12-$LCFI11
 	.byte	0xc	 # DW_CFA_def_cfa
 	.uleb128 0x1e
-	.uleb128 0x38
+	.uleb128 SIZEOF_FRAME2
 	.align	2
 $LEFDE1:
 
+$LSFDE2:
+	.4byte	$LEFDE2-$LASFDE2	 # FDE Length
+$LASFDE2:
+	.4byte	$LASFDE2-$Lframe0	 # FDE CIE offset
+	.4byte	$LFB2	 # FDE initial location
+	.4byte	$LFE2-$LFB2	 # FDE address range
+	.uleb128 0x0	 # Augmentation size
+	.byte	0x4	 # DW_CFA_advance_loc4
+	.4byte	$LCFI20-$LFB2
+	.byte	0xe	 # DW_CFA_def_cfa_offset
+	.uleb128 SIZEOF_FRAME2
+	.byte	0x4	 # DW_CFA_advance_loc4
+	.4byte	$LCFI21-$LCFI20
+	.byte	0x11	 # DW_CFA_offset_extended_sf
+	.uleb128 0x10	 # $16
+	.sleb128 -3	 # SIZEOF_FRAME2 - 3*FFI_SIZEOF_ARG($sp)
+	.byte	0x11	 # DW_CFA_offset_extended_sf
+	.uleb128 0x1e	 # $fp
+	.sleb128 -2	 # SIZEOF_FRAME2 - 2*FFI_SIZEOF_ARG($sp)
+	.byte	0x11	 # DW_CFA_offset_extended_sf
+	.uleb128 0x1f	 # $ra
+	.sleb128 -1	 # SIZEOF_FRAME2 - 1*FFI_SIZEOF_ARG($sp)
+	.byte	0x4	 # DW_CFA_advance_loc4
+	.4byte	$LCFI22-$LCFI21
+	.byte	0xc	 # DW_CFA_def_cfa
+	.uleb128 0x1e
+	.uleb128 SIZEOF_FRAME2
+	.align	2
+$LEFDE2:
+
 #endif
diff --git a/libffi/src/moxie/eabi.S b/libffi/src/moxie/eabi.S
index ac7aceb..10cfb04 100644
--- a/libffi/src/moxie/eabi.S
+++ b/libffi/src/moxie/eabi.S
@@ -59,7 +59,7 @@ ffi_call_EABI:
 	mov 	$r6, $r4 /* Save result buffer */
 	mov	$r7, $r5 /* Save the target fn */
 	mov	$r8, $r3 /* Save the flags */
-	sub.l	$sp, $r2 /* Allocate stack space */
+	sub	$sp, $r2 /* Allocate stack space */
 	mov	$r0, $sp /* We can stomp over $r0 */
 	/* $r1 is already set up */
 	jsra 	ffi_prep_args
diff --git a/libffi/src/moxie/ffi.c b/libffi/src/moxie/ffi.c
index 540a042..16d2bb3 100644
--- a/libffi/src/moxie/ffi.c
+++ b/libffi/src/moxie/ffi.c
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------
-   ffi.c - Copyright (C) 2012, 2013  Anthony Green
+   ffi.c - Copyright (C) 2012, 2013, 2018  Anthony Green
    
    Moxie Foreign Function Interface 
 
@@ -100,7 +100,7 @@ void *ffi_prep_args(char *stack, extended_cif *ecif)
       count += z;
     }
 
-  return (stack + ((count > 24) ? 24 : ALIGN_DOWN(count, 8)));
+  return (stack + ((count > 24) ? 24 : FFI_ALIGN_DOWN(count, 8)));
 }
 
 /* Perform machine dependent cif processing */
@@ -111,7 +111,7 @@ ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
   else
     cif->flags = cif->rtype->size;
 
-  cif->bytes = ALIGN (cif->bytes, 8);
+  cif->bytes = FFI_ALIGN (cif->bytes, 8);
 
   return FFI_OK;
 }
@@ -159,7 +159,7 @@ void ffi_closure_eabi (unsigned arg1, unsigned arg2, unsigned arg3,
 		       unsigned arg4, unsigned arg5, unsigned arg6)
 {
   /* This function is called by a trampoline.  The trampoline stows a
-     pointer to the ffi_closure object in $r7.  We must save this
+     pointer to the ffi_closure object in $r12.  We must save this
      pointer in a place that will persist while we do our work.  */
   register ffi_closure *creg __asm__ ("$r12");
   ffi_closure *closure = creg;
@@ -215,7 +215,18 @@ void ffi_closure_eabi (unsigned arg1, unsigned arg2, unsigned arg3,
 	  break;
 	default:
 	  /* This is an 8-byte value.  */
-	  avalue[i] = ptr;
+	  if (ptr == (char *) &register_args[5])
+	    {
+	      /* The value is split across two locations */
+	      unsigned *ip = alloca(8);
+	      avalue[i] = ip;
+	      ip[0] = *(unsigned *) ptr;
+	      ip[1] = *(unsigned *) stack_args;
+	    }
+	  else
+	    {
+	      avalue[i] = ptr;
+	    }
 	  ptr += 4;
 	  break;
 	}
@@ -223,8 +234,10 @@ void ffi_closure_eabi (unsigned arg1, unsigned arg2, unsigned arg3,
 
       /* If we've handled more arguments than fit in registers,
 	 start looking at the those passed on the stack.  */
-      if (ptr == &register_args[6])
+      if (ptr == (char *) &register_args[6])
 	ptr = stack_args;
+      else if (ptr == (char *) &register_args[7])
+	ptr = stack_args + 4;
     }
 
   /* Invoke the closure.  */
@@ -257,7 +270,7 @@ ffi_prep_closure_loc (ffi_closure* closure,
 
   fn = (unsigned long) ffi_closure_eabi;
 
-  tramp[0] = 0x01e0; /* ldi.l $r7, .... */
+  tramp[0] = 0x01e0; /* ldi.l $r12, .... */
   tramp[1] = cls >> 16;
   tramp[2] = cls & 0xffff;
   tramp[3] = 0x1a00; /* jmpa .... */
diff --git a/libffi/src/nios2/ffi.c b/libffi/src/nios2/ffi.c
index 2efa033..721080d 100644
--- a/libffi/src/nios2/ffi.c
+++ b/libffi/src/nios2/ffi.c
@@ -101,7 +101,7 @@ void ffi_prep_args (char *stack, extended_cif *ecif)
 
       /* Align argp as appropriate for the argument type.  */
       if ((alignment - 1) & (unsigned) argp)
-	argp = (char *) ALIGN (argp, alignment);
+	argp = (char *) FFI_ALIGN (argp, alignment);
 
       /* Copy the argument, promoting integral types smaller than a
 	 word to word size.  */
@@ -230,7 +230,7 @@ ffi_closure_helper (unsigned char *args,
 
       /* Align argp as appropriate for the argument type.  */
       if ((alignment - 1) & (unsigned) argp)
-	argp = (char *) ALIGN (argp, alignment);
+	argp = (char *) FFI_ALIGN (argp, alignment);
 
       /* Arguments smaller than an int are promoted to int.  */
       if (size < sizeof (int))
diff --git a/libffi/src/pa/ffi.c b/libffi/src/pa/ffi.c
index 0da8184..95e6694 100644
--- a/libffi/src/pa/ffi.c
+++ b/libffi/src/pa/ffi.c
@@ -1,6 +1,5 @@
 /* -----------------------------------------------------------------------
-   ffi.c - (c) 2016 John David Anglin
-	   (c) 2011 Anthony Green
+   ffi.c - (c) 2011 Anthony Green
            (c) 2008 Red Hat, Inc.
 	   (c) 2006 Free Software Foundation, Inc.
            (c) 2003-2004 Randolph Chung <tausq@debian.org>
@@ -52,8 +51,7 @@
 
 #define debug(lvl, x...) do { if (lvl <= DEBUG_LEVEL) { printf(x); } } while (0)
 
-static inline int
-ffi_struct_type (ffi_type *t)
+static inline int ffi_struct_type(ffi_type *t)
 {
   size_t sz = t->size;
 
@@ -141,8 +139,7 @@ ffi_struct_type (ffi_type *t)
    NOTE: We load floating point args in this function... that means we
    assume gcc will not mess with fp regs in here.  */
 
-void
-ffi_prep_args_pa32 (UINT32 *stack, extended_cif *ecif, unsigned bytes)
+void ffi_prep_args_pa32(UINT32 *stack, extended_cif *ecif, unsigned bytes)
 {
   register unsigned int i;
   register ffi_type **p_arg;
@@ -278,8 +275,7 @@ ffi_prep_args_pa32 (UINT32 *stack, extended_cif *ecif, unsigned bytes)
   return;
 }
 
-static void
-ffi_size_stack_pa32 (ffi_cif *cif)
+static void ffi_size_stack_pa32(ffi_cif *cif)
 {
   ffi_type **ptr;
   int i;
@@ -320,8 +316,7 @@ ffi_size_stack_pa32 (ffi_cif *cif)
 }
 
 /* Perform machine dependent cif processing.  */
-ffi_status
-ffi_prep_cif_machdep (ffi_cif *cif)
+ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
 {
   /* Set the return type flag */
   switch (cif->rtype->type)
@@ -374,13 +369,11 @@ ffi_prep_cif_machdep (ffi_cif *cif)
   return FFI_OK;
 }
 
-extern void ffi_call_pa32 (void (*)(UINT32 *, extended_cif *, unsigned),
-			   extended_cif *, unsigned, unsigned, unsigned *,
-			   void (*fn)(void), void *closure);
+extern void ffi_call_pa32(void (*)(UINT32 *, extended_cif *, unsigned),
+			  extended_cif *, unsigned, unsigned, unsigned *,
+			  void (*fn)(void));
 
-static void
-ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue,
-	      void *closure)
+void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 {
   extended_cif ecif;
 
@@ -408,8 +401,8 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue,
     {
     case FFI_PA32:
       debug(3, "Calling ffi_call_pa32: ecif=%p, bytes=%u, flags=%u, rvalue=%p, fn=%p\n", &ecif, cif->bytes, cif->flags, ecif.rvalue, (void *)fn);
-      ffi_call_pa32 (ffi_prep_args_pa32, &ecif, cif->bytes,
-		     cif->flags, ecif.rvalue, fn, closure);
+      ffi_call_pa32(ffi_prep_args_pa32, &ecif, cif->bytes,
+		     cif->flags, ecif.rvalue, fn);
       break;
 
     default:
@@ -418,60 +411,35 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue,
     }
 }
 
-void
-ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
-{
-  ffi_call_int (cif, fn, rvalue, avalue, NULL);
-}
-
-void
-ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue,
-	     void *closure)
-{
-  ffi_call_int (cif, fn, rvalue, avalue, closure);
-}
-
 #if FFI_CLOSURES
 /* This is more-or-less an inverse of ffi_call -- we have arguments on
    the stack, and we need to fill them into a cif structure and invoke
    the user function. This really ought to be in asm to make sure
    the compiler doesn't do things we don't expect.  */
-ffi_status
-ffi_closure_inner_pa32 (void *closure, UINT32 *stack, int closure_type)
+ffi_status ffi_closure_inner_pa32(ffi_closure *closure, UINT32 *stack)
 {
   ffi_cif *cif;
-  void (*fun)(ffi_cif *,void *,void **,void *); 
-  void *user_data;
   void **avalue;
   void *rvalue;
-  UINT32 ret[2]; /* function can return up to 64-bits in registers */
+  /* Functions can return up to 64-bits in registers.  Return address
+     must be double word aligned.  */
+  union { double rd; UINT32 ret[2]; } u;
   ffi_type **p_arg;
   char *tmp;
   int i, avn;
   unsigned int slot = FIRST_ARG_SLOT;
   register UINT32 r28 asm("r28");
+  ffi_closure *c = (ffi_closure *)FFI_RESTORE_PTR (closure);
 
-  /* A non-zero closure type indicates a go closure.  */
-  if (closure_type)
-    {
-      cif = ((ffi_go_closure *)closure)->cif;
-      fun = ((ffi_go_closure *)closure)->fun;
-      user_data = closure;
-    }
-  else
-    {
-      cif = ((ffi_closure *)closure)->cif;
-      fun = ((ffi_closure *)closure)->fun;
-      user_data = ((ffi_closure *)closure)->user_data;
-    }
+  cif = closure->cif;
 
   /* If returning via structure, callee will write to our pointer.  */
   if (cif->flags == FFI_TYPE_STRUCT)
     rvalue = (void *)r28;
   else
-    rvalue = &ret[0];
+    rvalue = &u;
 
-  avalue = (void **) alloca (cif->nargs * FFI_SIZEOF_ARG);
+  avalue = (void **)alloca(cif->nargs * FFI_SIZEOF_ARG);
   avn = cif->nargs;
   p_arg = cif->arg_types;
 
@@ -564,35 +532,35 @@ ffi_closure_inner_pa32 (void *closure, UINT32 *stack, int closure_type)
     }
 
   /* Invoke the closure.  */
-  fun (cif, rvalue, avalue, user_data);
+  (c->fun) (cif, rvalue, avalue, c->user_data);
 
-  debug(3, "after calling function, ret[0] = %08x, ret[1] = %08x\n", ret[0],
-	ret[1]);
+  debug(3, "after calling function, ret[0] = %08x, ret[1] = %08x\n", u.ret[0],
+	u.ret[1]);
 
   /* Store the result using the lower 2 bytes of the flags.  */
   switch (cif->flags)
     {
     case FFI_TYPE_UINT8:
-      *(stack - FIRST_ARG_SLOT) = (UINT8)(ret[0] >> 24);
+      *(stack - FIRST_ARG_SLOT) = (UINT8)(u.ret[0] >> 24);
       break;
     case FFI_TYPE_SINT8:
-      *(stack - FIRST_ARG_SLOT) = (SINT8)(ret[0] >> 24);
+      *(stack - FIRST_ARG_SLOT) = (SINT8)(u.ret[0] >> 24);
       break;
     case FFI_TYPE_UINT16:
-      *(stack - FIRST_ARG_SLOT) = (UINT16)(ret[0] >> 16);
+      *(stack - FIRST_ARG_SLOT) = (UINT16)(u.ret[0] >> 16);
       break;
     case FFI_TYPE_SINT16:
-      *(stack - FIRST_ARG_SLOT) = (SINT16)(ret[0] >> 16);
+      *(stack - FIRST_ARG_SLOT) = (SINT16)(u.ret[0] >> 16);
       break;
     case FFI_TYPE_INT:
     case FFI_TYPE_SINT32:
     case FFI_TYPE_UINT32:
-      *(stack - FIRST_ARG_SLOT) = ret[0];
+      *(stack - FIRST_ARG_SLOT) = u.ret[0];
       break;
     case FFI_TYPE_SINT64:
     case FFI_TYPE_UINT64:
-      *(stack - FIRST_ARG_SLOT) = ret[0];
-      *(stack - FIRST_ARG_SLOT - 1) = ret[1];
+      *(stack - FIRST_ARG_SLOT) = u.ret[0];
+      *(stack - FIRST_ARG_SLOT - 1) = u.ret[1];
       break;
 
     case FFI_TYPE_DOUBLE:
@@ -612,7 +580,7 @@ ffi_closure_inner_pa32 (void *closure, UINT32 *stack, int closure_type)
     case FFI_TYPE_SMALL_STRUCT4:
       tmp = (void*)(stack -  FIRST_ARG_SLOT);
       tmp += 4 - cif->rtype->size;
-      memcpy((void*)tmp, &ret[0], cif->rtype->size);
+      memcpy((void*)tmp, &u, cif->rtype->size);
       break;
 
     case FFI_TYPE_SMALL_STRUCT5:
@@ -633,7 +601,7 @@ ffi_closure_inner_pa32 (void *closure, UINT32 *stack, int closure_type)
 	  }
 
 	memset (ret2, 0, sizeof (ret2));
-	memcpy ((char *)ret2 + off, ret, 8 - off);
+	memcpy ((char *)ret2 + off, &u, 8 - off);
 
 	*(stack - FIRST_ARG_SLOT) = ret2[0];
 	*(stack - FIRST_ARG_SLOT - 1) = ret2[1];
@@ -656,7 +624,6 @@ ffi_closure_inner_pa32 (void *closure, UINT32 *stack, int closure_type)
    cif specifies the argument and result types for fun.
    The cif must already be prep'ed.  */
 
-extern void ffi_go_closure_pa32(void);
 extern void ffi_closure_pa32(void);
 
 ffi_status
@@ -666,107 +633,42 @@ ffi_prep_closure_loc (ffi_closure* closure,
 		      void *user_data,
 		      void *codeloc)
 {
-  UINT32 *tramp = (UINT32 *)(closure->tramp);
-#ifdef PA_HPUX
-  UINT32 *tmp;
-#endif
-
-  if (cif->abi != FFI_PA32)
-    return FFI_BAD_ABI;
-
-  /* Make a small trampoline that will branch to our
-     handler function. Use PC-relative addressing.  */
-
-#ifdef PA_LINUX
-  tramp[0] = 0xeaa00000; /* b,l .+8,%r21        ; %r21 <- pc+8 */
-  tramp[1] = 0xd6a01c1e; /* depi 0,31,2,%r21    ; mask priv bits */
-  tramp[2] = 0x4aa10028; /* ldw 20(%r21),%r1    ; load plabel */
-  tramp[3] = 0x36b53ff1; /* ldo -8(%r21),%r21   ; get closure addr */
-  tramp[4] = 0x0c201096; /* ldw 0(%r1),%r22     ; address of handler */
-  tramp[5] = 0xeac0c000; /* bv%r0(%r22)         ; branch to handler */
-  tramp[6] = 0x0c281093; /* ldw 4(%r1),%r19     ; GP of handler */
-  tramp[7] = ((UINT32)(ffi_closure_pa32) & ~2);
-
-  /* Flush d/icache -- have to flush up 2 two lines because of
-     alignment.  */
-  __asm__ volatile(
-		   "fdc 0(%0)\n\t"
-		   "fdc %1(%0)\n\t"
-		   "fic 0(%%sr4, %0)\n\t"
-		   "fic %1(%%sr4, %0)\n\t"
-		   "sync\n\t"
-		   "nop\n\t"
-		   "nop\n\t"
-		   "nop\n\t"
-		   "nop\n\t"
-		   "nop\n\t"
-		   "nop\n\t"
-		   "nop\n"
-		   :
-		   : "r"((unsigned long)tramp & ~31),
-		     "r"(32 /* stride */)
-		   : "memory");
-#endif
+  ffi_closure *c = (ffi_closure *)FFI_RESTORE_PTR (closure);
 
-#ifdef PA_HPUX
-  tramp[0] = 0xeaa00000; /* b,l .+8,%r21        ; %r21 <- pc+8  */
-  tramp[1] = 0xd6a01c1e; /* depi 0,31,2,%r21    ; mask priv bits  */
-  tramp[2] = 0x4aa10038; /* ldw 28(%r21),%r1    ; load plabel  */
-  tramp[3] = 0x36b53ff1; /* ldo -8(%r21),%r21   ; get closure addr  */
-  tramp[4] = 0x0c201096; /* ldw 0(%r1),%r22     ; address of handler  */
-  tramp[5] = 0x02c010b4; /* ldsid (%r22),%r20   ; load space id  */
-  tramp[6] = 0x00141820; /* mtsp %r20,%sr0      ; into %sr0  */
-  tramp[7] = 0xe2c00000; /* be 0(%sr0,%r22)     ; branch to handler  */
-  tramp[8] = 0x0c281093; /* ldw 4(%r1),%r19     ; GP of handler  */
-  tramp[9] = ((UINT32)(ffi_closure_pa32) & ~2);
-
-  /* Flush d/icache -- have to flush three lines because of alignment.  */
-  __asm__ volatile(
-		   "copy %1,%0\n\t"
-		   "fdc,m %2(%0)\n\t"
-		   "fdc,m %2(%0)\n\t"
-		   "fdc,m %2(%0)\n\t"
-		   "ldsid (%1),%0\n\t"
-		   "mtsp %0,%%sr0\n\t"
-		   "copy %1,%0\n\t"
-		   "fic,m %2(%%sr0,%0)\n\t"
-		   "fic,m %2(%%sr0,%0)\n\t"
-		   "fic,m %2(%%sr0,%0)\n\t"
-		   "sync\n\t"
-		   "nop\n\t"
-		   "nop\n\t"
-		   "nop\n\t"
-		   "nop\n\t"
-		   "nop\n\t"
-		   "nop\n\t"
-		   "nop\n"
-		   : "=&r" ((unsigned long)tmp)
-		   : "r" ((unsigned long)tramp & ~31),
-		     "r" (32/* stride */)
-		   : "memory");
-#endif
+  /* The layout of a function descriptor.  A function pointer with the PLABEL
+     bit set points to a function descriptor.  */
+  struct pa32_fd
+  {
+    UINT32 code_pointer;
+    UINT32 gp;
+  };
 
-  closure->cif  = cif;
-  closure->user_data = user_data;
-  closure->fun  = fun;
+  struct ffi_pa32_trampoline_struct
+  {
+     UINT32 code_pointer;        /* Pointer to ffi_closure_unix.  */
+     UINT32 fake_gp;             /* Pointer to closure, installed as gp.  */
+     UINT32 real_gp;             /* Real gp value.  */
+  };
 
-  return FFI_OK;
-}
+  struct ffi_pa32_trampoline_struct *tramp;
+  struct pa32_fd *fd;
 
-#ifdef FFI_GO_CLOSURES
-ffi_status
-ffi_prep_go_closure (ffi_go_closure *closure,
-		     ffi_cif *cif,
-		     void (*fun)(ffi_cif *, void *, void **, void *))
-{
   if (cif->abi != FFI_PA32)
     return FFI_BAD_ABI;
 
-  closure->tramp = &ffi_go_closure_pa32;
-  closure->cif = cif;
-  closure->fun = fun;
+  /* Get function descriptor address for ffi_closure_pa32.  */
+  fd = (struct pa32_fd *)((UINT32)ffi_closure_pa32 & ~3);
+
+  /* Setup trampoline.  */
+  tramp = (struct ffi_pa32_trampoline_struct *)c->tramp;
+  tramp->code_pointer = fd->code_pointer;
+  tramp->fake_gp = (UINT32)codeloc & ~3;
+  tramp->real_gp = fd->gp;
+
+  c->cif  = cif;
+  c->user_data = user_data;
+  c->fun  = fun;
 
   return FFI_OK;
 }
-#endif /* FFI_GO_CLOSURES */
 #endif
diff --git a/libffi/src/pa/ffitarget.h b/libffi/src/pa/ffitarget.h
index 024ac81..df1209e 100644
--- a/libffi/src/pa/ffitarget.h
+++ b/libffi/src/pa/ffitarget.h
@@ -1,6 +1,5 @@
 /* -----------------------------------------------------------------*-C-*-
-   ffitarget.h - Copyright (c) 2016  John David Anglin
-		 Copyright (c) 2012  Anthony Green
+   ffitarget.h - Copyright (c) 2012  Anthony Green
                  Copyright (c) 1996-2003  Red Hat, Inc.
    Target configuration macros for hppa.
 
@@ -68,14 +67,8 @@ typedef enum ffi_abi {
 /* ---- Definitions for closures ----------------------------------------- */
 
 #define FFI_CLOSURES 1
-#define FFI_GO_CLOSURES 1
 #define FFI_NATIVE_RAW_API 0
-
-#ifdef PA_LINUX
-#define FFI_TRAMPOLINE_SIZE 32
-#else
-#define FFI_TRAMPOLINE_SIZE 40
-#endif
+#define FFI_TRAMPOLINE_SIZE 12
 
 #define FFI_TYPE_SMALL_STRUCT2 -1
 #define FFI_TYPE_SMALL_STRUCT3 -2
diff --git a/libffi/src/pa/hpux32.S b/libffi/src/pa/hpux32.S
index 4a47da3..d0e5f69 100644
--- a/libffi/src/pa/hpux32.S
+++ b/libffi/src/pa/hpux32.S
@@ -1,7 +1,6 @@
 /* -----------------------------------------------------------------------
    hpux32.S - Copyright (c) 2006 Free Software Foundation, Inc.
 	                (c) 2008 Red Hat, Inc.
-			(c) 2016 John David Anglin
    based on src/pa/linux.S
 
    HP-UX PA Foreign Function Interface
@@ -42,8 +41,7 @@
 			       unsigned bytes,
 			       unsigned flags,
 			       unsigned *rvalue,
-			       void (*fn)(void),
-			       ffi_go_closure *closure);
+			       void (*fn)(void));
 	 */
 
 	.export	ffi_call_pa32,ENTRY,PRIV_LEV=3
@@ -106,7 +104,6 @@ L$CFI13
 	   we need to give it a place to put the result.  */
 	ldw	-52(%r3), %ret0		; %ret0 <- rvalue
 	ldw	-56(%r3), %r22		; %r22 <- function to call
-	ldw	-60(%r3), %ret1		; %ret1 <- closure
 	bl	$$dyncall, %r31		; Call the user function
 	copy	%r31, %rp
 
@@ -262,7 +259,7 @@ L$done
 L$FE1
 
 	/* void ffi_closure_pa32(void);
-	   Called with closure argument in %r21 */
+	   Called with closure argument in %r19 */
 
 	.SPACE $TEXT$
 	.SUBSPA $CODE$
@@ -288,9 +285,9 @@ L$CFI22
 	stw	%arg2, -44(%r3)
 	stw	%arg3, -48(%r3)
 
-	/* Closure type 0.  */
-	copy	%r21, %arg0
-	copy	%r0, %arg2
+	/* Retrieve closure pointer and real gp.  */
+	copy	%r19, %arg0
+	ldw	8(%r19), %r19
 	bl	ffi_closure_inner_pa32, %r2
 	copy    %r3, %arg1
 	ldwm	-64(%sp), %r3
@@ -302,47 +299,6 @@ L$CFI22
 	.procend
 L$FE2:
 
-	/* void ffi_go_closure_pa32(void);
-	   Called with closure argument in %ret1 */
-
-	.SPACE $TEXT$
-	.SUBSPA $CODE$
-	.export ffi_go_closure_pa32,ENTRY,PRIV_LEV=3,RTNVAL=GR
-	.import ffi_closure_inner_pa32,CODE
-	.align 4
-L$FB3
-ffi_go_closure_pa32
-	.proc
-	.callinfo FRAME=64,CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=3
-	.entry
-
-	stw	%rp, -20(%sp)
-	copy	%r3, %r1
-L$CFI31
-	copy	%sp, %r3
-L$CFI32
-	stwm	%r1, 64(%sp)
-
-	/* Put arguments onto the stack and call ffi_closure_inner.  */
-	stw	%arg0, -36(%r3)
-	stw	%arg1, -40(%r3)
-	stw	%arg2, -44(%r3)
-	stw	%arg3, -48(%r3)
-
-	/* Closure type 1.  */
-	copy	%ret1, %arg0
-	ldi	1, %arg2
-	bl	ffi_closure_inner_pa32, %r2
-	copy    %r3, %arg1
-	ldwm	-64(%sp), %r3
-	ldw	-20(%sp), %rp
-	ldw	-36(%sp), %ret0
-	bv	%r0(%rp)
-	ldw	-40(%sp), %ret1
-	.exit
-	.procend
-L$FE3:
-
 	.SPACE $PRIVATE$
 	.SUBSPA $DATA$
 
@@ -412,25 +368,3 @@ L$ASFDE2:
 
 	.align 4
 L$EFDE2:
-
-L$SFDE3:
-	.word   L$EFDE3-L$ASFDE3        ;# FDE Length
-L$ASFDE3:
-	.word   L$ASFDE3-L$frame1       ;# FDE CIE offset
-	.word   L$FB3   ;# FDE initial location
-	.word   L$FE3-L$FB3     ;# FDE address range
-	.byte   0x4     ;# DW_CFA_advance_loc4
-	.word   L$CFI31-L$FB3
-	.byte   0x83    ;# DW_CFA_offset, column 0x3
-	.uleb128 0x0
-	.byte   0x11    ;# DW_CFA_offset_extended_sf
-	.uleb128 0x2
-	.sleb128 -5
-
-	.byte   0x4     ;# DW_CFA_advance_loc4
-	.word   L$CFI32-L$CFI31
-	.byte   0xd     ;# DW_CFA_def_cfa_register = r3
-	.uleb128 0x3
-
-	.align 4
-L$EFDE3:
diff --git a/libffi/src/pa/linux.S b/libffi/src/pa/linux.S
index 6026904..33ef0b1 100644
--- a/libffi/src/pa/linux.S
+++ b/libffi/src/pa/linux.S
@@ -1,7 +1,6 @@
 /* -----------------------------------------------------------------------
    linux.S - (c) 2003-2004 Randolph Chung <tausq@debian.org>
 	     (c) 2008 Red Hat, Inc.
-	     (c) 2016 John David Anglin
 
    HPPA Foreign Function Interface
 
@@ -38,26 +37,24 @@
 			       unsigned bytes,
 			       unsigned flags,
 			       unsigned *rvalue,
-			       void (*fn)(void),
-			       ffi_go_closure *closure);
+			       void (*fn)(void));
 	 */
 
 	.export ffi_call_pa32,code
 	.import ffi_prep_args_pa32,code
 
 	.type ffi_call_pa32, @function
-	.cfi_startproc
+.LFB1:
 ffi_call_pa32:
 	.proc
 	.callinfo FRAME=64,CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=4
 	.entry
 	stw %rp, -20(%sp)
 	copy %r3, %r1
-        .cfi_offset 2, -20
-        .cfi_register 3, 1
+.LCFI11:
 
 	copy %sp, %r3
-	.cfi_def_cfa_register 3
+.LCFI12:
 
 	/* Setup the stack for calling prep_args...
 	   We want the stack to look like this:
@@ -73,8 +70,8 @@ ffi_call_pa32:
 	 */
 
 	stwm %r1, 64(%sp)
-	.cfi_offset 3, 0
 	stw %r4, 12(%r3)
+.LCFI13:
 	copy %sp, %r4
 
 	addl %arg2, %r4, %arg0      /* arg stack */
@@ -101,7 +98,6 @@ ffi_call_pa32:
 	   we need to give it a place to put the result.  */
 	ldw -52(%r3), %ret0                     /* %ret0 <- rvalue */
 	ldw -56(%r3), %r22                      /* %r22 <- function to call */
-	ldw -60(%r3), %ret1                     /* %ret1 <- closure */
 	bl $$dyncall, %r31                      /* Call the user function */
 	copy %r31, %rp
 
@@ -253,27 +249,27 @@ ffi_call_pa32:
 	nop
 	.exit
 	.procend
-	.cfi_endproc
+.LFE1:
 
 	/* void ffi_closure_pa32(void);
-	   Called with ffi_closure argument in %r21.  */
+	   Called with closure argument in %r19 */
 	.export ffi_closure_pa32,code
 	.import ffi_closure_inner_pa32,code
+
 	.type ffi_closure_pa32, @function
-	.cfi_startproc
+.LFB2:
 ffi_closure_pa32:
 	.proc
 	.callinfo FRAME=64,CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=3
 	.entry
 
 	stw %rp, -20(%sp)
+.LCFI20:
 	copy %r3, %r1
-	.cfi_offset 2, -20
-	.cfi_register 3, 1
+.LCFI21:
 	copy %sp, %r3
-	.cfi_def_cfa_register 3
+.LCFI22:
 	stwm %r1, 64(%sp)
-	.cfi_offset 3, 0
 
 	/* Put arguments onto the stack and call ffi_closure_inner.  */
 	stw %arg0, -36(%r3)
@@ -281,9 +277,9 @@ ffi_closure_pa32:
 	stw %arg2, -44(%r3)
 	stw %arg3, -48(%r3)
 
-	/* Closure type 0.  */
-	copy %r21, %arg0
-	copy %r0, %arg2
+	/* Retrieve closure pointer and real gp.  */
+	copy    %r19, %arg0
+	ldw     8(%r19), %r19
 	bl ffi_closure_inner_pa32, %r2
 	copy %r3, %arg1
 
@@ -295,46 +291,90 @@ ffi_closure_pa32:
 
 	.exit
 	.procend
-	.cfi_endproc
-
-	/* void ffi_go_closure_pa32(void);
-	   Called with ffi_go_closure argument in %ret1.  */
-	.export ffi_go_closure_pa32,code
-	.import ffi_closure_inner_pa32,code
-	.type ffi_go_closure_pa32, @function
-	.cfi_startproc
-ffi_go_closure_pa32:
-	.proc
-	.callinfo FRAME=64,CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=3
-	.entry
-
-	stw %rp, -20(%sp)
-	copy %r3, %r1
-	.cfi_offset 2, -20
-	.cfi_register 3, 1
-	copy %sp, %r3
-	.cfi_def_cfa_register 3
-	stwm %r1, 64(%sp)
-	.cfi_offset 3, 0
-
-	/* Put arguments onto the stack and call ffi_closure_inner.  */
-	stw %arg0, -36(%r3)
-	stw %arg1, -40(%r3)
-	stw %arg2, -44(%r3)
-	stw %arg3, -48(%r3)
-
-	/* Closure type 1.  */
-	copy %ret1, %arg0
-	ldi 1, %arg2
-	bl ffi_closure_inner_pa32, %r2
-	copy %r3, %arg1
+.LFE2:
+
+	.section        ".eh_frame",EH_FRAME_FLAGS,@progbits
+.Lframe1:
+	.word   .LECIE1-.LSCIE1 ;# Length of Common Information Entry
+.LSCIE1:
+	.word   0x0     ;# CIE Identifier Tag
+	.byte   0x1     ;# CIE Version
+#ifdef __PIC__
+	.ascii  "zR\0"  ;# CIE Augmentation: 'z' - data, 'R' - DW_EH_PE_... data
+#else
+	.ascii "\0"     ;# CIE Augmentation
+#endif
+	.uleb128 0x1    ;# CIE Code Alignment Factor
+	.sleb128 4      ;# CIE Data Alignment Factor
+	.byte   0x2     ;# CIE RA Column
+#ifdef __PIC__
+	.uleb128 0x1    ;# Augmentation size
+	.byte	0x1b    ;# FDE Encoding (DW_EH_PE_pcrel|DW_EH_PE_sdata4)
+#endif
+	.byte   0xc     ;# DW_CFA_def_cfa
+	.uleb128 0x1e
+	.uleb128 0x0
+	.align 4
+.LECIE1:
+.LSFDE1:
+	.word   .LEFDE1-.LASFDE1        ;# FDE Length
+.LASFDE1:
+	.word   .LASFDE1-.Lframe1       ;# FDE CIE offset
+#ifdef __PIC__
+	.word	.LFB1-. ;# FDE initial location
+#else
+	.word	.LFB1   ;# FDE initial location
+#endif
+	.word   .LFE1-.LFB1     ;# FDE address range
+#ifdef __PIC__
+	.uleb128 0x0	;# Augmentation size: no data
+#endif
+	.byte   0x4     ;# DW_CFA_advance_loc4
+	.word   .LCFI11-.LFB1
+	.byte	0x83	;# DW_CFA_offset, column 0x3
+	.uleb128 0x0
+	.byte   0x11    ;# DW_CFA_offset_extended_sf; save r2 at [r30-20]
+	.uleb128 0x2
+	.sleb128 -5
+
+	.byte   0x4     ;# DW_CFA_advance_loc4
+	.word   .LCFI12-.LCFI11
+	.byte   0xd     ;# DW_CFA_def_cfa_register = r3
+	.uleb128 0x3
+
+	.byte   0x4     ;# DW_CFA_advance_loc4
+	.word   .LCFI13-.LCFI12
+	.byte	0x84	;# DW_CFA_offset, column 0x4
+	.uleb128 0x3
 
-	ldwm -64(%sp), %r3
-	ldw -20(%sp), %rp
-	ldw -36(%sp), %ret0
-	bv %r0(%r2)
-	ldw -40(%sp), %ret1
+	.align 4
+.LEFDE1:
+
+.LSFDE2:
+	.word   .LEFDE2-.LASFDE2        ;# FDE Length
+.LASFDE2:
+	.word   .LASFDE2-.Lframe1       ;# FDE CIE offset
+#ifdef __PIC__
+	.word   .LFB2-. ;# FDE initial location
+#else
+	.word   .LFB2   ;# FDE initial location
+#endif
+	.word   .LFE2-.LFB2     ;# FDE address range
+#ifdef __PIC__
+	.uleb128 0x0	;# Augmentation size: no data
+#endif
+	.byte   0x4     ;# DW_CFA_advance_loc4
+	.word   .LCFI21-.LFB2
+	.byte   0x83    ;# DW_CFA_offset, column 0x3
+	.uleb128 0x0
+	.byte   0x11    ;# DW_CFA_offset_extended_sf
+	.uleb128 0x2
+	.sleb128 -5
+
+	.byte   0x4     ;# DW_CFA_advance_loc4
+	.word   .LCFI22-.LCFI21
+	.byte   0xd     ;# DW_CFA_def_cfa_register = r3
+	.uleb128 0x3
 
-	.exit
-	.procend
-	.cfi_endproc
+	.align 4
+.LEFDE2:
diff --git a/libffi/src/powerpc/asm.h b/libffi/src/powerpc/asm.h
index 994f62d..27b22f6 100644
--- a/libffi/src/powerpc/asm.h
+++ b/libffi/src/powerpc/asm.h
@@ -93,7 +93,7 @@
 /* EALIGN is like ENTRY, but does alignment to 'words'*4 bytes
    past a 2^align boundary.  */
 #ifdef PROF
-#define EALIGN(name, alignt, words)					      \
+#define EFFI_ALIGN(name, alignt, words)					      \
   ASM_GLOBAL_DIRECTIVE C_SYMBOL_NAME(name);				      \
   ASM_TYPE_DIRECTIVE (C_SYMBOL_NAME(name),@function)			      \
   .align ALIGNARG(2);							      \
@@ -104,7 +104,7 @@
   EALIGN_W_##words;							      \
   0:
 #else /* PROF */
-#define EALIGN(name, alignt, words)					      \
+#define EFFI_ALIGN(name, alignt, words)					      \
   ASM_GLOBAL_DIRECTIVE C_SYMBOL_NAME(name);				      \
   ASM_TYPE_DIRECTIVE (C_SYMBOL_NAME(name),@function)			      \
   .align ALIGNARG(alignt);						      \
diff --git a/libffi/src/powerpc/darwin_closure.S b/libffi/src/powerpc/darwin_closure.S
index c7734d4..3121e6a 100644
--- a/libffi/src/powerpc/darwin_closure.S
+++ b/libffi/src/powerpc/darwin_closure.S
@@ -353,7 +353,7 @@ Lret_type13:
 	bgt	Lstructend		; not a special small case
 	b	Lsmallstruct		; see if we need more.
 #else
-	cmpi	0,r0,4
+	cmpwi	0,r0,4
 	bgt	Lfinish		; not by value
 	lg	r3,0(r5)
 	b	Lfinish
@@ -494,8 +494,8 @@ LSFDE1:
 LASFDE1:
 	.long	LASFDE1-EH_frame1	; FDE CIE offset
 	.g_long	Lstartcode-.	; FDE initial location
-	.set	L$set$3,LFE1-Lstartcode
-	.g_long	L$set$3	; FDE address range
+	.set	L$set$2,LFE1-Lstartcode
+	.g_long	L$set$2	; FDE address range
 	.byte   0x0     ; uleb128 0x0; Augmentation size
 	.byte	0x4	; DW_CFA_advance_loc4
 	.set	L$set$3,LCFI1-LCFI0
diff --git a/libffi/src/powerpc/ffi.c b/libffi/src/powerpc/ffi.c
index 7eb543e..a19bcbb 100644
--- a/libffi/src/powerpc/ffi.c
+++ b/libffi/src/powerpc/ffi.c
@@ -85,8 +85,9 @@ ffi_call_int (ffi_cif *cif,
      can write r3 and r4 to memory without worrying about struct size.
    
      For ELFv2 ABI, use a bounce buffer for homogeneous structs too,
-     for similar reasons.  */
-  unsigned long smst_buffer[8];
+     for similar reasons. This bounce buffer must be aligned to 16
+     bytes for use with homogeneous structs of vectors (float128).  */
+  float128 smst_buffer[8];
   extended_cif ecif;
 
   ecif.cif = cif;
@@ -121,8 +122,9 @@ ffi_call_int (ffi_cif *cif,
 # endif
 	/* The SYSV ABI returns a structure of up to 8 bytes in size
 	   left-padded in r3/r4, and the ELFv2 ABI similarly returns a
-	   structure of up to 8 bytes in size left-padded in r3.  */
-	if (rsize <= 8)
+	   structure of up to 8 bytes in size left-padded in r3. But
+	   note that a structure of a single float is not paddded.  */
+	if (rsize <= 8 && (cif->flags & FLAG_RETURNS_FP) == 0)
 	  memcpy (rvalue, (char *) smst_buffer + 8 - rsize, rsize);
 	else
 #endif
diff --git a/libffi/src/powerpc/ffi_darwin.c b/libffi/src/powerpc/ffi_darwin.c
index 6588e3c..64bb94df 100644
--- a/libffi/src/powerpc/ffi_darwin.c
+++ b/libffi/src/powerpc/ffi_darwin.c
@@ -33,7 +33,10 @@
 #include <stdlib.h>
 
 extern void ffi_closure_ASM (void);
+
+#if defined (FFI_GO_CLOSURES)
 extern void ffi_go_closure_ASM (void);
+#endif
 
 enum {
   /* The assembly depends on these exact flags.  
@@ -256,7 +259,7 @@ ffi_prep_args (extended_cif *ecif, unsigned long *const stack)
 	case FFI_TYPE_STRUCT:
 	  size_al = (*ptr)->size;
 #if defined(POWERPC_DARWIN64)
-	  next_arg = (unsigned long *)ALIGN((char *)next_arg, (*ptr)->alignment);
+	  next_arg = (unsigned long *)FFI_ALIGN((char *)next_arg, (*ptr)->alignment);
 	  darwin64_pass_struct_by_value (*ptr, (char *) *p_argv, 
 					 (unsigned) size_al,
 					 (unsigned int *) &fparg_count,
@@ -267,7 +270,7 @@ ffi_prep_args (extended_cif *ecif, unsigned long *const stack)
 	  /* If the first member of the struct is a double, then include enough
 	     padding in the struct size to align it to double-word.  */
 	  if ((*ptr)->elements[0]->type == FFI_TYPE_DOUBLE)
-	    size_al = ALIGN((*ptr)->size, 8);
+	    size_al = FFI_ALIGN((*ptr)->size, 8);
 
 #  if defined(POWERPC64) 
 	  FFI_ASSERT (abi != FFI_DARWIN);
@@ -353,7 +356,7 @@ darwin64_struct_size_exceeds_gprs_p (ffi_type *s, char *src, unsigned *nfpr)
       ffi_type *p = s->elements[i];
       /* Find the start of this item (0 for the first one).  */
       if (i > 0)
-        struct_offset = ALIGN(struct_offset, p->alignment);
+        struct_offset = FFI_ALIGN(struct_offset, p->alignment);
 
       item_base = src + struct_offset;
 
@@ -437,7 +440,7 @@ darwin64_pass_struct_floats (ffi_type *s, char *src,
       ffi_type *p = s->elements[i];
       /* Find the start of this item (0 for the first one).  */
       if (i > 0)
-        struct_offset = ALIGN(struct_offset, p->alignment);
+        struct_offset = FFI_ALIGN(struct_offset, p->alignment);
       item_base = src + struct_offset;
 
       switch (p->type)
@@ -528,7 +531,7 @@ darwin64_struct_floats_to_mem (ffi_type *s, char *dest, double *fprs, unsigned *
       ffi_type *p = s->elements[i];
       /* Find the start of this item (0 for the first one).  */
       if (i > 0)
-        struct_offset = ALIGN(struct_offset, p->alignment);
+        struct_offset = FFI_ALIGN(struct_offset, p->alignment);
       item_base = dest + struct_offset;
 
       switch (p->type)
@@ -605,10 +608,10 @@ darwin_adjust_aggregate_sizes (ffi_type *s)
 	align = 4;
 #endif
       /* Pad, if necessary, before adding the current item.  */
-      s->size = ALIGN(s->size, align) + p->size;
+      s->size = FFI_ALIGN(s->size, align) + p->size;
     }
   
-  s->size = ALIGN(s->size, s->alignment);
+  s->size = FFI_ALIGN(s->size, s->alignment);
   
   /* This should not be necessary on m64, but harmless.  */
   if (s->elements[0]->type == FFI_TYPE_UINT64
@@ -641,10 +644,10 @@ aix_adjust_aggregate_sizes (ffi_type *s)
       align = p->alignment;
       if (i != 0 && p->type == FFI_TYPE_DOUBLE)
 	align = 4;
-      s->size = ALIGN(s->size, align) + p->size;
+      s->size = FFI_ALIGN(s->size, align) + p->size;
     }
   
-  s->size = ALIGN(s->size, s->alignment);
+  s->size = FFI_ALIGN(s->size, s->alignment);
   
   if (s->elements[0]->type == FFI_TYPE_UINT64
       || s->elements[0]->type == FFI_TYPE_SINT64
@@ -810,9 +813,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 	     16-byte-aligned.  */
 	  if (fparg_count >= NUM_FPR_ARG_REGISTERS)
 #if defined (POWERPC64)
-	    intarg_count = ALIGN(intarg_count, 2);
+	    intarg_count = FFI_ALIGN(intarg_count, 2);
 #else
-	    intarg_count = ALIGN(intarg_count, 4);
+	    intarg_count = FFI_ALIGN(intarg_count, 4);
 #endif
 	  break;
 #endif
@@ -839,7 +842,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 #if defined(POWERPC_DARWIN64)
 	  align_words = (*ptr)->alignment >> 3;
 	  if (align_words)
-	    intarg_count = ALIGN(intarg_count, align_words);
+	    intarg_count = FFI_ALIGN(intarg_count, align_words);
 	  /* Base size of the struct.  */
 	  intarg_count += (size_al + 7) / 8;
 	  /* If 16 bytes then don't worry about floats.  */
@@ -849,11 +852,11 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 #else
 	  align_words = (*ptr)->alignment >> 2;
 	  if (align_words)
-	    intarg_count = ALIGN(intarg_count, align_words);
+	    intarg_count = FFI_ALIGN(intarg_count, align_words);
 	  /* If the first member of the struct is a double, then align
 	     the struct to double-word. 
 	  if ((*ptr)->elements[0]->type == FFI_TYPE_DOUBLE)
-	    size_al = ALIGN((*ptr)->size, 8); */
+	    size_al = FFI_ALIGN((*ptr)->size, 8); */
 #  ifdef POWERPC64
 	  intarg_count += (size_al + 7) / 8;
 #  else
@@ -898,7 +901,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
     bytes += NUM_GPR_ARG_REGISTERS * sizeof(long);
 
   /* The stack space allocated needs to be a multiple of 16 bytes.  */
-  bytes = ALIGN(bytes, 16) ;
+  bytes = FFI_ALIGN(bytes, 16) ;
 
   cif->flags = flags;
   cif->bytes = bytes;
@@ -909,8 +912,10 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 extern void ffi_call_AIX(extended_cif *, long, unsigned, unsigned *,
 			 void (*fn)(void), void (*fn2)(void));
 
+#if defined (FFI_GO_CLOSURES)
 extern void ffi_call_go_AIX(extended_cif *, long, unsigned, unsigned *,
 			    void (*fn)(void), void (*fn2)(void), void *closure);
+#endif
 
 extern void ffi_call_DARWIN(extended_cif *, long, unsigned, unsigned *,
 			    void (*fn)(void), void (*fn2)(void), ffi_type*);
@@ -950,6 +955,7 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
     }
 }
 
+#if defined (FFI_GO_CLOSURES)
 void
 ffi_call_go (ffi_cif *cif, void (*fn) (void), void *rvalue, void **avalue,
 	     void *closure)
@@ -981,6 +987,7 @@ ffi_call_go (ffi_cif *cif, void (*fn) (void), void *rvalue, void **avalue,
       break;
     }
 }
+#endif
 
 static void flush_icache(char *);
 static void flush_range(char *, int);
@@ -1110,6 +1117,7 @@ ffi_prep_closure_loc (ffi_closure* closure,
   return FFI_OK;
 }
 
+#if defined (FFI_GO_CLOSURES)
 ffi_status
 ffi_prep_go_closure (ffi_go_closure* closure,
 		     ffi_cif* cif,
@@ -1133,6 +1141,7 @@ ffi_prep_go_closure (ffi_go_closure* closure,
     }
   return FFI_OK;
 }
+#endif
 
 static void
 flush_icache(char *addr)
@@ -1168,9 +1177,11 @@ ffi_type *
 ffi_closure_helper_DARWIN (ffi_closure *, void *,
 			   unsigned long *, ffi_dblfl *);
 
+#if defined (FFI_GO_CLOSURES)
 ffi_type *
 ffi_go_closure_helper_DARWIN (ffi_go_closure*, void *,
 			      unsigned long *, ffi_dblfl *);
+#endif
 
 /* Basically the trampoline invokes ffi_closure_ASM, and on
    entry, r11 holds the address of the closure.
@@ -1272,7 +1283,7 @@ ffi_closure_helper_common (ffi_cif* cif,
 	case FFI_TYPE_STRUCT:
 	  size_al = arg_types[i]->size;
 #if defined(POWERPC_DARWIN64)
-	  pgr = (unsigned long *)ALIGN((char *)pgr, arg_types[i]->alignment);
+	  pgr = (unsigned long *)FFI_ALIGN((char *)pgr, arg_types[i]->alignment);
 	  if (size_al < 3 || size_al == 4)
 	    {
 	      avalue[i] = ((char *)pgr)+8-size_al;
@@ -1297,7 +1308,7 @@ ffi_closure_helper_common (ffi_cif* cif,
 	  /* If the first member of the struct is a double, then align
 	     the struct to double-word.  */
 	  if (arg_types[i]->elements[0]->type == FFI_TYPE_DOUBLE)
-	    size_al = ALIGN(arg_types[i]->size, 8);
+	    size_al = FFI_ALIGN(arg_types[i]->size, 8);
 #  if defined(POWERPC64)
 	  FFI_ASSERT (cif->abi != FFI_DARWIN);
 	  avalue[i] = pgr;
@@ -1430,6 +1441,7 @@ ffi_closure_helper_DARWIN (ffi_closure *closure, void *rvalue,
 				    closure->user_data, rvalue, pgr, pfr);
 }
 
+#if defined (FFI_GO_CLOSURES)
 ffi_type *
 ffi_go_closure_helper_DARWIN (ffi_go_closure *closure, void *rvalue,
 			      unsigned long *pgr, ffi_dblfl *pfr)
@@ -1437,4 +1449,4 @@ ffi_go_closure_helper_DARWIN (ffi_go_closure *closure, void *rvalue,
   return ffi_closure_helper_common (closure->cif, closure->fun,
 				    closure, rvalue, pgr, pfr);
 }
-
+#endif
diff --git a/libffi/src/powerpc/ffi_linux64.c b/libffi/src/powerpc/ffi_linux64.c
index ef0361b..4d50878 100644
--- a/libffi/src/powerpc/ffi_linux64.c
+++ b/libffi/src/powerpc/ffi_linux64.c
@@ -38,7 +38,8 @@
 /* About the LINUX64 ABI.  */
 enum {
   NUM_GPR_ARG_REGISTERS64 = 8,
-  NUM_FPR_ARG_REGISTERS64 = 13
+  NUM_FPR_ARG_REGISTERS64 = 13,
+  NUM_VEC_ARG_REGISTERS64 = 12,
 };
 enum { ASM_NEEDS_REGISTERS64 = 4 };
 
@@ -63,10 +64,31 @@ ffi_prep_types_linux64 (ffi_abi abi)
 
 
 static unsigned int
-discover_homogeneous_aggregate (const ffi_type *t, unsigned int *elnum)
+discover_homogeneous_aggregate (ffi_abi abi,
+                                const ffi_type *t,
+                                unsigned int *elnum)
 {
   switch (t->type)
     {
+#if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
+    case FFI_TYPE_LONGDOUBLE:
+      /* 64-bit long doubles are equivalent to doubles. */
+      if ((abi & FFI_LINUX_LONG_DOUBLE_128) == 0)
+        {
+          *elnum = 1;
+          return FFI_TYPE_DOUBLE;
+        }
+      /* IBM extended precision values use unaligned pairs
+         of FPRs, but according to the ABI must be considered
+         distinct from doubles. They are also limited to a
+         maximum of four members in a homogeneous aggregate. */
+      else if ((abi & FFI_LINUX_LONG_DOUBLE_IEEE128) == 0)
+        {
+          *elnum = 2;
+          return FFI_TYPE_LONGDOUBLE;
+        }
+      /* Fall through. */
+#endif
     case FFI_TYPE_FLOAT:
     case FFI_TYPE_DOUBLE:
       *elnum = 1;
@@ -79,7 +101,7 @@ discover_homogeneous_aggregate (const ffi_type *t, unsigned int *elnum)
 	while (*el)
 	  {
 	    unsigned int el_elt, el_elnum = 0;
-	    el_elt = discover_homogeneous_aggregate (*el, &el_elnum);
+	    el_elt = discover_homogeneous_aggregate (abi, *el, &el_elnum);
 	    if (el_elt == 0
 		|| (base_elt && base_elt != el_elt))
 	      return 0;
@@ -110,13 +132,23 @@ ffi_prep_cif_linux64_core (ffi_cif *cif)
 {
   ffi_type **ptr;
   unsigned bytes;
-  unsigned i, fparg_count = 0, intarg_count = 0;
+  unsigned i, fparg_count = 0, intarg_count = 0, vecarg_count = 0;
   unsigned flags = cif->flags;
-  unsigned int elt, elnum;
+  unsigned elt, elnum, rtype;
 
 #if FFI_TYPE_LONGDOUBLE == FFI_TYPE_DOUBLE
-  /* If compiled without long double support..  */
-  if ((cif->abi & FFI_LINUX_LONG_DOUBLE_128) != 0)
+  /* If compiled without long double support... */
+  if ((cif->abi & FFI_LINUX_LONG_DOUBLE_128) != 0 ||
+      (cif->abi & FFI_LINUX_LONG_DOUBLE_IEEE128) != 0)
+    return FFI_BAD_ABI;
+#elif !defined(__VEC__)
+  /* If compiled without vector register support (used by assembly)... */
+  if ((cif->abi & FFI_LINUX_LONG_DOUBLE_IEEE128) != 0)
+    return FFI_BAD_ABI;
+#else
+  /* If the IEEE128 flag is set, but long double is only 64 bits wide... */
+  if ((cif->abi & FFI_LINUX_LONG_DOUBLE_128) == 0 &&
+      (cif->abi & FFI_LINUX_LONG_DOUBLE_IEEE128) != 0)
     return FFI_BAD_ABI;
 #endif
 
@@ -138,10 +170,19 @@ ffi_prep_cif_linux64_core (ffi_cif *cif)
 #endif
 
   /* Return value handling.  */
-  switch (cif->rtype->type)
+  rtype = cif->rtype->type;
+#if _CALL_ELF == 2
+homogeneous:
+#endif
+  switch (rtype)
     {
 #if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
     case FFI_TYPE_LONGDOUBLE:
+      if ((cif->abi & FFI_LINUX_LONG_DOUBLE_IEEE128) != 0)
+        {
+          flags |= FLAG_RETURNS_VEC;
+          break;
+        }
       if ((cif->abi & FFI_LINUX_LONG_DOUBLE_128) != 0)
 	flags |= FLAG_RETURNS_128BITS;
       /* Fall through.  */
@@ -164,19 +205,18 @@ ffi_prep_cif_linux64_core (ffi_cif *cif)
 
     case FFI_TYPE_STRUCT:
 #if _CALL_ELF == 2
-      elt = discover_homogeneous_aggregate (cif->rtype, &elnum);
+      elt = discover_homogeneous_aggregate (cif->abi, cif->rtype, &elnum);
       if (elt)
-	{
-	  if (elt == FFI_TYPE_DOUBLE)
-	    flags |= FLAG_RETURNS_64BITS;
-	  flags |= FLAG_RETURNS_FP | FLAG_RETURNS_SMST;
-	  break;
-	}
+        {
+          flags |= FLAG_RETURNS_SMST;
+          rtype = elt;
+          goto homogeneous;
+        }
       if (cif->rtype->size <= 16)
-	{
-	  flags |= FLAG_RETURNS_SMST;
-	  break;
-	}
+        {
+          flags |= FLAG_RETURNS_SMST;
+          break;
+        }
 #endif
       intarg_count++;
       flags |= FLAG_RETVAL_REFERENCE;
@@ -198,6 +238,15 @@ ffi_prep_cif_linux64_core (ffi_cif *cif)
 	{
 #if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
 	case FFI_TYPE_LONGDOUBLE:
+          if ((cif->abi & FFI_LINUX_LONG_DOUBLE_IEEE128) != 0)
+            {
+              vecarg_count++;
+              /* Align to 16 bytes, plus the 16-byte argument. */
+              intarg_count = (intarg_count + 3) & ~0x1;
+              if (vecarg_count > NUM_VEC_ARG_REGISTERS64)
+                flags |= FLAG_ARG_NEEDS_PSAVE;
+              break;
+            }
 	  if ((cif->abi & FFI_LINUX_LONG_DOUBLE_128) != 0)
 	    {
 	      fparg_count++;
@@ -221,10 +270,21 @@ ffi_prep_cif_linux64_core (ffi_cif *cif)
 		align = 16;
 	      align = align / 8;
 	      if (align > 1)
-		intarg_count = ALIGN (intarg_count, align);
+		intarg_count = FFI_ALIGN (intarg_count, align);
 	    }
 	  intarg_count += ((*ptr)->size + 7) / 8;
-	  elt = discover_homogeneous_aggregate (*ptr, &elnum);
+	  elt = discover_homogeneous_aggregate (cif->abi, *ptr, &elnum);
+#if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
+          if (elt == FFI_TYPE_LONGDOUBLE &&
+              (cif->abi & FFI_LINUX_LONG_DOUBLE_IEEE128) != 0)
+            {
+              vecarg_count += elnum;
+              if (vecarg_count > NUM_VEC_ARG_REGISTERS64)
+                flags |= FLAG_ARG_NEEDS_PSAVE;
+              break;
+            }
+	  else
+#endif
 	  if (elt)
 	    {
 	      fparg_count += elnum;
@@ -263,10 +323,17 @@ ffi_prep_cif_linux64_core (ffi_cif *cif)
     flags |= FLAG_FP_ARGUMENTS;
   if (intarg_count > 4)
     flags |= FLAG_4_GPR_ARGUMENTS;
+  if (vecarg_count != 0)
+    flags |= FLAG_VEC_ARGUMENTS;
 
   /* Space for the FPR registers, if needed.  */
   if (fparg_count != 0)
     bytes += NUM_FPR_ARG_REGISTERS64 * sizeof (double);
+  /* Space for the vector registers, if needed, aligned to 16 bytes. */
+  if (vecarg_count != 0) {
+    bytes = (bytes + 15) & ~0xF;
+    bytes += NUM_VEC_ARG_REGISTERS64 * sizeof (float128);
+  }
 
   /* Stack space.  */
 #if _CALL_ELF == 2
@@ -349,6 +416,8 @@ ffi_prep_cif_linux64_var (ffi_cif *cif,
    |--------------------------------------------| |
    |   FPR registers f1-f13 (optional)	13*8	| |
    |--------------------------------------------| |
+   |   VEC registers v2-v13 (optional)  12*16   | |
+   |--------------------------------------------| |
    |   Parameter save area		        | |
    |--------------------------------------------| |
    |   TOC save area			8	| |
@@ -378,6 +447,7 @@ ffi_prep_args64 (extended_cif *ecif, unsigned long *const stack)
     unsigned long *ul;
     float *f;
     double *d;
+    float128 *f128;
     size_t p;
   } valp;
 
@@ -391,11 +461,16 @@ ffi_prep_args64 (extended_cif *ecif, unsigned long *const stack)
   valp rest;
   valp next_arg;
 
-  /* 'fpr_base' points at the space for fpr3, and grows upwards as
+  /* 'fpr_base' points at the space for f1, and grows upwards as
      we use FPR registers.  */
   valp fpr_base;
   unsigned int fparg_count;
 
+  /* 'vec_base' points at the space for v2, and grows upwards as
+     we use vector registers.  */
+  valp vec_base;
+  unsigned int vecarg_count;
+
   unsigned int i, words, nargs, nfixedargs;
   ffi_type **ptr;
   double double_tmp;
@@ -412,6 +487,7 @@ ffi_prep_args64 (extended_cif *ecif, unsigned long *const stack)
     unsigned long **ul;
     float **f;
     double **d;
+    float128 **f128;
   } p_argv;
   unsigned long gprvalue;
   unsigned long align;
@@ -426,11 +502,21 @@ ffi_prep_args64 (extended_cif *ecif, unsigned long *const stack)
 #endif
   fpr_base.d = gpr_base.d - NUM_FPR_ARG_REGISTERS64;
   fparg_count = 0;
+  /* Place the vector args below the FPRs, if used, else the GPRs. */
+  if (ecif->cif->flags & FLAG_FP_ARGUMENTS)
+    vec_base.p = fpr_base.p & ~0xF;
+  else
+    vec_base.p = gpr_base.p;
+  vec_base.f128 -= NUM_VEC_ARG_REGISTERS64;
+  vecarg_count = 0;
   next_arg.ul = gpr_base.ul;
 
   /* Check that everything starts aligned properly.  */
   FFI_ASSERT (((unsigned long) (char *) stack & 0xF) == 0);
   FFI_ASSERT (((unsigned long) stacktop.c & 0xF) == 0);
+  FFI_ASSERT (((unsigned long) gpr_base.c & 0xF) == 0);
+  FFI_ASSERT (((unsigned long) gpr_end.c  & 0xF) == 0);
+  FFI_ASSERT (((unsigned long) vec_base.c & 0xF) == 0);
   FFI_ASSERT ((bytes & 0xF) == 0);
 
   /* Deal with return values that are actually pass-by-reference.  */
@@ -455,6 +541,22 @@ ffi_prep_args64 (extended_cif *ecif, unsigned long *const stack)
 	{
 #if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
 	case FFI_TYPE_LONGDOUBLE:
+          if ((ecif->cif->abi & FFI_LINUX_LONG_DOUBLE_IEEE128) != 0)
+            {
+              next_arg.p = FFI_ALIGN (next_arg.p, 16);
+              if (next_arg.ul == gpr_end.ul)
+                next_arg.ul = rest.ul;
+              if (vecarg_count < NUM_VEC_ARG_REGISTERS64 && i < nfixedargs)
+		memcpy (vec_base.f128++, *p_argv.f128, sizeof (float128));
+              else
+		memcpy (next_arg.f128, *p_argv.f128, sizeof (float128));
+              if (++next_arg.f128 == gpr_end.f128)
+                next_arg.f128 = rest.f128;
+              vecarg_count++;
+              FFI_ASSERT (__LDBL_MANT_DIG__ == 113);
+              FFI_ASSERT (flags & FLAG_VEC_ARGUMENTS);
+              break;
+            }
 	  if ((ecif->cif->abi & FFI_LINUX_LONG_DOUBLE_128) != 0)
 	    {
 	      double_tmp = (*p_argv.d)[0];
@@ -492,7 +594,9 @@ ffi_prep_args64 (extended_cif *ecif, unsigned long *const stack)
 	  /* Fall through.  */
 #endif
 	case FFI_TYPE_DOUBLE:
+#if _CALL_ELF != 2
 	do_double:
+#endif
 	  double_tmp = **p_argv.d;
 	  if (fparg_count < NUM_FPR_ARG_REGISTERS64 && i < nfixedargs)
 	    {
@@ -511,7 +615,9 @@ ffi_prep_args64 (extended_cif *ecif, unsigned long *const stack)
 	  break;
 
 	case FFI_TYPE_FLOAT:
+#if _CALL_ELF != 2
 	do_float:
+#endif
 	  double_tmp = **p_argv.f;
 	  if (fparg_count < NUM_FPR_ARG_REGISTERS64 && i < nfixedargs)
 	    {
@@ -548,9 +654,13 @@ ffi_prep_args64 (extended_cif *ecif, unsigned long *const stack)
 	      if (align > 16)
 		align = 16;
 	      if (align > 1)
-		next_arg.p = ALIGN (next_arg.p, align);
+                {
+                  next_arg.p = FFI_ALIGN (next_arg.p, align);
+                  if (next_arg.ul == gpr_end.ul)
+                    next_arg.ul = rest.ul;
+                }
 	    }
-	  elt = discover_homogeneous_aggregate (*ptr, &elnum);
+	  elt = discover_homogeneous_aggregate (ecif->cif->abi, *ptr, &elnum);
 	  if (elt)
 	    {
 #if _CALL_ELF == 2
@@ -558,9 +668,29 @@ ffi_prep_args64 (extended_cif *ecif, unsigned long *const stack)
 		void *v;
 		float *f;
 		double *d;
+		float128 *f128;
 	      } arg;
 
 	      arg.v = *p_argv.v;
+#if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
+              if (elt == FFI_TYPE_LONGDOUBLE &&
+                  (ecif->cif->abi & FFI_LINUX_LONG_DOUBLE_IEEE128) != 0)
+                {
+                  do
+                    {
+                      if (vecarg_count < NUM_VEC_ARG_REGISTERS64
+                          && i < nfixedargs)
+		        memcpy (vec_base.f128++, arg.f128, sizeof (float128));
+                      else
+		        memcpy (next_arg.f128, arg.f128++, sizeof (float128));
+                      if (++next_arg.f128 == gpr_end.f128)
+                        next_arg.f128 = rest.f128;
+                      vecarg_count++;
+                    }
+                  while (--elnum != 0);
+                }
+              else
+#endif
 	      if (elt == FFI_TYPE_FLOAT)
 		{
 		  do
@@ -576,11 +706,9 @@ ffi_prep_args64 (extended_cif *ecif, unsigned long *const stack)
 		      fparg_count++;
 		    }
 		  while (--elnum != 0);
-		  if ((next_arg.p & 3) != 0)
-		    {
-		      if (++next_arg.f == gpr_end.f)
-			next_arg.f = rest.f;
-		    }
+		  if ((next_arg.p & 7) != 0)
+                    if (++next_arg.f == gpr_end.f)
+                      next_arg.f = rest.f;
 		}
 	      else
 		do
@@ -733,17 +861,20 @@ ffi_closure_helper_LINUX64 (ffi_cif *cif,
 			    void *user_data,
 			    void *rvalue,
 			    unsigned long *pst,
-			    ffi_dblfl *pfr)
+                            ffi_dblfl *pfr,
+                            float128 *pvec)
 {
   /* rvalue is the pointer to space for return value in closure assembly */
   /* pst is the pointer to parameter save area
      (r3-r10 are stored into its first 8 slots by ffi_closure_LINUX64) */
   /* pfr is the pointer to where f1-f13 are stored in ffi_closure_LINUX64 */
+  /* pvec is the pointer to where v2-v13 are stored in ffi_closure_LINUX64 */
 
   void **avalue;
   ffi_type **arg_types;
   unsigned long i, avn, nfixedargs;
   ffi_dblfl *end_pfr = pfr + NUM_FPR_ARG_REGISTERS64;
+  float128 *end_pvec = pvec + NUM_VEC_ARG_REGISTERS64;
   unsigned long align;
 
   avalue = alloca (cif->nargs * sizeof (void *));
@@ -811,9 +942,9 @@ ffi_closure_helper_LINUX64 (ffi_cif *cif,
 	      if (align > 16)
 		align = 16;
 	      if (align > 1)
-		pst = (unsigned long *) ALIGN ((size_t) pst, align);
+		pst = (unsigned long *) FFI_ALIGN ((size_t) pst, align);
 	    }
-	  elt = discover_homogeneous_aggregate (arg_types[i], &elnum);
+	  elt = discover_homogeneous_aggregate (cif->abi, arg_types[i], &elnum);
 	  if (elt)
 	    {
 #if _CALL_ELF == 2
@@ -822,6 +953,7 @@ ffi_closure_helper_LINUX64 (ffi_cif *cif,
 		unsigned long *ul;
 		float *f;
 		double *d;
+		float128 *f128;
 		size_t p;
 	      } to, from;
 
@@ -829,6 +961,17 @@ ffi_closure_helper_LINUX64 (ffi_cif *cif,
 		 aggregate size is not greater than the space taken by
 		 the registers so store back to the register/parameter
 		 save arrays.  */
+#if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
+              if (elt == FFI_TYPE_LONGDOUBLE &&
+                  (cif->abi & FFI_LINUX_LONG_DOUBLE_IEEE128) != 0)
+                {
+                  if (pvec + elnum <= end_pvec)
+                    to.v = pvec;
+                  else
+                    to.v = pst;
+                }
+              else
+#endif
 	      if (pfr + elnum <= end_pfr)
 		to.v = pfr;
 	      else
@@ -836,6 +979,23 @@ ffi_closure_helper_LINUX64 (ffi_cif *cif,
 
 	      avalue[i] = to.v;
 	      from.ul = pst;
+#if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
+              if (elt == FFI_TYPE_LONGDOUBLE &&
+                  (cif->abi & FFI_LINUX_LONG_DOUBLE_IEEE128) != 0)
+                {
+                  do
+                    {
+                      if (pvec < end_pvec && i < nfixedargs)
+		        memcpy (to.f128, pvec++, sizeof (float128));
+                      else
+		        memcpy (to.f128, from.f128, sizeof (float128));
+                      to.f128++;
+                      from.f128++;
+                    }
+                  while (--elnum != 0);
+                }
+              else
+#endif
 	      if (elt == FFI_TYPE_FLOAT)
 		{
 		  do
@@ -891,7 +1051,18 @@ ffi_closure_helper_LINUX64 (ffi_cif *cif,
 
 #if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
 	case FFI_TYPE_LONGDOUBLE:
-	  if ((cif->abi & FFI_LINUX_LONG_DOUBLE_128) != 0)
+          if ((cif->abi & FFI_LINUX_LONG_DOUBLE_IEEE128) != 0)
+            {
+              if (((unsigned long) pst & 0xF) != 0)
+                ++pst;
+              if (pvec < end_pvec && i < nfixedargs)
+                avalue[i] = pvec++;
+              else
+                avalue[i] = pst;
+              pst += 2;
+              break;
+            }
+          else if ((cif->abi & FFI_LINUX_LONG_DOUBLE_128) != 0)
 	    {
 	      if (pfr + 1 < end_pfr && i + 1 < nfixedargs)
 		{
@@ -915,7 +1086,9 @@ ffi_closure_helper_LINUX64 (ffi_cif *cif,
 	  /* Fall through.  */
 #endif
 	case FFI_TYPE_DOUBLE:
+#if _CALL_ELF != 2
 	do_double:
+#endif
 	  /* On the outgoing stack all values are aligned to 8 */
 	  /* there are 13 64bit floating point registers */
 
@@ -930,7 +1103,9 @@ ffi_closure_helper_LINUX64 (ffi_cif *cif,
 	  break;
 
 	case FFI_TYPE_FLOAT:
+#if _CALL_ELF != 2
 	do_float:
+#endif
 	  if (pfr < end_pfr && i < nfixedargs)
 	    {
 	      /* Float values are stored as doubles in the
@@ -962,13 +1137,17 @@ ffi_closure_helper_LINUX64 (ffi_cif *cif,
   /* Tell ffi_closure_LINUX64 how to perform return type promotions.  */
   if ((cif->flags & FLAG_RETURNS_SMST) != 0)
     {
-      if ((cif->flags & FLAG_RETURNS_FP) == 0)
+      if ((cif->flags & (FLAG_RETURNS_FP | FLAG_RETURNS_VEC)) == 0)
 	return FFI_V2_TYPE_SMALL_STRUCT + cif->rtype->size - 1;
+      else if ((cif->flags & FLAG_RETURNS_VEC) != 0)
+        return FFI_V2_TYPE_VECTOR_HOMOG;
       else if ((cif->flags & FLAG_RETURNS_64BITS) != 0)
 	return FFI_V2_TYPE_DOUBLE_HOMOG;
       else
 	return FFI_V2_TYPE_FLOAT_HOMOG;
     }
+  if ((cif->flags & FLAG_RETURNS_VEC) != 0)
+    return FFI_V2_TYPE_VECTOR;
   return cif->rtype->type;
 }
 #endif
diff --git a/libffi/src/powerpc/ffi_powerpc.h b/libffi/src/powerpc/ffi_powerpc.h
index 3dcd6b5..960a5c4 100644
--- a/libffi/src/powerpc/ffi_powerpc.h
+++ b/libffi/src/powerpc/ffi_powerpc.h
@@ -31,22 +31,24 @@
 enum {
   /* The assembly depends on these exact flags.  */
   /* These go in cr7 */
-  FLAG_RETURNS_SMST	= 1 << (31-31), /* Used for FFI_SYSV small structs.  */
+  FLAG_RETURNS_SMST     = 1 << (31-31), /* Used for FFI_SYSV small structs.  */
   FLAG_RETURNS_NOTHING  = 1 << (31-30),
   FLAG_RETURNS_FP       = 1 << (31-29),
-  FLAG_RETURNS_64BITS   = 1 << (31-28),
+  FLAG_RETURNS_VEC      = 1 << (31-28),
 
-  /* This goes in cr6 */
-  FLAG_RETURNS_128BITS  = 1 << (31-27),
+  /* These go in cr6 */
+  FLAG_RETURNS_64BITS   = 1 << (31-27),
+  FLAG_RETURNS_128BITS  = 1 << (31-26),
 
-  FLAG_COMPAT		= 1 << (31- 8), /* Not used by assembly */
+  FLAG_COMPAT           = 1 << (31- 8), /* Not used by assembly */
 
   /* These go in cr1 */
   FLAG_ARG_NEEDS_COPY   = 1 << (31- 7), /* Used by sysv code */
   FLAG_ARG_NEEDS_PSAVE  = FLAG_ARG_NEEDS_COPY, /* Used by linux64 code */
   FLAG_FP_ARGUMENTS     = 1 << (31- 6), /* cr1.eq; specified by ABI */
   FLAG_4_GPR_ARGUMENTS  = 1 << (31- 5),
-  FLAG_RETVAL_REFERENCE = 1 << (31- 4)
+  FLAG_RETVAL_REFERENCE = 1 << (31- 4),
+  FLAG_VEC_ARGUMENTS    = 1 << (31- 3),
 };
 
 typedef union
@@ -55,6 +57,14 @@ typedef union
   double d;
 } ffi_dblfl;
 
+#if defined(__FLOAT128_TYPE__) && defined(__HAVE_FLOAT128)
+typedef _Float128 float128;
+#elif defined(__FLOAT128__)
+typedef __float128 float128;
+#else
+typedef char float128[16] __attribute__((aligned(16)));
+#endif
+
 void FFI_HIDDEN ffi_closure_SYSV (void);
 void FFI_HIDDEN ffi_go_closure_sysv (void);
 void FFI_HIDDEN ffi_call_SYSV(extended_cif *, void (*)(void), void *,
@@ -91,4 +101,5 @@ int FFI_HIDDEN ffi_closure_helper_LINUX64 (ffi_cif *,
 					   void (*) (ffi_cif *, void *,
 						     void **, void *),
 					   void *, void *,
-					   unsigned long *, ffi_dblfl *);
+					   unsigned long *, ffi_dblfl *,
+					   float128 *);
diff --git a/libffi/src/powerpc/ffitarget.h b/libffi/src/powerpc/ffitarget.h
index 90aa36b..7fb9a93 100644
--- a/libffi/src/powerpc/ffitarget.h
+++ b/libffi/src/powerpc/ffitarget.h
@@ -91,15 +91,19 @@ typedef enum ffi_abi {
   /* This and following bits can reuse FFI_COMPAT values.  */
   FFI_LINUX_STRUCT_ALIGN = 1,
   FFI_LINUX_LONG_DOUBLE_128 = 2,
+  FFI_LINUX_LONG_DOUBLE_IEEE128 = 4,
   FFI_DEFAULT_ABI = (FFI_LINUX
 #  ifdef __STRUCT_PARM_ALIGN__
 		     | FFI_LINUX_STRUCT_ALIGN
 #  endif
 #  ifdef __LONG_DOUBLE_128__
 		     | FFI_LINUX_LONG_DOUBLE_128
+#   ifdef __LONG_DOUBLE_IEEE128__
+		     | FFI_LINUX_LONG_DOUBLE_IEEE128
+#   endif
 #  endif
 		     ),
-  FFI_LAST_ABI = 12
+  FFI_LAST_ABI = 16
 
 # else
   /* This bit, always set in new code, must not be set in any of the
@@ -167,9 +171,11 @@ typedef enum ffi_abi {
 #define FFI_SYSV_TYPE_SMALL_STRUCT (FFI_PPC_TYPE_LAST + 2)
 
 /* Used by ELFv2 for homogenous structure returns.  */
-#define FFI_V2_TYPE_FLOAT_HOMOG		(FFI_PPC_TYPE_LAST + 1)
-#define FFI_V2_TYPE_DOUBLE_HOMOG	(FFI_PPC_TYPE_LAST + 2)
-#define FFI_V2_TYPE_SMALL_STRUCT	(FFI_PPC_TYPE_LAST + 3)
+#define FFI_V2_TYPE_VECTOR		(FFI_PPC_TYPE_LAST + 1)
+#define FFI_V2_TYPE_VECTOR_HOMOG	(FFI_PPC_TYPE_LAST + 2)
+#define FFI_V2_TYPE_FLOAT_HOMOG		(FFI_PPC_TYPE_LAST + 3)
+#define FFI_V2_TYPE_DOUBLE_HOMOG	(FFI_PPC_TYPE_LAST + 4)
+#define FFI_V2_TYPE_SMALL_STRUCT	(FFI_PPC_TYPE_LAST + 5)
 
 #if _CALL_ELF == 2
 # define FFI_TRAMPOLINE_SIZE 32
diff --git a/libffi/src/powerpc/linux64.S b/libffi/src/powerpc/linux64.S
index f0006fe..e92d64a 100644
--- a/libffi/src/powerpc/linux64.S
+++ b/libffi/src/powerpc/linux64.S
@@ -109,40 +109,70 @@ ffi_call_LINUX64:
 	ld	%r2, 8(%r29)
 # endif
 	/* Now do the call.  */
-	/* Set up cr1 with bits 4-7 of the flags.  */
-	mtcrf	0x40, %r31
+	/* Set up cr1 with bits 3-7 of the flags.  */
+	mtcrf	0xc0, %r31
 
 	/* Get the address to call into CTR.  */
 	mtctr	%r12
 	/* Load all those argument registers.  */
-	ld	%r3, -32-(8*8)(%r28)
-	ld	%r4, -32-(7*8)(%r28)
-	ld	%r5, -32-(6*8)(%r28)
-	ld	%r6, -32-(5*8)(%r28)
+	addi	%r29, %r28, -32-(8*8)
+	ld	%r3,  (0*8)(%r29)
+	ld	%r4,  (1*8)(%r29)
+	ld	%r5,  (2*8)(%r29)
+	ld	%r6,  (3*8)(%r29)
 	bf-	5, 1f
-	ld	%r7, -32-(4*8)(%r28)
-	ld	%r8, -32-(3*8)(%r28)
-	ld	%r9, -32-(2*8)(%r28)
-	ld	%r10, -32-(1*8)(%r28)
+	ld	%r7,  (4*8)(%r29)
+	ld	%r8,  (5*8)(%r29)
+	ld	%r9,  (6*8)(%r29)
+	ld	%r10, (7*8)(%r29)
 1:
 
 	/* Load all the FP registers.  */
 	bf-	6, 2f
-	lfd	%f1, -32-(21*8)(%r28)
-	lfd	%f2, -32-(20*8)(%r28)
-	lfd	%f3, -32-(19*8)(%r28)
-	lfd	%f4, -32-(18*8)(%r28)
-	lfd	%f5, -32-(17*8)(%r28)
-	lfd	%f6, -32-(16*8)(%r28)
-	lfd	%f7, -32-(15*8)(%r28)
-	lfd	%f8, -32-(14*8)(%r28)
-	lfd	%f9, -32-(13*8)(%r28)
-	lfd	%f10, -32-(12*8)(%r28)
-	lfd	%f11, -32-(11*8)(%r28)
-	lfd	%f12, -32-(10*8)(%r28)
-	lfd	%f13, -32-(9*8)(%r28)
+	addi	%r29, %r29, -(14*8)
+	lfd	%f1,  ( 1*8)(%r29)
+	lfd	%f2,  ( 2*8)(%r29)
+	lfd	%f3,  ( 3*8)(%r29)
+	lfd	%f4,  ( 4*8)(%r29)
+	lfd	%f5,  ( 5*8)(%r29)
+	lfd	%f6,  ( 6*8)(%r29)
+	lfd	%f7,  ( 7*8)(%r29)
+	lfd	%f8,  ( 8*8)(%r29)
+	lfd	%f9,  ( 9*8)(%r29)
+	lfd	%f10, (10*8)(%r29)
+	lfd	%f11, (11*8)(%r29)
+	lfd	%f12, (12*8)(%r29)
+	lfd	%f13, (13*8)(%r29)
 2:
 
+	/* Load all the vector registers.  */
+	bf-	3, 3f
+	addi	%r29, %r29, -16
+	lvx	%v13, 0, %r29
+	addi	%r29, %r29, -16
+	lvx	%v12, 0, %r29
+	addi	%r29, %r29, -16
+	lvx	%v11, 0, %r29
+	addi	%r29, %r29, -16
+	lvx	%v10, 0, %r29
+	addi	%r29, %r29, -16
+	lvx	%v9,  0, %r29
+	addi	%r29, %r29, -16
+	lvx	%v8,  0, %r29
+	addi	%r29, %r29, -16
+	lvx	%v7,  0, %r29
+	addi	%r29, %r29, -16
+	lvx	%v6,  0, %r29
+	addi	%r29, %r29, -16
+	lvx	%v5,  0, %r29
+	addi	%r29, %r29, -16
+	lvx	%v4,  0, %r29
+	addi	%r29, %r29, -16
+	lvx	%v3,  0, %r29
+	addi	%r29, %r29, -16
+	lvx	%v2,  0, %r29
+3:
+
 	/* Make the call.  */
 	ld	%r11, 8(%r28)
 	bctrl
@@ -160,6 +190,7 @@ ffi_call_LINUX64:
 	bt	31, .Lstruct_return_value
 	bt	30, .Ldone_return_value
 	bt	29, .Lfp_return_value
+	bt	28, .Lvec_return_value
 	std	%r3, 0(%r30)
 	/* Fall through...  */
 
@@ -175,12 +206,16 @@ ffi_call_LINUX64:
 	ld	%r31, -8(%r1)
 	blr
 
+.Lvec_return_value:
+	stvx	%v2, 0, %r30
+	b	.Ldone_return_value
+
 .Lfp_return_value:
 	.cfi_def_cfa_register 28
-	bf	28, .Lfloat_return_value
-	stfd	%f1, 0(%r30)
 	mtcrf	0x02, %r31 /* cr6  */
-	bf	27, .Ldone_return_value
+	bf	27, .Lfloat_return_value
+	stfd	%f1, 0(%r30)
+	bf	26, .Ldone_return_value
 	stfd	%f2, 8(%r30)
 	b	.Ldone_return_value
 .Lfloat_return_value:
@@ -188,8 +223,9 @@ ffi_call_LINUX64:
 	b	.Ldone_return_value
 
 .Lstruct_return_value:
-	bf	29, .Lsmall_struct
-	bf	28, .Lfloat_homog_return_value
+	bf	29, .Lvec_homog_or_small_struct
+	mtcrf	0x02, %r31 /* cr6  */
+	bf	27, .Lfloat_homog_return_value
 	stfd	%f1, 0(%r30)
 	stfd	%f2, 8(%r30)
 	stfd	%f3, 16(%r30)
@@ -211,6 +247,25 @@ ffi_call_LINUX64:
 	stfs	%f8, 28(%r30)
 	b	.Ldone_return_value
 
+.Lvec_homog_or_small_struct:
+	bf	28, .Lsmall_struct
+	stvx	%v2, 0, %r30
+	addi	%r30, %r30, 16
+	stvx	%v3, 0, %r30
+	addi	%r30, %r30, 16
+	stvx	%v4, 0, %r30
+	addi	%r30, %r30, 16
+	stvx	%v5, 0, %r30
+	addi	%r30, %r30, 16
+	stvx	%v6, 0, %r30
+	addi	%r30, %r30, 16
+	stvx	%v7, 0, %r30
+	addi	%r30, %r30, 16
+	stvx	%v8, 0, %r30
+	addi	%r30, %r30, 16
+	stvx	%v9, 0, %r30
+	b	.Ldone_return_value
+
 .Lsmall_struct:
 	std	%r3, 0(%r30)
 	std	%r4, 8(%r30)
diff --git a/libffi/src/powerpc/linux64_closure.S b/libffi/src/powerpc/linux64_closure.S
index 5663bb4..3469a2c 100644
--- a/libffi/src/powerpc/linux64_closure.S
+++ b/libffi/src/powerpc/linux64_closure.S
@@ -63,9 +63,15 @@ ffi_closure_LINUX64:
 # endif
 
 # if _CALL_ELF == 2
-#  32 byte special reg save area + 64 byte parm save area
-#  + 64 byte retval area + 13*8 fpr save area + round to 16
-#  define STACKFRAME 272
+#  ifdef __VEC__
+#   32 byte special reg save area + 64 byte parm save area
+#   + 128 byte retval area + 13*8 fpr save area + 12*16 vec save area + round to 16
+#   define STACKFRAME 528
+#  else
+#   32 byte special reg save area + 64 byte parm save area
+#   + 64 byte retval area + 13*8 fpr save area + round to 16
+#   define STACKFRAME 272
+#  endif
 #  define PARMSAVE 32
 #  define RETVAL PARMSAVE+64
 # else
@@ -148,6 +154,35 @@ ffi_closure_LINUX64:
 	# load up the pointer to the saved fpr registers
 	addi	%r8, %r1, -104
 
+# ifdef __VEC__
+	# load up the pointer to the saved vector registers
+	# 8 bytes padding for 16-byte alignment at -112(%r1)
+	addi	%r9, %r8, -24
+	stvx	%v13, 0, %r9
+	addi	%r9, %r9, -16
+	stvx	%v12, 0, %r9
+	addi	%r9, %r9, -16
+	stvx	%v11, 0, %r9
+	addi	%r9, %r9, -16
+	stvx	%v10, 0, %r9
+	addi	%r9, %r9, -16
+	stvx	%v9, 0, %r9
+	addi	%r9, %r9, -16
+	stvx	%v8, 0, %r9
+	addi	%r9, %r9, -16
+	stvx	%v7, 0, %r9
+	addi	%r9, %r9, -16
+	stvx	%v6, 0, %r9
+	addi	%r9, %r9, -16
+	stvx	%v5, 0, %r9
+	addi	%r9, %r9, -16
+	stvx	%v4, 0, %r9
+	addi	%r9, %r9, -16
+	stvx	%v3, 0, %r9
+	addi	%r9, %r9, -16
+	stvx	%v2, 0, %r9
+# endif
+
 	# load up the pointer to the result storage
 	addi	%r6, %r1, -STACKFRAME+RETVAL
 
@@ -323,6 +358,16 @@ ffi_closure_LINUX64:
 	.cfi_def_cfa_offset 0
 	blr
 	.cfi_def_cfa_offset STACKFRAME
+# case FFI_V2_TYPE_VECTOR
+	addi %r3, %r1, RETVAL
+	lvx %v2, 0, %r3
+	mtlr %r0
+	b .Lfinish
+# case FFI_V2_TYPE_VECTOR_HOMOG
+	addi %r3, %r1, RETVAL
+	lvx %v2, 0, %r3
+	addi %r3, %r3, 16
+	b .Lmorevector
 # case FFI_V2_TYPE_FLOAT_HOMOG
 	lfs %f1, RETVAL+0(%r1)
 	lfs %f2, RETVAL+4(%r1)
@@ -342,6 +387,25 @@ ffi_closure_LINUX64:
 	.cfi_def_cfa_offset 0
 	blr
 	.cfi_def_cfa_offset STACKFRAME
+.Lmorevector:
+	lvx %v3, 0, %r3
+	addi %r3, %r3, 16
+	lvx %v4, 0, %r3
+	addi %r3, %r3, 16
+	lvx %v5, 0, %r3
+	mtlr %r0
+	addi %r3, %r3, 16
+	lvx %v6, 0, %r3
+	addi %r3, %r3, 16
+	lvx %v7, 0, %r3
+	addi %r3, %r3, 16
+	lvx %v8, 0, %r3
+	addi %r3, %r3, 16
+	lvx %v9, 0, %r3
+	addi %r1, %r1, STACKFRAME
+	.cfi_def_cfa_offset 0
+	blr
+	.cfi_def_cfa_offset STACKFRAME
 .Lmorefloat:
 	lfs %f4, RETVAL+12(%r1)
 	mtlr %r0
diff --git a/libffi/src/powerpc/sysv.S b/libffi/src/powerpc/sysv.S
index 1474ce7..df97734 100644
--- a/libffi/src/powerpc/sysv.S
+++ b/libffi/src/powerpc/sysv.S
@@ -104,17 +104,16 @@ ENTRY(ffi_call_SYSV)
 	bctrl
 
 	/* Now, deal with the return value.  */
-	mtcrf	0x01,%r31 /* cr7  */
+	mtcrf	0x03,%r31 /* cr6-cr7  */
 	bt-	31,L(small_struct_return_value)
 	bt-	30,L(done_return_value)
 #ifndef __NO_FPRS__
 	bt-	29,L(fp_return_value)
 #endif
 	stw	%r3,0(%r30)
-	bf+	28,L(done_return_value)
+	bf+	27,L(done_return_value)
 	stw	%r4,4(%r30)
-	mtcrf	0x02,%r31 /* cr6  */
-	bf	27,L(done_return_value)
+	bf	26,L(done_return_value)
 	stw     %r5,8(%r30)
 	stw	%r6,12(%r30)
 	/* Fall through...  */
@@ -145,10 +144,9 @@ L(done_return_value):
 #ifndef __NO_FPRS__
 L(fp_return_value):
 	.cfi_restore_state
-	bf	28,L(float_return_value)
+	bf	27,L(float_return_value)
 	stfd	%f1,0(%r30)
-	mtcrf   0x02,%r31 /* cr6  */
-	bf	27,L(done_return_value)
+	bf	26,L(done_return_value)
 	stfd	%f2,8(%r30)
 	b	L(done_return_value)
 L(float_return_value):
diff --git a/libffi/src/prep_cif.c b/libffi/src/prep_cif.c
index 5881ceb..c1832b1 100644
--- a/libffi/src/prep_cif.c
+++ b/libffi/src/prep_cif.c
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------------
-   prep_cif.c - Copyright (c) 2011, 2012  Anthony Green
+   prep_cif.c - Copyright (c) 2011, 2012, 2021  Anthony Green
                 Copyright (c) 1996, 1998, 2007  Red Hat, Inc.
 
    Permission is hereby granted, free of charge, to any person obtaining
@@ -29,12 +29,12 @@
 
 /* Round up to FFI_SIZEOF_ARG. */
 
-#define STACK_ARG_SIZE(x) ALIGN(x, FFI_SIZEOF_ARG)
+#define STACK_ARG_SIZE(x) FFI_ALIGN(x, FFI_SIZEOF_ARG)
 
 /* Perform machine independent initialization of aggregate type
    specifications. */
 
-static ffi_status initialize_aggregate(ffi_type *arg)
+static ffi_status initialize_aggregate(ffi_type *arg, size_t *offsets)
 {
   ffi_type **ptr;
 
@@ -52,13 +52,15 @@ static ffi_status initialize_aggregate(ffi_type *arg)
   while ((*ptr) != NULL)
     {
       if (UNLIKELY(((*ptr)->size == 0)
-		    && (initialize_aggregate((*ptr)) != FFI_OK)))
+		    && (initialize_aggregate((*ptr), NULL) != FFI_OK)))
 	return FFI_BAD_TYPEDEF;
 
       /* Perform a sanity check on the argument type */
       FFI_ASSERT_VALID_TYPE(*ptr);
 
-      arg->size = ALIGN(arg->size, (*ptr)->alignment);
+      arg->size = FFI_ALIGN(arg->size, (*ptr)->alignment);
+      if (offsets)
+	*offsets++ = arg->size;
       arg->size += (*ptr)->size;
 
       arg->alignment = (arg->alignment > (*ptr)->alignment) ?
@@ -74,7 +76,7 @@ static ffi_status initialize_aggregate(ffi_type *arg)
      struct A { long a; char b; }; struct B { struct A x; char y; };
      should find y at an offset of 2*sizeof(long) and result in a
      total size of 3*sizeof(long).  */
-  arg->size = ALIGN (arg->size, arg->alignment);
+  arg->size = FFI_ALIGN (arg->size, arg->alignment);
 
   /* On some targets, the ABI defines that structures have an additional
      alignment beyond the "natural" one based on their elements.  */
@@ -127,13 +129,16 @@ ffi_status FFI_HIDDEN ffi_prep_cif_core(ffi_cif *cif, ffi_abi abi,
   cif->rtype = rtype;
 
   cif->flags = 0;
-
+#if (defined(_M_ARM64) || defined(__aarch64__)) && defined(_WIN32)
+  cif->is_variadic = isvariadic;
+#endif
 #if HAVE_LONG_DOUBLE_VARIANT
   ffi_prep_types (abi);
 #endif
 
   /* Initialize the return type if necessary */
-  if ((cif->rtype->size == 0) && (initialize_aggregate(cif->rtype) != FFI_OK))
+  if ((cif->rtype->size == 0)
+      && (initialize_aggregate(cif->rtype, NULL) != FFI_OK))
     return FFI_BAD_TYPEDEF;
 
 #ifndef FFI_TARGET_HAS_COMPLEX_TYPE
@@ -164,7 +169,8 @@ ffi_status FFI_HIDDEN ffi_prep_cif_core(ffi_cif *cif, ffi_abi abi,
     {
 
       /* Initialize any uninitialized aggregate type definitions */
-      if (((*ptr)->size == 0) && (initialize_aggregate((*ptr)) != FFI_OK))
+      if (((*ptr)->size == 0)
+	  && (initialize_aggregate((*ptr), NULL) != FFI_OK))
 	return FFI_BAD_TYPEDEF;
 
 #ifndef FFI_TARGET_HAS_COMPLEX_TYPE
@@ -179,7 +185,7 @@ ffi_status FFI_HIDDEN ffi_prep_cif_core(ffi_cif *cif, ffi_abi abi,
 	{
 	  /* Add any padding if necessary */
 	  if (((*ptr)->alignment - 1) & bytes)
-	    bytes = (unsigned)ALIGN(bytes, (*ptr)->alignment);
+	    bytes = (unsigned)FFI_ALIGN(bytes, (*ptr)->alignment);
 
 #ifdef TILE
 	  if (bytes < 10 * FFI_SIZEOF_ARG &&
@@ -195,7 +201,7 @@ ffi_status FFI_HIDDEN ffi_prep_cif_core(ffi_cif *cif, ffi_abi abi,
 	    bytes = 6*4;
 #endif
 
-	  bytes += STACK_ARG_SIZE((*ptr)->size);
+	  bytes += (unsigned int)STACK_ARG_SIZE((*ptr)->size);
 	}
 #endif
     }
@@ -225,7 +231,26 @@ ffi_status ffi_prep_cif_var(ffi_cif *cif,
                             ffi_type *rtype,
                             ffi_type **atypes)
 {
-  return ffi_prep_cif_core(cif, abi, 1, nfixedargs, ntotalargs, rtype, atypes);
+  ffi_status rc;
+  size_t int_size = ffi_type_sint.size;
+  int i;
+
+  rc = ffi_prep_cif_core(cif, abi, 1, nfixedargs, ntotalargs, rtype, atypes);
+
+  if (rc != FFI_OK)
+    return rc;
+
+  for (i = 1; i < ntotalargs; i++)
+    {
+      ffi_type *arg_type = atypes[i];
+      if (arg_type == &ffi_type_float
+          || ((arg_type->type != FFI_TYPE_STRUCT
+               && arg_type->type != FFI_TYPE_COMPLEX)
+              && arg_type->size < int_size))
+        return FFI_BAD_ARGTYPE;
+    }
+
+  return FFI_OK;
 }
 
 #if FFI_CLOSURES
@@ -240,3 +265,18 @@ ffi_prep_closure (ffi_closure* closure,
 }
 
 #endif
+
+ffi_status
+ffi_get_struct_offsets (ffi_abi abi, ffi_type *struct_type, size_t *offsets)
+{
+  if (! (abi > FFI_FIRST_ABI && abi < FFI_LAST_ABI))
+    return FFI_BAD_ABI;
+  if (struct_type->type != FFI_TYPE_STRUCT)
+    return FFI_BAD_TYPEDEF;
+
+#if HAVE_LONG_DOUBLE_VARIANT
+  ffi_prep_types (abi);
+#endif
+
+  return initialize_aggregate(struct_type, offsets);
+}
diff --git a/libffi/src/raw_api.c b/libffi/src/raw_api.c
index 276cb22..be15611 100644
--- a/libffi/src/raw_api.c
+++ b/libffi/src/raw_api.c
@@ -43,10 +43,10 @@ ffi_raw_size (ffi_cif *cif)
     {
 #if !FFI_NO_STRUCTS
       if ((*at)->type == FFI_TYPE_STRUCT)
-	result += ALIGN (sizeof (void*), FFI_SIZEOF_ARG);
+	result += FFI_ALIGN (sizeof (void*), FFI_SIZEOF_ARG);
       else
 #endif
-	result += ALIGN ((*at)->size, FFI_SIZEOF_ARG);
+	result += FFI_ALIGN ((*at)->size, FFI_SIZEOF_ARG);
     }
 
   return result;
@@ -98,7 +98,7 @@ ffi_raw_to_ptrarray (ffi_cif *cif, ffi_raw *raw, void **args)
 	  
 	default:
 	  *args = raw;
-	  raw += ALIGN ((*tp)->size, FFI_SIZEOF_ARG) / FFI_SIZEOF_ARG;
+	  raw += FFI_ALIGN ((*tp)->size, FFI_SIZEOF_ARG) / FFI_SIZEOF_ARG;
 	}
     }
 
@@ -123,7 +123,7 @@ ffi_raw_to_ptrarray (ffi_cif *cif, ffi_raw *raw, void **args)
       else
 	{
 	  *args = (void*) raw;
-	  raw += ALIGN ((*tp)->size, sizeof (void*)) / sizeof (void*);
+	  raw += FFI_ALIGN ((*tp)->size, sizeof (void*)) / sizeof (void*);
 	}
     }
 
@@ -186,7 +186,7 @@ ffi_ptrarray_to_raw (ffi_cif *cif, void **args, ffi_raw *raw)
 
 	default:
 	  memcpy ((void*) raw->data, (void*)*args, (*tp)->size);
-	  raw += ALIGN ((*tp)->size, FFI_SIZEOF_ARG) / FFI_SIZEOF_ARG;
+	  raw += FFI_ALIGN ((*tp)->size, FFI_SIZEOF_ARG) / FFI_SIZEOF_ARG;
 	}
     }
 }
diff --git a/libffi/src/riscv/ffi.c b/libffi/src/riscv/ffi.c
index 8c5a860..c910858 100644
--- a/libffi/src/riscv/ffi.c
+++ b/libffi/src/riscv/ffi.c
@@ -120,7 +120,7 @@ static float_struct_info struct_passed_as_elements(call_builder *cb, ffi_type *t
 
         ret.type1 = fields[0]->type;
         ret.type2 = fields[1]->type;
-        ret.offset2 = ALIGN(fields[0]->size, fields[1]->alignment);
+        ret.offset2 = FFI_ALIGN(fields[0]->size, fields[1]->alignment);
         ret.as_elements = 1;
     }
 
@@ -238,8 +238,8 @@ static void marshal(call_builder *cb, ffi_type *type, int var, void *data) {
         /* variadics are aligned even in registers */
         if (type->alignment > __SIZEOF_POINTER__) {
             if (var)
-                cb->used_integer = ALIGN(cb->used_integer, 2);
-            cb->used_stack = (size_t *)ALIGN(cb->used_stack, 2*__SIZEOF_POINTER__);
+                cb->used_integer = FFI_ALIGN(cb->used_integer, 2);
+            cb->used_stack = (size_t *)FFI_ALIGN(cb->used_stack, 2*__SIZEOF_POINTER__);
         }
 
         memcpy(realign, data, type->size);
@@ -286,8 +286,8 @@ static void *unmarshal(call_builder *cb, ffi_type *type, int var, void *data) {
         /* variadics are aligned even in registers */
         if (type->alignment > __SIZEOF_POINTER__) {
             if (var)
-                cb->used_integer = ALIGN(cb->used_integer, 2);
-            cb->used_stack = (size_t *)ALIGN(cb->used_stack, 2*__SIZEOF_POINTER__);
+                cb->used_integer = FFI_ALIGN(cb->used_integer, 2);
+            cb->used_stack = (size_t *)FFI_ALIGN(cb->used_stack, 2*__SIZEOF_POINTER__);
         }
 
         if (type->size > 0)
@@ -334,10 +334,10 @@ ffi_call_int (ffi_cif *cif, void (*fn) (void), void *rvalue, void **avalue,
     /* this is a conservative estimate, assuming a complex return value and
        that all remaining arguments are long long / __int128 */
     size_t arg_bytes = cif->nargs <= 3 ? 0 :
-        ALIGN(2 * sizeof(size_t) * (cif->nargs - 3), STKALIGN);
+        FFI_ALIGN(2 * sizeof(size_t) * (cif->nargs - 3), STKALIGN);
     size_t rval_bytes = 0;
     if (rvalue == NULL && cif->rtype->size > 2*__SIZEOF_POINTER__)
-        rval_bytes = ALIGN(cif->rtype->size, STKALIGN);
+        rval_bytes = FFI_ALIGN(cif->rtype->size, STKALIGN);
     size_t alloc_size = arg_bytes + rval_bytes + sizeof(call_context);
 
     /* the assembly code will deallocate all stack data at lower addresses
@@ -350,7 +350,7 @@ ffi_call_int (ffi_cif *cif, void (*fn) (void), void *rvalue, void **avalue,
            guarantee alloca alignment to at least that much */
         alloc_base = (size_t)alloca(alloc_size);
     } else {
-        alloc_base = ALIGN(alloca(alloc_size + STKALIGN - 1), STKALIGN);
+        alloc_base = FFI_ALIGN(alloca(alloc_size + STKALIGN - 1), STKALIGN);
     }
 
     if (rval_bytes)
diff --git a/libffi/src/sparc/ffi.c b/libffi/src/sparc/ffi.c
index d5212d8..9e406d0 100644
--- a/libffi/src/sparc/ffi.c
+++ b/libffi/src/sparc/ffi.c
@@ -153,7 +153,7 @@ ffi_prep_cif_machdep(ffi_cif *cif)
 	  /* FALLTHRU */
 
 	default:
-	  z = ALIGN(z, 4);
+	  z = FFI_ALIGN(z, 4);
 	}
       bytes += z;
     }
@@ -167,7 +167,7 @@ ffi_prep_cif_machdep(ffi_cif *cif)
   bytes += 4;
 
   /* The stack must be 2 word aligned, so round bytes up appropriately. */
-  bytes = ALIGN(bytes, 2 * 4);
+  bytes = FFI_ALIGN(bytes, 2 * 4);
 
   /* Include the call frame to prep_args.  */
   bytes += 4*16 + 4*8;
@@ -293,7 +293,7 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
      got to pass the return value to the callee.  Otherwise ignore it.  */
   if (rvalue == NULL
       && (cif->flags & SPARC_FLAG_RET_MASK) == SPARC_RET_STRUCT)
-    bytes += ALIGN (cif->rtype->size, 8);
+    bytes += FFI_ALIGN (cif->rtype->size, 8);
 
   ffi_call_v8(cif, fn, rvalue, avalue, -bytes, closure);
 }
diff --git a/libffi/src/sparc/ffi64.c b/libffi/src/sparc/ffi64.c
index 340b198..9e04061 100644
--- a/libffi/src/sparc/ffi64.c
+++ b/libffi/src/sparc/ffi64.c
@@ -75,7 +75,7 @@ ffi_struct_float_mask (ffi_type *outer_type, int size_mask)
       size_t z = t->size;
       int o, m, tt;
 
-      size_mask = ALIGN(size_mask, t->alignment);
+      size_mask = FFI_ALIGN(size_mask, t->alignment);
       switch (t->type)
 	{
 	case FFI_TYPE_STRUCT:
@@ -99,7 +99,7 @@ ffi_struct_float_mask (ffi_type *outer_type, int size_mask)
       size_mask += z;
     }
 
-  size_mask = ALIGN(size_mask, outer_type->alignment);
+  size_mask = FFI_ALIGN(size_mask, outer_type->alignment);
   FFI_ASSERT ((size_mask & 0xff) == outer_type->size);
 
   return size_mask;
@@ -284,8 +284,8 @@ ffi_prep_cif_machdep_core(ffi_cif *cif)
 	  flags |= SPARC_FLAG_FP_ARGS;
 	  break;
 	}
-      bytes = ALIGN(bytes, a);
-      bytes += ALIGN(z, 8);
+      bytes = FFI_ALIGN(bytes, a);
+      bytes += FFI_ALIGN(z, 8);
     }
 
   /* Sparc call frames require that space is allocated for 6 args,
@@ -294,7 +294,7 @@ ffi_prep_cif_machdep_core(ffi_cif *cif)
     bytes = 6 * 8;
 
   /* The stack must be 2 word aligned, so round bytes up appropriately. */
-  bytes = ALIGN(bytes, 16);
+  bytes = FFI_ALIGN(bytes, 16);
 
   /* Include the call frame to prep_args.  */
   bytes += 8*16 + 8*8;
@@ -405,7 +405,7 @@ ffi_prep_args_v9(ffi_cif *cif, unsigned long *argp, void *rvalue, void **avalue)
 	  if (((unsigned long)argp & 15) && ty->alignment > 8)
 	    argp++;
 	  memcpy(argp, a, z);
-	  argp += ALIGN(z, 8) / 8;
+	  argp += FFI_ALIGN(z, 8) / 8;
 	  break;
 
 	default:
@@ -425,7 +425,7 @@ ffi_call_int(ffi_cif *cif, void (*fn)(void), void *rvalue,
   FFI_ASSERT (cif->abi == FFI_V9);
 
   if (rvalue == NULL && (cif->flags & SPARC_FLAG_RET_IN_MEM))
-    bytes += ALIGN (cif->rtype->size, 16);
+    bytes += FFI_ALIGN (cif->rtype->size, 16);
 
   ffi_call_v9(cif, fn, rvalue, avalue, -bytes, closure);
 }
@@ -547,7 +547,7 @@ ffi_closure_sparc_inner_v9(ffi_cif *cif,
 	    a = *(void **)a;
 	  else
 	    {
-	      argx = argn + ALIGN (z, 8) / 8;
+	      argx = argn + FFI_ALIGN (z, 8) / 8;
 	      if (named && argn < 16)
 		{
 		  int size_mask = ffi_struct_float_mask (ty, 0);
@@ -561,7 +561,7 @@ ffi_closure_sparc_inner_v9(ffi_cif *cif,
 	  break;
 
 	case FFI_TYPE_LONGDOUBLE:
-	  argn = ALIGN (argn, 2);
+	  argn = FFI_ALIGN (argn, 2);
 	  a = (named && argn < 16 ? fpr : gpr) + argn;
 	  argx = argn + 2;
 	  break;
diff --git a/libffi/src/tramp.c b/libffi/src/tramp.c
new file mode 100644
index 0000000..265aeaa
--- /dev/null
+++ b/libffi/src/tramp.c
@@ -0,0 +1,729 @@
+/* -----------------------------------------------------------------------
+   tramp.c - Copyright (c) 2020 Madhavan T. Venkataraman
+
+   API and support functions for managing statically defined closure
+   trampolines.
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+   ----------------------------------------------------------------------- */
+
+#include <fficonfig.h>
+
+#ifdef FFI_EXEC_STATIC_TRAMP
+
+/* -------------------------- Headers and Definitions ---------------------*/
+/*
+ * Add support for other OSes later. For now, it is just Linux.
+ */
+
+#if defined __linux__
+#ifdef __linux__
+#define _GNU_SOURCE 1
+#endif
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sys/mman.h>
+#include <tramp.h>
+#ifdef __linux__
+#include <linux/limits.h>
+#include <linux/types.h>
+#endif
+#endif /* __linux__ */
+
+/*
+ * Each architecture defines static code for a trampoline code table. The
+ * trampoline code table is mapped into the address space of a process.
+ *
+ * The following architecture specific function returns:
+ *
+ *	- the address of the trampoline code table in the text segment
+ *	- the size of each trampoline in the trampoline code table
+ *	- the size of the mapping for the whole trampoline code table
+ */
+void __attribute__((weak)) *ffi_tramp_arch (size_t *tramp_size,
+  size_t *map_size);
+
+/* ------------------------- Trampoline Data Structures --------------------*/
+
+struct tramp;
+
+/*
+ * Trampoline table. Manages one trampoline code table and one trampoline
+ * parameter table.
+ *
+ * prev, next	Links in the global trampoline table list.
+ * code_table	Trampoline code table mapping.
+ * parm_table	Trampoline parameter table mapping.
+ * array	Array of trampolines malloced.
+ * free		List of free trampolines.
+ * nfree	Number of free trampolines.
+ */
+struct tramp_table
+{
+  struct tramp_table *prev;
+  struct tramp_table *next;
+  void *code_table;
+  void *parm_table;
+  struct tramp *array;
+  struct tramp *free;
+  int nfree;
+};
+
+/*
+ * Parameters for each trampoline.
+ *
+ * data
+ *	Data for the target code that the trampoline jumps to.
+ * target
+ *	Target code that the trampoline jumps to.
+ */
+struct tramp_parm
+{
+  void *data;
+  void *target;
+};
+
+/*
+ * Trampoline structure for each trampoline.
+ *
+ * prev, next	Links in the trampoline free list of a trampoline table.
+ * table	Trampoline table to which this trampoline belongs.
+ * code		Address of this trampoline in the code table mapping.
+ * parm		Address of this trampoline's parameters in the parameter
+ *		table mapping.
+ */
+struct tramp
+{
+  struct tramp *prev;
+  struct tramp *next;
+  struct tramp_table *table;
+  void *code;
+  struct tramp_parm *parm;
+};
+
+enum tramp_globals_status {
+	TRAMP_GLOBALS_UNINITIALIZED = 0,
+	TRAMP_GLOBALS_PASSED,
+	TRAMP_GLOBALS_FAILED,
+};
+
+/*
+ * Trampoline globals.
+ *
+ * fd
+ *	File descriptor of binary file that contains the trampoline code table.
+ * offset
+ *	Offset of the trampoline code table in that file.
+ * text
+ *	Address of the trampoline code table in the text segment.
+ * map_size
+ *	Size of the trampoline code table mapping.
+ * size
+ *	Size of one trampoline in the trampoline code table.
+ * ntramp
+ *	Total number of trampolines in the trampoline code table.
+ * free_tables
+ *	List of trampoline tables that contain free trampolines.
+ * nfree_tables
+ *	Number of trampoline tables that contain free trampolines.
+ * status
+ *	Initialization status.
+ */
+struct tramp_globals
+{
+  int fd;
+  off_t offset;
+  void *text;
+  size_t map_size;
+  size_t size;
+  int ntramp;
+  struct tramp_table *free_tables;
+  int nfree_tables;
+  enum tramp_globals_status status;
+};
+
+static struct tramp_globals tramp_globals;
+
+/* --------------------- Trampoline File Initialization --------------------*/
+
+/*
+ * The trampoline file is the file used to map the trampoline code table into
+ * the address space of a process. There are two ways to get this file:
+ *
+ * - From the OS. E.g., on Linux, /proc/<pid>/maps lists all the memory
+ *   mappings for <pid>. For file-backed mappings, maps supplies the file name
+ *   and the file offset. Using this, we can locate the mapping that maps
+ *   libffi and get the path to the libffi binary. And, we can compute the
+ *   offset of the trampoline code table within that binary.
+ *
+ * - Else, if we can create a temporary file, we can write the trampoline code
+ *   table from the text segment into the temporary file.
+ *
+ * The first method is the preferred one. If the OS security subsystem
+ * disallows mapping unsigned files with PROT_EXEC, then the second method
+ * will fail.
+ *
+ * If an OS allows the trampoline code table in the text segment to be
+ * directly remapped (e.g., MACH vm_remap ()), then we don't need the
+ * trampoline file.
+ */
+static int tramp_table_alloc (void);
+
+#if defined __linux__
+
+static int
+ffi_tramp_get_libffi (void)
+{
+  FILE *fp;
+  char file[PATH_MAX], line[PATH_MAX+100], perm[10], dev[10];
+  unsigned long start, end, offset, inode;
+  uintptr_t addr = (uintptr_t) tramp_globals.text;
+  int nfields, found;
+
+  snprintf (file, PATH_MAX, "/proc/%d/maps", getpid());
+  fp = fopen (file, "r");
+  if (fp == NULL)
+    return 0;
+
+  found = 0;
+  while (feof (fp) == 0) {
+    if (fgets (line, sizeof (line), fp) == 0)
+      break;
+
+    nfields = sscanf (line, "%lx-%lx %9s %lx %9s %ld %s",
+      &start, &end, perm, &offset, dev, &inode, file);
+    if (nfields != 7)
+      continue;
+
+    if (addr >= start && addr < end) {
+      tramp_globals.offset = offset + (addr - start);
+      found = 1;
+      break;
+    }
+  }
+  fclose (fp);
+
+  if (!found)
+    return 0;
+
+  tramp_globals.fd = open (file, O_RDONLY);
+  if (tramp_globals.fd == -1)
+    return 0;
+
+  /*
+   * Allocate a trampoline table just to make sure that the trampoline code
+   * table can be mapped.
+   */
+  if (!tramp_table_alloc ())
+    {
+      close (tramp_globals.fd);
+      tramp_globals.fd = -1;
+      return 0;
+    }
+  return 1;
+}
+
+#endif /* __linux__ */
+
+#if defined __linux__
+
+#if defined HAVE_MKSTEMP
+
+static int
+ffi_tramp_get_temp_file (void)
+{
+  char template[12] = "/tmp/XXXXXX";
+  ssize_t count;
+
+  tramp_globals.offset = 0;
+  tramp_globals.fd = mkstemp (template);
+  if (tramp_globals.fd == -1)
+    return 0;
+
+  unlink (template);
+  /*
+   * Write the trampoline code table into the temporary file and allocate a
+   * trampoline table to make sure that the temporary file can be mapped.
+   */
+  count = write(tramp_globals.fd, tramp_globals.text, tramp_globals.map_size);
+  if (count == tramp_globals.map_size && tramp_table_alloc ())
+    return 1;
+
+  close (tramp_globals.fd);
+  tramp_globals.fd = -1;
+  return 0;
+}
+
+#else /* !defined HAVE_MKSTEMP */
+
+/*
+ * TODO:
+ * src/closures.c contains code for finding temp file that has EXEC
+ * permissions. May be, some of that code can be shared with static
+ * trampolines.
+ */
+static int
+ffi_tramp_get_temp_file (void)
+{
+  tramp_globals.offset = 0;
+  tramp_globals.fd = -1;
+  return 0;
+}
+
+#endif /* defined HAVE_MKSTEMP */
+
+#endif /* __linux__ */
+
+/* ------------------------ OS-specific Initialization ----------------------*/
+
+#if defined __linux__
+
+static int
+ffi_tramp_init_os (void)
+{
+  if (ffi_tramp_get_libffi ())
+    return 1;
+  return ffi_tramp_get_temp_file ();
+}
+
+#endif /* __linux__ */
+
+/* --------------------------- OS-specific Locking -------------------------*/
+
+#if defined __linux__
+
+static pthread_mutex_t tramp_globals_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void
+ffi_tramp_lock(void)
+{
+  pthread_mutex_lock (&tramp_globals_mutex);
+}
+
+static void
+ffi_tramp_unlock()
+{
+  pthread_mutex_unlock (&tramp_globals_mutex);
+}
+
+#endif /* __linux__ */
+
+/* ------------------------ OS-specific Memory Mapping ----------------------*/
+
+/*
+ * Create a trampoline code table mapping and a trampoline parameter table
+ * mapping. The two mappings must be adjacent to each other for PC-relative
+ * access.
+ *
+ * For each trampoline in the code table, there is a corresponding parameter
+ * block in the parameter table. The size of the parameter block is the same
+ * as the size of the trampoline. This means that the parameter block is at
+ * a fixed offset from its trampoline making it easy for a trampoline to find
+ * its parameters using PC-relative access.
+ *
+ * The parameter block will contain a struct tramp_parm. This means that
+ * sizeof (struct tramp_parm) cannot exceed the size of a parameter block.
+ */
+
+#if defined __linux__
+
+static int
+tramp_table_map (struct tramp_table *table)
+{
+  char *addr;
+
+  /*
+   * Create an anonymous mapping twice the map size. The top half will be used
+   * for the code table. The bottom half will be used for the parameter table.
+   */
+  addr = mmap (NULL, tramp_globals.map_size * 2, PROT_READ | PROT_WRITE,
+    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  if (addr == MAP_FAILED)
+    return 0;
+
+  /*
+   * Replace the top half of the anonymous mapping with the code table mapping.
+   */
+  table->code_table = mmap (addr, tramp_globals.map_size, PROT_READ | PROT_EXEC,
+    MAP_PRIVATE | MAP_FIXED, tramp_globals.fd, tramp_globals.offset);
+  if (table->code_table == MAP_FAILED)
+    {
+      (void) munmap (addr, tramp_globals.map_size * 2);
+      return 0;
+    }
+  table->parm_table = table->code_table + tramp_globals.map_size;
+  return 1;
+}
+
+static void
+tramp_table_unmap (struct tramp_table *table)
+{
+  (void) munmap (table->code_table, tramp_globals.map_size);
+  (void) munmap (table->parm_table, tramp_globals.map_size);
+}
+
+#endif /* __linux__ */
+
+/* ------------------------ Trampoline Initialization ----------------------*/
+
+/*
+ * Initialize the static trampoline feature.
+ */
+static int
+ffi_tramp_init (void)
+{
+  if (tramp_globals.status == TRAMP_GLOBALS_PASSED)
+    return 1;
+
+  if (tramp_globals.status == TRAMP_GLOBALS_FAILED)
+    return 0;
+
+  if (ffi_tramp_arch == NULL)
+    {
+      tramp_globals.status = TRAMP_GLOBALS_FAILED;
+      return 0;
+    }
+
+  tramp_globals.free_tables = NULL;
+  tramp_globals.nfree_tables = 0;
+
+  /*
+   * Get trampoline code table information from the architecture.
+   */
+  tramp_globals.text = ffi_tramp_arch (&tramp_globals.size,
+    &tramp_globals.map_size);
+  tramp_globals.ntramp = tramp_globals.map_size / tramp_globals.size;
+
+  if (sysconf (_SC_PAGESIZE) > tramp_globals.map_size)
+    return 0;
+
+  if (ffi_tramp_init_os ())
+    {
+      tramp_globals.status = TRAMP_GLOBALS_PASSED;
+      return 1;
+    }
+
+  tramp_globals.status = TRAMP_GLOBALS_FAILED;
+  return 0;
+}
+
+/* ---------------------- Trampoline Table functions ---------------------- */
+
+/* This code assumes that malloc () is available on all OSes. */
+
+static void tramp_add (struct tramp *tramp);
+
+/*
+ * Allocate and initialize a trampoline table.
+ */
+static int
+tramp_table_alloc (void)
+{
+  struct tramp_table *table;
+  struct tramp *tramp_array, *tramp;
+  size_t size;
+  char *code, *parm;
+  int i;
+
+  /*
+   * If we already have tables with free trampolines, there is no need to
+   * allocate a new table.
+   */
+  if (tramp_globals.nfree_tables > 0)
+    return 1;
+
+  /*
+   * Allocate a new trampoline table structure.
+   */
+  table = malloc (sizeof (*table));
+  if (table == NULL)
+    return 0;
+
+  /*
+   * Allocate new trampoline structures.
+   */
+  tramp_array = malloc (sizeof (*tramp) * tramp_globals.ntramp);
+  if (tramp_array == NULL)
+    goto free_table;
+
+  /*
+   * Map a code table and a parameter table into the caller's address space.
+   */
+  if (!tramp_table_map (table))
+    {
+      /*
+       * Failed to map the code and parameter tables.
+       */
+      goto free_tramp_array;
+    }
+
+  /*
+   * Initialize the trampoline table.
+   */
+  table->array = tramp_array;
+  table->free = NULL;
+  table->nfree = 0;
+
+  /*
+   * Populate the trampoline table free list. This will also add the trampoline
+   * table to the global list of trampoline tables.
+   */
+  size = tramp_globals.size;
+  code = table->code_table;
+  parm = table->parm_table;
+  for (i = 0; i < tramp_globals.ntramp; i++)
+    {
+      tramp = &tramp_array[i];
+      tramp->table = table;
+      tramp->code = code;
+      tramp->parm = (struct tramp_parm *) parm;
+      tramp_add (tramp);
+
+      code += size;
+      parm += size;
+    }
+  /* Success */
+  return 1;
+
+/* Failure */
+free_tramp_array:
+  free (tramp_array);
+free_table:
+  free (table);
+  return 0;
+}
+
+/*
+ * Free a trampoline table.
+ */
+static void
+tramp_table_free (struct tramp_table *table)
+{
+  tramp_table_unmap (table);
+  free (table->array);
+  free (table);
+}
+
+/*
+ * Add a new trampoline table to the global table list.
+ */
+static void
+tramp_table_add (struct tramp_table *table)
+{
+  table->next = tramp_globals.free_tables;
+  table->prev = NULL;
+  if (tramp_globals.free_tables != NULL)
+    tramp_globals.free_tables->prev = table;
+  tramp_globals.free_tables = table;
+  tramp_globals.nfree_tables++;
+}
+
+/*
+ * Delete a trampoline table from the global table list.
+ */
+static void
+tramp_table_del (struct tramp_table *table)
+{
+  tramp_globals.nfree_tables--;
+  if (table->prev != NULL)
+    table->prev->next = table->next;
+  if (table->next != NULL)
+    table->next->prev = table->prev;
+  if (tramp_globals.free_tables == table)
+    tramp_globals.free_tables = table->next;
+}
+
+/* ------------------------- Trampoline functions ------------------------- */
+
+/*
+ * Add a trampoline to its trampoline table.
+ */
+static void
+tramp_add (struct tramp *tramp)
+{
+  struct tramp_table *table = tramp->table;
+
+  tramp->next = table->free;
+  tramp->prev = NULL;
+  if (table->free != NULL)
+    table->free->prev = tramp;
+  table->free = tramp;
+  table->nfree++;
+
+  if (table->nfree == 1)
+    tramp_table_add (table);
+
+  /*
+   * We don't want to keep too many free trampoline tables lying around.
+   */
+  if (table->nfree == tramp_globals.ntramp &&
+    tramp_globals.nfree_tables > 1)
+    {
+      tramp_table_del (table);
+      tramp_table_free (table);
+    }
+}
+
+/*
+ * Remove a trampoline from its trampoline table.
+ */
+static void
+tramp_del (struct tramp *tramp)
+{
+  struct tramp_table *table = tramp->table;
+
+  table->nfree--;
+  if (tramp->prev != NULL)
+    tramp->prev->next = tramp->next;
+  if (tramp->next != NULL)
+    tramp->next->prev = tramp->prev;
+  if (table->free == tramp)
+    table->free = tramp->next;
+
+  if (table->nfree == 0)
+    tramp_table_del (table);
+}
+
+/* ------------------------ Trampoline API functions ------------------------ */
+
+int
+ffi_tramp_is_supported(void)
+{
+  int ret;
+
+  ffi_tramp_lock();
+  ret = ffi_tramp_init ();
+  ffi_tramp_unlock();
+  return ret;
+}
+
+/*
+ * Allocate a trampoline and return its opaque address.
+ */
+void *
+ffi_tramp_alloc (int flags)
+{
+  struct tramp *tramp;
+
+  ffi_tramp_lock();
+
+  if (!ffi_tramp_init () || flags != 0)
+    {
+      ffi_tramp_unlock();
+      return NULL;
+    }
+
+  if (!tramp_table_alloc ())
+    {
+      ffi_tramp_unlock();
+      return NULL;
+    }
+
+  tramp = tramp_globals.free_tables->free;
+  tramp_del (tramp);
+
+  ffi_tramp_unlock();
+
+  return tramp;
+}
+
+/*
+ * Set the parameters for a trampoline.
+ */
+void
+ffi_tramp_set_parms (void *arg, void *target, void *data)
+{
+  struct tramp *tramp = arg;
+
+  ffi_tramp_lock();
+  tramp->parm->target = target;
+  tramp->parm->data = data;
+  ffi_tramp_unlock();
+}
+
+/*
+ * Get the invocation address of a trampoline.
+ */
+void *
+ffi_tramp_get_addr (void *arg)
+{
+  struct tramp *tramp = arg;
+  void *addr;
+
+  ffi_tramp_lock();
+  addr = tramp->code;
+  ffi_tramp_unlock();
+
+  return addr;
+}
+
+/*
+ * Free a trampoline.
+ */
+void
+ffi_tramp_free (void *arg)
+{
+  struct tramp *tramp = arg;
+
+  ffi_tramp_lock();
+  tramp_add (tramp);
+  ffi_tramp_unlock();
+}
+
+/* ------------------------------------------------------------------------- */
+
+#else /* !FFI_EXEC_STATIC_TRAMP */
+
+#include <stddef.h>
+
+int
+ffi_tramp_is_supported(void)
+{
+  return 0;
+}
+
+void *
+ffi_tramp_alloc (int flags)
+{
+  return NULL;
+}
+
+void
+ffi_tramp_set_parms (void *arg, void *target, void *data)
+{
+}
+
+void *
+ffi_tramp_get_addr (void *arg)
+{
+  return NULL;
+}
+
+void
+ffi_tramp_free (void *arg)
+{
+}
+
+#endif /* FFI_EXEC_STATIC_TRAMP */
diff --git a/libffi/src/types.c b/libffi/src/types.c
index 7e80aec..9ec27f6 100644
--- a/libffi/src/types.c
+++ b/libffi/src/types.c
@@ -38,6 +38,7 @@ struct struct_align_##name {			\
   char c;					\
   type x;					\
 };						\
+FFI_EXTERN					\
 maybe_const ffi_type ffi_type_##name = {	\
   sizeof(type),					\
   offsetof(struct struct_align_##name, x),	\
@@ -52,6 +53,7 @@ struct struct_align_complex_##name {			\
   char c;						\
   _Complex type x;					\
 };							\
+FFI_EXTERN						\
 maybe_const ffi_type ffi_type_complex_##name = {	\
   sizeof(_Complex type),				\
   offsetof(struct struct_align_complex_##name, x),	\
@@ -60,7 +62,7 @@ maybe_const ffi_type ffi_type_complex_##name = {	\
 }
 
 /* Size and alignment are fake here. They must not be 0. */
-const ffi_type ffi_type_void = {
+FFI_EXTERN const ffi_type ffi_type_void = {
   1, 1, FFI_TYPE_VOID, NULL
 };
 
diff --git a/libffi/src/vax/ffi.c b/libffi/src/vax/ffi.c
index f4d6bbb..e52caec 100644
--- a/libffi/src/vax/ffi.c
+++ b/libffi/src/vax/ffi.c
@@ -108,7 +108,7 @@ ffi_prep_args (extended_cif *ecif, void *stack)
 
 	  /* Align if necessary.  */
 	  if ((sizeof(int) - 1) & z)
-	    z = ALIGN(z, sizeof(int));
+	    z = FFI_ALIGN(z, sizeof(int));
 	}
 
       p_argv++;
@@ -215,7 +215,7 @@ ffi_prep_closure_elfbsd (ffi_cif *cif, void **avalue, char *stackp)
 
       /* Align if necessary */
       if ((sizeof (int) - 1) & z)
-	z = ALIGN(z, sizeof (int));
+	z = FFI_ALIGN(z, sizeof (int));
 
       p_argv++;
       stackp += z;
diff --git a/libffi/src/x86/asmnames.h b/libffi/src/x86/asmnames.h
new file mode 100644
index 0000000..7551021
--- /dev/null
+++ b/libffi/src/x86/asmnames.h
@@ -0,0 +1,30 @@
+#ifndef ASMNAMES_H
+#define ASMNAMES_H
+
+#define C2(X, Y)  X ## Y
+#define C1(X, Y)  C2(X, Y)
+#ifdef __USER_LABEL_PREFIX__
+# define C(X)     C1(__USER_LABEL_PREFIX__, X)
+#else
+# define C(X)     X
+#endif
+
+#ifdef __APPLE__
+# define L(X)     C1(L, X)
+#else
+# define L(X)     C1(.L, X)
+#endif
+
+#if defined(__ELF__) && defined(__PIC__)
+# define PLT(X)	  X@PLT
+#else
+# define PLT(X)	  X
+#endif
+
+#ifdef __ELF__
+# define ENDF(X)  .type	X,@function; .size X, . - X
+#else
+# define ENDF(X)
+#endif
+
+#endif /* ASMNAMES_H */
diff --git a/libffi/src/x86/darwin.S b/libffi/src/x86/darwin.S
deleted file mode 100644
index 8f0f070..0000000
--- a/libffi/src/x86/darwin.S
+++ /dev/null
@@ -1,444 +0,0 @@
-/* -----------------------------------------------------------------------
-   darwin.S - Copyright (c) 1996, 1998, 2001, 2002, 2003, 2005  Red Hat, Inc.
-	Copyright (C) 2008  Free Software Foundation, Inc.
-
-   X86 Foreign Function Interface
-
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files (the
-   ``Software''), to deal in the Software without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Software, and to
-   permit persons to whom the Software is furnished to do so, subject to
-   the following conditions:
-
-   The above copyright notice and this permission notice shall be included
-   in all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-   DEALINGS IN THE SOFTWARE.
-   -----------------------------------------------------------------------
-   */
-
-#ifndef __x86_64__
-
-#define LIBFFI_ASM	
-#include <fficonfig.h>
-#include <ffi.h>
-
-.text
-
-.globl _ffi_prep_args
-
-	.align 4
-.globl _ffi_call_SYSV
-
-_ffi_call_SYSV:
-.LFB1:
-        pushl %ebp
-.LCFI0:
-        movl  %esp,%ebp
-.LCFI1:
-        subl $8,%esp
-	/* Make room for all of the new args.  */
-	movl  16(%ebp),%ecx
-	subl  %ecx,%esp
-
-	movl  %esp,%eax
-
-	/* Place all of the ffi_prep_args in position  */
-	subl  $8,%esp
-	pushl 12(%ebp)
-	pushl %eax
-	call  *8(%ebp)
-
-	/* Return stack to previous state and call the function  */
-	addl  $16,%esp	
-
-	call  *28(%ebp)
-
-	/* Load %ecx with the return type code  */
-	movl  20(%ebp),%ecx	
-
-	/* Protect %esi.  We're going to pop it in the epilogue.  */
-	pushl %esi
-
-	/* If the return value pointer is NULL, assume no return value.  */
-	cmpl  $0,24(%ebp)
-	jne  0f
-
-	/* Even if there is no space for the return value, we are 
-	   obliged to handle floating-point values.  */
-	cmpl  $FFI_TYPE_FLOAT,%ecx
-	jne   noretval
-	fstp  %st(0)
-
-	jmp   epilogue
-0:
-	.align 4
-	call 1f
-.Lstore_table:
-	.long   noretval-.Lstore_table		/* FFI_TYPE_VOID */
-	.long   retint-.Lstore_table		/* FFI_TYPE_INT */
-	.long   retfloat-.Lstore_table		/* FFI_TYPE_FLOAT */
-	.long   retdouble-.Lstore_table		/* FFI_TYPE_DOUBLE */
-	.long   retlongdouble-.Lstore_table     /* FFI_TYPE_LONGDOUBLE */
-	.long   retuint8-.Lstore_table		/* FFI_TYPE_UINT8 */
-	.long   retsint8-.Lstore_table		/* FFI_TYPE_SINT8 */
-	.long   retuint16-.Lstore_table		/* FFI_TYPE_UINT16 */
-	.long   retsint16-.Lstore_table		/* FFI_TYPE_SINT16 */
-	.long   retint-.Lstore_table		/* FFI_TYPE_UINT32 */
-	.long   retint-.Lstore_table		/* FFI_TYPE_SINT32 */
-	.long   retint64-.Lstore_table		/* FFI_TYPE_UINT64 */
-	.long   retint64-.Lstore_table		/* FFI_TYPE_SINT64 */
-	.long   retstruct-.Lstore_table		/* FFI_TYPE_STRUCT */
-	.long   retint-.Lstore_table		/* FFI_TYPE_POINTER */
-	.long   retstruct1b-.Lstore_table	/* FFI_TYPE_SMALL_STRUCT_1B */
-	.long   retstruct2b-.Lstore_table	/* FFI_TYPE_SMALL_STRUCT_2B */
-1:
-	pop  %esi
-	add  (%esi, %ecx, 4), %esi
-	jmp  *%esi
-
-	/* Sign/zero extend as appropriate.  */
-retsint8:
-	movsbl  %al, %eax
-	jmp  retint
-
-retsint16:
-	movswl  %ax, %eax
-	jmp  retint
-
-retuint8:
-	movzbl  %al, %eax
-	jmp  retint
-
-retuint16:
-	movzwl  %ax, %eax
-	jmp  retint
-
-retfloat:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	fstps (%ecx)
-	jmp   epilogue
-
-retdouble:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	fstpl (%ecx)
-	jmp   epilogue
-
-retlongdouble:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	fstpt (%ecx)
-	jmp   epilogue
-
-retint64:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	movl  %eax,0(%ecx)
-	movl  %edx,4(%ecx)
-	jmp   epilogue
-
-retstruct1b:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	movb  %al,0(%ecx)
-	jmp   epilogue
-
-retstruct2b:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	movw  %ax,0(%ecx)
-	jmp   epilogue
-
-retint:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	movl  %eax,0(%ecx)
-
-retstruct:
-	/* Nothing to do!  */
-
-noretval:
-epilogue:
-	popl %esi
-	movl %ebp,%esp
-	popl %ebp
-	ret
-
-.LFE1:
-.ffi_call_SYSV_end:
-
-	.align	4
-FFI_HIDDEN (ffi_closure_SYSV)
-.globl _ffi_closure_SYSV
-
-_ffi_closure_SYSV:
-.LFB2:
-	pushl	%ebp
-.LCFI2:
-	movl	%esp, %ebp
-.LCFI3:
-	subl	$40, %esp
-	leal	-24(%ebp), %edx
-	movl	%edx, -12(%ebp)	/* resp */
-	leal	8(%ebp), %edx
-	movl	%edx, 4(%esp)	/* args = __builtin_dwarf_cfa () */
-	leal	-12(%ebp), %edx
-	movl	%edx, (%esp)	/* &resp */
-	movl	%ebx, 8(%esp)
-.LCFI7:
-	call	L_ffi_closure_SYSV_inner$stub
-	movl	8(%esp), %ebx
-	movl	-12(%ebp), %ecx
-	cmpl	$FFI_TYPE_INT, %eax
-	je	.Lcls_retint
-
-	/* Handle FFI_TYPE_UINT8, FFI_TYPE_SINT8, FFI_TYPE_UINT16,
-	   FFI_TYPE_SINT16, FFI_TYPE_UINT32, FFI_TYPE_SINT32.  */
-	cmpl	$FFI_TYPE_UINT64, %eax
-	jge	0f
-	cmpl	$FFI_TYPE_UINT8, %eax
-	jge	.Lcls_retint
-
-0:	cmpl	$FFI_TYPE_FLOAT, %eax
-	je	.Lcls_retfloat
-	cmpl	$FFI_TYPE_DOUBLE, %eax
-	je	.Lcls_retdouble
-	cmpl	$FFI_TYPE_LONGDOUBLE, %eax
-	je	.Lcls_retldouble
-	cmpl	$FFI_TYPE_SINT64, %eax
-	je	.Lcls_retllong
-	cmpl	$FFI_TYPE_SMALL_STRUCT_1B, %eax
-	je	.Lcls_retstruct1b
-	cmpl	$FFI_TYPE_SMALL_STRUCT_2B, %eax
-	je	.Lcls_retstruct2b
-	cmpl	$FFI_TYPE_STRUCT, %eax
-	je	.Lcls_retstruct
-.Lcls_epilogue:
-	movl	%ebp, %esp
-	popl	%ebp
-	ret
-.Lcls_retint:
-	movl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-.Lcls_retfloat:
-	flds	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retdouble:
-	fldl	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retldouble:
-	fldt	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retllong:
-	movl	(%ecx), %eax
-	movl	4(%ecx), %edx
-	jmp	.Lcls_epilogue
-.Lcls_retstruct1b:
-	movsbl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-.Lcls_retstruct2b:
-	movswl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-.Lcls_retstruct:
-	lea -8(%ebp),%esp
-	movl	%ebp, %esp
-	popl	%ebp
-	ret $4
-.LFE2:
-
-#if !FFI_NO_RAW_API
-
-#define RAW_CLOSURE_CIF_OFFSET ((FFI_TRAMPOLINE_SIZE + 3) & ~3)
-#define RAW_CLOSURE_FUN_OFFSET (RAW_CLOSURE_CIF_OFFSET + 4)
-#define RAW_CLOSURE_USER_DATA_OFFSET (RAW_CLOSURE_FUN_OFFSET + 4)
-#define CIF_FLAGS_OFFSET 20
-
-	.align	4
-FFI_HIDDEN (ffi_closure_raw_SYSV)
-.globl _ffi_closure_raw_SYSV
-
-_ffi_closure_raw_SYSV:
-.LFB3:
-	pushl	%ebp
-.LCFI4:
-	movl	%esp, %ebp
-.LCFI5:
-	pushl	%esi
-.LCFI6:
-	subl	$36, %esp
-	movl	RAW_CLOSURE_CIF_OFFSET(%eax), %esi	 /* closure->cif */
-	movl	RAW_CLOSURE_USER_DATA_OFFSET(%eax), %edx /* closure->user_data */
-	movl	%edx, 12(%esp)	/* user_data */
-	leal	8(%ebp), %edx	/* __builtin_dwarf_cfa () */
-	movl	%edx, 8(%esp)	/* raw_args */
-	leal	-24(%ebp), %edx
-	movl	%edx, 4(%esp)	/* &res */
-	movl	%esi, (%esp)	/* cif */
-	call	*RAW_CLOSURE_FUN_OFFSET(%eax)		 /* closure->fun */
-	movl	CIF_FLAGS_OFFSET(%esi), %eax		 /* rtype */
-	cmpl	$FFI_TYPE_INT, %eax
-	je	.Lrcls_retint
-
-	/* Handle FFI_TYPE_UINT8, FFI_TYPE_SINT8, FFI_TYPE_UINT16,
-	   FFI_TYPE_SINT16, FFI_TYPE_UINT32, FFI_TYPE_SINT32.  */
-	cmpl	$FFI_TYPE_UINT64, %eax
-	jge	0f
-	cmpl	$FFI_TYPE_UINT8, %eax
-	jge	.Lrcls_retint
-0:
-	cmpl	$FFI_TYPE_FLOAT, %eax
-	je	.Lrcls_retfloat
-	cmpl	$FFI_TYPE_DOUBLE, %eax
-	je	.Lrcls_retdouble
-	cmpl	$FFI_TYPE_LONGDOUBLE, %eax
-	je	.Lrcls_retldouble
-	cmpl	$FFI_TYPE_SINT64, %eax
-	je	.Lrcls_retllong
-.Lrcls_epilogue:
-	addl	$36, %esp
-	popl	%esi
-	popl	%ebp
-	ret
-.Lrcls_retint:
-	movl	-24(%ebp), %eax
-	jmp	.Lrcls_epilogue
-.Lrcls_retfloat:
-	flds	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retdouble:
-	fldl	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retldouble:
-	fldt	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retllong:
-	movl	-24(%ebp), %eax
-	movl	-20(%ebp), %edx
-	jmp	.Lrcls_epilogue
-.LFE3:
-#endif
-
-.section __IMPORT,__jump_table,symbol_stubs,self_modifying_code+pure_instructions,5
-L_ffi_closure_SYSV_inner$stub:
-	.indirect_symbol _ffi_closure_SYSV_inner
-	hlt ; hlt ; hlt ; hlt ; hlt
-
-
-.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
-EH_frame1:
-	.set	L$set$0,LECIE1-LSCIE1
-	.long	L$set$0
-LSCIE1:
-	.long	0x0
-	.byte	0x1
-	.ascii "zR\0"
-	.byte	0x1
-	.byte	0x7c
-	.byte	0x8
-	.byte	0x1
-	.byte	0x10
-	.byte	0xc
-	.byte	0x5
-	.byte	0x4
-	.byte	0x88
-	.byte	0x1
-	.align 2
-LECIE1:
-.globl _ffi_call_SYSV.eh
-_ffi_call_SYSV.eh:
-LSFDE1:
-	.set	L$set$1,LEFDE1-LASFDE1
-	.long	L$set$1
-LASFDE1:
-	.long	LASFDE1-EH_frame1
-	.long	.LFB1-.
-	.set L$set$2,.LFE1-.LFB1
-	.long L$set$2
-	.byte	0x0
-	.byte	0x4
-	.set L$set$3,.LCFI0-.LFB1
-	.long L$set$3
-	.byte	0xe
-	.byte	0x8
-	.byte	0x84
-	.byte	0x2
-	.byte	0x4
-	.set L$set$4,.LCFI1-.LCFI0
-	.long L$set$4
-	.byte	0xd
-	.byte	0x4
-	.align 2
-LEFDE1:
-.globl _ffi_closure_SYSV.eh
-_ffi_closure_SYSV.eh:
-LSFDE2:
-	.set	L$set$5,LEFDE2-LASFDE2
-	.long	L$set$5
-LASFDE2:
-	.long	LASFDE2-EH_frame1
-	.long	.LFB2-.
-	.set L$set$6,.LFE2-.LFB2
-	.long L$set$6
-	.byte	0x0
-	.byte	0x4
-	.set L$set$7,.LCFI2-.LFB2
-	.long L$set$7
-	.byte	0xe
-	.byte	0x8
-	.byte	0x84
-	.byte	0x2
-	.byte	0x4
-	.set L$set$8,.LCFI3-.LCFI2
-	.long L$set$8
-	.byte	0xd
-	.byte	0x4
-	.align 2
-LEFDE2:
-
-#if !FFI_NO_RAW_API
-
-.globl _ffi_closure_raw_SYSV.eh
-_ffi_closure_raw_SYSV.eh:
-LSFDE3:
-	.set	L$set$10,LEFDE3-LASFDE3
-	.long	L$set$10
-LASFDE3:
-	.long	LASFDE3-EH_frame1
-	.long	.LFB3-.
-	.set L$set$11,.LFE3-.LFB3
-	.long L$set$11
-	.byte	0x0
-	.byte	0x4
-	.set L$set$12,.LCFI4-.LFB3
-	.long L$set$12
-	.byte	0xe
-	.byte	0x8
-	.byte	0x84
-	.byte	0x2
-	.byte	0x4
-	.set L$set$13,.LCFI5-.LCFI4
-	.long L$set$13
-	.byte	0xd
-	.byte	0x4
-	.byte	0x4
-	.set L$set$14,.LCFI6-.LCFI5
-	.long L$set$14
-	.byte	0x85
-	.byte	0x3
-	.align 2
-LEFDE3:
-
-#endif
-
-#endif /* ifndef __x86_64__ */
diff --git a/libffi/src/x86/darwin64.S b/libffi/src/x86/darwin64.S
deleted file mode 100644
index 2f7394e..0000000
--- a/libffi/src/x86/darwin64.S
+++ /dev/null
@@ -1,416 +0,0 @@
-/* -----------------------------------------------------------------------
-   darwin64.S - Copyright (c) 2006 Free Software Foundation, Inc.
-	        Copyright (c) 2008 Red Hat, Inc.
-   derived from unix64.S
-
-   x86-64 Foreign Function Interface for Darwin.
-
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files (the
-   ``Software''), to deal in the Software without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Software, and to
-   permit persons to whom the Software is furnished to do so, subject to
-   the following conditions:
-
-   The above copyright notice and this permission notice shall be included
-   in all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, EXPRESS
-   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-   IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR
-   OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-   ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-   OTHER DEALINGS IN THE SOFTWARE.
-   ----------------------------------------------------------------------- */
-
-#ifdef __x86_64__
-#define LIBFFI_ASM
-#include <fficonfig.h>
-#include <ffi.h>
-
-	.file "darwin64.S"
-.text
-
-/* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
-		    void *raddr, void (*fnaddr)(void));
-
-   Bit o trickiness here -- ARGS+BYTES is the base of the stack frame
-   for this function.  This has been allocated by ffi_call.  We also
-   deallocate some of the stack that has been alloca'd.  */
-
-	.align	3
-	.globl	_ffi_call_unix64
-
-_ffi_call_unix64:
-LUW0:
-	movq	(%rsp), %r10		/* Load return address.  */
-	leaq	(%rdi, %rsi), %rax	/* Find local stack base.  */
-	movq	%rdx, (%rax)		/* Save flags.  */
-	movq	%rcx, 8(%rax)		/* Save raddr.  */
-	movq	%rbp, 16(%rax)		/* Save old frame pointer.  */
-	movq	%r10, 24(%rax)		/* Relocate return address.  */
-	movq	%rax, %rbp		/* Finalize local stack frame.  */
-LUW1:
-	movq	%rdi, %r10		/* Save a copy of the register area. */
-	movq	%r8, %r11		/* Save a copy of the target fn.  */
-	movl	%r9d, %eax		/* Set number of SSE registers.  */
-
-	/* Load up all argument registers.  */
-	movq	(%r10), %rdi
-	movq	8(%r10), %rsi
-	movq	16(%r10), %rdx
-	movq	24(%r10), %rcx
-	movq	32(%r10), %r8
-	movq	40(%r10), %r9
-	testl	%eax, %eax
-	jnz	Lload_sse
-Lret_from_load_sse:
-
-	/* Deallocate the reg arg area.  */
-	leaq	176(%r10), %rsp
-
-	/* Call the user function.  */
-	call	*%r11
-
-	/* Deallocate stack arg area; local stack frame in redzone.  */
-	leaq	24(%rbp), %rsp
-
-	movq	0(%rbp), %rcx		/* Reload flags.  */
-	movq	8(%rbp), %rdi		/* Reload raddr.  */
-	movq	16(%rbp), %rbp		/* Reload old frame pointer.  */
-LUW2:
-
-	/* The first byte of the flags contains the FFI_TYPE.  */
-	movzbl	%cl, %r10d
-	leaq	Lstore_table(%rip), %r11
-	movslq	(%r11, %r10, 4), %r10
-	addq	%r11, %r10
-	jmp	*%r10
-
-Lstore_table:
-	.long	Lst_void-Lstore_table		/* FFI_TYPE_VOID */
-	.long	Lst_sint32-Lstore_table		/* FFI_TYPE_INT */
-	.long	Lst_float-Lstore_table		/* FFI_TYPE_FLOAT */
-	.long	Lst_double-Lstore_table		/* FFI_TYPE_DOUBLE */
-	.long	Lst_ldouble-Lstore_table	/* FFI_TYPE_LONGDOUBLE */
-	.long	Lst_uint8-Lstore_table		/* FFI_TYPE_UINT8 */
-	.long	Lst_sint8-Lstore_table		/* FFI_TYPE_SINT8 */
-	.long	Lst_uint16-Lstore_table		/* FFI_TYPE_UINT16 */
-	.long	Lst_sint16-Lstore_table		/* FFI_TYPE_SINT16 */
-	.long	Lst_uint32-Lstore_table		/* FFI_TYPE_UINT32 */
-	.long	Lst_sint32-Lstore_table		/* FFI_TYPE_SINT32 */
-	.long	Lst_int64-Lstore_table		/* FFI_TYPE_UINT64 */
-	.long	Lst_int64-Lstore_table		/* FFI_TYPE_SINT64 */
-	.long	Lst_struct-Lstore_table		/* FFI_TYPE_STRUCT */
-	.long	Lst_int64-Lstore_table		/* FFI_TYPE_POINTER */
-
-	.text
-	.align	3
-Lst_void:
-	ret
-	.align	3
-Lst_uint8:
-	movzbq	%al, %rax
-	movq	%rax, (%rdi)
-	ret
-	.align	3
-Lst_sint8:
-	movsbq	%al, %rax
-	movq	%rax, (%rdi)
-	ret
-	.align	3
-Lst_uint16:
-	movzwq	%ax, %rax
-	movq	%rax, (%rdi)
-	.align	3
-Lst_sint16:
-	movswq	%ax, %rax
-	movq	%rax, (%rdi)
-	ret
-	.align	3
-Lst_uint32:
-	movl	%eax, %eax
-	movq	%rax, (%rdi)
-	.align	3
-Lst_sint32:
-	cltq
-	movq	%rax, (%rdi)
-	ret
-	.align	3
-Lst_int64:
-	movq	%rax, (%rdi)
-	ret
-	.align	3
-Lst_float:
-	movss	%xmm0, (%rdi)
-	ret
-	.align	3
-Lst_double:
-	movsd	%xmm0, (%rdi)
-	ret
-Lst_ldouble:
-	fstpt	(%rdi)
-	ret
-	.align	3
-Lst_struct:
-	leaq	-20(%rsp), %rsi		/* Scratch area in redzone.  */
-
-	/* We have to locate the values now, and since we don't want to
-	   write too much data into the user's return value, we spill the
-	   value to a 16 byte scratch area first.  Bits 8, 9, and 10
-	   control where the values are located.  Only one of the three
-	   bits will be set; see ffi_prep_cif_machdep for the pattern.  */
-	movd	%xmm0, %r10
-	movd	%xmm1, %r11
-	testl	$0x100, %ecx
-	cmovnz	%rax, %rdx
-	cmovnz	%r10, %rax
-	testl	$0x200, %ecx
-	cmovnz	%r10, %rdx
-	testl	$0x400, %ecx
-	cmovnz	%r10, %rax
-	cmovnz	%r11, %rdx
-	movq	%rax, (%rsi)
-	movq	%rdx, 8(%rsi)
-
-	/* Bits 12-31 contain the true size of the structure.  Copy from
-	   the scratch area to the true destination.  */
-	shrl	$12, %ecx
-	rep movsb
-	ret
-
-	/* Many times we can avoid loading any SSE registers at all.
-	   It's not worth an indirect jump to load the exact set of
-	   SSE registers needed; zero or all is a good compromise.  */
-	.align	3
-LUW3:
-Lload_sse:
-	movdqa	48(%r10), %xmm0
-	movdqa	64(%r10), %xmm1
-	movdqa	80(%r10), %xmm2
-	movdqa	96(%r10), %xmm3
-	movdqa	112(%r10), %xmm4
-	movdqa	128(%r10), %xmm5
-	movdqa	144(%r10), %xmm6
-	movdqa	160(%r10), %xmm7
-	jmp	Lret_from_load_sse
-
-LUW4:
-	.align	3
-	.globl	_ffi_closure_unix64
-
-_ffi_closure_unix64:
-LUW5:
-	/* The carry flag is set by the trampoline iff SSE registers
-	   are used.  Don't clobber it before the branch instruction.  */
-	leaq    -200(%rsp), %rsp
-LUW6:
-	movq	%rdi, (%rsp)
-	movq    %rsi, 8(%rsp)
-	movq    %rdx, 16(%rsp)
-	movq    %rcx, 24(%rsp)
-	movq    %r8, 32(%rsp)
-	movq    %r9, 40(%rsp)
-	jc      Lsave_sse
-Lret_from_save_sse:
-
-	movq	%r10, %rdi
-	leaq	176(%rsp), %rsi
-	movq	%rsp, %rdx
-	leaq	208(%rsp), %rcx
-	call	_ffi_closure_unix64_inner
-
-	/* Deallocate stack frame early; return value is now in redzone.  */
-	addq	$200, %rsp
-LUW7:
-
-	/* The first byte of the return value contains the FFI_TYPE.  */
-	movzbl	%al, %r10d
-	leaq	Lload_table(%rip), %r11
-	movslq	(%r11, %r10, 4), %r10
-	addq	%r11, %r10
-	jmp	*%r10
-
-Lload_table:
-	.long	Lld_void-Lload_table		/* FFI_TYPE_VOID */
-	.long	Lld_int32-Lload_table		/* FFI_TYPE_INT */
-	.long	Lld_float-Lload_table		/* FFI_TYPE_FLOAT */
-	.long	Lld_double-Lload_table		/* FFI_TYPE_DOUBLE */
-	.long	Lld_ldouble-Lload_table		/* FFI_TYPE_LONGDOUBLE */
-	.long	Lld_int8-Lload_table		/* FFI_TYPE_UINT8 */
-	.long	Lld_int8-Lload_table		/* FFI_TYPE_SINT8 */
-	.long	Lld_int16-Lload_table		/* FFI_TYPE_UINT16 */
-	.long	Lld_int16-Lload_table		/* FFI_TYPE_SINT16 */
-	.long	Lld_int32-Lload_table		/* FFI_TYPE_UINT32 */
-	.long	Lld_int32-Lload_table		/* FFI_TYPE_SINT32 */
-	.long	Lld_int64-Lload_table		/* FFI_TYPE_UINT64 */
-	.long	Lld_int64-Lload_table		/* FFI_TYPE_SINT64 */
-	.long	Lld_struct-Lload_table		/* FFI_TYPE_STRUCT */
-	.long	Lld_int64-Lload_table		/* FFI_TYPE_POINTER */
-
-	.text
-	.align	3
-Lld_void:
-	ret
-	.align	3
-Lld_int8:
-	movzbl	-24(%rsp), %eax
-	ret
-	.align	3
-Lld_int16:
-	movzwl	-24(%rsp), %eax
-	ret
-	.align	3
-Lld_int32:
-	movl	-24(%rsp), %eax
-	ret
-	.align	3
-Lld_int64:
-	movq	-24(%rsp), %rax
-	ret
-	.align	3
-Lld_float:
-	movss	-24(%rsp), %xmm0
-	ret
-	.align	3
-Lld_double:
-	movsd	-24(%rsp), %xmm0
-	ret
-	.align	3
-Lld_ldouble:
-	fldt	-24(%rsp)
-	ret
-	.align	3
-Lld_struct:
-	/* There are four possibilities here, %rax/%rdx, %xmm0/%rax,
-	   %rax/%xmm0, %xmm0/%xmm1.  We collapse two by always loading
-	   both rdx and xmm1 with the second word.  For the remaining,
-	   bit 8 set means xmm0 gets the second word, and bit 9 means
-	   that rax gets the second word.  */
-	movq	-24(%rsp), %rcx
-	movq	-16(%rsp), %rdx
-	movq	-16(%rsp), %xmm1
-	testl	$0x100, %eax
-	cmovnz	%rdx, %rcx
-	movd	%rcx, %xmm0
-	testl	$0x200, %eax
-	movq	-24(%rsp), %rax
-	cmovnz	%rdx, %rax
-	ret
-
-	/* See the comment above Lload_sse; the same logic applies here.  */
-	.align	3
-LUW8:
-Lsave_sse:
-	movdqa	%xmm0, 48(%rsp)
-	movdqa	%xmm1, 64(%rsp)
-	movdqa	%xmm2, 80(%rsp)
-	movdqa	%xmm3, 96(%rsp)
-	movdqa	%xmm4, 112(%rsp)
-	movdqa	%xmm5, 128(%rsp)
-	movdqa	%xmm6, 144(%rsp)
-	movdqa	%xmm7, 160(%rsp)
-	jmp	Lret_from_save_sse
-
-LUW9:
-.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
-EH_frame1:
-	.set	L$set$0,LECIE1-LSCIE1		/* CIE Length */
-	.long	L$set$0
-LSCIE1:
-	.long	0x0		/* CIE Identifier Tag */
-	.byte	0x1		/* CIE Version */
-	.ascii	"zR\0"		/* CIE Augmentation */
-	.byte	0x1		/* uleb128 0x1; CIE Code Alignment Factor */
-	.byte	0x78		/* sleb128 -8; CIE Data Alignment Factor */
-	.byte	0x10		/* CIE RA Column */
-	.byte	0x1		/* uleb128 0x1; Augmentation size */
-	.byte	0x10		/* FDE Encoding (pcrel sdata4) */
-	.byte	0xc		/* DW_CFA_def_cfa, %rsp offset 8 */
-	.byte	0x7		/* uleb128 0x7 */
-	.byte	0x8		/* uleb128 0x8 */
-	.byte	0x90		/* DW_CFA_offset, column 0x10 */
-	.byte	0x1
-	.align	3
-LECIE1:
-	.globl _ffi_call_unix64.eh
-_ffi_call_unix64.eh:
-LSFDE1:
-	.set	L$set$1,LEFDE1-LASFDE1	/* FDE Length */
-	.long	L$set$1
-LASFDE1:
-	.long	LASFDE1-EH_frame1	/* FDE CIE offset */
-	.quad	LUW0-.			/* FDE initial location */
-	.set	L$set$2,LUW4-LUW0	/* FDE address range */
-	.quad	L$set$2
-	.byte	0x0			/* Augmentation size */
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.set	L$set$3,LUW1-LUW0
-	.long	L$set$3
-
-	/* New stack frame based off rbp.  This is a itty bit of unwind
-	   trickery in that the CFA *has* changed.  There is no easy way
-	   to describe it correctly on entry to the function.  Fortunately,
-	   it doesn't matter too much since at all points we can correctly
-	   unwind back to ffi_call.  Note that the location to which we
-	   moved the return address is (the new) CFA-8, so from the
-	   perspective of the unwind info, it hasn't moved.  */
-	.byte	0xc			/* DW_CFA_def_cfa, %rbp offset 32 */
-	.byte	0x6
-	.byte	0x20
-	.byte	0x80+6			/* DW_CFA_offset, %rbp offset 2*-8 */
-	.byte	0x2
-	.byte	0xa			/* DW_CFA_remember_state */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.set	L$set$4,LUW2-LUW1
-	.long	L$set$4
-	.byte	0xc			/* DW_CFA_def_cfa, %rsp offset 8 */
-	.byte	0x7
-	.byte	0x8
-	.byte	0xc0+6			/* DW_CFA_restore, %rbp */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.set	L$set$5,LUW3-LUW2
-	.long	L$set$5
-	.byte	0xb			/* DW_CFA_restore_state */
-
-	.align	3
-LEFDE1:
-	.globl _ffi_closure_unix64.eh
-_ffi_closure_unix64.eh:
-LSFDE3:
-	.set	L$set$6,LEFDE3-LASFDE3	/* FDE Length */
-	.long	L$set$6
-LASFDE3:
-	.long	LASFDE3-EH_frame1	/* FDE CIE offset */
-	.quad	LUW5-.			/* FDE initial location */
-	.set	L$set$7,LUW9-LUW5	/* FDE address range */
-	.quad	L$set$7
-	.byte	0x0			/* Augmentation size */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.set	L$set$8,LUW6-LUW5
-	.long	L$set$8
-	.byte	0xe			/* DW_CFA_def_cfa_offset */
-	.byte	208,1			/* uleb128 208 */
-	.byte	0xa			/* DW_CFA_remember_state */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.set	L$set$9,LUW7-LUW6
-	.long	L$set$9
-	.byte	0xe			/* DW_CFA_def_cfa_offset */
-	.byte	0x8
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.set	L$set$10,LUW8-LUW7
-	.long	L$set$10
-	.byte	0xb			/* DW_CFA_restore_state */
-
-	.align	3
-LEFDE3:
-	.subsections_via_symbols
-
-#endif /* __x86_64__ */
diff --git a/libffi/src/x86/darwin64_c.c b/libffi/src/x86/darwin64_c.c
deleted file mode 100644
index 1daa1c0..0000000
--- a/libffi/src/x86/darwin64_c.c
+++ /dev/null
@@ -1,643 +0,0 @@
-/* -----------------------------------------------------------------------
-   ffi64.c - Copyright (c) 20011  Anthony Green
-             Copyright (c) 2008, 2010  Red Hat, Inc.
-             Copyright (c) 2002, 2007  Bo Thorsen <bo@suse.de>
-             
-   x86-64 Foreign Function Interface 
-
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files (the
-   ``Software''), to deal in the Software without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Software, and to
-   permit persons to whom the Software is furnished to do so, subject to
-   the following conditions:
-
-   The above copyright notice and this permission notice shall be included
-   in all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-   DEALINGS IN THE SOFTWARE.
-   ----------------------------------------------------------------------- */
-
-#include <ffi.h>
-#include <ffi_common.h>
-
-#include <stdlib.h>
-#include <stdarg.h>
-
-#ifdef __x86_64__
-
-#define MAX_GPR_REGS 6
-#define MAX_SSE_REGS 8
-
-#ifdef __INTEL_COMPILER
-#define UINT128 __m128
-#else
-#define UINT128 __int128_t
-#endif
-
-struct register_args
-{
-  /* Registers for argument passing.  */
-  UINT64 gpr[MAX_GPR_REGS];
-  UINT128 sse[MAX_SSE_REGS];
-};
-
-extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
-			     void *raddr, void (*fnaddr)(void), unsigned ssecount);
-
-/* All reference to register classes here is identical to the code in
-   gcc/config/i386/i386.c. Do *not* change one without the other.  */
-
-/* Register class used for passing given 64bit part of the argument.
-   These represent classes as documented by the PS ABI, with the
-   exception of SSESF, SSEDF classes, that are basically SSE class,
-   just gcc will use SF or DFmode move instead of DImode to avoid
-   reformatting penalties.
-
-   Similary we play games with INTEGERSI_CLASS to use cheaper SImode moves
-   whenever possible (upper half does contain padding).  */
-enum x86_64_reg_class
-  {
-    X86_64_NO_CLASS,
-    X86_64_INTEGER_CLASS,
-    X86_64_INTEGERSI_CLASS,
-    X86_64_SSE_CLASS,
-    X86_64_SSESF_CLASS,
-    X86_64_SSEDF_CLASS,
-    X86_64_SSEUP_CLASS,
-    X86_64_X87_CLASS,
-    X86_64_X87UP_CLASS,
-    X86_64_COMPLEX_X87_CLASS,
-    X86_64_MEMORY_CLASS
-  };
-
-#define MAX_CLASSES 4
-
-#define SSE_CLASS_P(X)	((X) >= X86_64_SSE_CLASS && X <= X86_64_SSEUP_CLASS)
-
-/* x86-64 register passing implementation.  See x86-64 ABI for details.  Goal
-   of this code is to classify each 8bytes of incoming argument by the register
-   class and assign registers accordingly.  */
-
-/* Return the union class of CLASS1 and CLASS2.
-   See the x86-64 PS ABI for details.  */
-
-static enum x86_64_reg_class
-merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
-{
-  /* Rule #1: If both classes are equal, this is the resulting class.  */
-  if (class1 == class2)
-    return class1;
-
-  /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
-     the other class.  */
-  if (class1 == X86_64_NO_CLASS)
-    return class2;
-  if (class2 == X86_64_NO_CLASS)
-    return class1;
-
-  /* Rule #3: If one of the classes is MEMORY, the result is MEMORY.  */
-  if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
-    return X86_64_MEMORY_CLASS;
-
-  /* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */
-  if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
-      || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
-    return X86_64_INTEGERSI_CLASS;
-  if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
-      || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
-    return X86_64_INTEGER_CLASS;
-
-  /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
-     MEMORY is used.  */
-  if (class1 == X86_64_X87_CLASS
-      || class1 == X86_64_X87UP_CLASS
-      || class1 == X86_64_COMPLEX_X87_CLASS
-      || class2 == X86_64_X87_CLASS
-      || class2 == X86_64_X87UP_CLASS
-      || class2 == X86_64_COMPLEX_X87_CLASS)
-    return X86_64_MEMORY_CLASS;
-
-  /* Rule #6: Otherwise class SSE is used.  */
-  return X86_64_SSE_CLASS;
-}
-
-/* Classify the argument of type TYPE and mode MODE.
-   CLASSES will be filled by the register class used to pass each word
-   of the operand.  The number of words is returned.  In case the parameter
-   should be passed in memory, 0 is returned. As a special case for zero
-   sized containers, classes[0] will be NO_CLASS and 1 is returned.
-
-   See the x86-64 PS ABI for details.
-*/
-static int
-classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
-		   size_t byte_offset)
-{
-  switch (type->type)
-    {
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_SINT32:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_SINT64:
-    case FFI_TYPE_POINTER:
-      {
-	int size = byte_offset + type->size;
-
-	if (size <= 4)
-	  {
-	    classes[0] = X86_64_INTEGERSI_CLASS;
-	    return 1;
-	  }
-	else if (size <= 8)
-	  {
-	    classes[0] = X86_64_INTEGER_CLASS;
-	    return 1;
-	  }
-	else if (size <= 12)
-	  {
-	    classes[0] = X86_64_INTEGER_CLASS;
-	    classes[1] = X86_64_INTEGERSI_CLASS;
-	    return 2;
-	  }
-	else if (size <= 16)
-	  {
-	    classes[0] = classes[1] = X86_64_INTEGERSI_CLASS;
-	    return 2;
-	  }
-	else
-	  FFI_ASSERT (0);
-      }
-    case FFI_TYPE_FLOAT:
-      if (!(byte_offset % 8))
-	classes[0] = X86_64_SSESF_CLASS;
-      else
-	classes[0] = X86_64_SSE_CLASS;
-      return 1;
-    case FFI_TYPE_DOUBLE:
-      classes[0] = X86_64_SSEDF_CLASS;
-      return 1;
-    case FFI_TYPE_LONGDOUBLE:
-      classes[0] = X86_64_X87_CLASS;
-      classes[1] = X86_64_X87UP_CLASS;
-      return 2;
-    case FFI_TYPE_STRUCT:
-      {
-	const int UNITS_PER_WORD = 8;
-	int words = (type->size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
-	ffi_type **ptr; 
-	int i;
-	enum x86_64_reg_class subclasses[MAX_CLASSES];
-
-	/* If the struct is larger than 32 bytes, pass it on the stack.  */
-	if (type->size > 32)
-	  return 0;
-
-	for (i = 0; i < words; i++)
-	  classes[i] = X86_64_NO_CLASS;
-
-	/* Zero sized arrays or structures are NO_CLASS.  We return 0 to
-	   signalize memory class, so handle it as special case.  */
-	if (!words)
-	  {
-	    classes[0] = X86_64_NO_CLASS;
-	    return 1;
-	  }
-
-	/* Merge the fields of structure.  */
-	for (ptr = type->elements; *ptr != NULL; ptr++)
-	  {
-	    int num;
-
-	    byte_offset = ALIGN (byte_offset, (*ptr)->alignment);
-
-	    num = classify_argument (*ptr, subclasses, byte_offset % 8);
-	    if (num == 0)
-	      return 0;
-	    for (i = 0; i < num; i++)
-	      {
-		int pos = byte_offset / 8;
-		classes[i + pos] =
-		  merge_classes (subclasses[i], classes[i + pos]);
-	      }
-
-	    byte_offset += (*ptr)->size;
-	  }
-
-	if (words > 2)
-	  {
-	    /* When size > 16 bytes, if the first one isn't
-	       X86_64_SSE_CLASS or any other ones aren't
-	       X86_64_SSEUP_CLASS, everything should be passed in
-	       memory.  */
-	    if (classes[0] != X86_64_SSE_CLASS)
-	      return 0;
-
-	    for (i = 1; i < words; i++)
-	      if (classes[i] != X86_64_SSEUP_CLASS)
-		return 0;
-	  }
-
-	/* Final merger cleanup.  */
-	for (i = 0; i < words; i++)
-	  {
-	    /* If one class is MEMORY, everything should be passed in
-	       memory.  */
-	    if (classes[i] == X86_64_MEMORY_CLASS)
-	      return 0;
-
-	    /* The X86_64_SSEUP_CLASS should be always preceded by
-	       X86_64_SSE_CLASS or X86_64_SSEUP_CLASS.  */
-	    if (classes[i] == X86_64_SSEUP_CLASS
-		&& classes[i - 1] != X86_64_SSE_CLASS
-		&& classes[i - 1] != X86_64_SSEUP_CLASS)
-	      {
-		/* The first one should never be X86_64_SSEUP_CLASS.  */
-		FFI_ASSERT (i != 0);
-		classes[i] = X86_64_SSE_CLASS;
-	      }
-
-	    /*  If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
-		everything should be passed in memory.  */
-	    if (classes[i] == X86_64_X87UP_CLASS
-		&& (classes[i - 1] != X86_64_X87_CLASS))
-	      {
-		/* The first one should never be X86_64_X87UP_CLASS.  */
-		FFI_ASSERT (i != 0);
-		return 0;
-	      }
-	  }
-	return words;
-      }
-
-    default:
-      FFI_ASSERT(0);
-    }
-  return 0; /* Never reached.  */
-}
-
-/* Examine the argument and return set number of register required in each
-   class.  Return zero iff parameter should be passed in memory, otherwise
-   the number of registers.  */
-
-static int
-examine_argument (ffi_type *type, enum x86_64_reg_class classes[MAX_CLASSES],
-		  _Bool in_return, int *pngpr, int *pnsse)
-{
-  int i, n, ngpr, nsse;
-
-  n = classify_argument (type, classes, 0);
-  if (n == 0)
-    return 0;
-
-  ngpr = nsse = 0;
-  for (i = 0; i < n; ++i)
-    switch (classes[i])
-      {
-      case X86_64_INTEGER_CLASS:
-      case X86_64_INTEGERSI_CLASS:
-	ngpr++;
-	break;
-      case X86_64_SSE_CLASS:
-      case X86_64_SSESF_CLASS:
-      case X86_64_SSEDF_CLASS:
-	nsse++;
-	break;
-      case X86_64_NO_CLASS:
-      case X86_64_SSEUP_CLASS:
-	break;
-      case X86_64_X87_CLASS:
-      case X86_64_X87UP_CLASS:
-      case X86_64_COMPLEX_X87_CLASS:
-	return in_return != 0;
-      default:
-	abort ();
-      }
-
-  *pngpr = ngpr;
-  *pnsse = nsse;
-
-  return n;
-}
-
-/* Perform machine dependent cif processing.  */
-
-ffi_status
-ffi_prep_cif_machdep (ffi_cif *cif)
-{
-  int gprcount, ssecount, i, avn, n, ngpr, nsse, flags;
-  enum x86_64_reg_class classes[MAX_CLASSES];
-  size_t bytes;
-
-  gprcount = ssecount = 0;
-
-  flags = cif->rtype->type;
-  if (flags != FFI_TYPE_VOID)
-    {
-      n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
-      if (n == 0)
-	{
-	  /* The return value is passed in memory.  A pointer to that
-	     memory is the first argument.  Allocate a register for it.  */
-	  gprcount++;
-	  /* We don't have to do anything in asm for the return.  */
-	  flags = FFI_TYPE_VOID;
-	}
-      else if (flags == FFI_TYPE_STRUCT)
-	{
-	  /* Mark which registers the result appears in.  */
-	  _Bool sse0 = SSE_CLASS_P (classes[0]);
-	  _Bool sse1 = n == 2 && SSE_CLASS_P (classes[1]);
-	  if (sse0 && !sse1)
-	    flags |= 1 << 8;
-	  else if (!sse0 && sse1)
-	    flags |= 1 << 9;
-	  else if (sse0 && sse1)
-	    flags |= 1 << 10;
-	  /* Mark the true size of the structure.  */
-	  flags |= cif->rtype->size << 12;
-	}
-    }
-
-  /* Go over all arguments and determine the way they should be passed.
-     If it's in a register and there is space for it, let that be so. If
-     not, add it's size to the stack byte count.  */
-  for (bytes = 0, i = 0, avn = cif->nargs; i < avn; i++)
-    {
-      if (examine_argument (cif->arg_types[i], classes, 0, &ngpr, &nsse) == 0
-	  || gprcount + ngpr > MAX_GPR_REGS
-	  || ssecount + nsse > MAX_SSE_REGS)
-	{
-	  long align = cif->arg_types[i]->alignment;
-
-	  if (align < 8)
-	    align = 8;
-
-	  bytes = ALIGN (bytes, align);
-	  bytes += cif->arg_types[i]->size;
-	}
-      else
-	{
-	  gprcount += ngpr;
-	  ssecount += nsse;
-	}
-    }
-  if (ssecount)
-    flags |= 1 << 11;
-  cif->flags = flags;
-  cif->bytes = ALIGN (bytes, 8);
-
-  return FFI_OK;
-}
-
-void
-ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
-{
-  enum x86_64_reg_class classes[MAX_CLASSES];
-  char *stack, *argp;
-  ffi_type **arg_types;
-  int gprcount, ssecount, ngpr, nsse, i, avn;
-  _Bool ret_in_memory;
-  struct register_args *reg_args;
-
-  /* Can't call 32-bit mode from 64-bit mode.  */
-  FFI_ASSERT (cif->abi == FFI_UNIX64);
-
-  /* If the return value is a struct and we don't have a return value
-     address then we need to make one.  Note the setting of flags to
-     VOID above in ffi_prep_cif_machdep.  */
-  ret_in_memory = (cif->rtype->type == FFI_TYPE_STRUCT
-		   && (cif->flags & 0xff) == FFI_TYPE_VOID);
-  if (rvalue == NULL && ret_in_memory)
-    rvalue = alloca (cif->rtype->size);
-
-  /* Allocate the space for the arguments, plus 4 words of temp space.  */
-  stack = alloca (sizeof (struct register_args) + cif->bytes + 4*8);
-  reg_args = (struct register_args *) stack;
-  argp = stack + sizeof (struct register_args);
-
-  gprcount = ssecount = 0;
-
-  /* If the return value is passed in memory, add the pointer as the
-     first integer argument.  */
-  if (ret_in_memory)
-    reg_args->gpr[gprcount++] = (unsigned long) rvalue;
-
-  avn = cif->nargs;
-  arg_types = cif->arg_types;
-
-  for (i = 0; i < avn; ++i)
-    {
-      size_t size = arg_types[i]->size;
-      int n;
-
-      n = examine_argument (arg_types[i], classes, 0, &ngpr, &nsse);
-      if (n == 0
-	  || gprcount + ngpr > MAX_GPR_REGS
-	  || ssecount + nsse > MAX_SSE_REGS)
-	{
-	  long align = arg_types[i]->alignment;
-
-	  /* Stack arguments are *always* at least 8 byte aligned.  */
-	  if (align < 8)
-	    align = 8;
-
-	  /* Pass this argument in memory.  */
-	  argp = (void *) ALIGN (argp, align);
-	  memcpy (argp, avalue[i], size);
-	  argp += size;
-	}
-      else
-	{
-	  /* The argument is passed entirely in registers.  */
-	  char *a = (char *) avalue[i];
-	  int j;
-
-	  for (j = 0; j < n; j++, a += 8, size -= 8)
-	    {
-	      switch (classes[j])
-		{
-		case X86_64_INTEGER_CLASS:
-		case X86_64_INTEGERSI_CLASS:
-		  reg_args->gpr[gprcount] = 0;
-		  memcpy (&reg_args->gpr[gprcount], a, size < 8 ? size : 8);
-		  gprcount++;
-		  break;
-		case X86_64_SSE_CLASS:
-		case X86_64_SSEDF_CLASS:
-		  reg_args->sse[ssecount++] = *(UINT64 *) a;
-		  break;
-		case X86_64_SSESF_CLASS:
-		  reg_args->sse[ssecount++] = *(UINT32 *) a;
-		  break;
-		default:
-		  abort();
-		}
-	    }
-	}
-    }
-
-  ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
-		   cif->flags, rvalue, fn, ssecount);
-}
-
-
-extern void ffi_closure_unix64(void);
-
-ffi_status
-ffi_prep_closure_loc (ffi_closure* closure,
-		      ffi_cif* cif,
-		      void (*fun)(ffi_cif*, void*, void**, void*),
-		      void *user_data,
-		      void *codeloc)
-{
-  volatile unsigned short *tramp;
-
-  /* Sanity check on the cif ABI.  */
-  {
-    int abi = cif->abi;
-    if (UNLIKELY (! (abi > FFI_FIRST_ABI && abi < FFI_LAST_ABI)))
-      return FFI_BAD_ABI;
-  }
-
-  tramp = (volatile unsigned short *) &closure->tramp[0];
-
-  tramp[0] = 0xbb49;		/* mov <code>, %r11	*/
-  *((unsigned long long * volatile) &tramp[1])
-    = (unsigned long) ffi_closure_unix64;
-  tramp[5] = 0xba49;		/* mov <data>, %r10	*/
-  *((unsigned long long * volatile) &tramp[6])
-    = (unsigned long) codeloc;
-
-  /* Set the carry bit iff the function uses any sse registers.
-     This is clc or stc, together with the first byte of the jmp.  */
-  tramp[10] = cif->flags & (1 << 11) ? 0x49f9 : 0x49f8;
-
-  tramp[11] = 0xe3ff;			/* jmp *%r11    */
-
-  closure->cif = cif;
-  closure->fun = fun;
-  closure->user_data = user_data;
-
-  return FFI_OK;
-}
-
-int
-ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue,
-			 struct register_args *reg_args, char *argp)
-{
-  ffi_cif *cif;
-  void **avalue;
-  ffi_type **arg_types;
-  long i, avn;
-  int gprcount, ssecount, ngpr, nsse;
-  int ret;
-
-  cif = closure->cif;
-  avalue = alloca(cif->nargs * sizeof(void *));
-  gprcount = ssecount = 0;
-
-  ret = cif->rtype->type;
-  if (ret != FFI_TYPE_VOID)
-    {
-      enum x86_64_reg_class classes[MAX_CLASSES];
-      int n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
-      if (n == 0)
-	{
-	  /* The return value goes in memory.  Arrange for the closure
-	     return value to go directly back to the original caller.  */
-	  rvalue = (void *) (unsigned long) reg_args->gpr[gprcount++];
-	  /* We don't have to do anything in asm for the return.  */
-	  ret = FFI_TYPE_VOID;
-	}
-      else if (ret == FFI_TYPE_STRUCT && n == 2)
-	{
-	  /* Mark which register the second word of the structure goes in.  */
-	  _Bool sse0 = SSE_CLASS_P (classes[0]);
-	  _Bool sse1 = SSE_CLASS_P (classes[1]);
-	  if (!sse0 && sse1)
-	    ret |= 1 << 8;
-	  else if (sse0 && !sse1)
-	    ret |= 1 << 9;
-	}
-    }
-
-  avn = cif->nargs;
-  arg_types = cif->arg_types;
-  
-  for (i = 0; i < avn; ++i)
-    {
-      enum x86_64_reg_class classes[MAX_CLASSES];
-      int n;
-
-      n = examine_argument (arg_types[i], classes, 0, &ngpr, &nsse);
-      if (n == 0
-	  || gprcount + ngpr > MAX_GPR_REGS
-	  || ssecount + nsse > MAX_SSE_REGS)
-	{
-	  long align = arg_types[i]->alignment;
-
-	  /* Stack arguments are *always* at least 8 byte aligned.  */
-	  if (align < 8)
-	    align = 8;
-
-	  /* Pass this argument in memory.  */
-	  argp = (void *) ALIGN (argp, align);
-	  avalue[i] = argp;
-	  argp += arg_types[i]->size;
-	}
-      /* If the argument is in a single register, or two consecutive
-	 integer registers, then we can use that address directly.  */
-      else if (n == 1
-	       || (n == 2 && !(SSE_CLASS_P (classes[0])
-			       || SSE_CLASS_P (classes[1]))))
-	{
-	  /* The argument is in a single register.  */
-	  if (SSE_CLASS_P (classes[0]))
-	    {
-	      avalue[i] = &reg_args->sse[ssecount];
-	      ssecount += n;
-	    }
-	  else
-	    {
-	      avalue[i] = &reg_args->gpr[gprcount];
-	      gprcount += n;
-	    }
-	}
-      /* Otherwise, allocate space to make them consecutive.  */
-      else
-	{
-	  char *a = alloca (16);
-	  int j;
-
-	  avalue[i] = a;
-	  for (j = 0; j < n; j++, a += 8)
-	    {
-	      if (SSE_CLASS_P (classes[j]))
-		memcpy (a, &reg_args->sse[ssecount++], 8);
-	      else
-		memcpy (a, &reg_args->gpr[gprcount++], 8);
-	    }
-	}
-    }
-
-  /* Invoke the closure.  */
-  closure->fun (cif, rvalue, avalue, closure->user_data);
-
-  /* Tell assembly how to perform return type promotions.  */
-  return ret;
-}
-
-#endif /* __x86_64__ */
diff --git a/libffi/src/x86/darwin_c.c b/libffi/src/x86/darwin_c.c
deleted file mode 100644
index 6338de2..0000000
--- a/libffi/src/x86/darwin_c.c
+++ /dev/null
@@ -1,843 +0,0 @@
-/* -----------------------------------------------------------------------
-   ffi.c - Copyright (c) 1996, 1998, 1999, 2001, 2007, 2008  Red Hat, Inc.
-           Copyright (c) 2002  Ranjit Mathew
-           Copyright (c) 2002  Bo Thorsen
-           Copyright (c) 2002  Roger Sayle
-           Copyright (C) 2008, 2010  Free Software Foundation, Inc.
-
-   x86 Foreign Function Interface
-
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files (the
-   ``Software''), to deal in the Software without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Software, and to
-   permit persons to whom the Software is furnished to do so, subject to
-   the following conditions:
-
-   The above copyright notice and this permission notice shall be included
-   in all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-   DEALINGS IN THE SOFTWARE.
-   ----------------------------------------------------------------------- */
-
-#if !defined(__x86_64__) || defined(_WIN64) || defined(__CYGWIN__)
-
-#ifdef _WIN64
-#include <windows.h>
-#endif
-
-#include <ffi.h>
-#include <ffi_common.h>
-
-#include <stdlib.h>
-
-/* ffi_prep_args is called by the assembly routine once stack space
-   has been allocated for the function's arguments */
-
-void ffi_prep_args(char *stack, extended_cif *ecif)
-{
-  register unsigned int i;
-  register void **p_argv;
-  register char *argp;
-  register ffi_type **p_arg;
-#ifdef X86_WIN32
-  size_t p_stack_args[2];
-  void *p_stack_data[2];
-  char *argp2 = stack;
-  int stack_args_count = 0;
-  int cabi = ecif->cif->abi;
-#endif
-
-  argp = stack;
-
-  if ((ecif->cif->flags == FFI_TYPE_STRUCT
-       || ecif->cif->flags == FFI_TYPE_MS_STRUCT)
-#ifdef X86_WIN64
-      && (ecif->cif->rtype->size != 1 && ecif->cif->rtype->size != 2
-          && ecif->cif->rtype->size != 4 && ecif->cif->rtype->size != 8)
-#endif
-      )
-    {
-      *(void **) argp = ecif->rvalue;
-#ifdef X86_WIN32
-      /* For fastcall/thiscall this is first register-passed
-         argument.  */
-      if (cabi == FFI_THISCALL || cabi == FFI_FASTCALL)
-	{
-	  p_stack_args[stack_args_count] = sizeof (void*);
-	  p_stack_data[stack_args_count] = argp;
-	  ++stack_args_count;
-	}
-#endif
-      argp += sizeof(void*);
-    }
-
-  p_argv = ecif->avalue;
-
-  for (i = ecif->cif->nargs, p_arg = ecif->cif->arg_types;
-       i != 0;
-       i--, p_arg++)
-    {
-      size_t z;
-
-      /* Align if necessary */
-      if ((sizeof(void*) - 1) & (size_t) argp)
-        argp = (char *) ALIGN(argp, sizeof(void*));
-
-      z = (*p_arg)->size;
-#ifdef X86_WIN64
-      if (z > sizeof(ffi_arg)
-          || ((*p_arg)->type == FFI_TYPE_STRUCT
-              && (z != 1 && z != 2 && z != 4 && z != 8))
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
-          || ((*p_arg)->type == FFI_TYPE_LONGDOUBLE)
-#endif
-          )
-        {
-          z = sizeof(ffi_arg);
-          *(void **)argp = *p_argv;
-        }
-      else if ((*p_arg)->type == FFI_TYPE_FLOAT)
-        {
-          memcpy(argp, *p_argv, z);
-        }
-      else
-#endif
-      if (z < sizeof(ffi_arg))
-        {
-          z = sizeof(ffi_arg);
-          switch ((*p_arg)->type)
-            {
-            case FFI_TYPE_SINT8:
-              *(ffi_sarg *) argp = (ffi_sarg)*(SINT8 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_UINT8:
-              *(ffi_arg *) argp = (ffi_arg)*(UINT8 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_SINT16:
-              *(ffi_sarg *) argp = (ffi_sarg)*(SINT16 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_UINT16:
-              *(ffi_arg *) argp = (ffi_arg)*(UINT16 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_SINT32:
-              *(ffi_sarg *) argp = (ffi_sarg)*(SINT32 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_UINT32:
-              *(ffi_arg *) argp = (ffi_arg)*(UINT32 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_STRUCT:
-              *(ffi_arg *) argp = *(ffi_arg *)(* p_argv);
-              break;
-
-            default:
-              FFI_ASSERT(0);
-            }
-        }
-      else
-        {
-          memcpy(argp, *p_argv, z);
-        }
-
-#ifdef X86_WIN32
-    /* For thiscall/fastcall convention register-passed arguments
-       are the first two none-floating-point arguments with a size
-       smaller or equal to sizeof (void*).  */
-    if ((cabi == FFI_THISCALL && stack_args_count < 1)
-        || (cabi == FFI_FASTCALL && stack_args_count < 2))
-      {
-	if (z <= 4
-	    && ((*p_arg)->type != FFI_TYPE_FLOAT
-	        && (*p_arg)->type != FFI_TYPE_STRUCT))
-	  {
-	    p_stack_args[stack_args_count] = z;
-	    p_stack_data[stack_args_count] = argp;
-	    ++stack_args_count;
-	  }
-      }
-#endif
-      p_argv++;
-#ifdef X86_WIN64
-      argp += (z + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
-#else
-      argp += z;
-#endif
-    }
-
-#ifdef X86_WIN32
-  /* We need to move the register-passed arguments for thiscall/fastcall
-     on top of stack, so that those can be moved to registers ecx/edx by
-     call-handler.  */
-  if (stack_args_count > 0)
-    {
-      size_t zz = (p_stack_args[0] + 3) & ~3;
-      char *h;
-
-      /* Move first argument to top-stack position.  */
-      if (p_stack_data[0] != argp2)
-	{
-	  h = alloca (zz + 1);
-	  memcpy (h, p_stack_data[0], zz);
-	  memmove (argp2 + zz, argp2,
-	           (size_t) ((char *) p_stack_data[0] - (char*)argp2));
-	  memcpy (argp2, h, zz);
-	}
-
-      argp2 += zz;
-      --stack_args_count;
-      if (zz > 4)
-	stack_args_count = 0;
-
-      /* If we have a second argument, then move it on top
-         after the first one.  */
-      if (stack_args_count > 0 && p_stack_data[1] != argp2)
-	{
-	  zz = p_stack_args[1];
-	  zz = (zz + 3) & ~3;
-	  h = alloca (zz + 1);
-	  h = alloca (zz + 1);
-	  memcpy (h, p_stack_data[1], zz);
-	  memmove (argp2 + zz, argp2, (size_t) ((char*) p_stack_data[1] - (char*)argp2));
-	  memcpy (argp2, h, zz);
-	}
-    }
-#endif
-  return;
-}
-
-/* Perform machine dependent cif processing */
-ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
-{
-  unsigned int i;
-  ffi_type **ptr;
-
-  /* Set the return type flag */
-  switch (cif->rtype->type)
-    {
-    case FFI_TYPE_VOID:
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_SINT16:
-#ifdef X86_WIN64
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_SINT32:
-#endif
-    case FFI_TYPE_SINT64:
-    case FFI_TYPE_FLOAT:
-    case FFI_TYPE_DOUBLE:
-#ifndef X86_WIN64
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
-    case FFI_TYPE_LONGDOUBLE:
-#endif
-#endif
-      cif->flags = (unsigned) cif->rtype->type;
-      break;
-
-    case FFI_TYPE_UINT64:
-#ifdef X86_WIN64
-    case FFI_TYPE_POINTER:
-#endif
-      cif->flags = FFI_TYPE_SINT64;
-      break;
-
-    case FFI_TYPE_STRUCT:
-#ifndef X86
-      if (cif->rtype->size == 1)
-        {
-          cif->flags = FFI_TYPE_SMALL_STRUCT_1B; /* same as char size */
-        }
-      else if (cif->rtype->size == 2)
-        {
-          cif->flags = FFI_TYPE_SMALL_STRUCT_2B; /* same as short size */
-        }
-      else if (cif->rtype->size == 4)
-        {
-#ifdef X86_WIN64
-          cif->flags = FFI_TYPE_SMALL_STRUCT_4B;
-#else
-          cif->flags = FFI_TYPE_INT; /* same as int type */
-#endif
-        }
-      else if (cif->rtype->size == 8)
-        {
-          cif->flags = FFI_TYPE_SINT64; /* same as int64 type */
-        }
-      else
-#endif
-        {
-#ifdef X86_WIN32
-          if (cif->abi == FFI_MS_CDECL)
-            cif->flags = FFI_TYPE_MS_STRUCT;
-          else
-#endif
-            cif->flags = FFI_TYPE_STRUCT;
-          /* allocate space for return value pointer */
-          cif->bytes += ALIGN(sizeof(void*), FFI_SIZEOF_ARG);
-        }
-      break;
-
-    default:
-#ifdef X86_WIN64
-      cif->flags = FFI_TYPE_SINT64;
-      break;
-    case FFI_TYPE_INT:
-      cif->flags = FFI_TYPE_SINT32;
-#else
-      cif->flags = FFI_TYPE_INT;
-#endif
-      break;
-    }
-
-  for (ptr = cif->arg_types, i = cif->nargs; i > 0; i--, ptr++)
-    {
-      if (((*ptr)->alignment - 1) & cif->bytes)
-        cif->bytes = ALIGN(cif->bytes, (*ptr)->alignment);
-      cif->bytes += ALIGN((*ptr)->size, FFI_SIZEOF_ARG);
-    }
-
-#ifdef X86_WIN64
-  /* ensure space for storing four registers */
-  cif->bytes += 4 * sizeof(ffi_arg);
-#endif
-
-#ifdef X86_DARWIN
-  cif->bytes = (cif->bytes + 15) & ~0xF;
-#endif
-
-  return FFI_OK;
-}
-
-#ifdef X86_WIN64
-extern int
-ffi_call_win64(void (*)(char *, extended_cif *), extended_cif *,
-               unsigned, unsigned, unsigned *, void (*fn)(void));
-#elif defined(X86_WIN32)
-extern void
-ffi_call_win32(void (*)(char *, extended_cif *), extended_cif *,
-               unsigned, unsigned, unsigned, unsigned *, void (*fn)(void));
-#else
-extern void ffi_call_SYSV(void (*)(char *, extended_cif *), extended_cif *,
-                          unsigned, unsigned, unsigned *, void (*fn)(void));
-#endif
-
-void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
-{
-  extended_cif ecif;
-
-  ecif.cif = cif;
-  ecif.avalue = avalue;
-  
-  /* If the return value is a struct and we don't have a return */
-  /* value address then we need to make one                     */
-
-#ifdef X86_WIN64
-  if (rvalue == NULL
-      && cif->flags == FFI_TYPE_STRUCT
-      && cif->rtype->size != 1 && cif->rtype->size != 2
-      && cif->rtype->size != 4 && cif->rtype->size != 8)
-    {
-      ecif.rvalue = alloca((cif->rtype->size + 0xF) & ~0xF);
-    }
-#else
-  if (rvalue == NULL
-      && (cif->flags == FFI_TYPE_STRUCT
-          || cif->flags == FFI_TYPE_MS_STRUCT))
-    {
-      ecif.rvalue = alloca(cif->rtype->size);
-    }
-#endif
-  else
-    ecif.rvalue = rvalue;
-    
-  
-  switch (cif->abi) 
-    {
-#ifdef X86_WIN64
-    case FFI_WIN64:
-      ffi_call_win64(ffi_prep_args, &ecif, cif->bytes,
-                     cif->flags, ecif.rvalue, fn);
-      break;
-#elif defined(X86_WIN32)
-    case FFI_SYSV:
-    case FFI_STDCALL:
-    case FFI_MS_CDECL:
-      ffi_call_win32(ffi_prep_args, &ecif, cif->abi, cif->bytes, cif->flags,
-		     ecif.rvalue, fn);
-      break;
-    case FFI_THISCALL:
-    case FFI_FASTCALL:
-      {
-	unsigned int abi = cif->abi;
-	unsigned int i, passed_regs = 0;
-
-	if (cif->flags == FFI_TYPE_STRUCT)
-	  ++passed_regs;
-
-	for (i=0; i < cif->nargs && passed_regs < 2;i++)
-	  {
-	    size_t sz;
-
-	    if (cif->arg_types[i]->type == FFI_TYPE_FLOAT
-	        || cif->arg_types[i]->type == FFI_TYPE_STRUCT)
-	      continue;
-	    sz = (cif->arg_types[i]->size + 3) & ~3;
-	    if (sz == 0 || sz > 4)
-	      continue;
-	    ++passed_regs;
-	  }
-	if (passed_regs < 2 && abi == FFI_FASTCALL)
-	  abi = FFI_THISCALL;
-	if (passed_regs < 1 && abi == FFI_THISCALL)
-	  abi = FFI_STDCALL;
-        ffi_call_win32(ffi_prep_args, &ecif, abi, cif->bytes, cif->flags,
-                       ecif.rvalue, fn);
-      }
-      break;
-#else
-    case FFI_SYSV:
-      ffi_call_SYSV(ffi_prep_args, &ecif, cif->bytes, cif->flags, ecif.rvalue,
-                    fn);
-      break;
-#endif
-    default:
-      FFI_ASSERT(0);
-      break;
-    }
-}
-
-
-/** private members **/
-
-/* The following __attribute__((regparm(1))) decorations will have no effect
-   on MSVC - standard cdecl convention applies. */
-static void ffi_prep_incoming_args_SYSV (char *stack, void **ret,
-                                         void** args, ffi_cif* cif);
-void FFI_HIDDEN ffi_closure_SYSV (ffi_closure *)
-     __attribute__ ((regparm(1)));
-unsigned int FFI_HIDDEN ffi_closure_SYSV_inner (ffi_closure *, void **, void *)
-     __attribute__ ((regparm(1)));
-void FFI_HIDDEN ffi_closure_raw_SYSV (ffi_raw_closure *)
-     __attribute__ ((regparm(1)));
-#ifdef X86_WIN32
-void FFI_HIDDEN ffi_closure_raw_THISCALL (ffi_raw_closure *)
-     __attribute__ ((regparm(1)));
-void FFI_HIDDEN ffi_closure_STDCALL (ffi_closure *)
-     __attribute__ ((regparm(1)));
-void FFI_HIDDEN ffi_closure_THISCALL (ffi_closure *)
-     __attribute__ ((regparm(1)));
-#endif
-#ifdef X86_WIN64
-void FFI_HIDDEN ffi_closure_win64 (ffi_closure *);
-#endif
-
-/* This function is jumped to by the trampoline */
-
-#ifdef X86_WIN64
-void * FFI_HIDDEN
-ffi_closure_win64_inner (ffi_closure *closure, void *args) {
-  ffi_cif       *cif;
-  void         **arg_area;
-  void          *result;
-  void          *resp = &result;
-
-  cif         = closure->cif;
-  arg_area    = (void**) alloca (cif->nargs * sizeof (void*));  
-
-  /* this call will initialize ARG_AREA, such that each
-   * element in that array points to the corresponding 
-   * value on the stack; and if the function returns
-   * a structure, it will change RESP to point to the
-   * structure return address.  */
-
-  ffi_prep_incoming_args_SYSV(args, &resp, arg_area, cif);
-  
-  (closure->fun) (cif, resp, arg_area, closure->user_data);
-
-  /* The result is returned in rax.  This does the right thing for
-     result types except for floats; we have to 'mov xmm0, rax' in the
-     caller to correct this.
-     TODO: structure sizes of 3 5 6 7 are returned by reference, too!!!
-  */
-  return cif->rtype->size > sizeof(void *) ? resp : *(void **)resp;
-}
-
-#else
-unsigned int FFI_HIDDEN __attribute__ ((regparm(1)))
-ffi_closure_SYSV_inner (ffi_closure *closure, void **respp, void *args)
-{
-  /* our various things...  */
-  ffi_cif       *cif;
-  void         **arg_area;
-
-  cif         = closure->cif;
-  arg_area    = (void**) alloca (cif->nargs * sizeof (void*));  
-
-  /* this call will initialize ARG_AREA, such that each
-   * element in that array points to the corresponding 
-   * value on the stack; and if the function returns
-   * a structure, it will change RESP to point to the
-   * structure return address.  */
-
-  ffi_prep_incoming_args_SYSV(args, respp, arg_area, cif);
-
-  (closure->fun) (cif, *respp, arg_area, closure->user_data);
-
-  return cif->flags;
-}
-#endif /* !X86_WIN64 */
-
-static void
-ffi_prep_incoming_args_SYSV(char *stack, void **rvalue, void **avalue,
-                            ffi_cif *cif)
-{
-  register unsigned int i;
-  register void **p_argv;
-  register char *argp;
-  register ffi_type **p_arg;
-
-  argp = stack;
-
-#ifdef X86_WIN64
-  if (cif->rtype->size > sizeof(ffi_arg)
-      || (cif->flags == FFI_TYPE_STRUCT
-          && (cif->rtype->size != 1 && cif->rtype->size != 2
-              && cif->rtype->size != 4 && cif->rtype->size != 8))) {
-    *rvalue = *(void **) argp;
-    argp += sizeof(void *);
-  }
-#else
-  if ( cif->flags == FFI_TYPE_STRUCT
-       || cif->flags == FFI_TYPE_MS_STRUCT ) {
-    *rvalue = *(void **) argp;
-    argp += sizeof(void *);
-  }
-#endif
-
-  p_argv = avalue;
-
-  for (i = cif->nargs, p_arg = cif->arg_types; (i != 0); i--, p_arg++)
-    {
-      size_t z;
-
-      /* Align if necessary */
-      if ((sizeof(void*) - 1) & (size_t) argp) {
-        argp = (char *) ALIGN(argp, sizeof(void*));
-      }
-
-#ifdef X86_WIN64
-      if ((*p_arg)->size > sizeof(ffi_arg)
-          || ((*p_arg)->type == FFI_TYPE_STRUCT
-              && ((*p_arg)->size != 1 && (*p_arg)->size != 2
-                  && (*p_arg)->size != 4 && (*p_arg)->size != 8)))
-        {
-          z = sizeof(void *);
-          *p_argv = *(void **)argp;
-        }
-      else
-#endif
-        {
-          z = (*p_arg)->size;
-          
-          /* because we're little endian, this is what it turns into.   */
-          
-          *p_argv = (void*) argp;
-        }
-          
-      p_argv++;
-#ifdef X86_WIN64
-      argp += (z + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
-#else
-      argp += z;
-#endif
-    }
-  
-  return;
-}
-
-#define FFI_INIT_TRAMPOLINE_WIN64(TRAMP,FUN,CTX,MASK) \
-{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
-   void*  __fun = (void*)(FUN); \
-   void*  __ctx = (void*)(CTX); \
-   *(unsigned char*) &__tramp[0] = 0x41; \
-   *(unsigned char*) &__tramp[1] = 0xbb; \
-   *(unsigned int*) &__tramp[2] = MASK; /* mov $mask, %r11 */ \
-   *(unsigned char*) &__tramp[6] = 0x48; \
-   *(unsigned char*) &__tramp[7] = 0xb8; \
-   *(void**) &__tramp[8] = __ctx; /* mov __ctx, %rax */ \
-   *(unsigned char *)  &__tramp[16] = 0x49; \
-   *(unsigned char *)  &__tramp[17] = 0xba; \
-   *(void**) &__tramp[18] = __fun; /* mov __fun, %r10 */ \
-   *(unsigned char *)  &__tramp[26] = 0x41; \
-   *(unsigned char *)  &__tramp[27] = 0xff; \
-   *(unsigned char *)  &__tramp[28] = 0xe2; /* jmp %r10 */ \
- }
-
-/* How to make a trampoline.  Derived from gcc/config/i386/i386.c. */
-
-#define FFI_INIT_TRAMPOLINE(TRAMP,FUN,CTX) \
-{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
-   unsigned int  __fun = (unsigned int)(FUN); \
-   unsigned int  __ctx = (unsigned int)(CTX); \
-   unsigned int  __dis = __fun - (__ctx + 10);  \
-   *(unsigned char*) &__tramp[0] = 0xb8; \
-   *(unsigned int*)  &__tramp[1] = __ctx; /* movl __ctx, %eax */ \
-   *(unsigned char *)  &__tramp[5] = 0xe9; \
-   *(unsigned int*)  &__tramp[6] = __dis; /* jmp __fun  */ \
- }
-
-#define FFI_INIT_TRAMPOLINE_THISCALL(TRAMP,FUN,CTX,SIZE) \
-{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
-   unsigned int  __fun = (unsigned int)(FUN); \
-   unsigned int  __ctx = (unsigned int)(CTX); \
-   unsigned int  __dis = __fun - (__ctx + 49);  \
-   unsigned short __size = (unsigned short)(SIZE); \
-   *(unsigned int *) &__tramp[0] = 0x8324048b;	/* mov (%esp), %eax */ \
-   *(unsigned int *) &__tramp[4] = 0x4c890cec;	/* sub $12, %esp */ \
-   *(unsigned int *) &__tramp[8] = 0x04890424;	/* mov %ecx, 4(%esp) */ \
-   *(unsigned char*) &__tramp[12] = 0x24;	/* mov %eax, (%esp) */ \
-   *(unsigned char*) &__tramp[13] = 0xb8; \
-   *(unsigned int *) &__tramp[14] = __size;	/* mov __size, %eax */ \
-   *(unsigned int *) &__tramp[18] = 0x08244c8d;	/* lea 8(%esp), %ecx */ \
-   *(unsigned int *) &__tramp[22] = 0x4802e8c1; /* shr $2, %eax ; dec %eax */ \
-   *(unsigned short*) &__tramp[26] = 0x0b74;	/* jz 1f */ \
-   *(unsigned int *) &__tramp[28] = 0x8908518b;	/* 2b: mov 8(%ecx), %edx */ \
-   *(unsigned int *) &__tramp[32] = 0x04c18311; /* mov %edx, (%ecx) ; add $4, %ecx */ \
-   *(unsigned char*) &__tramp[36] = 0x48;	/* dec %eax */ \
-   *(unsigned short*) &__tramp[37] = 0xf575;	/* jnz 2b ; 1f: */ \
-   *(unsigned char*) &__tramp[39] = 0xb8; \
-   *(unsigned int*)  &__tramp[40] = __ctx; /* movl __ctx, %eax */ \
-   *(unsigned char *)  &__tramp[44] = 0xe8; \
-   *(unsigned int*)  &__tramp[45] = __dis; /* call __fun  */ \
-   *(unsigned char*)  &__tramp[49] = 0xc2; /* ret  */ \
-   *(unsigned short*)  &__tramp[50] = (__size + 8); /* ret (__size + 8)  */ \
- }
-
-#define FFI_INIT_TRAMPOLINE_STDCALL(TRAMP,FUN,CTX,SIZE)  \
-{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
-   unsigned int  __fun = (unsigned int)(FUN); \
-   unsigned int  __ctx = (unsigned int)(CTX); \
-   unsigned int  __dis = __fun - (__ctx + 10); \
-   unsigned short __size = (unsigned short)(SIZE); \
-   *(unsigned char*) &__tramp[0] = 0xb8; \
-   *(unsigned int*)  &__tramp[1] = __ctx; /* movl __ctx, %eax */ \
-   *(unsigned char *)  &__tramp[5] = 0xe8; \
-   *(unsigned int*)  &__tramp[6] = __dis; /* call __fun  */ \
-   *(unsigned char *)  &__tramp[10] = 0xc2; \
-   *(unsigned short*)  &__tramp[11] = __size; /* ret __size  */ \
- }
-
-/* the cif must already be prep'ed */
-
-ffi_status
-ffi_prep_closure_loc (ffi_closure* closure,
-                      ffi_cif* cif,
-                      void (*fun)(ffi_cif*,void*,void**,void*),
-                      void *user_data,
-                      void *codeloc)
-{
-#ifdef X86_WIN64
-#define ISFLOAT(IDX) (cif->arg_types[IDX]->type == FFI_TYPE_FLOAT || cif->arg_types[IDX]->type == FFI_TYPE_DOUBLE)
-#define FLAG(IDX) (cif->nargs>(IDX)&&ISFLOAT(IDX)?(1<<(IDX)):0)
-  if (cif->abi == FFI_WIN64) 
-    {
-      int mask = FLAG(0)|FLAG(1)|FLAG(2)|FLAG(3);
-      FFI_INIT_TRAMPOLINE_WIN64 (&closure->tramp[0],
-                                 &ffi_closure_win64,
-                                 codeloc, mask);
-      /* make sure we can execute here */
-    }
-#else
-  if (cif->abi == FFI_SYSV)
-    {
-      FFI_INIT_TRAMPOLINE (&closure->tramp[0],
-                           &ffi_closure_SYSV,
-                           (void*)codeloc);
-    }
-#ifdef X86_WIN32
-  else if (cif->abi == FFI_THISCALL)
-    {
-      FFI_INIT_TRAMPOLINE_THISCALL (&closure->tramp[0],
-				    &ffi_closure_THISCALL,
-				    (void*)codeloc,
-				    cif->bytes);
-    }
-  else if (cif->abi == FFI_STDCALL)
-    {
-      FFI_INIT_TRAMPOLINE_STDCALL (&closure->tramp[0],
-                                   &ffi_closure_STDCALL,
-                                   (void*)codeloc, cif->bytes);
-    }
-  else if (cif->abi == FFI_MS_CDECL)
-    {
-      FFI_INIT_TRAMPOLINE (&closure->tramp[0],
-                           &ffi_closure_SYSV,
-                           (void*)codeloc);
-    }
-#endif /* X86_WIN32 */
-#endif /* !X86_WIN64 */
-  else
-    {
-      return FFI_BAD_ABI;
-    }
-    
-  closure->cif  = cif;
-  closure->user_data = user_data;
-  closure->fun  = fun;
-
-  return FFI_OK;
-}
-
-/* ------- Native raw API support -------------------------------- */
-
-#if !FFI_NO_RAW_API
-
-ffi_status
-ffi_prep_raw_closure_loc (ffi_raw_closure* closure,
-                          ffi_cif* cif,
-                          void (*fun)(ffi_cif*,void*,ffi_raw*,void*),
-                          void *user_data,
-                          void *codeloc)
-{
-  int i;
-
-  if (cif->abi != FFI_SYSV) {
-#ifdef X86_WIN32
-    if (cif->abi != FFI_THISCALL)
-#endif
-    return FFI_BAD_ABI;
-  }
-
-  /* we currently don't support certain kinds of arguments for raw
-     closures.  This should be implemented by a separate assembly
-     language routine, since it would require argument processing,
-     something we don't do now for performance.  */
-
-  for (i = cif->nargs-1; i >= 0; i--)
-    {
-      FFI_ASSERT (cif->arg_types[i]->type != FFI_TYPE_STRUCT);
-      FFI_ASSERT (cif->arg_types[i]->type != FFI_TYPE_LONGDOUBLE);
-    }
-  
-#ifdef X86_WIN32
-  if (cif->abi == FFI_SYSV)
-    {
-#endif
-  FFI_INIT_TRAMPOLINE (&closure->tramp[0], &ffi_closure_raw_SYSV,
-                       codeloc);
-#ifdef X86_WIN32
-    }
-  else if (cif->abi == FFI_THISCALL)
-    {
-      FFI_INIT_TRAMPOLINE_THISCALL (&closure->tramp[0], &ffi_closure_raw_THISCALL,
-				    codeloc, cif->bytes);
-    }
-#endif
-  closure->cif  = cif;
-  closure->user_data = user_data;
-  closure->fun  = fun;
-
-  return FFI_OK;
-}
-
-static void 
-ffi_prep_args_raw(char *stack, extended_cif *ecif)
-{
-  memcpy (stack, ecif->avalue, ecif->cif->bytes);
-}
-
-/* we borrow this routine from libffi (it must be changed, though, to
- * actually call the function passed in the first argument.  as of
- * libffi-1.20, this is not the case.)
- */
-
-void
-ffi_raw_call(ffi_cif *cif, void (*fn)(void), void *rvalue, ffi_raw *fake_avalue)
-{
-  extended_cif ecif;
-  void **avalue = (void **)fake_avalue;
-
-  ecif.cif = cif;
-  ecif.avalue = avalue;
-  
-  /* If the return value is a struct and we don't have a return */
-  /* value address then we need to make one                     */
-
-  if (rvalue == NULL
-      && (cif->flags == FFI_TYPE_STRUCT
-          || cif->flags == FFI_TYPE_MS_STRUCT))
-    {
-      ecif.rvalue = alloca(cif->rtype->size);
-    }
-  else
-    ecif.rvalue = rvalue;
-    
-  
-  switch (cif->abi) 
-    {
-#ifdef X86_WIN32
-    case FFI_SYSV:
-    case FFI_STDCALL:
-    case FFI_MS_CDECL:
-      ffi_call_win32(ffi_prep_args_raw, &ecif, cif->abi, cif->bytes, cif->flags,
-		     ecif.rvalue, fn);
-      break;
-    case FFI_THISCALL:
-    case FFI_FASTCALL:
-      {
-	unsigned int abi = cif->abi;
-	unsigned int i, passed_regs = 0;
-
-	if (cif->flags == FFI_TYPE_STRUCT)
-	  ++passed_regs;
-
-	for (i=0; i < cif->nargs && passed_regs < 2;i++)
-	  {
-	    size_t sz;
-
-	    if (cif->arg_types[i]->type == FFI_TYPE_FLOAT
-	        || cif->arg_types[i]->type == FFI_TYPE_STRUCT)
-	      continue;
-	    sz = (cif->arg_types[i]->size + 3) & ~3;
-	    if (sz == 0 || sz > 4)
-	      continue;
-	    ++passed_regs;
-	  }
-	if (passed_regs < 2 && abi == FFI_FASTCALL)
-	  cif->abi = abi = FFI_THISCALL;
-	if (passed_regs < 1 && abi == FFI_THISCALL)
-	  cif->abi = abi = FFI_STDCALL;
-        ffi_call_win32(ffi_prep_args_raw, &ecif, abi, cif->bytes, cif->flags,
-                       ecif.rvalue, fn);
-      }
-      break;
-#else
-    case FFI_SYSV:
-      ffi_call_SYSV(ffi_prep_args_raw, &ecif, cif->bytes, cif->flags,
-                    ecif.rvalue, fn);
-      break;
-#endif
-    default:
-      FFI_ASSERT(0);
-      break;
-    }
-}
-
-#endif
-
-#endif /* !__x86_64__  || X86_WIN64 */
-
diff --git a/libffi/src/x86/ffi.c b/libffi/src/x86/ffi.c
index feb5cbb..24431c1 100644
--- a/libffi/src/x86/ffi.c
+++ b/libffi/src/x86/ffi.c
@@ -1,5 +1,6 @@
 /* -----------------------------------------------------------------------
-   ffi.c - Copyright (c) 1996, 1998, 1999, 2001, 2007, 2008  Red Hat, Inc.
+   ffi.c - Copyright (c) 2017  Anthony Green
+           Copyright (c) 1996, 1998, 1999, 2001, 2007, 2008  Red Hat, Inc.
            Copyright (c) 2002  Ranjit Mathew
            Copyright (c) 2002  Bo Thorsen
            Copyright (c) 2002  Roger Sayle
@@ -28,10 +29,12 @@
    DEALINGS IN THE SOFTWARE.
    ----------------------------------------------------------------------- */
 
-#ifndef __x86_64__
+#if defined(__i386__) || defined(_M_IX86)
 #include <ffi.h>
 #include <ffi_common.h>
+#include <stdint.h>
 #include <stdlib.h>
+#include <tramp.h>
 #include "internal.h"
 
 /* Force FFI_TYPE_LONGDOUBLE to be different than FFI_TYPE_DOUBLE;
@@ -49,6 +52,13 @@
 # define __declspec(x)  __attribute__((x))
 #endif
 
+#if defined(_MSC_VER) && defined(_M_IX86)
+/* Stack is not 16-byte aligned on Windows.  */
+#define STACK_ALIGN(bytes) (bytes)
+#else
+#define STACK_ALIGN(bytes) FFI_ALIGN (bytes, 16)
+#endif
+
 /* Perform machine dependent cif processing.  */
 ffi_status FFI_HIDDEN
 ffi_prep_cif_machdep(ffi_cif *cif)
@@ -134,7 +144,7 @@ ffi_prep_cif_machdep(ffi_cif *cif)
 	      break;
 	    }
 	  /* Allocate space for return value pointer.  */
-	  bytes += ALIGN (sizeof(void*), FFI_SIZEOF_ARG);
+	  bytes += FFI_ALIGN (sizeof(void*), FFI_SIZEOF_ARG);
 	}
       break;
     case FFI_TYPE_COMPLEX:
@@ -172,10 +182,10 @@ ffi_prep_cif_machdep(ffi_cif *cif)
     {
       ffi_type *t = cif->arg_types[i];
 
-      bytes = ALIGN (bytes, t->alignment);
-      bytes += ALIGN (t->size, FFI_SIZEOF_ARG);
+      bytes = FFI_ALIGN (bytes, t->alignment);
+      bytes += FFI_ALIGN (t->size, FFI_SIZEOF_ARG);
     }
-  cif->bytes = ALIGN (bytes, 16);
+  cif->bytes = bytes;
 
   return FFI_OK;
 }
@@ -234,12 +244,25 @@ static const struct abi_params abi_params[FFI_LAST_ABI] = {
   [FFI_MS_CDECL] = { 1, R_ECX, 0 }
 };
 
-extern void ffi_call_i386(struct call_frame *, char *)
-#if HAVE_FASTCALL
-	__declspec(fastcall)
+#ifdef HAVE_FASTCALL
+  #ifdef _MSC_VER
+    #define FFI_DECLARE_FASTCALL __fastcall
+  #else
+    #define FFI_DECLARE_FASTCALL __declspec(fastcall)
+  #endif
+#else
+  #define FFI_DECLARE_FASTCALL
 #endif
-	FFI_HIDDEN;
 
+extern void FFI_DECLARE_FASTCALL ffi_call_i386(struct call_frame *, char *) FFI_HIDDEN;
+
+/* We perform some black magic here to use some of the parent's stack frame in
+ * ffi_call_i386() that breaks with the MSVC compiler with the /RTCs or /GZ
+ * flags.  Disable the 'Stack frame run time error checking' for this function
+ * so we don't hit weird exceptions in debug builds. */
+#if defined(_MSC_VER)
+#pragma runtime_checks("s", off)
+#endif
 static void
 ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
 	      void **avalue, void *closure)
@@ -277,7 +300,7 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
 	}
     }
 
-  bytes = cif->bytes;
+  bytes = STACK_ALIGN (cif->bytes);
   stack = alloca(bytes + sizeof(*frame) + rsize);
   argp = (dir < 0 ? stack + bytes : stack);
   frame = (struct call_frame *)(stack + bytes);
@@ -334,9 +357,18 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
 	}
       else
 	{
-	  size_t za = ALIGN (z, FFI_SIZEOF_ARG);
+	  size_t za = FFI_ALIGN (z, FFI_SIZEOF_ARG);
 	  size_t align = FFI_SIZEOF_ARG;
 
+	  /* Issue 434: For thiscall and fastcall, if the paramter passed
+	     as 64-bit integer or struct, all following integer parameters
+	     will be passed on stack.  */
+	  if ((cabi == FFI_THISCALL || cabi == FFI_FASTCALL)
+	      && (t == FFI_TYPE_SINT64
+		  || t == FFI_TYPE_UINT64
+		  || t == FFI_TYPE_STRUCT))
+	    narg_reg = 2;
+
 	  /* Alignment rules for arguments are quite complex.  Vectors and
 	     structures with 16 byte alignment get it.  Note that long double
 	     on Darwin does have 16 byte alignment, and does not get this
@@ -356,7 +388,7 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
 	    }
 	  else
 	    {
-	      argp = (char *)ALIGN (argp, align);
+	      argp = (char *)FFI_ALIGN (argp, align);
 	      memcpy (argp, valp, z);
 	      argp += za;
 	    }
@@ -366,6 +398,9 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
 
   ffi_call_i386 (frame, stack);
 }
+#if defined(_MSC_VER)
+#pragma runtime_checks("s", restore)
+#endif
 
 void
 ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
@@ -373,18 +408,25 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
   ffi_call_int (cif, fn, rvalue, avalue, NULL);
 }
 
+#ifdef FFI_GO_CLOSURES
 void
 ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
 	     void **avalue, void *closure)
 {
   ffi_call_int (cif, fn, rvalue, avalue, closure);
 }
+#endif
 
 /** private members **/
 
 void FFI_HIDDEN ffi_closure_i386(void);
 void FFI_HIDDEN ffi_closure_STDCALL(void);
 void FFI_HIDDEN ffi_closure_REGISTER(void);
+#if defined(FFI_EXEC_STATIC_TRAMP)
+void FFI_HIDDEN ffi_closure_i386_alt(void);
+void FFI_HIDDEN ffi_closure_STDCALL_alt(void);
+void FFI_HIDDEN ffi_closure_REGISTER_alt(void);
+#endif
 
 struct closure_frame
 {
@@ -395,10 +437,7 @@ struct closure_frame
   void *user_data;				/* 36 */
 };
 
-int FFI_HIDDEN
-#if HAVE_FASTCALL
-__declspec(fastcall)
-#endif
+int FFI_HIDDEN FFI_DECLARE_FASTCALL
 ffi_closure_inner (struct closure_frame *frame, char *stack)
 {
   ffi_cif *cif = frame->cif;
@@ -415,7 +454,7 @@ ffi_closure_inner (struct closure_frame *frame, char *stack)
   rvalue = frame->rettemp;
   pabi = &abi_params[cabi];
   dir = pabi->dir;
-  argp = (dir < 0 ? stack + cif->bytes : stack);
+  argp = (dir < 0 ? stack + STACK_ALIGN (cif->bytes) : stack);
 
   switch (flags)
     {
@@ -463,13 +502,22 @@ ffi_closure_inner (struct closure_frame *frame, char *stack)
 	}
       else
 	{
-	  size_t za = ALIGN (z, FFI_SIZEOF_ARG);
+	  size_t za = FFI_ALIGN (z, FFI_SIZEOF_ARG);
 	  size_t align = FFI_SIZEOF_ARG;
 
 	  /* See the comment in ffi_call_int.  */
 	  if (t == FFI_TYPE_STRUCT && ty->alignment >= 16)
 	    align = 16;
 
+	  /* Issue 434: For thiscall and fastcall, if the paramter passed
+	     as 64-bit integer or struct, all following integer parameters
+	     will be passed on stack.  */
+	  if ((cabi == FFI_THISCALL || cabi == FFI_FASTCALL)
+	      && (t == FFI_TYPE_SINT64
+		  || t == FFI_TYPE_UINT64
+		  || t == FFI_TYPE_STRUCT))
+	    narg_reg = 2;
+
 	  if (dir < 0)
 	    {
 	      /* ??? These reverse argument ABIs are probably too old
@@ -479,7 +527,7 @@ ffi_closure_inner (struct closure_frame *frame, char *stack)
 	    }
 	  else
 	    {
-	      argp = (char *)ALIGN (argp, align);
+	      argp = (char *)FFI_ALIGN (argp, align);
 	      valp = argp;
 	      argp += za;
 	    }
@@ -490,10 +538,17 @@ ffi_closure_inner (struct closure_frame *frame, char *stack)
 
   frame->fun (cif, rvalue, avalue, frame->user_data);
 
-  if (cabi == FFI_STDCALL)
-    return flags + (cif->bytes << X86_RET_POP_SHIFT);
-  else
-    return flags;
+  switch (cabi)
+    {
+    case FFI_STDCALL:
+      return flags | (cif->bytes << X86_RET_POP_SHIFT);
+    case FFI_THISCALL:
+    case FFI_FASTCALL:
+      return flags | ((cif->bytes - (narg_reg * FFI_SIZEOF_ARG))
+          << X86_RET_POP_SHIFT);
+    default:
+      return flags;
+    }
 }
 
 ffi_status
@@ -510,30 +565,51 @@ ffi_prep_closure_loc (ffi_closure* closure,
   switch (cif->abi)
     {
     case FFI_SYSV:
-    case FFI_THISCALL:
-    case FFI_FASTCALL:
     case FFI_MS_CDECL:
       dest = ffi_closure_i386;
       break;
     case FFI_STDCALL:
+    case FFI_THISCALL:
+    case FFI_FASTCALL:
     case FFI_PASCAL:
       dest = ffi_closure_STDCALL;
       break;
     case FFI_REGISTER:
       dest = ffi_closure_REGISTER;
       op = 0x68;  /* pushl imm */
+      break;
     default:
       return FFI_BAD_ABI;
     }
 
+#if defined(FFI_EXEC_STATIC_TRAMP)
+  if (ffi_tramp_is_present(closure))
+    {
+      /* Initialize the static trampoline's parameters. */
+      if (dest == ffi_closure_i386)
+        dest = ffi_closure_i386_alt;
+      else if (dest == ffi_closure_STDCALL)
+        dest = ffi_closure_STDCALL_alt;
+      else
+        dest = ffi_closure_REGISTER_alt;
+      ffi_tramp_set_parms (closure->ftramp, dest, closure);
+      goto out;
+    }
+#endif
+
+  /* Initialize the dynamic trampoline. */
+  /* endbr32.  */
+  *(UINT32 *) tramp = 0xfb1e0ff3;
+
   /* movl or pushl immediate.  */
-  tramp[0] = op;
-  *(void **)(tramp + 1) = codeloc;
+  tramp[4] = op;
+  *(void **)(tramp + 5) = codeloc;
 
   /* jmp dest */
-  tramp[5] = 0xe9;
-  *(unsigned *)(tramp + 6) = (unsigned)dest - ((unsigned)codeloc + 10);
+  tramp[9] = 0xe9;
+  *(unsigned *)(tramp + 10) = (unsigned)dest - ((unsigned)codeloc + 14);
 
+out:
   closure->cif = cif;
   closure->fun = fun;
   closure->user_data = user_data;
@@ -541,6 +617,8 @@ ffi_prep_closure_loc (ffi_closure* closure,
   return FFI_OK;
 }
 
+#ifdef FFI_GO_CLOSURES
+
 void FFI_HIDDEN ffi_go_closure_EAX(void);
 void FFI_HIDDEN ffi_go_closure_ECX(void);
 void FFI_HIDDEN ffi_go_closure_STDCALL(void);
@@ -577,6 +655,8 @@ ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
   return FFI_OK;
 }
 
+#endif /* FFI_GO_CLOSURES */
+
 /* ------- Native raw API support -------------------------------- */
 
 #if !FFI_NO_RAW_API
@@ -669,8 +749,9 @@ ffi_raw_call(ffi_cif *cif, void (*fn)(void), void *rvalue, ffi_raw *avalue)
 	}
     }
 
-  bytes = cif->bytes;
-  argp = stack = alloca(bytes + sizeof(*frame) + rsize);
+  bytes = STACK_ALIGN (cif->bytes);
+  argp = stack =
+      (void *)((uintptr_t)alloca(bytes + sizeof(*frame) + rsize + 15) & ~16);
   frame = (struct call_frame *)(stack + bytes);
   if (rsize)
     rvalue = frame + 1;
@@ -714,7 +795,7 @@ ffi_raw_call(ffi_cif *cif, void (*fn)(void), void *rvalue, ffi_raw *avalue)
       else
 	{
 	  memcpy (argp, avalue, z);
-	  z = ALIGN (z, FFI_SIZEOF_ARG);
+	  z = FFI_ALIGN (z, FFI_SIZEOF_ARG);
 	  argp += z;
 	}
       avalue += z;
@@ -726,4 +807,17 @@ ffi_raw_call(ffi_cif *cif, void (*fn)(void), void *rvalue, ffi_raw *avalue)
   ffi_call_i386 (frame, stack);
 }
 #endif /* !FFI_NO_RAW_API */
-#endif /* !__x86_64__ */
+
+#if defined(FFI_EXEC_STATIC_TRAMP)
+void *
+ffi_tramp_arch (size_t *tramp_size, size_t *map_size)
+{
+  extern void *trampoline_code_table;
+
+  *map_size = X86_TRAMP_MAP_SIZE;
+  *tramp_size = X86_TRAMP_SIZE;
+  return &trampoline_code_table;
+}
+#endif
+
+#endif /* __i386__ */
diff --git a/libffi/src/x86/ffi64.c b/libffi/src/x86/ffi64.c
index 243cbc7..438b374 100644
--- a/libffi/src/x86/ffi64.c
+++ b/libffi/src/x86/ffi64.c
@@ -1,6 +1,6 @@
 /* -----------------------------------------------------------------------
-   ffi64.c - Copyright (c) 2013  The Written Word, Inc.
-             Copyright (c) 2011  Anthony Green
+   ffi64.c - Copyright (c) 2011, 2018  Anthony Green
+             Copyright (c) 2013  The Written Word, Inc.
              Copyright (c) 2008, 2010  Red Hat, Inc.
              Copyright (c) 2002, 2007  Bo Thorsen <bo@suse.de>
 
@@ -33,6 +33,7 @@
 #include <stdlib.h>
 #include <stdarg.h>
 #include <stdint.h>
+#include <tramp.h>
 #include "internal64.h"
 
 #ifdef __x86_64__
@@ -217,10 +218,10 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
     case FFI_TYPE_STRUCT:
       {
 	const size_t UNITS_PER_WORD = 8;
-	size_t words = (type->size + byte_offset + UNITS_PER_WORD - 1)
-		       / UNITS_PER_WORD;
+        size_t words = (type->size + byte_offset + UNITS_PER_WORD - 1)
+                       / UNITS_PER_WORD;
 	ffi_type **ptr;
-	int i;
+	unsigned int i;
 	enum x86_64_reg_class subclasses[MAX_CLASSES];
 
 	/* If the struct is larger than 32 bytes, pass it on the stack.  */
@@ -244,14 +245,15 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
 	  {
 	    size_t num, pos;
 
-	    byte_offset = ALIGN (byte_offset, (*ptr)->alignment);
+	    byte_offset = FFI_ALIGN (byte_offset, (*ptr)->alignment);
 
 	    num = classify_argument (*ptr, subclasses, byte_offset % 8);
 	    if (num == 0)
 	      return 0;
-	    pos = byte_offset / 8;
-	    for (i = 0; i < num && (i + pos) < words; i++)
+            pos = byte_offset / 8;
+            for (i = 0; i < num && (i + pos) < words; i++)
 	      {
+		size_t pos = byte_offset / 8;
 		classes[i + pos] =
 		  merge_classes (subclasses[i], classes[i + pos]);
 	      }
@@ -283,7 +285,7 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
 
 	    /* The X86_64_SSEUP_CLASS should be always preceded by
 	       X86_64_SSE_CLASS or X86_64_SSEUP_CLASS.  */
-	    if (classes[i] == X86_64_SSEUP_CLASS
+	    if (i > 1 && classes[i] == X86_64_SSEUP_CLASS
 		&& classes[i - 1] != X86_64_SSE_CLASS
 		&& classes[i - 1] != X86_64_SSEUP_CLASS)
 	      {
@@ -294,7 +296,7 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
 
 	    /*  If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
 		everything should be passed in memory.  */
-	    if (classes[i] == X86_64_X87UP_CLASS
+	    if (i > 1 && classes[i] == X86_64_X87UP_CLASS
 		&& (classes[i - 1] != X86_64_X87_CLASS))
 	      {
 		/* The first one should never be X86_64_X87UP_CLASS.  */
@@ -351,7 +353,8 @@ examine_argument (ffi_type *type, enum x86_64_reg_class classes[MAX_CLASSES],
 		  _Bool in_return, int *pngpr, int *pnsse)
 {
   size_t n;
-  int i, ngpr, nsse;
+  unsigned int i;
+  int ngpr, nsse;
 
   n = classify_argument (type, classes, 0);
   if (n == 0)
@@ -389,14 +392,24 @@ examine_argument (ffi_type *type, enum x86_64_reg_class classes[MAX_CLASSES],
 
 /* Perform machine dependent cif processing.  */
 
-ffi_status
+#ifndef __ILP32__
+extern ffi_status
+ffi_prep_cif_machdep_efi64(ffi_cif *cif);
+#endif
+
+ffi_status FFI_HIDDEN
 ffi_prep_cif_machdep (ffi_cif *cif)
 {
-  int gprcount, ssecount, i, avn, ngpr, nsse, flags;
+  int gprcount, ssecount, i, avn, ngpr, nsse;
+  unsigned flags;
   enum x86_64_reg_class classes[MAX_CLASSES];
   size_t bytes, n, rtype_size;
   ffi_type *rtype;
 
+#ifndef __ILP32__
+  if (cif->abi == FFI_EFI64 || cif->abi == FFI_GNUW64)
+    return ffi_prep_cif_machdep_efi64(cif);
+#endif
   if (cif->abi != FFI_UNIX64)
     return FFI_BAD_ABI;
 
@@ -441,9 +454,11 @@ ffi_prep_cif_machdep (ffi_cif *cif)
     case FFI_TYPE_DOUBLE:
       flags = UNIX64_RET_XMM64;
       break;
+#if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
     case FFI_TYPE_LONGDOUBLE:
       flags = UNIX64_RET_X87;
       break;
+#endif
     case FFI_TYPE_STRUCT:
       n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
       if (n == 0)
@@ -489,7 +504,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 	case FFI_TYPE_SINT32:
 	case FFI_TYPE_UINT64:
 	case FFI_TYPE_SINT64:
-	  flags = UNIX64_RET_ST_RAX_RDX | (rtype_size << UNIX64_SIZE_SHIFT);
+	  flags = UNIX64_RET_ST_RAX_RDX | ((unsigned) rtype_size << UNIX64_SIZE_SHIFT);
 	  break;
 	case FFI_TYPE_FLOAT:
 	  flags = UNIX64_RET_XMM64;
@@ -524,7 +539,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 	  if (align < 8)
 	    align = 8;
 
-	  bytes = ALIGN (bytes, align);
+	  bytes = FFI_ALIGN (bytes, align);
 	  bytes += cif->arg_types[i]->size;
 	}
       else
@@ -537,7 +552,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
     flags |= UNIX64_FLAG_XMM_ARGS;
 
   cif->flags = flags;
-  cif->bytes = ALIGN (bytes, 8);
+  cif->bytes = (unsigned) FFI_ALIGN (bytes, 8);
 
   return FFI_OK;
 }
@@ -599,7 +614,7 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
 	    align = 8;
 
 	  /* Pass this argument in memory.  */
-	  argp = (void *) ALIGN (argp, align);
+	  argp = (void *) FFI_ALIGN (argp, align);
 	  memcpy (argp, avalue[i], size);
 	  argp += size;
 	}
@@ -607,7 +622,7 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
 	{
 	  /* The argument is passed entirely in registers.  */
 	  char *a = (char *) avalue[i];
-	  int j;
+	  unsigned int j;
 
 	  for (j = 0; j < n; j++, a += 8, size -= 8)
 	    {
@@ -641,10 +656,10 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
 		  break;
 		case X86_64_SSE_CLASS:
 		case X86_64_SSEDF_CLASS:
-		  reg_args->sse[ssecount++].i64 = *(UINT64 *) a;
+		  memcpy (&reg_args->sse[ssecount++].i64, a, sizeof(UINT64));
 		  break;
 		case X86_64_SSESF_CLASS:
-		  reg_args->sse[ssecount++].i32 = *(UINT32 *) a;
+		  memcpy (&reg_args->sse[ssecount++].i32, a, sizeof(UINT32));
 		  break;
 		default:
 		  abort();
@@ -658,21 +673,63 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
 		   flags, rvalue, fn);
 }
 
+#ifndef __ILP32__
+extern void
+ffi_call_efi64(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue);
+#endif
+
 void
 ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 {
+#ifndef __ILP32__
+  if (cif->abi == FFI_EFI64 || cif->abi == FFI_GNUW64)
+    {
+      ffi_call_efi64(cif, fn, rvalue, avalue);
+      return;
+    }
+#endif
   ffi_call_int (cif, fn, rvalue, avalue, NULL);
 }
 
+#ifdef FFI_GO_CLOSURES
+
+#ifndef __ILP32__
+extern void
+ffi_call_go_efi64(ffi_cif *cif, void (*fn)(void), void *rvalue,
+		  void **avalue, void *closure);
+#endif
+
 void
 ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
 	     void **avalue, void *closure)
 {
+#ifndef __ILP32__
+  if (cif->abi == FFI_EFI64 || cif->abi == FFI_GNUW64)
+    {
+      ffi_call_go_efi64(cif, fn, rvalue, avalue, closure);
+      return;
+    }
+#endif
   ffi_call_int (cif, fn, rvalue, avalue, closure);
 }
 
+#endif /* FFI_GO_CLOSURES */
+
 extern void ffi_closure_unix64(void) FFI_HIDDEN;
 extern void ffi_closure_unix64_sse(void) FFI_HIDDEN;
+#if defined(FFI_EXEC_STATIC_TRAMP)
+extern void ffi_closure_unix64_alt(void) FFI_HIDDEN;
+extern void ffi_closure_unix64_sse_alt(void) FFI_HIDDEN;
+#endif
+
+#ifndef __ILP32__
+extern ffi_status
+ffi_prep_closure_loc_efi64(ffi_closure* closure,
+			   ffi_cif* cif,
+			   void (*fun)(ffi_cif*, void*, void**, void*),
+			   void *user_data,
+			   void *codeloc);
+#endif
 
 ffi_status
 ffi_prep_closure_loc (ffi_closure* closure,
@@ -681,17 +738,23 @@ ffi_prep_closure_loc (ffi_closure* closure,
 		      void *user_data,
 		      void *codeloc)
 {
-  static const unsigned char trampoline[16] = {
-    /* leaq  -0x7(%rip),%r10   # 0x0  */
-    0x4c, 0x8d, 0x15, 0xf9, 0xff, 0xff, 0xff,
-    /* jmpq  *0x3(%rip)        # 0x10 */
-    0xff, 0x25, 0x03, 0x00, 0x00, 0x00,
-    /* nopl  (%rax) */
-    0x0f, 0x1f, 0x00
+  static const unsigned char trampoline[24] = {
+    /* endbr64 */
+    0xf3, 0x0f, 0x1e, 0xfa,
+    /* leaq  -0xb(%rip),%r10   # 0x0  */
+    0x4c, 0x8d, 0x15, 0xf5, 0xff, 0xff, 0xff,
+    /* jmpq  *0x7(%rip)        # 0x18 */
+    0xff, 0x25, 0x07, 0x00, 0x00, 0x00,
+    /* nopl  0(%rax) */
+    0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00
   };
   void (*dest)(void);
   char *tramp = closure->tramp;
 
+#ifndef __ILP32__
+  if (cif->abi == FFI_EFI64 || cif->abi == FFI_GNUW64)
+    return ffi_prep_closure_loc_efi64(closure, cif, fun, user_data, codeloc);
+#endif
   if (cif->abi != FFI_UNIX64)
     return FFI_BAD_ABI;
 
@@ -700,9 +763,24 @@ ffi_prep_closure_loc (ffi_closure* closure,
   else
     dest = ffi_closure_unix64;
 
+#if defined(FFI_EXEC_STATIC_TRAMP)
+  if (ffi_tramp_is_present(closure))
+    {
+      /* Initialize the static trampoline's parameters. */
+      if (dest == ffi_closure_unix64_sse)
+        dest = ffi_closure_unix64_sse_alt;
+      else
+        dest = ffi_closure_unix64_alt;
+      ffi_tramp_set_parms (closure->ftramp, dest, closure);
+      goto out;
+    }
+#endif
+
+  /* Initialize the dynamic trampoline. */
   memcpy (tramp, trampoline, sizeof(trampoline));
-  *(UINT64 *)(tramp + 16) = (uintptr_t)dest;
+  *(UINT64 *)(tramp + sizeof (trampoline)) = (uintptr_t)dest;
 
+out:
   closure->cif = cif;
   closure->fun = fun;
   closure->user_data = user_data;
@@ -757,7 +835,7 @@ ffi_closure_unix64_inner(ffi_cif *cif,
 	    align = 8;
 
 	  /* Pass this argument in memory.  */
-	  argp = (void *) ALIGN (argp, align);
+	  argp = (void *) FFI_ALIGN (argp, align);
 	  avalue[i] = argp;
 	  argp += arg_types[i]->size;
 	}
@@ -783,7 +861,7 @@ ffi_closure_unix64_inner(ffi_cif *cif,
       else
 	{
 	  char *a = alloca (16);
-	  int j;
+	  unsigned int j;
 
 	  avalue[i] = a;
 	  for (j = 0; j < n; j++, a += 8)
@@ -803,13 +881,25 @@ ffi_closure_unix64_inner(ffi_cif *cif,
   return flags;
 }
 
+#ifdef FFI_GO_CLOSURES
+
 extern void ffi_go_closure_unix64(void) FFI_HIDDEN;
 extern void ffi_go_closure_unix64_sse(void) FFI_HIDDEN;
 
+#ifndef __ILP32__
+extern ffi_status
+ffi_prep_go_closure_efi64(ffi_go_closure* closure, ffi_cif* cif,
+			  void (*fun)(ffi_cif*, void*, void**, void*));
+#endif
+
 ffi_status
 ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
 		     void (*fun)(ffi_cif*, void*, void**, void*))
 {
+#ifndef __ILP32__
+  if (cif->abi == FFI_EFI64 || cif->abi == FFI_GNUW64)
+    return ffi_prep_go_closure_efi64(closure, cif, fun);
+#endif
   if (cif->abi != FFI_UNIX64)
     return FFI_BAD_ABI;
 
@@ -822,4 +912,18 @@ ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
   return FFI_OK;
 }
 
+#endif /* FFI_GO_CLOSURES */
+
+#if defined(FFI_EXEC_STATIC_TRAMP)
+void *
+ffi_tramp_arch (size_t *tramp_size, size_t *map_size)
+{
+  extern void *trampoline_code_table;
+
+  *map_size = UNIX64_TRAMP_MAP_SIZE;
+  *tramp_size = UNIX64_TRAMP_SIZE;
+  return &trampoline_code_table;
+}
+#endif
+
 #endif /* __x86_64__ */
diff --git a/libffi/src/x86/ffitarget.h b/libffi/src/x86/ffitarget.h
index a576961..f454341 100644
--- a/libffi/src/x86/ffitarget.h
+++ b/libffi/src/x86/ffitarget.h
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------*-C-*-
-   ffitarget.h - Copyright (c) 2012, 2014  Anthony Green
+   ffitarget.h - Copyright (c) 2012, 2014, 2018  Anthony Green
                  Copyright (c) 1996-2003, 2010  Red Hat, Inc.
                  Copyright (C) 2008  Free Software Foundation, Inc.
 
@@ -50,8 +50,7 @@
 #endif
 
 #define FFI_TARGET_SPECIFIC_STACK_SPACE_ALLOCATION
-
-#if !defined(_MSC_VER) && !defined(X86_DARWIN) && !defined(X86_64_DARWIN)
+#ifndef _MSC_VER
 #define FFI_TARGET_HAS_COMPLEX_TYPE
 #endif
 
@@ -81,13 +80,21 @@ typedef signed long            ffi_sarg;
 typedef enum ffi_abi {
 #if defined(X86_WIN64)
   FFI_FIRST_ABI = 0,
-  FFI_WIN64,
+  FFI_WIN64,            /* sizeof(long double) == 8  - microsoft compilers */
+  FFI_GNUW64,           /* sizeof(long double) == 16 - GNU compilers */
   FFI_LAST_ABI,
+#ifdef __GNUC__
+  FFI_DEFAULT_ABI = FFI_GNUW64
+#else  
   FFI_DEFAULT_ABI = FFI_WIN64
+#endif  
 
-#elif defined(X86_64) || defined(X86_64_DARWIN)
+#elif defined(X86_64) || (defined (__x86_64__) && defined (X86_DARWIN))
   FFI_FIRST_ABI = 1,
   FFI_UNIX64,
+  FFI_WIN64,
+  FFI_EFI64 = FFI_WIN64,
+  FFI_GNUW64,
   FFI_LAST_ABI,
   FFI_DEFAULT_ABI = FFI_UNIX64
 
@@ -120,23 +127,36 @@ typedef enum ffi_abi {
 /* ---- Definitions for closures ----------------------------------------- */
 
 #define FFI_CLOSURES 1
-
-#if !defined(X86_DARWIN) && !defined(X86_64_DARWIN)
 #define FFI_GO_CLOSURES 1
-#endif
 
 #define FFI_TYPE_SMALL_STRUCT_1B (FFI_TYPE_LAST + 1)
 #define FFI_TYPE_SMALL_STRUCT_2B (FFI_TYPE_LAST + 2)
 #define FFI_TYPE_SMALL_STRUCT_4B (FFI_TYPE_LAST + 3)
 #define FFI_TYPE_MS_STRUCT       (FFI_TYPE_LAST + 4)
 
-#if defined (X86_64) || defined(X86_WIN64) || defined(X86_64_DARWIN)
-# define FFI_TRAMPOLINE_SIZE 24
+#if defined (X86_64) || defined(X86_WIN64) \
+    || (defined (__x86_64__) && defined (X86_DARWIN))
+/* 4 bytes of ENDBR64 + 7 bytes of LEA + 6 bytes of JMP + 7 bytes of NOP
+   + 8 bytes of pointer.  */
+# define FFI_TRAMPOLINE_SIZE 32
 # define FFI_NATIVE_RAW_API 0
 #else
-# define FFI_TRAMPOLINE_SIZE 12
+/* 4 bytes of ENDBR32 + 5 bytes of MOV + 5 bytes of JMP + 2 unused
+   bytes.  */
+# define FFI_TRAMPOLINE_SIZE 16
 # define FFI_NATIVE_RAW_API 1  /* x86 has native raw api support */
 #endif
 
+#if !defined(GENERATE_LIBFFI_MAP) && defined(__CET__)
+# include <cet.h>
+# if (__CET__ & 1) != 0
+#   define ENDBR_PRESENT
+# endif
+# define _CET_NOTRACK notrack
+#else
+# define _CET_ENDBR
+# define _CET_NOTRACK
+#endif
+
 #endif
 
diff --git a/libffi/src/x86/ffiw64.c b/libffi/src/x86/ffiw64.c
index 8a33a6c..6870d07 100644
--- a/libffi/src/x86/ffiw64.c
+++ b/libffi/src/x86/ffiw64.c
@@ -1,5 +1,6 @@
 /* -----------------------------------------------------------------------
-   ffiw64.c - Copyright (c) 2014 Red Hat, Inc.
+   ffiw64.c - Copyright (c) 2018 Anthony Green
+              Copyright (c) 2014 Red Hat, Inc.
 
    x86 win64 Foreign Function Interface
 
@@ -24,12 +25,18 @@
    DEALINGS IN THE SOFTWARE.
    ----------------------------------------------------------------------- */
 
+#if defined(__x86_64__) || defined(_M_AMD64)
 #include <ffi.h>
 #include <ffi_common.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <tramp.h>
 
 #ifdef X86_WIN64
+#define EFI64(name) name
+#else
+#define EFI64(name) FFI_HIDDEN name##_efi64
+#endif
 
 struct win64_call_frame
 {
@@ -43,13 +50,19 @@ struct win64_call_frame
 extern void ffi_call_win64 (void *stack, struct win64_call_frame *,
 			    void *closure) FFI_HIDDEN;
 
-ffi_status
-ffi_prep_cif_machdep (ffi_cif *cif)
+ffi_status FFI_HIDDEN
+EFI64(ffi_prep_cif_machdep)(ffi_cif *cif)
 {
   int flags, n;
 
-  if (cif->abi != FFI_WIN64)
-    return FFI_BAD_ABI;
+  switch (cif->abi)
+    {
+    case FFI_WIN64:
+    case FFI_GNUW64:
+      break;
+    default:
+      return FFI_BAD_ABI;
+    }
 
   flags = cif->rtype->type;
   switch (flags)
@@ -57,7 +70,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
     default:
       break;
     case FFI_TYPE_LONGDOUBLE:
-      flags = FFI_TYPE_STRUCT;
+      /* GCC returns long double values by reference, like a struct */
+      if (cif->abi == FFI_GNUW64)
+	flags = FFI_TYPE_STRUCT;
       break;
     case FFI_TYPE_COMPLEX:
       flags = FFI_TYPE_STRUCT;
@@ -93,6 +108,13 @@ ffi_prep_cif_machdep (ffi_cif *cif)
   return FFI_OK;
 }
 
+/* We perform some black magic here to use some of the parent's stack frame in
+ * ffi_call_win64() that breaks with the MSVC compiler with the /RTCs or /GZ
+ * flags.  Disable the 'Stack frame run time error checking' for this function
+ * so we don't hit weird exceptions in debug builds. */
+#if defined(_MSC_VER)
+#pragma runtime_checks("s", off)
+#endif
 static void
 ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
 	      void **avalue, void *closure)
@@ -102,7 +124,7 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
   size_t rsize;
   struct win64_call_frame *frame;
 
-  FFI_ASSERT(cif->abi == FFI_WIN64);
+  FFI_ASSERT(cif->abi == FFI_GNUW64 || cif->abi == FFI_WIN64);
 
   flags = cif->flags;
   rsize = 0;
@@ -157,15 +179,18 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
 
   ffi_call_win64 (stack, frame, closure);
 }
+#if defined(_MSC_VER)
+#pragma runtime_checks("s", restore)
+#endif
 
 void
-ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+EFI64(ffi_call)(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 {
   ffi_call_int (cif, fn, rvalue, avalue, NULL);
 }
 
 void
-ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
+EFI64(ffi_call_go)(ffi_cif *cif, void (*fn)(void), void *rvalue,
 	     void **avalue, void *closure)
 {
   ffi_call_int (cif, fn, rvalue, avalue, closure);
@@ -173,31 +198,56 @@ ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
 
 
 extern void ffi_closure_win64(void) FFI_HIDDEN;
+#if defined(FFI_EXEC_STATIC_TRAMP)
+extern void ffi_closure_win64_alt(void) FFI_HIDDEN;
+#endif
+
+#ifdef FFI_GO_CLOSURES
 extern void ffi_go_closure_win64(void) FFI_HIDDEN;
+#endif
 
 ffi_status
-ffi_prep_closure_loc (ffi_closure* closure,
+EFI64(ffi_prep_closure_loc)(ffi_closure* closure,
 		      ffi_cif* cif,
 		      void (*fun)(ffi_cif*, void*, void**, void*),
 		      void *user_data,
 		      void *codeloc)
 {
-  static const unsigned char trampoline[16] = {
-    /* leaq  -0x7(%rip),%r10   # 0x0  */
-    0x4c, 0x8d, 0x15, 0xf9, 0xff, 0xff, 0xff,
-    /* jmpq  *0x3(%rip)        # 0x10 */
-    0xff, 0x25, 0x03, 0x00, 0x00, 0x00,
-    /* nopl  (%rax) */
-    0x0f, 0x1f, 0x00
+  static const unsigned char trampoline[FFI_TRAMPOLINE_SIZE - 8] = {
+    /* endbr64 */
+    0xf3, 0x0f, 0x1e, 0xfa,
+    /* leaq  -0xb(%rip),%r10   # 0x0  */
+    0x4c, 0x8d, 0x15, 0xf5, 0xff, 0xff, 0xff,
+    /* jmpq  *0x7(%rip)        # 0x18 */
+    0xff, 0x25, 0x07, 0x00, 0x00, 0x00,
+    /* nopl  0(%rax) */
+    0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00
   };
-  unsigned char *tramp = closure->tramp;
+  char *tramp = closure->tramp;
+
+  switch (cif->abi)
+    {
+    case FFI_WIN64:
+    case FFI_GNUW64:
+      break;
+    default:
+      return FFI_BAD_ABI;
+    }
 
-  if (cif->abi != FFI_WIN64)
-    return FFI_BAD_ABI;
+#if defined(FFI_EXEC_STATIC_TRAMP)
+  if (ffi_tramp_is_present(closure))
+    {
+      /* Initialize the static trampoline's parameters. */
+      ffi_tramp_set_parms (closure->ftramp, ffi_closure_win64_alt, closure);
+      goto out;
+    }
+#endif
 
+  /* Initialize the dynamic trampoline. */
   memcpy (tramp, trampoline, sizeof(trampoline));
-  *(UINT64 *)(tramp + 16) = (uintptr_t)ffi_closure_win64;
+  *(UINT64 *)(tramp + sizeof (trampoline)) = (uintptr_t)ffi_closure_win64;
 
+out:
   closure->cif = cif;
   closure->fun = fun;
   closure->user_data = user_data;
@@ -205,12 +255,19 @@ ffi_prep_closure_loc (ffi_closure* closure,
   return FFI_OK;
 }
 
+#ifdef FFI_GO_CLOSURES
 ffi_status
-ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
+EFI64(ffi_prep_go_closure)(ffi_go_closure* closure, ffi_cif* cif,
 		     void (*fun)(ffi_cif*, void*, void**, void*))
 {
-  if (cif->abi != FFI_WIN64)
-    return FFI_BAD_ABI;
+  switch (cif->abi)
+    {
+    case FFI_WIN64:
+    case FFI_GNUW64:
+      break;
+    default:
+      return FFI_BAD_ABI;
+    }
 
   closure->tramp = ffi_go_closure_win64;
   closure->cif = cif;
@@ -218,6 +275,7 @@ ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
 
   return FFI_OK;
 }
+#endif
 
 struct win64_closure_frame
 {
@@ -227,7 +285,11 @@ struct win64_closure_frame
   UINT64 args[];
 };
 
-int FFI_HIDDEN
+/* Force the inner function to use the MS ABI.  When compiling on win64
+   this is a nop.  When compiling on unix, this simplifies the assembly,
+   and places the burden of saving the extra call-saved registers on
+   the compiler.  */
+int FFI_HIDDEN __attribute__((ms_abi))
 ffi_closure_win64_inner(ffi_cif *cif,
 			void (*fun)(ffi_cif*, void*, void**, void*),
 			void *user_data,
@@ -278,4 +340,4 @@ ffi_closure_win64_inner(ffi_cif *cif,
   return flags;
 }
 
-#endif /* X86_WIN64 */
+#endif /* __x86_64__ */
diff --git a/libffi/src/x86/internal.h b/libffi/src/x86/internal.h
index 09771ba..23be7a2 100644
--- a/libffi/src/x86/internal.h
+++ b/libffi/src/x86/internal.h
@@ -27,3 +27,17 @@
 #else
 # define HAVE_FASTCALL 1
 #endif
+
+#if defined(FFI_EXEC_STATIC_TRAMP)
+/*
+ * For the trampoline code table mapping, a mapping size of 4K (base page size)
+ * is chosen.
+ */
+#define X86_TRAMP_MAP_SHIFT	12
+#define X86_TRAMP_MAP_SIZE	(1 << X86_TRAMP_MAP_SHIFT)
+#ifdef ENDBR_PRESENT
+#define X86_TRAMP_SIZE		44
+#else
+#define X86_TRAMP_SIZE		40
+#endif
+#endif
diff --git a/libffi/src/x86/internal64.h b/libffi/src/x86/internal64.h
index 512e955..282b408 100644
--- a/libffi/src/x86/internal64.h
+++ b/libffi/src/x86/internal64.h
@@ -20,3 +20,17 @@
 #define UNIX64_FLAG_RET_IN_MEM	(1 << 10)
 #define UNIX64_FLAG_XMM_ARGS	(1 << 11)
 #define UNIX64_SIZE_SHIFT	12
+
+#if defined(FFI_EXEC_STATIC_TRAMP)
+/*
+ * For the trampoline code table mapping, a mapping size of 4K (base page size)
+ * is chosen.
+ */
+#define UNIX64_TRAMP_MAP_SHIFT	12
+#define UNIX64_TRAMP_MAP_SIZE	(1 << UNIX64_TRAMP_MAP_SHIFT)
+#ifdef ENDBR_PRESENT
+#define UNIX64_TRAMP_SIZE	40
+#else
+#define UNIX64_TRAMP_SIZE	32
+#endif
+#endif
diff --git a/libffi/src/x86/sysv.S b/libffi/src/x86/sysv.S
index 78f245b..7110f02 100644
--- a/libffi/src/x86/sysv.S
+++ b/libffi/src/x86/sysv.S
@@ -1,6 +1,7 @@
 /* -----------------------------------------------------------------------
-   sysv.S - Copyright (c) 2013  The Written Word, Inc.
-	  - Copyright (c) 1996,1998,2001-2003,2005,2008,2010  Red Hat, Inc.
+   sysv.S - Copyright (c) 2017  Anthony Green
+          - Copyright (c) 2013  The Written Word, Inc.
+          - Copyright (c) 1996,1998,2001-2003,2005,2008,2010  Red Hat, Inc.
    
    X86 Foreign Function Interface 
 
@@ -25,7 +26,8 @@
    DEALINGS IN THE SOFTWARE.
    ----------------------------------------------------------------------- */
 
-#ifndef __x86_64__
+#ifdef __i386__
+#ifndef _MSC_VER
 
 #define LIBFFI_ASM	
 #include <fficonfig.h>
@@ -54,8 +56,8 @@
 
 /* Handle win32 fastcall name mangling.  */
 #ifdef X86_WIN32
-# define ffi_call_i386		@ffi_call_i386@8
-# define ffi_closure_inner	@ffi_closure_inner@8
+# define ffi_call_i386		"@ffi_call_i386@8"
+# define ffi_closure_inner	"@ffi_closure_inner@8"
 #else
 # define ffi_call_i386		C(ffi_call_i386)
 # define ffi_closure_inner	C(ffi_closure_inner)
@@ -90,6 +92,7 @@
 ffi_call_i386:
 L(UW0):
 	# cfi_startproc
+	_CET_ENDBR
 #if !HAVE_FASTCALL
 	movl	4(%esp), %ecx
 	movl	8(%esp), %edx
@@ -131,7 +134,7 @@ L(pc1):
 	leal	L(store_table)(,%ecx, 8), %ebx
 #endif
 	movl	16(%ebp), %ecx		/* load result address */
-	jmp	*%ebx
+	_CET_NOTRACK jmp *%ebx
 
 	.balign	8
 L(store_table):
@@ -254,7 +257,7 @@ ENDF(ffi_call_i386)
 	andl	$X86_RET_TYPE_MASK, %eax;				\
 	leal	L(C1(load_table,N))(, %eax, 8), %edx;			\
 	movl	closure_CF(%esp), %eax;		/* optimiztic load */	\
-	jmp	*%edx
+	_CET_NOTRACK jmp *%edx
 
 #ifdef __PIC__
 # if defined X86_DARWIN || defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE
@@ -265,14 +268,14 @@ ENDF(ffi_call_i386)
 L(C1(pc,N)):								\
 	leal	L(C1(load_table,N))-L(C1(pc,N))(%edx, %eax, 8), %edx;	\
 	movl	closure_CF(%esp), %eax;		/* optimiztic load */	\
-	jmp	*%edx
+	_CET_NOTRACK jmp *%edx
 # else
 #  define FFI_CLOSURE_CALL_INNER_SAVE_EBX
 #  undef FFI_CLOSURE_CALL_INNER
 #  define FFI_CLOSURE_CALL_INNER(UWN)					\
 	movl	%ebx, 40(%esp);			/* save ebx */		\
 L(C1(UW,UWN)):								\
-	# cfi_rel_offset(%ebx, 40);					\
+	/* cfi_rel_offset(%ebx, 40); */					\
 	call	C(__x86.get_pc_thunk.bx);	/* load got register */	\
 	addl	$C(_GLOBAL_OFFSET_TABLE_), %ebx;			\
 	call	ffi_closure_inner@PLT
@@ -282,9 +285,9 @@ L(C1(UW,UWN)):								\
 	leal	L(C1(load_table,N))@GOTOFF(%ebx, %eax, 8), %edx;	\
 	movl	40(%esp), %ebx;			/* restore ebx */	\
 L(C1(UW,UWN)):								\
-	# cfi_restore(%ebx);						\
+	/* cfi_restore(%ebx); */					\
 	movl	closure_CF(%esp), %eax;		/* optimiztic load */	\
-	jmp	*%edx
+	_CET_NOTRACK jmp *%edx
 # endif /* DARWIN || HIDDEN */
 #endif /* __PIC__ */
 
@@ -294,6 +297,7 @@ L(C1(UW,UWN)):								\
 C(ffi_go_closure_EAX):
 L(UW6):
 	# cfi_startproc
+	_CET_ENDBR
 	subl	$closure_FS, %esp
 L(UW7):
 	# cfi_def_cfa_offset(closure_FS + 4)
@@ -314,6 +318,7 @@ ENDF(C(ffi_go_closure_EAX))
 C(ffi_go_closure_ECX):
 L(UW9):
 	# cfi_startproc
+	_CET_ENDBR
 	subl	$closure_FS, %esp
 L(UW10):
 	# cfi_def_cfa_offset(closure_FS + 4)
@@ -338,6 +343,7 @@ ENDF(C(ffi_go_closure_ECX))
 C(ffi_closure_i386):
 L(UW12):
 	# cfi_startproc
+	_CET_ENDBR
 	subl	$closure_FS, %esp
 L(UW13):
 	# cfi_def_cfa_offset(closure_FS + 4)
@@ -421,6 +427,7 @@ ENDF(C(ffi_closure_i386))
 C(ffi_go_closure_STDCALL):
 L(UW21):
 	# cfi_startproc
+	_CET_ENDBR
 	subl	$closure_FS, %esp
 L(UW22):
 	# cfi_def_cfa_offset(closure_FS + 4)
@@ -446,6 +453,7 @@ L(UW24):
 	# cfi_startproc
 	# cfi_def_cfa(%esp, 8)
 	# cfi_offset(%eip, -8)
+	_CET_ENDBR
 	subl	$closure_FS-4, %esp
 L(UW25):
 	# cfi_def_cfa_offset(closure_FS + 4)
@@ -468,6 +476,7 @@ ENDF(C(ffi_closure_REGISTER))
 C(ffi_closure_STDCALL):
 L(UW27):
 	# cfi_startproc
+	_CET_ENDBR
 	subl	$closure_FS, %esp
 L(UW28):
 	# cfi_def_cfa_offset(closure_FS + 4)
@@ -564,6 +573,94 @@ L(UW31):
 	# cfi_endproc
 ENDF(C(ffi_closure_STDCALL))
 
+#if defined(FFI_EXEC_STATIC_TRAMP)
+	.balign	16
+	.globl	C(ffi_closure_i386_alt)
+	FFI_HIDDEN(C(ffi_closure_i386_alt))
+C(ffi_closure_i386_alt):
+	/* See the comments above trampoline_code_table. */
+	_CET_ENDBR
+	movl	4(%esp), %eax			/* Load closure in eax */
+	add	$8, %esp			/* Restore the stack */
+	jmp	C(ffi_closure_i386)
+ENDF(C(ffi_closure_i386_alt))
+
+	.balign	16
+	.globl	C(ffi_closure_REGISTER_alt)
+	FFI_HIDDEN(C(ffi_closure_REGISTER_alt))
+C(ffi_closure_REGISTER_alt):
+	/* See the comments above trampoline_code_table. */
+	_CET_ENDBR
+	movl	(%esp), %eax			/* Restore eax */
+	add	$4, %esp			/* Leave closure on stack */
+	jmp	C(ffi_closure_REGISTER)
+ENDF(C(ffi_closure_REGISTER_alt))
+
+	.balign	16
+	.globl	C(ffi_closure_STDCALL_alt)
+	FFI_HIDDEN(C(ffi_closure_STDCALL_alt))
+C(ffi_closure_STDCALL_alt):
+	/* See the comments above trampoline_code_table. */
+	_CET_ENDBR
+	movl	4(%esp), %eax			/* Load closure in eax */
+	add	$8, %esp			/* Restore the stack */
+	jmp	C(ffi_closure_STDCALL)
+ENDF(C(ffi_closure_STDCALL_alt))
+
+/*
+ * Below is the definition of the trampoline code table. Each element in
+ * the code table is a trampoline.
+ *
+ * Because we jump to the trampoline, we place a _CET_ENDBR at the
+ * beginning of the trampoline to mark it as a valid branch target. This is
+ * part of the the Intel CET (Control Flow Enforcement Technology).
+ */
+/*
+ * The trampoline uses register eax.  It saves the original value of eax on
+ * the stack.
+ *
+ * The trampoline has two parameters - target code to jump to and data for
+ * the target code. The trampoline extracts the parameters from its parameter
+ * block (see tramp_table_map()). The trampoline saves the data address on
+ * the stack. Finally, it jumps to the target code.
+ *
+ * The target code can choose to:
+ *
+ * - restore the value of eax
+ * - load the data address in a register
+ * - restore the stack pointer to what it was when the trampoline was invoked.
+ */
+#ifdef ENDBR_PRESENT
+#define X86_DATA_OFFSET		4081
+#define X86_CODE_OFFSET		4070
+#else
+#define X86_DATA_OFFSET		4085
+#define X86_CODE_OFFSET		4074
+#endif
+
+	.align	X86_TRAMP_MAP_SIZE
+	.globl	C(trampoline_code_table)
+	FFI_HIDDEN(C(trampoline_code_table))
+C(trampoline_code_table):
+	.rept	X86_TRAMP_MAP_SIZE / X86_TRAMP_SIZE
+	_CET_ENDBR
+	sub	$8, %esp
+	movl	%eax, (%esp)			/* Save %eax on stack */
+	call	1f				/* Get next PC into %eax */
+	movl	X86_DATA_OFFSET(%eax), %eax	/* Copy data into %eax */
+	movl	%eax, 4(%esp)			/* Save data on stack */
+	call	1f				/* Get next PC into %eax */
+	movl	X86_CODE_OFFSET(%eax), %eax	/* Copy code into %eax */
+	jmp	*%eax				/* Jump to code */
+1:
+	mov	(%esp), %eax
+	ret
+	.align	4
+	.endr
+ENDF(C(trampoline_code_table))
+	.align	X86_TRAMP_MAP_SIZE
+#endif /* FFI_EXEC_STATIC_TRAMP */
+
 #if !FFI_NO_RAW_API
 
 #define raw_closure_S_FS	(16+16+12)
@@ -574,6 +671,7 @@ ENDF(C(ffi_closure_STDCALL))
 C(ffi_closure_raw_SYSV):
 L(UW32):
 	# cfi_startproc
+	_CET_ENDBR
 	subl	$raw_closure_S_FS, %esp
 L(UW33):
 	# cfi_def_cfa_offset(raw_closure_S_FS + 4)
@@ -677,6 +775,7 @@ ENDF(C(ffi_closure_raw_SYSV))
 C(ffi_closure_raw_THISCALL):
 L(UW41):
 	# cfi_startproc
+	_CET_ENDBR
 	/* Rearrange the stack such that %ecx is the first argument.
 	   This means moving the return address.  */
 	popl	%edx
@@ -790,9 +889,9 @@ ENDF(C(ffi_closure_raw_THISCALL))
 
 #ifdef X86_DARWIN
 # define COMDAT(X)							\
-        .section __TEXT,__textcoal_nt,coalesced,pure_instructions;	\
+        .section __TEXT,__text,coalesced,pure_instructions;		\
         .weak_definition X;						\
-        .private_extern X
+        FFI_HIDDEN(X)
 #elif defined __ELF__ && !(defined(__sun__) && defined(__svr4__))
 # define COMDAT(X)							\
 	.section .text.X,"axG",@progbits,X,comdat;			\
@@ -1033,7 +1132,95 @@ L(SFDE9):
 L(EFDE9):
 #endif /* !FFI_NO_RAW_API */
 
-#endif /* ifndef __x86_64__ */
+#ifdef _WIN32
+	.def	 @feat.00;
+	.scl	3;
+	.type	0;
+	.endef
+	.globl	@feat.00
+@feat.00 = 1
+#endif
+
+#ifdef __APPLE__
+    .subsections_via_symbols
+    .section __LD,__compact_unwind,regular,debug
+
+    /* compact unwind for ffi_call_i386 */
+    .long    C(ffi_call_i386)
+    .set     L1,L(UW5)-L(UW0)
+    .long    L1
+    .long    0x04000000 /* use dwarf unwind info */
+    .long    0
+    .long    0
+
+    /* compact unwind for ffi_go_closure_EAX */
+    .long    C(ffi_go_closure_EAX)
+    .set     L2,L(UW8)-L(UW6)
+    .long    L2
+    .long    0x04000000 /* use dwarf unwind info */
+    .long    0
+    .long    0
+
+    /* compact unwind for ffi_go_closure_ECX */
+    .long    C(ffi_go_closure_ECX)
+    .set     L3,L(UW11)-L(UW9)
+    .long    L3
+    .long    0x04000000 /* use dwarf unwind info */
+    .long    0
+    .long    0
+
+    /* compact unwind for ffi_closure_i386 */
+    .long    C(ffi_closure_i386)
+    .set     L4,L(UW20)-L(UW12)
+    .long    L4
+    .long    0x04000000 /* use dwarf unwind info */
+    .long    0
+    .long    0
+
+    /* compact unwind for ffi_go_closure_STDCALL */
+    .long    C(ffi_go_closure_STDCALL)
+    .set     L5,L(UW23)-L(UW21)
+    .long    L5
+    .long    0x04000000 /* use dwarf unwind info */
+    .long    0
+    .long    0
+
+    /* compact unwind for ffi_closure_REGISTER */
+    .long    C(ffi_closure_REGISTER)
+    .set     L6,L(UW26)-L(UW24)
+    .long    L6
+    .long    0x04000000 /* use dwarf unwind info */
+    .long    0
+    .long    0
+
+    /* compact unwind for ffi_closure_STDCALL */
+    .long    C(ffi_closure_STDCALL)
+    .set     L7,L(UW31)-L(UW27)
+    .long    L7
+    .long    0x04000000 /* use dwarf unwind info */
+    .long    0
+    .long    0
+
+    /* compact unwind for ffi_closure_raw_SYSV */
+    .long    C(ffi_closure_raw_SYSV)
+    .set     L8,L(UW40)-L(UW32)
+    .long    L8
+    .long    0x04000000 /* use dwarf unwind info */
+    .long    0
+    .long    0
+
+    /* compact unwind for ffi_closure_raw_THISCALL */
+    .long    C(ffi_closure_raw_THISCALL)
+    .set     L9,L(UW52)-L(UW41)
+    .long    L9
+    .long    0x04000000 /* use dwarf unwind info */
+    .long    0
+    .long    0
+#endif /* __APPLE__ */
+
+#endif /* ifndef _MSC_VER */
+
+#endif /* ifdef __i386__ */
 
 #if defined __ELF__ && defined __linux__
 	.section	.note.GNU-stack,"",@progbits
diff --git a/libffi/src/x86/sysv_intel.S b/libffi/src/x86/sysv_intel.S
new file mode 100644
index 0000000..3cafd71
--- /dev/null
+++ b/libffi/src/x86/sysv_intel.S
@@ -0,0 +1,995 @@
+/* -----------------------------------------------------------------------
+   sysv.S - Copyright (c) 2017  Anthony Green
+          - Copyright (c) 2013  The Written Word, Inc.
+          - Copyright (c) 1996,1998,2001-2003,2005,2008,2010  Red Hat, Inc.
+   
+   X86 Foreign Function Interface 
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+   ----------------------------------------------------------------------- */
+
+#ifndef __x86_64__
+#ifdef _MSC_VER
+
+#define LIBFFI_ASM	
+#include <fficonfig.h>
+#include <ffi.h>
+#include <ffi_cfi.h>
+#include "internal.h" 
+
+#define C2(X, Y)  X ## Y
+#define C1(X, Y)  C2(X, Y)
+#define L(X)     C1(L, X)
+# define ENDF(X) X ENDP
+
+/* This macro allows the safe creation of jump tables without an
+   actual table.  The entry points into the table are all 8 bytes.
+   The use of ORG asserts that we're at the correct location.  */
+/* ??? The clang assembler doesn't handle .org with symbolic expressions.  */
+#if defined(__clang__) || defined(__APPLE__) || (defined (__sun__) && defined(__svr4__))
+# define E(BASE, X)	ALIGN 8
+#else
+# define E(BASE, X)	ALIGN 8; ORG BASE + X * 8
+#endif
+
+    .686P
+    .MODEL FLAT
+
+EXTRN	@ffi_closure_inner@8:PROC
+_TEXT SEGMENT
+
+/* This is declared as
+
+   void ffi_call_i386(struct call_frame *frame, char *argp)
+        __attribute__((fastcall));
+
+   Thus the arguments are present in
+
+        ecx: frame
+        edx: argp
+*/
+
+ALIGN 16
+PUBLIC @ffi_call_i386@8
+@ffi_call_i386@8 PROC
+L(UW0):
+	cfi_startproc
+ #if !HAVE_FASTCALL
+	mov	    ecx, [esp+4]
+	mov 	edx, [esp+8]
+ #endif
+	mov	    eax, [esp]		/* move the return address */
+	mov	    [ecx], ebp		/* store ebp into local frame */
+	mov 	[ecx+4], eax	/* store retaddr into local frame */
+
+	/* New stack frame based off ebp.  This is a itty bit of unwind
+	   trickery in that the CFA *has* changed.  There is no easy way
+	   to describe it correctly on entry to the function.  Fortunately,
+	   it doesn't matter too much since at all points we can correctly
+	   unwind back to ffi_call.  Note that the location to which we
+	   moved the return address is (the new) CFA-4, so from the
+	   perspective of the unwind info, it hasn't moved.  */
+	mov 	ebp, ecx
+L(UW1):
+	// cfi_def_cfa(%ebp, 8)
+	// cfi_rel_offset(%ebp, 0)
+
+	mov 	esp, edx		/* set outgoing argument stack */
+	mov 	eax, [20+R_EAX*4+ebp]	/* set register arguments */
+	mov 	edx, [20+R_EDX*4+ebp]
+	mov	    ecx, [20+R_ECX*4+ebp]
+
+	call	dword ptr [ebp+8]
+
+	mov	    ecx, [12+ebp]		/* load return type code */
+	mov 	[ebp+8], ebx		/* preserve %ebx */
+L(UW2):
+	// cfi_rel_offset(%ebx, 8)
+
+	and 	ecx, X86_RET_TYPE_MASK
+	lea 	ebx, [L(store_table) + ecx * 8]
+	mov 	ecx, [ebp+16]		/* load result address */
+	jmp	    ebx
+
+	ALIGN	8
+L(store_table):
+E(L(store_table), X86_RET_FLOAT)
+	fstp	DWORD PTR [ecx]
+	jmp	L(e1)
+E(L(store_table), X86_RET_DOUBLE)
+	fstp	QWORD PTR [ecx]
+	jmp	L(e1)
+E(L(store_table), X86_RET_LDOUBLE)
+	fstp	QWORD PTR [ecx]
+	jmp	L(e1)
+E(L(store_table), X86_RET_SINT8)
+	movsx	eax, al
+	mov	[ecx], eax
+	jmp	L(e1)
+E(L(store_table), X86_RET_SINT16)
+	movsx	eax, ax
+	mov	[ecx], eax
+	jmp	L(e1)
+E(L(store_table), X86_RET_UINT8)
+	movzx	eax, al
+	mov	[ecx], eax
+	jmp	L(e1)
+E(L(store_table), X86_RET_UINT16)
+	movzx	eax, ax
+	mov	[ecx], eax
+	jmp	L(e1)
+E(L(store_table), X86_RET_INT64)
+	mov	[ecx+4], edx
+	/* fallthru */
+E(L(store_table), X86_RET_int 32)
+	mov	[ecx], eax
+	/* fallthru */
+E(L(store_table), X86_RET_VOID)
+L(e1):
+	mov	    ebx, [ebp+8]
+	mov	    esp, ebp
+	pop 	ebp
+L(UW3):
+	// cfi_remember_state
+	// cfi_def_cfa(%esp, 4)
+	// cfi_restore(%ebx)
+	// cfi_restore(%ebp)
+	ret
+L(UW4):
+	// cfi_restore_state
+
+E(L(store_table), X86_RET_STRUCTPOP)
+	jmp	    L(e1)
+E(L(store_table), X86_RET_STRUCTARG)
+	jmp	    L(e1)
+E(L(store_table), X86_RET_STRUCT_1B)
+	mov 	[ecx], al
+	jmp	    L(e1)
+E(L(store_table), X86_RET_STRUCT_2B)
+	mov 	[ecx], ax
+	jmp	    L(e1)
+
+	/* Fill out the table so that bad values are predictable.  */
+E(L(store_table), X86_RET_UNUSED14)
+	int 3
+E(L(store_table), X86_RET_UNUSED15)
+	int 3
+
+L(UW5):
+	// cfi_endproc
+ENDF(@ffi_call_i386@8)
+
+/* The inner helper is declared as
+
+   void ffi_closure_inner(struct closure_frame *frame, char *argp)
+	__attribute_((fastcall))
+
+   Thus the arguments are placed in
+
+	ecx:	frame
+	edx:	argp
+*/
+
+/* Macros to help setting up the closure_data structure.  */
+
+#if HAVE_FASTCALL
+# define closure_FS	(40 + 4)
+# define closure_CF	0
+#else
+# define closure_FS	(8 + 40 + 12)
+# define closure_CF	8
+#endif
+
+FFI_CLOSURE_SAVE_REGS MACRO
+	mov 	[esp + closure_CF+16+R_EAX*4], eax
+	mov 	[esp + closure_CF+16+R_EDX*4], edx
+	mov 	[esp + closure_CF+16+R_ECX*4], ecx
+ENDM
+
+FFI_CLOSURE_COPY_TRAMP_DATA MACRO
+	mov 	edx, [eax+FFI_TRAMPOLINE_SIZE]      /* copy cif */
+	mov 	ecx, [eax+FFI_TRAMPOLINE_SIZE+4]    /* copy fun */
+	mov 	eax, [eax+FFI_TRAMPOLINE_SIZE+8];   /* copy user_data */
+	mov 	[esp+closure_CF+28], edx
+	mov 	[esp+closure_CF+32], ecx
+	mov 	[esp+closure_CF+36], eax
+ENDM
+
+#if HAVE_FASTCALL
+FFI_CLOSURE_PREP_CALL MACRO
+	mov	    ecx, esp                    /* load closure_data */
+	lea 	edx, [esp+closure_FS+4]     /* load incoming stack */
+ENDM
+#else
+FFI_CLOSURE_PREP_CALL MACRO
+	lea 	ecx, [esp+closure_CF]       /* load closure_data */
+	lea 	edx, [esp+closure_FS+4]     /* load incoming stack */
+	mov 	[esp], ecx
+	mov 	[esp+4], edx
+ENDM
+#endif
+
+FFI_CLOSURE_CALL_INNER MACRO UWN
+	call	@ffi_closure_inner@8
+ENDM
+
+FFI_CLOSURE_MASK_AND_JUMP MACRO LABEL
+	and	    eax, X86_RET_TYPE_MASK
+	lea 	edx, [LABEL+eax*8]
+	mov 	eax, [esp+closure_CF]       /* optimiztic load */
+	jmp	    edx
+ENDM
+
+ALIGN 16
+PUBLIC ffi_go_closure_EAX
+ffi_go_closure_EAX PROC C
+L(UW6):
+	// cfi_startproc
+	sub	esp, closure_FS
+L(UW7):
+	// cfi_def_cfa_offset(closure_FS + 4)
+	FFI_CLOSURE_SAVE_REGS
+	mov     edx, [eax+4]			/* copy cif */
+	mov 	ecx, [eax +8]			/* copy fun */
+	mov 	[esp+closure_CF+28], edx
+	mov 	[esp+closure_CF+32], ecx
+	mov 	[esp+closure_CF+36], eax	/* closure is user_data */
+	jmp	L(do_closure_i386)
+L(UW8):
+	// cfi_endproc
+ENDF(ffi_go_closure_EAX)
+
+ALIGN 16
+PUBLIC ffi_go_closure_ECX
+ffi_go_closure_ECX PROC C
+L(UW9):
+	// cfi_startproc
+	sub 	esp, closure_FS
+L(UW10):
+	// cfi_def_cfa_offset(closure_FS + 4)
+	FFI_CLOSURE_SAVE_REGS
+	mov 	edx, [ecx+4]			/* copy cif */
+	mov 	eax, [ecx+8]			/* copy fun */
+	mov 	[esp+closure_CF+28], edx
+	mov 	[esp+closure_CF+32], eax
+	mov 	[esp+closure_CF+36], ecx	/* closure is user_data */
+	jmp	L(do_closure_i386)
+L(UW11):
+	// cfi_endproc
+ENDF(ffi_go_closure_ECX)
+
+/* The closure entry points are reached from the ffi_closure trampoline.
+   On entry, %eax contains the address of the ffi_closure.  */
+
+ALIGN 16
+PUBLIC ffi_closure_i386
+ffi_closure_i386 PROC C
+L(UW12):
+	// cfi_startproc
+	sub	    esp, closure_FS
+L(UW13):
+	// cfi_def_cfa_offset(closure_FS + 4)
+
+	FFI_CLOSURE_SAVE_REGS
+	FFI_CLOSURE_COPY_TRAMP_DATA
+
+	/* Entry point from preceeding Go closures.  */
+L(do_closure_i386)::
+
+	FFI_CLOSURE_PREP_CALL
+	FFI_CLOSURE_CALL_INNER(14)
+	FFI_CLOSURE_MASK_AND_JUMP L(C1(load_table,2))
+
+    ALIGN 8
+L(load_table2):
+E(L(load_table2), X86_RET_FLOAT)
+	fld 	dword ptr [esp+closure_CF]
+	jmp	L(e2)
+E(L(load_table2), X86_RET_DOUBLE)
+	fld 	qword ptr [esp+closure_CF]
+	jmp	L(e2)
+E(L(load_table2), X86_RET_LDOUBLE)
+	fld 	qword ptr [esp+closure_CF]
+	jmp	L(e2)
+E(L(load_table2), X86_RET_SINT8)
+	movsx	eax, al
+	jmp	L(e2)
+E(L(load_table2), X86_RET_SINT16)
+	movsx	eax, ax
+	jmp	L(e2)
+E(L(load_table2), X86_RET_UINT8)
+	movzx	eax, al
+	jmp	L(e2)
+E(L(load_table2), X86_RET_UINT16)
+	movzx	eax, ax
+	jmp	L(e2)
+E(L(load_table2), X86_RET_INT64)
+	mov 	edx, [esp+closure_CF+4]
+	jmp	L(e2)
+E(L(load_table2), X86_RET_INT32)
+	nop
+	/* fallthru */
+E(L(load_table2), X86_RET_VOID)
+L(e2):
+	add 	esp, closure_FS
+L(UW16):
+	// cfi_adjust_cfa_offset(-closure_FS)
+	ret
+L(UW17):
+	// cfi_adjust_cfa_offset(closure_FS)
+E(L(load_table2), X86_RET_STRUCTPOP)
+	add 	esp, closure_FS
+L(UW18):
+	// cfi_adjust_cfa_offset(-closure_FS)
+	ret	4
+L(UW19):
+	// cfi_adjust_cfa_offset(closure_FS)
+E(L(load_table2), X86_RET_STRUCTARG)
+	jmp	L(e2)
+E(L(load_table2), X86_RET_STRUCT_1B)
+	movzx	eax, al
+	jmp	L(e2)
+E(L(load_table2), X86_RET_STRUCT_2B)
+	movzx	eax, ax
+	jmp	L(e2)
+
+	/* Fill out the table so that bad values are predictable.  */
+E(L(load_table2), X86_RET_UNUSED14)
+	int 3
+E(L(load_table2), X86_RET_UNUSED15)
+	int 3
+
+L(UW20):
+	// cfi_endproc
+ENDF(ffi_closure_i386)
+
+ALIGN 16
+PUBLIC	ffi_go_closure_STDCALL
+ffi_go_closure_STDCALL PROC C
+L(UW21):
+	// cfi_startproc
+	sub 	esp, closure_FS
+L(UW22):
+	// cfi_def_cfa_offset(closure_FS + 4)
+	FFI_CLOSURE_SAVE_REGS
+	mov 	edx, [ecx+4]			/* copy cif */
+	mov 	eax, [ecx+8]			/* copy fun */
+	mov 	[esp+closure_CF+28], edx
+	mov 	[esp+closure_CF+32], eax
+	mov 	[esp+closure_CF+36], ecx	/* closure is user_data */
+	jmp	L(do_closure_STDCALL)
+L(UW23):
+	// cfi_endproc
+ENDF(ffi_go_closure_STDCALL)
+
+/* For REGISTER, we have no available parameter registers, and so we
+   enter here having pushed the closure onto the stack.  */
+
+ALIGN 16
+PUBLIC ffi_closure_REGISTER
+ffi_closure_REGISTER PROC C
+L(UW24):
+	// cfi_startproc
+	// cfi_def_cfa(%esp, 8)
+	// cfi_offset(%eip, -8)
+	sub 	esp, closure_FS-4
+L(UW25):
+	// cfi_def_cfa_offset(closure_FS + 4)
+	FFI_CLOSURE_SAVE_REGS
+	mov	ecx, [esp+closure_FS-4] 	/* load retaddr */
+	mov	eax, [esp+closure_FS]		/* load closure */
+	mov	[esp+closure_FS], ecx		/* move retaddr */
+	jmp	L(do_closure_REGISTER)
+L(UW26):
+	// cfi_endproc
+ENDF(ffi_closure_REGISTER)
+
+/* For STDCALL (and others), we need to pop N bytes of arguments off
+   the stack following the closure.  The amount needing to be popped
+   is returned to us from ffi_closure_inner.  */
+
+ALIGN 16
+PUBLIC ffi_closure_STDCALL
+ffi_closure_STDCALL PROC C
+L(UW27):
+	// cfi_startproc
+	sub 	esp, closure_FS
+L(UW28):
+	// cfi_def_cfa_offset(closure_FS + 4)
+
+	FFI_CLOSURE_SAVE_REGS
+
+	/* Entry point from ffi_closure_REGISTER.  */
+L(do_closure_REGISTER)::
+
+	FFI_CLOSURE_COPY_TRAMP_DATA
+
+	/* Entry point from preceeding Go closure.  */
+L(do_closure_STDCALL)::
+
+	FFI_CLOSURE_PREP_CALL
+	FFI_CLOSURE_CALL_INNER(29)
+
+	mov 	ecx, eax
+	shr 	ecx, X86_RET_POP_SHIFT	    /* isolate pop count */
+	lea 	ecx, [esp+closure_FS+ecx]	/* compute popped esp */
+	mov 	edx, [esp+closure_FS]		/* move return address */
+	mov 	[ecx], edx
+
+	/* From this point on, the value of %esp upon return is %ecx+4,
+	   and we've copied the return address to %ecx to make return easy.
+	   There's no point in representing this in the unwind info, as
+	   there is always a window between the mov and the ret which
+	   will be wrong from one point of view or another.  */
+
+	FFI_CLOSURE_MASK_AND_JUMP  L(C1(load_table,3))
+
+    ALIGN 8
+L(load_table3):
+E(L(load_table3), X86_RET_FLOAT)
+	fld    DWORD PTR [esp+closure_CF]
+	mov     esp, ecx
+	ret
+E(L(load_table3), X86_RET_DOUBLE)
+	fld    QWORD PTR [esp+closure_CF]
+	mov     esp, ecx
+	ret
+E(L(load_table3), X86_RET_LDOUBLE)
+	fld    QWORD PTR [esp+closure_CF]
+	mov     esp, ecx
+	ret
+E(L(load_table3), X86_RET_SINT8)
+	movsx   eax, al
+	mov     esp, ecx
+	ret
+E(L(load_table3), X86_RET_SINT16)
+	movsx   eax, ax
+	mov     esp, ecx
+	ret
+E(L(load_table3), X86_RET_UINT8)
+	movzx   eax, al
+	mov     esp, ecx
+	ret
+E(L(load_table3), X86_RET_UINT16)
+	movzx   eax, ax
+	mov     esp, ecx
+	ret
+E(L(load_table3), X86_RET_INT64)
+	mov 	edx, [esp+closure_CF+4]
+	mov     esp, ecx
+	ret
+E(L(load_table3), X86_RET_int 32)
+	mov     esp, ecx
+	ret
+E(L(load_table3), X86_RET_VOID)
+	mov     esp, ecx
+	ret
+E(L(load_table3), X86_RET_STRUCTPOP)
+	mov     esp, ecx
+	ret
+E(L(load_table3), X86_RET_STRUCTARG)
+	mov 	esp, ecx
+	ret
+E(L(load_table3), X86_RET_STRUCT_1B)
+	movzx	eax, al
+	mov 	esp, ecx
+	ret
+E(L(load_table3), X86_RET_STRUCT_2B)
+	movzx	eax, ax
+	mov 	esp, ecx
+	ret
+
+	/* Fill out the table so that bad values are predictable.  */
+E(L(load_table3), X86_RET_UNUSED14)
+	int 3
+E(L(load_table3), X86_RET_UNUSED15)
+	int 3
+
+L(UW31):
+	// cfi_endproc
+ENDF(ffi_closure_STDCALL)
+
+#if !FFI_NO_RAW_API
+
+#define raw_closure_S_FS	(16+16+12)
+
+ALIGN 16
+PUBLIC ffi_closure_raw_SYSV
+ffi_closure_raw_SYSV PROC C
+L(UW32):
+	// cfi_startproc
+	sub 	esp, raw_closure_S_FS
+L(UW33):
+	// cfi_def_cfa_offset(raw_closure_S_FS + 4)
+	mov 	[esp+raw_closure_S_FS-4], ebx
+L(UW34):
+	// cfi_rel_offset(%ebx, raw_closure_S_FS-4)
+
+	mov 	edx, [eax+FFI_TRAMPOLINE_SIZE+8]	/* load cl->user_data */
+	mov 	[esp+12], edx
+	lea 	edx, [esp+raw_closure_S_FS+4]		/* load raw_args */
+	mov 	[esp+8], edx
+	lea 	edx, [esp+16]				/* load &res */
+	mov 	[esp+4], edx
+	mov 	ebx, [eax+FFI_TRAMPOLINE_SIZE]		/* load cl->cif */
+	mov 	[esp], ebx
+	call	DWORD PTR [eax+FFI_TRAMPOLINE_SIZE+4]		/* call cl->fun */
+
+	mov 	eax, [ebx+20]			/* load cif->flags */
+	and 	eax, X86_RET_TYPE_MASK
+// #ifdef __PIC__
+// 	call	__x86.get_pc_thunk.bx
+// L(pc4):
+// 	lea 	ecx, L(load_table4)-L(pc4)(%ebx, %eax, 8), %ecx
+// #else
+	lea 	ecx, [L(load_table4)+eax+8]
+// #endif
+	mov 	ebx, [esp+raw_closure_S_FS-4]
+L(UW35):
+	// cfi_restore(%ebx)
+	mov 	eax, [esp+16]				/* Optimistic load */
+	jmp	    dword ptr [ecx]
+
+	ALIGN 8
+L(load_table4):
+E(L(load_table4), X86_RET_FLOAT)
+	fld 	DWORD PTR [esp +16]
+	jmp	L(e4)
+E(L(load_table4), X86_RET_DOUBLE)
+	fld 	QWORD PTR [esp +16]
+	jmp	L(e4)
+E(L(load_table4), X86_RET_LDOUBLE)
+	fld 	QWORD PTR [esp +16]
+	jmp	L(e4)
+E(L(load_table4), X86_RET_SINT8)
+	movsx	eax, al
+	jmp	L(e4)
+E(L(load_table4), X86_RET_SINT16)
+	movsx	eax, ax
+	jmp	L(e4)
+E(L(load_table4), X86_RET_UINT8)
+	movzx	eax, al
+	jmp	L(e4)
+E(L(load_table4), X86_RET_UINT16)
+	movzx	eax, ax
+	jmp	L(e4)
+E(L(load_table4), X86_RET_INT64)
+	mov 	edx, [esp+16+4]
+	jmp	L(e4)
+E(L(load_table4), X86_RET_int 32)
+	nop
+	/* fallthru */
+E(L(load_table4), X86_RET_VOID)
+L(e4):
+	add 	esp, raw_closure_S_FS
+L(UW36):
+	// cfi_adjust_cfa_offset(-raw_closure_S_FS)
+	ret
+L(UW37):
+	// cfi_adjust_cfa_offset(raw_closure_S_FS)
+E(L(load_table4), X86_RET_STRUCTPOP)
+	add 	esp, raw_closure_S_FS
+L(UW38):
+	// cfi_adjust_cfa_offset(-raw_closure_S_FS)
+	ret	4
+L(UW39):
+	// cfi_adjust_cfa_offset(raw_closure_S_FS)
+E(L(load_table4), X86_RET_STRUCTARG)
+	jmp	L(e4)
+E(L(load_table4), X86_RET_STRUCT_1B)
+	movzx	eax, al
+	jmp	L(e4)
+E(L(load_table4), X86_RET_STRUCT_2B)
+	movzx	eax, ax
+	jmp	L(e4)
+
+	/* Fill out the table so that bad values are predictable.  */
+E(L(load_table4), X86_RET_UNUSED14)
+	int 3
+E(L(load_table4), X86_RET_UNUSED15)
+	int 3
+
+L(UW40):
+	// cfi_endproc
+ENDF(ffi_closure_raw_SYSV)
+
+#define raw_closure_T_FS	(16+16+8)
+
+ALIGN 16
+PUBLIC ffi_closure_raw_THISCALL
+ffi_closure_raw_THISCALL PROC C
+L(UW41):
+	// cfi_startproc
+	/* Rearrange the stack such that %ecx is the first argument.
+	   This means moving the return address.  */
+	pop 	edx
+L(UW42):
+	// cfi_def_cfa_offset(0)
+	// cfi_register(%eip, %edx)
+	push	ecx
+L(UW43):
+	// cfi_adjust_cfa_offset(4)
+	push 	edx
+L(UW44):
+	// cfi_adjust_cfa_offset(4)
+	// cfi_rel_offset(%eip, 0)
+	sub 	esp, raw_closure_T_FS
+L(UW45):
+	// cfi_adjust_cfa_offset(raw_closure_T_FS)
+	mov 	[esp+raw_closure_T_FS-4], ebx
+L(UW46):
+	// cfi_rel_offset(%ebx, raw_closure_T_FS-4)
+
+	mov 	edx, [eax+FFI_TRAMPOLINE_SIZE+8]	/* load cl->user_data */
+	mov 	[esp+12], edx
+	lea 	edx, [esp+raw_closure_T_FS+4]		/* load raw_args */
+	mov 	[esp+8], edx
+	lea 	edx, [esp+16]				/* load &res */
+	mov 	[esp+4], edx
+	mov 	ebx, [eax+FFI_TRAMPOLINE_SIZE]		/* load cl->cif */
+	mov 	[esp], ebx
+	call	DWORD PTR [eax+FFI_TRAMPOLINE_SIZE+4]		/* call cl->fun */
+
+	mov 	eax, [ebx+20]				/* load cif->flags */
+	and 	eax, X86_RET_TYPE_MASK
+// #ifdef __PIC__
+// 	call	__x86.get_pc_thunk.bx
+// L(pc5):
+// 	leal	L(load_table5)-L(pc5)(%ebx, %eax, 8), %ecx
+// #else
+	lea 	ecx, [L(load_table5)+eax*8]
+//#endif
+	mov 	ebx, [esp+raw_closure_T_FS-4]
+L(UW47):
+	// cfi_restore(%ebx)
+	mov 	eax, [esp+16]				/* Optimistic load */
+	jmp	    DWORD PTR [ecx]
+
+	AlIGN 4
+L(load_table5):
+E(L(load_table5), X86_RET_FLOAT)
+	fld	DWORD PTR [esp +16]
+	jmp	L(e5)
+E(L(load_table5), X86_RET_DOUBLE)
+	fld	QWORD PTR [esp +16]
+	jmp	L(e5)
+E(L(load_table5), X86_RET_LDOUBLE)
+	fld	QWORD PTR [esp+16]
+	jmp	L(e5)
+E(L(load_table5), X86_RET_SINT8)
+	movsx	eax, al
+	jmp	L(e5)
+E(L(load_table5), X86_RET_SINT16)
+	movsx	eax, ax
+	jmp	L(e5)
+E(L(load_table5), X86_RET_UINT8)
+	movzx	eax, al
+	jmp	L(e5)
+E(L(load_table5), X86_RET_UINT16)
+	movzx	eax, ax
+	jmp	L(e5)
+E(L(load_table5), X86_RET_INT64)
+	mov 	edx, [esp+16+4]
+	jmp	L(e5)
+E(L(load_table5), X86_RET_int 32)
+	nop
+	/* fallthru */
+E(L(load_table5), X86_RET_VOID)
+L(e5):
+	add 	esp, raw_closure_T_FS
+L(UW48):
+	// cfi_adjust_cfa_offset(-raw_closure_T_FS)
+	/* Remove the extra %ecx argument we pushed.  */
+	ret	4
+L(UW49):
+	// cfi_adjust_cfa_offset(raw_closure_T_FS)
+E(L(load_table5), X86_RET_STRUCTPOP)
+	add 	esp, raw_closure_T_FS
+L(UW50):
+	// cfi_adjust_cfa_offset(-raw_closure_T_FS)
+	ret	8
+L(UW51):
+	// cfi_adjust_cfa_offset(raw_closure_T_FS)
+E(L(load_table5), X86_RET_STRUCTARG)
+	jmp	L(e5)
+E(L(load_table5), X86_RET_STRUCT_1B)
+	movzx	eax, al
+	jmp	L(e5)
+E(L(load_table5), X86_RET_STRUCT_2B)
+	movzx	eax, ax
+	jmp	L(e5)
+
+	/* Fill out the table so that bad values are predictable.  */
+E(L(load_table5), X86_RET_UNUSED14)
+	int 3
+E(L(load_table5), X86_RET_UNUSED15)
+	int 3
+
+L(UW52):
+	// cfi_endproc
+ENDF(ffi_closure_raw_THISCALL)
+
+#endif /* !FFI_NO_RAW_API */
+
+#ifdef X86_DARWIN
+# define COMDAT(X)							\
+        .section __TEXT,__text,coalesced,pure_instructions;		\
+        .weak_definition X;						\
+        FFI_HIDDEN(X)
+#elif defined __ELF__ && !(defined(__sun__) && defined(__svr4__))
+# define COMDAT(X)							\
+	.section .text.X,"axG",@progbits,X,comdat;			\
+	PUBLIC	X;							\
+	FFI_HIDDEN(X)
+#else
+# define COMDAT(X)
+#endif
+
+// #if defined(__PIC__)
+// 	COMDAT(C(__x86.get_pc_thunk.bx))
+// C(__x86.get_pc_thunk.bx):
+// 	movl	(%esp), %ebx
+// 	ret
+// ENDF(C(__x86.get_pc_thunk.bx))
+// # if defined X86_DARWIN || defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE
+// 	COMDAT(C(__x86.get_pc_thunk.dx))
+// C(__x86.get_pc_thunk.dx):
+// 	movl	(%esp), %edx
+// 	ret
+// ENDF(C(__x86.get_pc_thunk.dx))
+// #endif /* DARWIN || HIDDEN */
+// #endif /* __PIC__ */
+
+#if 0
+/* Sadly, OSX cctools-as doesn't understand .cfi directives at all.  */
+
+#ifdef __APPLE__
+.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
+EHFrame0:
+#elif defined(X86_WIN32)
+.section .eh_frame,"r"
+#elif defined(HAVE_AS_X86_64_UNWIND_SECTION_TYPE)
+.section .eh_frame,EH_FRAME_FLAGS,@unwind
+#else
+.section .eh_frame,EH_FRAME_FLAGS,@progbits
+#endif
+
+#ifdef HAVE_AS_X86_PCREL
+# define PCREL(X)	X - .
+#else
+# define PCREL(X)	X@rel
+#endif
+
+/* Simplify advancing between labels.  Assume DW_CFA_advance_loc1 fits.  */
+#define ADV(N, P)	.byte 2, L(N)-L(P)
+
+	.balign 4
+L(CIE):
+	.set	L(set0),L(ECIE)-L(SCIE)
+	.long	L(set0)			/* CIE Length */
+L(SCIE):
+	.long	0			/* CIE Identifier Tag */
+	.byte	1			/* CIE Version */
+	.ascii	"zR\0"			/* CIE Augmentation */
+	.byte	1			/* CIE Code Alignment Factor */
+	.byte	0x7c			/* CIE Data Alignment Factor */
+	.byte	0x8			/* CIE RA Column */
+	.byte	1			/* Augmentation size */
+	.byte	0x1b			/* FDE Encoding (pcrel sdata4) */
+	.byte	0xc, 4, 4		/* DW_CFA_def_cfa, %esp offset 4 */
+	.byte	0x80+8, 1		/* DW_CFA_offset, %eip offset 1*-4 */
+	.balign 4
+L(ECIE):
+
+	.set	L(set1),L(EFDE1)-L(SFDE1)
+	.long	L(set1)			/* FDE Length */
+L(SFDE1):
+	.long	L(SFDE1)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW0))		/* Initial location */
+	.long	L(UW5)-L(UW0)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW1, UW0)
+	.byte	0xc, 5, 8		/* DW_CFA_def_cfa, %ebp 8 */
+	.byte	0x80+5, 2		/* DW_CFA_offset, %ebp 2*-4 */
+	ADV(UW2, UW1)
+	.byte	0x80+3, 0		/* DW_CFA_offset, %ebx 0*-4 */
+	ADV(UW3, UW2)
+	.byte	0xa			/* DW_CFA_remember_state */
+	.byte	0xc, 4, 4		/* DW_CFA_def_cfa, %esp 4 */
+	.byte	0xc0+3			/* DW_CFA_restore, %ebx */
+	.byte	0xc0+5			/* DW_CFA_restore, %ebp */
+	ADV(UW4, UW3)
+	.byte	0xb			/* DW_CFA_restore_state */
+	.balign	4
+L(EFDE1):
+
+	.set	L(set2),L(EFDE2)-L(SFDE2)
+	.long	L(set2)			/* FDE Length */
+L(SFDE2):
+	.long	L(SFDE2)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW6))		/* Initial location */
+	.long	L(UW8)-L(UW6)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW7, UW6)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE2):
+
+	.set	L(set3),L(EFDE3)-L(SFDE3)
+	.long	L(set3)			/* FDE Length */
+L(SFDE3):
+	.long	L(SFDE3)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW9))		/* Initial location */
+	.long	L(UW11)-L(UW9)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW10, UW9)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE3):
+
+	.set	L(set4),L(EFDE4)-L(SFDE4)
+	.long	L(set4)			/* FDE Length */
+L(SFDE4):
+	.long	L(SFDE4)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW12))		/* Initial location */
+	.long	L(UW20)-L(UW12)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW13, UW12)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+#ifdef FFI_CLOSURE_CALL_INNER_SAVE_EBX
+	ADV(UW14, UW13)
+	.byte	0x80+3, (40-(closure_FS+4))/-4  /* DW_CFA_offset %ebx */
+	ADV(UW15, UW14)
+	.byte	0xc0+3			/* DW_CFA_restore %ebx */
+	ADV(UW16, UW15)
+#else
+	ADV(UW16, UW13)
+#endif
+	.byte	0xe, 4			/* DW_CFA_def_cfa_offset */
+	ADV(UW17, UW16)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+	ADV(UW18, UW17)
+	.byte	0xe, 4			/* DW_CFA_def_cfa_offset */
+	ADV(UW19, UW18)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE4):
+
+	.set	L(set5),L(EFDE5)-L(SFDE5)
+	.long	L(set5)			/* FDE Length */
+L(SFDE5):
+	.long	L(SFDE5)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW21))		/* Initial location */
+	.long	L(UW23)-L(UW21)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW22, UW21)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE5):
+
+	.set	L(set6),L(EFDE6)-L(SFDE6)
+	.long	L(set6)			/* FDE Length */
+L(SFDE6):
+	.long	L(SFDE6)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW24))		/* Initial location */
+	.long	L(UW26)-L(UW24)		/* Address range */
+	.byte	0			/* Augmentation size */
+	.byte	0xe, 8			/* DW_CFA_def_cfa_offset */
+	.byte	0x80+8, 2		/* DW_CFA_offset %eip, 2*-4 */
+	ADV(UW25, UW24)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE6):
+
+	.set	L(set7),L(EFDE7)-L(SFDE7)
+	.long	L(set7)			/* FDE Length */
+L(SFDE7):
+	.long	L(SFDE7)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW27))		/* Initial location */
+	.long	L(UW31)-L(UW27)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW28, UW27)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+#ifdef FFI_CLOSURE_CALL_INNER_SAVE_EBX
+	ADV(UW29, UW28)
+	.byte	0x80+3, (40-(closure_FS+4))/-4  /* DW_CFA_offset %ebx */
+	ADV(UW30, UW29)
+	.byte	0xc0+3			/* DW_CFA_restore %ebx */
+#endif
+	.balign	4
+L(EFDE7):
+
+#if !FFI_NO_RAW_API
+	.set	L(set8),L(EFDE8)-L(SFDE8)
+	.long	L(set8)			/* FDE Length */
+L(SFDE8):
+	.long	L(SFDE8)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW32))		/* Initial location */
+	.long	L(UW40)-L(UW32)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW33, UW32)
+	.byte	0xe, raw_closure_S_FS+4	/* DW_CFA_def_cfa_offset */
+	ADV(UW34, UW33)
+	.byte	0x80+3, 2		/* DW_CFA_offset %ebx 2*-4 */
+	ADV(UW35, UW34)
+	.byte	0xc0+3			/* DW_CFA_restore %ebx */
+	ADV(UW36, UW35)
+	.byte	0xe, 4			/* DW_CFA_def_cfa_offset */
+	ADV(UW37, UW36)
+	.byte	0xe, raw_closure_S_FS+4	/* DW_CFA_def_cfa_offset */
+	ADV(UW38, UW37)
+	.byte	0xe, 4			/* DW_CFA_def_cfa_offset */
+	ADV(UW39, UW38)
+	.byte	0xe, raw_closure_S_FS+4	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE8):
+
+	.set	L(set9),L(EFDE9)-L(SFDE9)
+	.long	L(set9)			/* FDE Length */
+L(SFDE9):
+	.long	L(SFDE9)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW41))		/* Initial location */
+	.long	L(UW52)-L(UW41)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW42, UW41)
+	.byte	0xe, 0			/* DW_CFA_def_cfa_offset */
+	.byte	0x9, 8, 2		/* DW_CFA_register %eip, %edx */
+	ADV(UW43, UW42)
+	.byte	0xe, 4			/* DW_CFA_def_cfa_offset */
+	ADV(UW44, UW43)
+	.byte	0xe, 8			/* DW_CFA_def_cfa_offset */
+	.byte	0x80+8, 2		/* DW_CFA_offset %eip 2*-4 */
+	ADV(UW45, UW44)
+	.byte	0xe, raw_closure_T_FS+8	/* DW_CFA_def_cfa_offset */
+	ADV(UW46, UW45)
+	.byte	0x80+3, 3		/* DW_CFA_offset %ebx 3*-4 */
+	ADV(UW47, UW46)
+	.byte	0xc0+3			/* DW_CFA_restore %ebx */
+	ADV(UW48, UW47)
+	.byte	0xe, 8			/* DW_CFA_def_cfa_offset */
+	ADV(UW49, UW48)
+	.byte	0xe, raw_closure_T_FS+8	/* DW_CFA_def_cfa_offset */
+	ADV(UW50, UW49)
+	.byte	0xe, 8			/* DW_CFA_def_cfa_offset */
+	ADV(UW51, UW50)
+	.byte	0xe, raw_closure_T_FS+8	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE9):
+#endif /* !FFI_NO_RAW_API */
+
+#ifdef _WIN32
+	.def	 @feat.00;
+	.scl	3;
+	.type	0;
+	.endef
+	PUBLIC	@feat.00
+@feat.00 = 1
+#endif
+
+#endif /* ifndef _MSC_VER */
+#endif /* ifndef __x86_64__ */
+
+#if defined __ELF__ && defined __linux__
+	.section	.note.GNU-stack,"",@progbits
+#endif
+#endif
+
+END
+\ No newline at end of file
diff --git a/libffi/src/x86/unix64.S b/libffi/src/x86/unix64.S
index c83010c..ca6fe0c 100644
--- a/libffi/src/x86/unix64.S
+++ b/libffi/src/x86/unix64.S
@@ -31,31 +31,10 @@
 #include <fficonfig.h>
 #include <ffi.h>
 #include "internal64.h"
+#include "asmnames.h"
 
 	.text
 
-#define C2(X, Y)  X ## Y
-#define C1(X, Y)  C2(X, Y)
-#ifdef __USER_LABEL_PREFIX__
-# define C(X)     C1(__USER_LABEL_PREFIX__, X)
-#else
-# define C(X)     X
-#endif
-
-#ifdef __APPLE__
-# define L(X)     C1(L, X)
-#else
-# define L(X)     C1(.L, X)
-#endif
-
-#ifdef __ELF__
-# define PLT(X)	  X@PLT
-# define ENDF(X)  .type	X,@function; .size X, . - X
-#else
-# define PLT(X)	  X
-# define ENDF(X)
-#endif
-
 /* This macro allows the safe creation of jump tables without an
    actual table.  The entry points into the table are all 8 bytes.
    The use of ORG asserts that we're at the correct location.  */
@@ -63,7 +42,11 @@
 #if defined(__clang__) || defined(__APPLE__) || (defined (__sun__) && defined(__svr4__))
 # define E(BASE, X)	.balign 8
 #else
-# define E(BASE, X)	.balign 8; .org BASE + X * 8
+# ifdef __CET__
+#  define E(BASE, X)	.balign 8; .org BASE + X * 16
+# else
+#  define E(BASE, X)	.balign 8; .org BASE + X * 8
+# endif
 #endif
 
 /* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
@@ -79,6 +62,7 @@
 
 C(ffi_call_unix64):
 L(UW0):
+	_CET_ENDBR
 	movq	(%rsp), %r10		/* Load return address.  */
 	leaq	(%rdi, %rsi), %rax	/* Find local stack base.  */
 	movq	%rdx, (%rax)		/* Save flags.  */
@@ -100,7 +84,6 @@ L(UW1):
 
 	movq	%rdi, %r10		/* Save a copy of the register area. */
 	movq	%r8, %r11		/* Save a copy of the target fn.  */
-	movl	%r9d, %eax		/* Set number of SSE registers.  */
 
 	/* Load up all argument registers.  */
 	movq	(%r10), %rdi
@@ -109,7 +92,7 @@ L(UW1):
 	movq	0x18(%r10), %rcx
 	movq	0x20(%r10), %r8
 	movq	0x28(%r10), %r9
-	movl	0xb0(%r10), %eax
+	movl	0xb0(%r10), %eax	/* Set number of SSE registers.  */
 	testl	%eax, %eax
 	jnz	L(load_sse)
 L(ret_from_load_sse):
@@ -137,6 +120,11 @@ L(UW2):
 	movzbl	%cl, %r10d
 	leaq	L(store_table)(%rip), %r11
 	ja	L(sa)
+#ifdef __CET__
+	/* NB: Originally, each slot is 8 byte.  4 bytes of ENDBR64 +
+	   4 bytes NOP padding double slot size to 16 bytes.  */
+	addl	%r10d, %r10d
+#endif
 	leaq	(%r11, %r10, 8), %r10
 
 	/* Prep for the structure cases: scratch area in redzone.  */
@@ -146,57 +134,73 @@ L(UW2):
 	.balign	8
 L(store_table):
 E(L(store_table), UNIX64_RET_VOID)
+	_CET_ENDBR
 	ret
 E(L(store_table), UNIX64_RET_UINT8)
+	_CET_ENDBR
 	movzbl	%al, %eax
 	movq	%rax, (%rdi)
 	ret
 E(L(store_table), UNIX64_RET_UINT16)
+	_CET_ENDBR
 	movzwl	%ax, %eax
 	movq	%rax, (%rdi)
 	ret
 E(L(store_table), UNIX64_RET_UINT32)
+	_CET_ENDBR
 	movl	%eax, %eax
 	movq	%rax, (%rdi)
 	ret
 E(L(store_table), UNIX64_RET_SINT8)
+	_CET_ENDBR
 	movsbq	%al, %rax
 	movq	%rax, (%rdi)
 	ret
 E(L(store_table), UNIX64_RET_SINT16)
+	_CET_ENDBR
 	movswq	%ax, %rax
 	movq	%rax, (%rdi)
 	ret
 E(L(store_table), UNIX64_RET_SINT32)
+	_CET_ENDBR
 	cltq
 	movq	%rax, (%rdi)
 	ret
 E(L(store_table), UNIX64_RET_INT64)
+	_CET_ENDBR
 	movq	%rax, (%rdi)
 	ret
 E(L(store_table), UNIX64_RET_XMM32)
+	_CET_ENDBR
 	movd	%xmm0, (%rdi)
 	ret
 E(L(store_table), UNIX64_RET_XMM64)
+	_CET_ENDBR
 	movq	%xmm0, (%rdi)
 	ret
 E(L(store_table), UNIX64_RET_X87)
+	_CET_ENDBR
 	fstpt	(%rdi)
 	ret
 E(L(store_table), UNIX64_RET_X87_2)
+	_CET_ENDBR
 	fstpt	(%rdi)
 	fstpt	16(%rdi)
 	ret
 E(L(store_table), UNIX64_RET_ST_XMM0_RAX)
+	_CET_ENDBR
 	movq	%rax, 8(%rsi)
 	jmp	L(s3)
 E(L(store_table), UNIX64_RET_ST_RAX_XMM0)
+	_CET_ENDBR
 	movq	%xmm0, 8(%rsi)
 	jmp	L(s2)
 E(L(store_table), UNIX64_RET_ST_XMM0_XMM1)
+	_CET_ENDBR
 	movq	%xmm1, 8(%rsi)
 	jmp	L(s3)
 E(L(store_table), UNIX64_RET_ST_RAX_RDX)
+	_CET_ENDBR
 	movq	%rdx, 8(%rsi)
 L(s2):
 	movq	%rax, (%rsi)
@@ -248,6 +252,7 @@ ENDF(C(ffi_call_unix64))
 
 C(ffi_closure_unix64_sse):
 L(UW5):
+	_CET_ENDBR
 	subq	$ffi_closure_FS, %rsp
 L(UW6):
 	/* cfi_adjust_cfa_offset(ffi_closure_FS) */
@@ -271,6 +276,7 @@ ENDF(C(ffi_closure_unix64_sse))
 
 C(ffi_closure_unix64):
 L(UW8):
+	_CET_ENDBR
 	subq	$ffi_closure_FS, %rsp
 L(UW9):
 	/* cfi_adjust_cfa_offset(ffi_closure_FS) */
@@ -295,7 +301,7 @@ L(do_closure):
 	leaq	ffi_closure_OFS_RVALUE(%rsp), %rcx	/* Load rvalue */
 	movq	%rsp, %r8				/* Load reg_args */
 	leaq	ffi_closure_FS+8(%rsp), %r9		/* Load argp */
-	call	C(ffi_closure_unix64_inner)
+	call	PLT(C(ffi_closure_unix64_inner))
 
 	/* Deallocate stack frame early; return value is now in redzone.  */
 	addq	$ffi_closure_FS, %rsp
@@ -307,6 +313,11 @@ L(UW10):
 	movzbl	%al, %r10d
 	leaq	L(load_table)(%rip), %r11
 	ja	L(la)
+#ifdef __CET__
+	/* NB: Originally, each slot is 8 byte.  4 bytes of ENDBR64 +
+	   4 bytes NOP padding double slot size to 16 bytes.  */
+	addl	%r10d, %r10d
+#endif
 	leaq	(%r11, %r10, 8), %r10
 	leaq	ffi_closure_RED_RVALUE(%rsp), %rsi
 	jmp	*%r10
@@ -314,51 +325,67 @@ L(UW10):
 	.balign	8
 L(load_table):
 E(L(load_table), UNIX64_RET_VOID)
+	_CET_ENDBR
 	ret
 E(L(load_table), UNIX64_RET_UINT8)
+	_CET_ENDBR
 	movzbl	(%rsi), %eax
 	ret
 E(L(load_table), UNIX64_RET_UINT16)
+	_CET_ENDBR
 	movzwl	(%rsi), %eax
 	ret
 E(L(load_table), UNIX64_RET_UINT32)
+	_CET_ENDBR
 	movl	(%rsi), %eax
 	ret
 E(L(load_table), UNIX64_RET_SINT8)
+	_CET_ENDBR
 	movsbl	(%rsi), %eax
 	ret
 E(L(load_table), UNIX64_RET_SINT16)
+	_CET_ENDBR
 	movswl	(%rsi), %eax
 	ret
 E(L(load_table), UNIX64_RET_SINT32)
+	_CET_ENDBR
 	movl	(%rsi), %eax
 	ret
 E(L(load_table), UNIX64_RET_INT64)
+	_CET_ENDBR
 	movq	(%rsi), %rax
 	ret
 E(L(load_table), UNIX64_RET_XMM32)
+	_CET_ENDBR
 	movd	(%rsi), %xmm0
 	ret
 E(L(load_table), UNIX64_RET_XMM64)
+	_CET_ENDBR
 	movq	(%rsi), %xmm0
 	ret
 E(L(load_table), UNIX64_RET_X87)
+	_CET_ENDBR
 	fldt	(%rsi)
 	ret
 E(L(load_table), UNIX64_RET_X87_2)
+	_CET_ENDBR
 	fldt	16(%rsi)
 	fldt	(%rsi)
 	ret
 E(L(load_table), UNIX64_RET_ST_XMM0_RAX)
+	_CET_ENDBR
 	movq	8(%rsi), %rax
 	jmp	L(l3)
 E(L(load_table), UNIX64_RET_ST_RAX_XMM0)
+	_CET_ENDBR
 	movq	8(%rsi), %xmm0
 	jmp	L(l2)
 E(L(load_table), UNIX64_RET_ST_XMM0_XMM1)
+	_CET_ENDBR
 	movq	8(%rsi), %xmm1
 	jmp	L(l3)
 E(L(load_table), UNIX64_RET_ST_RAX_RDX)
+	_CET_ENDBR
 	movq	8(%rsi), %rdx
 L(l2):
 	movq	(%rsi), %rax
@@ -379,6 +406,7 @@ ENDF(C(ffi_closure_unix64))
 
 C(ffi_go_closure_unix64_sse):
 L(UW12):
+	_CET_ENDBR
 	subq	$ffi_closure_FS, %rsp
 L(UW13):
 	/* cfi_adjust_cfa_offset(ffi_closure_FS) */
@@ -402,6 +430,7 @@ ENDF(C(ffi_go_closure_unix64_sse))
 
 C(ffi_go_closure_unix64):
 L(UW15):
+	_CET_ENDBR
 	subq	$ffi_closure_FS, %rsp
 L(UW16):
 	/* cfi_adjust_cfa_offset(ffi_closure_FS) */
@@ -427,6 +456,81 @@ L(sse_entry2):
 L(UW17):
 ENDF(C(ffi_go_closure_unix64))
 
+#if defined(FFI_EXEC_STATIC_TRAMP)
+	.balign	8
+	.globl	C(ffi_closure_unix64_sse_alt)
+	FFI_HIDDEN(C(ffi_closure_unix64_sse_alt))
+
+C(ffi_closure_unix64_sse_alt):
+	/* See the comments above trampoline_code_table. */
+	_CET_ENDBR
+	movq	8(%rsp), %r10			/* Load closure in r10 */
+	addq	$16, %rsp			/* Restore the stack */
+	jmp	C(ffi_closure_unix64_sse)
+ENDF(C(ffi_closure_unix64_sse_alt))
+
+	.balign	8
+	.globl	C(ffi_closure_unix64_alt)
+	FFI_HIDDEN(C(ffi_closure_unix64_alt))
+
+C(ffi_closure_unix64_alt):
+	/* See the comments above trampoline_code_table. */
+	_CET_ENDBR
+	movq	8(%rsp), %r10			/* Load closure in r10 */
+	addq	$16, %rsp			/* Restore the stack */
+	jmp	C(ffi_closure_unix64)
+	ENDF(C(ffi_closure_unix64_alt))
+
+/*
+ * Below is the definition of the trampoline code table. Each element in
+ * the code table is a trampoline.
+ *
+ * Because we jump to the trampoline, we place a _CET_ENDBR at the
+ * beginning of the trampoline to mark it as a valid branch target. This is
+ * part of the the Intel CET (Control Flow Enforcement Technology).
+ */
+/*
+ * The trampoline uses register r10. It saves the original value of r10 on
+ * the stack.
+ *
+ * The trampoline has two parameters - target code to jump to and data for
+ * the target code. The trampoline extracts the parameters from its parameter
+ * block (see tramp_table_map()). The trampoline saves the data address on
+ * the stack. Finally, it jumps to the target code.
+ *
+ * The target code can choose to:
+ *
+ * - restore the value of r10
+ * - load the data address in a register
+ * - restore the stack pointer to what it was when the trampoline was invoked.
+ */
+#ifdef ENDBR_PRESENT
+#define X86_DATA_OFFSET		4077
+#define X86_CODE_OFFSET		4073
+#else
+#define X86_DATA_OFFSET		4081
+#define X86_CODE_OFFSET		4077
+#endif
+
+	.align	UNIX64_TRAMP_MAP_SIZE
+	.globl	trampoline_code_table
+	FFI_HIDDEN(C(trampoline_code_table))
+
+C(trampoline_code_table):
+	.rept	UNIX64_TRAMP_MAP_SIZE / UNIX64_TRAMP_SIZE
+	_CET_ENDBR
+	subq	$16, %rsp			/* Make space on the stack */
+	movq	%r10, (%rsp)			/* Save %r10 on stack */
+	movq	X86_DATA_OFFSET(%rip), %r10	/* Copy data into %r10 */
+	movq	%r10, 8(%rsp)			/* Save data on stack */
+	movq	X86_CODE_OFFSET(%rip), %r10	/* Copy code into %r10 */
+	jmp	*%r10				/* Jump to code */
+	.align	8
+	.endr
+ENDF(C(trampoline_code_table))
+	.align	UNIX64_TRAMP_MAP_SIZE
+#endif /* FFI_EXEC_STATIC_TRAMP */
+
 /* Sadly, OSX cctools-as doesn't understand .cfi directives at all.  */
 
 #ifdef __APPLE__
@@ -445,7 +549,12 @@ EHFrame0:
 #endif
 
 /* Simplify advancing between labels.  Assume DW_CFA_advance_loc1 fits.  */
-#define ADV(N, P)	.byte 2, L(N)-L(P)
+#ifdef __CET__
+/* Use DW_CFA_advance_loc2 when IBT is enabled.  */
+# define ADV(N, P)	.byte 3; .2byte L(N)-L(P)
+#else
+# define ADV(N, P)	.byte 2, L(N)-L(P)
+#endif
 
 	.balign 8
 L(CIE):
@@ -538,6 +647,47 @@ L(SFDE5):
 L(EFDE5):
 #ifdef __APPLE__
 	.subsections_via_symbols
+	.section __LD,__compact_unwind,regular,debug
+
+	/* compact unwind for ffi_call_unix64 */
+	.quad    C(ffi_call_unix64)
+	.set     L1,L(UW4)-L(UW0)
+	.long    L1
+	.long    0x04000000 /* use dwarf unwind info */
+	.quad    0
+	.quad    0
+
+	/* compact unwind for ffi_closure_unix64_sse */
+	.quad    C(ffi_closure_unix64_sse)
+	.set     L2,L(UW7)-L(UW5)
+	.long    L2
+	.long    0x04000000 /* use dwarf unwind info */
+	.quad    0
+	.quad    0
+
+	/* compact unwind for ffi_closure_unix64 */
+	.quad    C(ffi_closure_unix64)
+	.set     L3,L(UW11)-L(UW8)
+	.long    L3
+	.long    0x04000000 /* use dwarf unwind info */
+	.quad    0
+	.quad    0
+
+	/* compact unwind for ffi_go_closure_unix64_sse */
+	.quad    C(ffi_go_closure_unix64_sse)
+	.set     L4,L(UW14)-L(UW12)
+	.long    L4
+	.long    0x04000000 /* use dwarf unwind info */
+	.quad    0
+	.quad    0
+
+	/* compact unwind for ffi_go_closure_unix64 */
+	.quad    C(ffi_go_closure_unix64)
+	.set     L5,L(UW17)-L(UW15)
+	.long    L5
+	.long    0x04000000 /* use dwarf unwind info */
+	.quad    0
+	.quad    0
 #endif
 
 #endif /* __x86_64__ */
diff --git a/libffi/src/x86/win64.S b/libffi/src/x86/win64.S
index a5a20b6..f3ace8d 100644
--- a/libffi/src/x86/win64.S
+++ b/libffi/src/x86/win64.S
@@ -1,27 +1,37 @@
+#ifdef __x86_64__
 #define LIBFFI_ASM
 #include <fficonfig.h>
 #include <ffi.h>
 #include <ffi_cfi.h>
+#include "asmnames.h"
 
 #if defined(HAVE_AS_CFI_PSEUDO_OP)
         .cfi_sections   .debug_frame
 #endif
 
+#ifdef X86_WIN64
+#define SEH(...) __VA_ARGS__
 #define arg0	%rcx
 #define arg1	%rdx
 #define arg2	%r8
 #define arg3	%r9
-
-#ifdef SYMBOL_UNDERSCORE
-#define SYMBOL_NAME(name) _##name
 #else
-#define SYMBOL_NAME(name) name
+#define SEH(...)
+#define arg0	%rdi
+#define arg1	%rsi
+#define arg2	%rdx
+#define arg3	%rcx
 #endif
 
-.macro E which
-	.align	8
-	.org	0b + \which * 8
-.endm
+/* This macro allows the safe creation of jump tables without an
+   actual table.  The entry points into the table are all 8 bytes.
+   The use of ORG asserts that we're at the correct location.  */
+/* ??? The clang assembler doesn't handle .org with symbolic expressions.  */
+#if defined(__clang__) || defined(__APPLE__) || (defined (__sun__) && defined(__svr4__))
+# define E(BASE, X)	.balign 8
+#else
+# define E(BASE, X)	.balign 8; .org BASE + (X) * 8
+#endif
 
 	.text
 
@@ -32,11 +42,13 @@
    deallocate some of the stack that has been alloca'd.  */
 
 	.align	8
-	.globl	ffi_call_win64
+	.globl	C(ffi_call_win64)
+	FFI_HIDDEN(C(ffi_call_win64))
 
-	.seh_proc ffi_call_win64
-ffi_call_win64:
+	SEH(.seh_proc ffi_call_win64)
+C(ffi_call_win64):
 	cfi_startproc
+	_CET_ENDBR
 	/* Set up the local stack frame and install it in rbp/rsp.  */
 	movq	(%rsp), %rax
 	movq	%rbp, (arg1)
@@ -44,9 +56,9 @@ ffi_call_win64:
 	movq	arg1, %rbp
 	cfi_def_cfa(%rbp, 16)
 	cfi_rel_offset(%rbp, 0)
-	.seh_pushreg %rbp
-	.seh_setframe %rbp, 0
-	.seh_endprologue
+	SEH(.seh_pushreg %rbp)
+	SEH(.seh_setframe %rbp, 0)
+	SEH(.seh_endprologue)
 	movq	arg0, %rsp
 
 	movq	arg2, %r10
@@ -69,7 +81,7 @@ ffi_call_win64:
 	cmpl	$FFI_TYPE_SMALL_STRUCT_4B, %ecx
 	leaq	(%r10, %rcx, 8), %r10
 	ja	99f
-	jmp	*%r10
+	_CET_NOTRACK jmp *%r10
 
 /* Below, we're space constrained most of the time.  Thus we eschew the
    modern "mov, pop, ret" sequence (5 bytes) for "leave, ret" (2 bytes).  */
@@ -84,72 +96,73 @@ ffi_call_win64:
 
 	.align	8
 0:
-E FFI_TYPE_VOID
+E(0b, FFI_TYPE_VOID)
 	epilogue
-E FFI_TYPE_INT
+E(0b, FFI_TYPE_INT)
 	movslq	%eax, %rax
 	movq	%rax, (%r8)
 	epilogue
-E FFI_TYPE_FLOAT
+E(0b, FFI_TYPE_FLOAT)
 	movss	%xmm0, (%r8)
 	epilogue
-E FFI_TYPE_DOUBLE
+E(0b, FFI_TYPE_DOUBLE)
 	movsd	%xmm0, (%r8)
 	epilogue
-E FFI_TYPE_LONGDOUBLE
-	call	abort
-E FFI_TYPE_UINT8
+// FFI_TYPE_LONGDOUBLE may be FFI_TYPE_DOUBLE but we need a different value here.
+E(0b, FFI_TYPE_DOUBLE + 1)
+	call	PLT(C(abort))
+E(0b, FFI_TYPE_UINT8)
 	movzbl	%al, %eax
 	movq	%rax, (%r8)
 	epilogue
-E FFI_TYPE_SINT8
+E(0b, FFI_TYPE_SINT8)
 	movsbq	%al, %rax
 	jmp	98f
-E FFI_TYPE_UINT16
+E(0b, FFI_TYPE_UINT16)
 	movzwl	%ax, %eax
 	movq	%rax, (%r8)
 	epilogue
-E FFI_TYPE_SINT16
+E(0b, FFI_TYPE_SINT16)
 	movswq	%ax, %rax
 	jmp	98f
-E FFI_TYPE_UINT32
+E(0b, FFI_TYPE_UINT32)
 	movl	%eax, %eax
 	movq	%rax, (%r8)
 	epilogue
-E FFI_TYPE_SINT32
+E(0b, FFI_TYPE_SINT32)
 	movslq	%eax, %rax
 	movq	%rax, (%r8)
 	epilogue
-E FFI_TYPE_UINT64
+E(0b, FFI_TYPE_UINT64)
 98:	movq	%rax, (%r8)
 	epilogue
-E FFI_TYPE_SINT64
+E(0b, FFI_TYPE_SINT64)
 	movq	%rax, (%r8)
 	epilogue
-E FFI_TYPE_STRUCT
+E(0b, FFI_TYPE_STRUCT)
 	epilogue
-E FFI_TYPE_POINTER
+E(0b, FFI_TYPE_POINTER)
 	movq	%rax, (%r8)
 	epilogue
-E FFI_TYPE_COMPLEX
-	call	abort
-E FFI_TYPE_SMALL_STRUCT_1B
+E(0b, FFI_TYPE_COMPLEX)
+	call	PLT(C(abort))
+E(0b, FFI_TYPE_SMALL_STRUCT_1B)
 	movb	%al, (%r8)
 	epilogue
-E FFI_TYPE_SMALL_STRUCT_2B
+E(0b, FFI_TYPE_SMALL_STRUCT_2B)
 	movw	%ax, (%r8)
 	epilogue
-E FFI_TYPE_SMALL_STRUCT_4B
+E(0b, FFI_TYPE_SMALL_STRUCT_4B)
 	movl	%eax, (%r8)
 	epilogue
 
 	.align	8
-99:	call	abort
+99:	call	PLT(C(abort))
 
-.purgem epilogue
+	epilogue
 
 	cfi_endproc
-	.seh_endproc
+	SEH(.seh_endproc)
 
 
 /* 32 bytes of outgoing register stack space, 8 bytes of alignment,
@@ -159,44 +172,48 @@ E FFI_TYPE_SMALL_STRUCT_4B
 #define ffi_clo_OFF_X	(32+8+16)
 
 	.align	8
-	.globl	ffi_go_closure_win64
+	.globl	C(ffi_go_closure_win64)
+	FFI_HIDDEN(C(ffi_go_closure_win64))
 
-	.seh_proc ffi_go_closure_win64
-ffi_go_closure_win64:
+	SEH(.seh_proc ffi_go_closure_win64)
+C(ffi_go_closure_win64):
 	cfi_startproc
+	_CET_ENDBR
 	/* Save all integer arguments into the incoming reg stack space.  */
-	movq	arg0, 8(%rsp)
-	movq	arg1, 16(%rsp)
-	movq	arg2, 24(%rsp)
-	movq	arg3, 32(%rsp)
-
-	movq	8(%r10), arg0			/* load cif */
-	movq	16(%r10), arg1			/* load fun */
-	movq	%r10, arg2			/* closure is user_data */
+	movq	%rcx, 8(%rsp)
+	movq	%rdx, 16(%rsp)
+	movq	%r8, 24(%rsp)
+	movq	%r9, 32(%rsp)
+
+	movq	8(%r10), %rcx			/* load cif */
+	movq	16(%r10), %rdx			/* load fun */
+	movq	%r10, %r8			/* closure is user_data */
 	jmp	0f
 	cfi_endproc
-	.seh_endproc
+	SEH(.seh_endproc)
 
 	.align	8
-	.globl	ffi_closure_win64
+	.globl	C(ffi_closure_win64)
+	FFI_HIDDEN(C(ffi_closure_win64))
 
-	.seh_proc ffi_closure_win64
-ffi_closure_win64:
+	SEH(.seh_proc ffi_closure_win64)
+C(ffi_closure_win64):
 	cfi_startproc
+	_CET_ENDBR
 	/* Save all integer arguments into the incoming reg stack space.  */
-	movq	arg0, 8(%rsp)
-	movq	arg1, 16(%rsp)
-	movq	arg2, 24(%rsp)
-	movq	arg3, 32(%rsp)
-
-	movq	FFI_TRAMPOLINE_SIZE(%r10), arg0		/* load cif */
-	movq	FFI_TRAMPOLINE_SIZE+8(%r10), arg1	/* load fun */
-	movq	FFI_TRAMPOLINE_SIZE+16(%r10), arg2	/* load user_data */
+	movq	%rcx, 8(%rsp)
+	movq	%rdx, 16(%rsp)
+	movq	%r8, 24(%rsp)
+	movq	%r9, 32(%rsp)
+
+	movq	FFI_TRAMPOLINE_SIZE(%r10), %rcx		/* load cif */
+	movq	FFI_TRAMPOLINE_SIZE+8(%r10), %rdx	/* load fun */
+	movq	FFI_TRAMPOLINE_SIZE+16(%r10), %r8	/* load user_data */
 0:
 	subq	$ffi_clo_FS, %rsp
 	cfi_adjust_cfa_offset(ffi_clo_FS)
-	.seh_stackalloc ffi_clo_FS
-	.seh_endprologue
+	SEH(.seh_stackalloc ffi_clo_FS)
+	SEH(.seh_endprologue)
 
 	/* Save all sse arguments into the stack frame.  */
 	movsd	%xmm0, ffi_clo_OFF_X(%rsp)
@@ -204,8 +221,8 @@ ffi_closure_win64:
 	movsd	%xmm2, ffi_clo_OFF_X+16(%rsp)
 	movsd	%xmm3, ffi_clo_OFF_X+24(%rsp)
 
-	leaq	ffi_clo_OFF_R(%rsp), arg3
-	call	ffi_closure_win64_inner
+	leaq	ffi_clo_OFF_R(%rsp), %r9
+	call	PLT(C(ffi_closure_win64_inner))
 
 	/* Load the result into both possible result registers.  */
 	movq    ffi_clo_OFF_R(%rsp), %rax
@@ -216,4 +233,23 @@ ffi_closure_win64:
 	ret
 
 	cfi_endproc
-	.seh_endproc
+	SEH(.seh_endproc)
+
+#if defined(FFI_EXEC_STATIC_TRAMP)
+	.align	8
+	.globl	C(ffi_closure_win64_alt)
+	FFI_HIDDEN(C(ffi_closure_win64_alt))
+
+	SEH(.seh_proc ffi_closure_win64_alt)
+C(ffi_closure_win64_alt):
+	_CET_ENDBR
+	movq	8(%rsp), %r10
+	addq	$16, %rsp
+	jmp	C(ffi_closure_win64)
+	SEH(.seh_endproc)
+#endif
+#endif /* __x86_64__ */
+
+#if defined __ELF__ && defined __linux__
+	.section	.note.GNU-stack,"",@progbits
+#endif
diff --git a/libffi/src/x86/win64_intel.S b/libffi/src/x86/win64_intel.S
new file mode 100644
index 0000000..970a4f9
--- /dev/null
+++ b/libffi/src/x86/win64_intel.S
@@ -0,0 +1,238 @@
+#define LIBFFI_ASM
+#include <fficonfig.h>
+#include <ffi.h>
+#include <ffi_cfi.h>
+#include "asmnames.h"
+
+#if defined(HAVE_AS_CFI_PSEUDO_OP)
+        .cfi_sections   .debug_frame
+#endif
+
+#ifdef X86_WIN64
+#define SEH(...) __VA_ARGS__
+#define arg0	rcx
+#define arg1	rdx
+#define arg2	r8
+#define arg3	r9
+#else
+#define SEH(...)
+#define arg0	rdi
+#define arg1	rsi
+#define arg2	rdx
+#define arg3	rcx
+#endif
+
+/* This macro allows the safe creation of jump tables without an
+   actual table.  The entry points into the table are all 8 bytes.
+   The use of ORG asserts that we're at the correct location.  */
+/* ??? The clang assembler doesn't handle .org with symbolic expressions.  */
+#if defined(__clang__) || defined(__APPLE__) || (defined (__sun__) && defined(__svr4__))
+# define E(BASE, X)	ALIGN 8
+#else
+# define E(BASE, X)	ALIGN 8; ORG BASE + (X) * 8
+#endif
+
+	.CODE
+	extern PLT(C(abort)):near
+	extern C(ffi_closure_win64_inner):near
+
+/* ffi_call_win64 (void *stack, struct win64_call_frame *frame, void *r10)
+
+   Bit o trickiness here -- FRAME is the base of the stack frame
+   for this function.  This has been allocated by ffi_call.  We also
+   deallocate some of the stack that has been alloca'd.  */
+
+	ALIGN	8
+	PUBLIC	C(ffi_call_win64)
+
+	; SEH(.safesh ffi_call_win64)
+C(ffi_call_win64) proc SEH(frame)
+	cfi_startproc
+	/* Set up the local stack frame and install it in rbp/rsp.  */
+	mov	RAX, [RSP] ; 	movq	(%rsp), %rax
+	mov [arg1], RBP ; movq	%rbp, (arg1)
+	mov [arg1 + 8], RAX;	movq	%rax, 8(arg1)
+	mov	 RBP, arg1; movq	arg1, %rbp
+	cfi_def_cfa(rbp, 16)
+	cfi_rel_offset(rbp, 0)
+	SEH(.pushreg rbp)
+	SEH(.setframe rbp, 0)
+	SEH(.endprolog)
+	mov	RSP, arg0 ;	movq	arg0, %rsp
+
+	mov	R10, arg2 ; movq	arg2, %r10
+
+	/* Load all slots into both general and xmm registers.  */
+	mov	RCX, [RSP] ;	movq	(%rsp), %rcx
+	movsd XMM0, qword ptr [RSP] ; movsd	(%rsp), %xmm0
+	mov	RDX, [RSP + 8] ;movq	8(%rsp), %rdx
+	movsd XMM1, qword ptr [RSP + 8];	movsd	8(%rsp), %xmm1
+	mov R8, [RSP + 16] ; movq	16(%rsp), %r8
+	movsd	XMM2, qword ptr [RSP + 16] ; movsd	16(%rsp), %xmm2
+	mov	R9, [RSP + 24] ; movq	24(%rsp), %r9
+	movsd	XMM3, qword ptr [RSP + 24] ;movsd	24(%rsp), %xmm3
+
+	CALL qword ptr [RBP + 16] ; call	*16(%rbp)
+
+	mov	 ECX, [RBP + 24] ; movl	24(%rbp), %ecx
+	mov	R8, [RBP + 32] ; movq	32(%rbp), %r8
+	LEA	R10, ffi_call_win64_tab ; leaq	0f(%rip), %r10
+	CMP	ECX, FFI_TYPE_SMALL_STRUCT_4B ; cmpl	$FFI_TYPE_SMALL_STRUCT_4B, %ecx
+	LEA	R10, [R10 + RCX*8] ; leaq	(%r10, %rcx, 8), %r10
+	JA	L99 ; ja	99f
+	JMP	R10 ; jmp	*%r10
+
+/* Below, we're space constrained most of the time.  Thus we eschew the
+   modern "mov, pop, ret" sequence (5 bytes) for "leave, ret" (2 bytes).  */
+epilogue macro
+	LEAVE
+	cfi_remember_state
+	cfi_def_cfa(rsp, 8)
+	cfi_restore(rbp)
+	RET
+	cfi_restore_state
+endm
+
+	ALIGN 8
+ffi_call_win64_tab LABEL NEAR
+E(0b, FFI_TYPE_VOID)
+	epilogue
+E(0b, FFI_TYPE_INT)
+	movsxd rax, eax ; movslq	%eax, %rax
+	mov qword ptr [r8], rax; movq	%rax, (%r8)
+	epilogue
+E(0b, FFI_TYPE_FLOAT)
+	movss dword ptr [r8], xmm0 ; movss	%xmm0, (%r8)
+	epilogue
+E(0b, FFI_TYPE_DOUBLE)
+	movsd qword ptr[r8], xmm0; movsd	%xmm0, (%r8)
+	epilogue
+// FFI_TYPE_LONGDOUBLE may be FFI_TYPE_DOUBLE but we need a different value here.
+E(0b, FFI_TYPE_DOUBLE + 1)
+	call	PLT(C(abort))
+E(0b, FFI_TYPE_UINT8)
+	movzx eax, al ;movzbl	%al, %eax
+	mov qword ptr[r8], rax; movq	%rax, (%r8)
+	epilogue
+E(0b, FFI_TYPE_SINT8)
+	movsx rax, al ; movsbq	%al, %rax
+	jmp	L98
+E(0b, FFI_TYPE_UINT16)
+	movzx eax, ax ; movzwl	%ax, %eax
+	mov qword ptr[r8], rax; movq	%rax, (%r8)
+	epilogue
+E(0b, FFI_TYPE_SINT16)
+	movsx rax, ax; movswq	%ax, %rax
+	jmp	L98
+E(0b, FFI_TYPE_UINT32)
+	mov eax, eax; movl	%eax, %eax
+	mov qword ptr[r8], rax ; movq	%rax, (%r8)
+	epilogue
+E(0b, FFI_TYPE_SINT32)
+	movsxd rax, eax; movslq	%eax, %rax
+	mov qword ptr [r8], rax; movq	%rax, (%r8)
+	epilogue
+E(0b, FFI_TYPE_UINT64)
+L98 LABEL near
+	mov qword ptr [r8], rax ; movq	%rax, (%r8)
+	epilogue
+E(0b, FFI_TYPE_SINT64)
+	mov qword ptr [r8], rax;movq	%rax, (%r8)
+	epilogue
+E(0b, FFI_TYPE_STRUCT)
+	epilogue
+E(0b, FFI_TYPE_POINTER)
+	mov qword ptr [r8], rax ;movq	%rax, (%r8)
+	epilogue
+E(0b, FFI_TYPE_COMPLEX)
+	call	PLT(C(abort))
+E(0b, FFI_TYPE_SMALL_STRUCT_1B)
+	mov byte ptr [r8], al ; movb	%al, (%r8)
+	epilogue
+E(0b, FFI_TYPE_SMALL_STRUCT_2B)
+	mov word ptr [r8], ax ; movw	%ax, (%r8)
+	epilogue
+E(0b, FFI_TYPE_SMALL_STRUCT_4B)
+	mov dword ptr [r8], eax ; movl	%eax, (%r8)
+	epilogue
+
+	align	8
+L99 LABEL near
+	call	PLT(C(abort))
+
+	epilogue
+
+	cfi_endproc
+	C(ffi_call_win64) endp
+
+
+/* 32 bytes of outgoing register stack space, 8 bytes of alignment,
+   16 bytes of result, 32 bytes of xmm registers.  */
+#define ffi_clo_FS	(32+8+16+32)
+#define ffi_clo_OFF_R	(32+8)
+#define ffi_clo_OFF_X	(32+8+16)
+
+	align	8
+	PUBLIC	C(ffi_go_closure_win64)
+
+C(ffi_go_closure_win64) proc
+	cfi_startproc
+	/* Save all integer arguments into the incoming reg stack space.  */
+	mov qword ptr [rsp + 8], rcx; movq	%rcx, 8(%rsp)
+	mov qword ptr [rsp + 16], rdx; movq	%rdx, 16(%rsp)
+	mov qword ptr [rsp + 24], r8; movq	%r8, 24(%rsp)
+	mov qword ptr [rsp + 32], r9 ;movq	%r9, 32(%rsp)
+
+	mov rcx, qword ptr [r10 + 8]; movq	8(%r10), %rcx			/* load cif */
+	mov rdx, qword ptr [r10 + 16];  movq	16(%r10), %rdx			/* load fun */
+	mov r8, r10 ; movq	%r10, %r8			/* closure is user_data */
+	jmp	ffi_closure_win64_2
+	cfi_endproc
+	C(ffi_go_closure_win64) endp
+
+	align	8
+	
+PUBLIC C(ffi_closure_win64)
+C(ffi_closure_win64) PROC FRAME
+	cfi_startproc
+	/* Save all integer arguments into the incoming reg stack space.  */
+	mov qword ptr [rsp + 8], rcx; movq	%rcx, 8(%rsp)
+	mov qword ptr [rsp + 16], rdx;	movq	%rdx, 16(%rsp)
+	mov qword ptr [rsp + 24], r8; 	movq	%r8, 24(%rsp)
+	mov qword ptr [rsp + 32], r9;	movq	%r9, 32(%rsp)
+
+	mov rcx, qword ptr [FFI_TRAMPOLINE_SIZE + r10]	;movq	FFI_TRAMPOLINE_SIZE(%r10), %rcx		/* load cif */
+	mov rdx, qword ptr [FFI_TRAMPOLINE_SIZE + 8 + r10] ;	movq	FFI_TRAMPOLINE_SIZE+8(%r10), %rdx	/* load fun */
+	mov r8, qword ptr [FFI_TRAMPOLINE_SIZE+16+r10] ;movq	FFI_TRAMPOLINE_SIZE+16(%r10), %r8	/* load user_data */
+ffi_closure_win64_2 LABEL near
+	sub rsp, ffi_clo_FS ;subq	$ffi_clo_FS, %rsp
+	cfi_adjust_cfa_offset(ffi_clo_FS)
+	SEH(.allocstack ffi_clo_FS)
+	SEH(.endprolog)
+
+	/* Save all sse arguments into the stack frame.  */
+	movsd qword ptr [ffi_clo_OFF_X + rsp], xmm0	; movsd	%xmm0, ffi_clo_OFF_X(%rsp)
+	movsd qword ptr [ffi_clo_OFF_X+8+rsp], xmm1 ; movsd	%xmm1, ffi_clo_OFF_X+8(%rsp)
+	movsd qword ptr [ffi_clo_OFF_X+16+rsp], xmm2 ; movsd %xmm2, ffi_clo_OFF_X+16(%rsp)
+	movsd qword ptr [ffi_clo_OFF_X+24+rsp], xmm3 ; movsd %xmm3, ffi_clo_OFF_X+24(%rsp)
+
+	lea	r9, [ffi_clo_OFF_R + rsp] ; leaq	ffi_clo_OFF_R(%rsp), %r9
+	call C(ffi_closure_win64_inner)
+
+	/* Load the result into both possible result registers.  */
+	
+	mov rax, qword ptr [ffi_clo_OFF_R + rsp] ;movq    ffi_clo_OFF_R(%rsp), %rax
+	movsd xmm0, qword ptr [rsp + ffi_clo_OFF_R] ;movsd   ffi_clo_OFF_R(%rsp), %xmm0
+
+	add rsp, ffi_clo_FS ;addq	$ffi_clo_FS, %rsp
+	cfi_adjust_cfa_offset(-ffi_clo_FS)
+	ret
+
+	cfi_endproc
+	C(ffi_closure_win64) endp
+
+#if defined __ELF__ && defined __linux__
+	.section	.note.GNU-stack,"",@progbits
+#endif
+_text ends
+end
+\ No newline at end of file
diff --git a/libffi/src/xtensa/ffi.c b/libffi/src/xtensa/ffi.c
index fd94daf..9a0575f 100644
--- a/libffi/src/xtensa/ffi.c
+++ b/libffi/src/xtensa/ffi.c
@@ -89,7 +89,7 @@ ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
   /* Round the stack up to a full 4 register frame, just in case
      (we use this size in movsp). This way, it's also a  multiple of
      8 bytes for 64-bit arguments.  */
-  cif->bytes = ALIGN(cif->bytes, 16);
+  cif->bytes = FFI_ALIGN(cif->bytes, 16);
 
   return FFI_OK;
 }
@@ -205,7 +205,7 @@ void ffi_call(ffi_cif* cif, void(*fn)(void), void *rvalue, void **avalue)
 
   if (flags == FFI_TYPE_STRUCT && (rsize <= 16 || rvalue == NULL))
   {
-    alloc = alloca(ALIGN(rsize, 4));
+    alloc = alloca(FFI_ALIGN(rsize, 4));
     ecif.rvalue = alloc;
   }
   else
diff --git a/libffi/src/xtensa/sysv.S b/libffi/src/xtensa/sysv.S
index 64e6a09..e942179 100644
--- a/libffi/src/xtensa/sysv.S
+++ b/libffi/src/xtensa/sysv.S
@@ -169,8 +169,13 @@ ENTRY(ffi_cacheflush)
 
 	entry	a1, 16
 
-1:	dhwbi	a2, 0
+1:	
+#if XCHAL_DCACHE_SIZE
+	dhwbi	a2, 0
+#endif
+#if XCHAL_ICACHE_SIZE
 	ihi	a2, 0
+#endif
 	addi	a2, a2, 4
 	blt	a2, a3, 1b
author	H.J. Lu <hjl.tools@gmail.com>	2021-08-31 07:14:47 -0700
committer	H.J. Lu <hjl.tools@gmail.com>	2021-10-20 05:35:52 -0700
commit	92456a4e5658e138e2cea79e390e3306b07685b0 (patch)
tree	6ef878e933b504a902035f1ae89510fde96a976d /libffi/src
parent	d738405e7fe62cc8eb9580948a6ea39005cd7170 (diff)
download	gcc-92456a4e5658e138e2cea79e390e3306b07685b0.zip gcc-92456a4e5658e138e2cea79e390e3306b07685b0.tar.gz gcc-92456a4e5658e138e2cea79e390e3306b07685b0.tar.bz2