Merge libffi to upstream commit c82cc159426d8d4402375fa1ae3f045b9cf82e16

From-SVN: r219477
author: Richard Henderson <rth@redhat.com> 2015-01-12 08:19:59 -0800
committer: Richard Henderson <rth@gcc.gnu.org> 2015-01-12 08:19:59 -0800
commit: b1760f7f915a36ee9b4636fb54719c9b3ae59356 (patch)
tree: 1a64d747b069bdebf651d856989dd40a54daf0cc /libffi/src/x86
parent: 62e22fcb7985349b93646b86351033e1fb09c46c (diff)
download: gcc-b1760f7f915a36ee9b4636fb54719c9b3ae59356.zip
gcc-b1760f7f915a36ee9b4636fb54719c9b3ae59356.tar.gz
gcc-b1760f7f915a36ee9b4636fb54719c9b3ae59356.tar.bz2
13 files changed, 4314 insertions, 3704 deletions
diff --git a/libffi/src/x86/darwin64_c.c b/libffi/src/x86/darwin64_c.c
new file mode 100644
index 0000000..1daa1c0
--- /dev/null
+++ b/libffi/src/x86/darwin64_c.c
@@ -0,0 +1,643 @@
+/* -----------------------------------------------------------------------
+   ffi64.c - Copyright (c) 20011  Anthony Green
+             Copyright (c) 2008, 2010  Red Hat, Inc.
+             Copyright (c) 2002, 2007  Bo Thorsen <bo@suse.de>
+             
+   x86-64 Foreign Function Interface 
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+   ----------------------------------------------------------------------- */
+
+#include <ffi.h>
+#include <ffi_common.h>
+
+#include <stdlib.h>
+#include <stdarg.h>
+
+#ifdef __x86_64__
+
+#define MAX_GPR_REGS 6
+#define MAX_SSE_REGS 8
+
+#ifdef __INTEL_COMPILER
+#define UINT128 __m128
+#else
+#define UINT128 __int128_t
+#endif
+
+struct register_args
+{
+  /* Registers for argument passing.  */
+  UINT64 gpr[MAX_GPR_REGS];
+  UINT128 sse[MAX_SSE_REGS];
+};
+
+extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
+			     void *raddr, void (*fnaddr)(void), unsigned ssecount);
+
+/* All reference to register classes here is identical to the code in
+   gcc/config/i386/i386.c. Do *not* change one without the other.  */
+
+/* Register class used for passing given 64bit part of the argument.
+   These represent classes as documented by the PS ABI, with the
+   exception of SSESF, SSEDF classes, that are basically SSE class,
+   just gcc will use SF or DFmode move instead of DImode to avoid
+   reformatting penalties.
+
+   Similary we play games with INTEGERSI_CLASS to use cheaper SImode moves
+   whenever possible (upper half does contain padding).  */
+enum x86_64_reg_class
+  {
+    X86_64_NO_CLASS,
+    X86_64_INTEGER_CLASS,
+    X86_64_INTEGERSI_CLASS,
+    X86_64_SSE_CLASS,
+    X86_64_SSESF_CLASS,
+    X86_64_SSEDF_CLASS,
+    X86_64_SSEUP_CLASS,
+    X86_64_X87_CLASS,
+    X86_64_X87UP_CLASS,
+    X86_64_COMPLEX_X87_CLASS,
+    X86_64_MEMORY_CLASS
+  };
+
+#define MAX_CLASSES 4
+
+#define SSE_CLASS_P(X)	((X) >= X86_64_SSE_CLASS && X <= X86_64_SSEUP_CLASS)
+
+/* x86-64 register passing implementation.  See x86-64 ABI for details.  Goal
+   of this code is to classify each 8bytes of incoming argument by the register
+   class and assign registers accordingly.  */
+
+/* Return the union class of CLASS1 and CLASS2.
+   See the x86-64 PS ABI for details.  */
+
+static enum x86_64_reg_class
+merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
+{
+  /* Rule #1: If both classes are equal, this is the resulting class.  */
+  if (class1 == class2)
+    return class1;
+
+  /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
+     the other class.  */
+  if (class1 == X86_64_NO_CLASS)
+    return class2;
+  if (class2 == X86_64_NO_CLASS)
+    return class1;
+
+  /* Rule #3: If one of the classes is MEMORY, the result is MEMORY.  */
+  if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
+    return X86_64_MEMORY_CLASS;
+
+  /* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */
+  if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
+      || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
+    return X86_64_INTEGERSI_CLASS;
+  if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
+      || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
+    return X86_64_INTEGER_CLASS;
+
+  /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
+     MEMORY is used.  */
+  if (class1 == X86_64_X87_CLASS
+      || class1 == X86_64_X87UP_CLASS
+      || class1 == X86_64_COMPLEX_X87_CLASS
+      || class2 == X86_64_X87_CLASS
+      || class2 == X86_64_X87UP_CLASS
+      || class2 == X86_64_COMPLEX_X87_CLASS)
+    return X86_64_MEMORY_CLASS;
+
+  /* Rule #6: Otherwise class SSE is used.  */
+  return X86_64_SSE_CLASS;
+}
+
+/* Classify the argument of type TYPE and mode MODE.
+   CLASSES will be filled by the register class used to pass each word
+   of the operand.  The number of words is returned.  In case the parameter
+   should be passed in memory, 0 is returned. As a special case for zero
+   sized containers, classes[0] will be NO_CLASS and 1 is returned.
+
+   See the x86-64 PS ABI for details.
+*/
+static int
+classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
+		   size_t byte_offset)
+{
+  switch (type->type)
+    {
+    case FFI_TYPE_UINT8:
+    case FFI_TYPE_SINT8:
+    case FFI_TYPE_UINT16:
+    case FFI_TYPE_SINT16:
+    case FFI_TYPE_UINT32:
+    case FFI_TYPE_SINT32:
+    case FFI_TYPE_UINT64:
+    case FFI_TYPE_SINT64:
+    case FFI_TYPE_POINTER:
+      {
+	int size = byte_offset + type->size;
+
+	if (size <= 4)
+	  {
+	    classes[0] = X86_64_INTEGERSI_CLASS;
+	    return 1;
+	  }
+	else if (size <= 8)
+	  {
+	    classes[0] = X86_64_INTEGER_CLASS;
+	    return 1;
+	  }
+	else if (size <= 12)
+	  {
+	    classes[0] = X86_64_INTEGER_CLASS;
+	    classes[1] = X86_64_INTEGERSI_CLASS;
+	    return 2;
+	  }
+	else if (size <= 16)
+	  {
+	    classes[0] = classes[1] = X86_64_INTEGERSI_CLASS;
+	    return 2;
+	  }
+	else
+	  FFI_ASSERT (0);
+      }
+    case FFI_TYPE_FLOAT:
+      if (!(byte_offset % 8))
+	classes[0] = X86_64_SSESF_CLASS;
+      else
+	classes[0] = X86_64_SSE_CLASS;
+      return 1;
+    case FFI_TYPE_DOUBLE:
+      classes[0] = X86_64_SSEDF_CLASS;
+      return 1;
+    case FFI_TYPE_LONGDOUBLE:
+      classes[0] = X86_64_X87_CLASS;
+      classes[1] = X86_64_X87UP_CLASS;
+      return 2;
+    case FFI_TYPE_STRUCT:
+      {
+	const int UNITS_PER_WORD = 8;
+	int words = (type->size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
+	ffi_type **ptr; 
+	int i;
+	enum x86_64_reg_class subclasses[MAX_CLASSES];
+
+	/* If the struct is larger than 32 bytes, pass it on the stack.  */
+	if (type->size > 32)
+	  return 0;
+
+	for (i = 0; i < words; i++)
+	  classes[i] = X86_64_NO_CLASS;
+
+	/* Zero sized arrays or structures are NO_CLASS.  We return 0 to
+	   signalize memory class, so handle it as special case.  */
+	if (!words)
+	  {
+	    classes[0] = X86_64_NO_CLASS;
+	    return 1;
+	  }
+
+	/* Merge the fields of structure.  */
+	for (ptr = type->elements; *ptr != NULL; ptr++)
+	  {
+	    int num;
+
+	    byte_offset = ALIGN (byte_offset, (*ptr)->alignment);
+
+	    num = classify_argument (*ptr, subclasses, byte_offset % 8);
+	    if (num == 0)
+	      return 0;
+	    for (i = 0; i < num; i++)
+	      {
+		int pos = byte_offset / 8;
+		classes[i + pos] =
+		  merge_classes (subclasses[i], classes[i + pos]);
+	      }
+
+	    byte_offset += (*ptr)->size;
+	  }
+
+	if (words > 2)
+	  {
+	    /* When size > 16 bytes, if the first one isn't
+	       X86_64_SSE_CLASS or any other ones aren't
+	       X86_64_SSEUP_CLASS, everything should be passed in
+	       memory.  */
+	    if (classes[0] != X86_64_SSE_CLASS)
+	      return 0;
+
+	    for (i = 1; i < words; i++)
+	      if (classes[i] != X86_64_SSEUP_CLASS)
+		return 0;
+	  }
+
+	/* Final merger cleanup.  */
+	for (i = 0; i < words; i++)
+	  {
+	    /* If one class is MEMORY, everything should be passed in
+	       memory.  */
+	    if (classes[i] == X86_64_MEMORY_CLASS)
+	      return 0;
+
+	    /* The X86_64_SSEUP_CLASS should be always preceded by
+	       X86_64_SSE_CLASS or X86_64_SSEUP_CLASS.  */
+	    if (classes[i] == X86_64_SSEUP_CLASS
+		&& classes[i - 1] != X86_64_SSE_CLASS
+		&& classes[i - 1] != X86_64_SSEUP_CLASS)
+	      {
+		/* The first one should never be X86_64_SSEUP_CLASS.  */
+		FFI_ASSERT (i != 0);
+		classes[i] = X86_64_SSE_CLASS;
+	      }
+
+	    /*  If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
+		everything should be passed in memory.  */
+	    if (classes[i] == X86_64_X87UP_CLASS
+		&& (classes[i - 1] != X86_64_X87_CLASS))
+	      {
+		/* The first one should never be X86_64_X87UP_CLASS.  */
+		FFI_ASSERT (i != 0);
+		return 0;
+	      }
+	  }
+	return words;
+      }
+
+    default:
+      FFI_ASSERT(0);
+    }
+  return 0; /* Never reached.  */
+}
+
+/* Examine the argument and return set number of register required in each
+   class.  Return zero iff parameter should be passed in memory, otherwise
+   the number of registers.  */
+
+static int
+examine_argument (ffi_type *type, enum x86_64_reg_class classes[MAX_CLASSES],
+		  _Bool in_return, int *pngpr, int *pnsse)
+{
+  int i, n, ngpr, nsse;
+
+  n = classify_argument (type, classes, 0);
+  if (n == 0)
+    return 0;
+
+  ngpr = nsse = 0;
+  for (i = 0; i < n; ++i)
+    switch (classes[i])
+      {
+      case X86_64_INTEGER_CLASS:
+      case X86_64_INTEGERSI_CLASS:
+	ngpr++;
+	break;
+      case X86_64_SSE_CLASS:
+      case X86_64_SSESF_CLASS:
+      case X86_64_SSEDF_CLASS:
+	nsse++;
+	break;
+      case X86_64_NO_CLASS:
+      case X86_64_SSEUP_CLASS:
+	break;
+      case X86_64_X87_CLASS:
+      case X86_64_X87UP_CLASS:
+      case X86_64_COMPLEX_X87_CLASS:
+	return in_return != 0;
+      default:
+	abort ();
+      }
+
+  *pngpr = ngpr;
+  *pnsse = nsse;
+
+  return n;
+}
+
+/* Perform machine dependent cif processing.  */
+
+ffi_status
+ffi_prep_cif_machdep (ffi_cif *cif)
+{
+  int gprcount, ssecount, i, avn, n, ngpr, nsse, flags;
+  enum x86_64_reg_class classes[MAX_CLASSES];
+  size_t bytes;
+
+  gprcount = ssecount = 0;
+
+  flags = cif->rtype->type;
+  if (flags != FFI_TYPE_VOID)
+    {
+      n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
+      if (n == 0)
+	{
+	  /* The return value is passed in memory.  A pointer to that
+	     memory is the first argument.  Allocate a register for it.  */
+	  gprcount++;
+	  /* We don't have to do anything in asm for the return.  */
+	  flags = FFI_TYPE_VOID;
+	}
+      else if (flags == FFI_TYPE_STRUCT)
+	{
+	  /* Mark which registers the result appears in.  */
+	  _Bool sse0 = SSE_CLASS_P (classes[0]);
+	  _Bool sse1 = n == 2 && SSE_CLASS_P (classes[1]);
+	  if (sse0 && !sse1)
+	    flags |= 1 << 8;
+	  else if (!sse0 && sse1)
+	    flags |= 1 << 9;
+	  else if (sse0 && sse1)
+	    flags |= 1 << 10;
+	  /* Mark the true size of the structure.  */
+	  flags |= cif->rtype->size << 12;
+	}
+    }
+
+  /* Go over all arguments and determine the way they should be passed.
+     If it's in a register and there is space for it, let that be so. If
+     not, add it's size to the stack byte count.  */
+  for (bytes = 0, i = 0, avn = cif->nargs; i < avn; i++)
+    {
+      if (examine_argument (cif->arg_types[i], classes, 0, &ngpr, &nsse) == 0
+	  || gprcount + ngpr > MAX_GPR_REGS
+	  || ssecount + nsse > MAX_SSE_REGS)
+	{
+	  long align = cif->arg_types[i]->alignment;
+
+	  if (align < 8)
+	    align = 8;
+
+	  bytes = ALIGN (bytes, align);
+	  bytes += cif->arg_types[i]->size;
+	}
+      else
+	{
+	  gprcount += ngpr;
+	  ssecount += nsse;
+	}
+    }
+  if (ssecount)
+    flags |= 1 << 11;
+  cif->flags = flags;
+  cif->bytes = ALIGN (bytes, 8);
+
+  return FFI_OK;
+}
+
+void
+ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  enum x86_64_reg_class classes[MAX_CLASSES];
+  char *stack, *argp;
+  ffi_type **arg_types;
+  int gprcount, ssecount, ngpr, nsse, i, avn;
+  _Bool ret_in_memory;
+  struct register_args *reg_args;
+
+  /* Can't call 32-bit mode from 64-bit mode.  */
+  FFI_ASSERT (cif->abi == FFI_UNIX64);
+
+  /* If the return value is a struct and we don't have a return value
+     address then we need to make one.  Note the setting of flags to
+     VOID above in ffi_prep_cif_machdep.  */
+  ret_in_memory = (cif->rtype->type == FFI_TYPE_STRUCT
+		   && (cif->flags & 0xff) == FFI_TYPE_VOID);
+  if (rvalue == NULL && ret_in_memory)
+    rvalue = alloca (cif->rtype->size);
+
+  /* Allocate the space for the arguments, plus 4 words of temp space.  */
+  stack = alloca (sizeof (struct register_args) + cif->bytes + 4*8);
+  reg_args = (struct register_args *) stack;
+  argp = stack + sizeof (struct register_args);
+
+  gprcount = ssecount = 0;
+
+  /* If the return value is passed in memory, add the pointer as the
+     first integer argument.  */
+  if (ret_in_memory)
+    reg_args->gpr[gprcount++] = (unsigned long) rvalue;
+
+  avn = cif->nargs;
+  arg_types = cif->arg_types;
+
+  for (i = 0; i < avn; ++i)
+    {
+      size_t size = arg_types[i]->size;
+      int n;
+
+      n = examine_argument (arg_types[i], classes, 0, &ngpr, &nsse);
+      if (n == 0
+	  || gprcount + ngpr > MAX_GPR_REGS
+	  || ssecount + nsse > MAX_SSE_REGS)
+	{
+	  long align = arg_types[i]->alignment;
+
+	  /* Stack arguments are *always* at least 8 byte aligned.  */
+	  if (align < 8)
+	    align = 8;
+
+	  /* Pass this argument in memory.  */
+	  argp = (void *) ALIGN (argp, align);
+	  memcpy (argp, avalue[i], size);
+	  argp += size;
+	}
+      else
+	{
+	  /* The argument is passed entirely in registers.  */
+	  char *a = (char *) avalue[i];
+	  int j;
+
+	  for (j = 0; j < n; j++, a += 8, size -= 8)
+	    {
+	      switch (classes[j])
+		{
+		case X86_64_INTEGER_CLASS:
+		case X86_64_INTEGERSI_CLASS:
+		  reg_args->gpr[gprcount] = 0;
+		  memcpy (&reg_args->gpr[gprcount], a, size < 8 ? size : 8);
+		  gprcount++;
+		  break;
+		case X86_64_SSE_CLASS:
+		case X86_64_SSEDF_CLASS:
+		  reg_args->sse[ssecount++] = *(UINT64 *) a;
+		  break;
+		case X86_64_SSESF_CLASS:
+		  reg_args->sse[ssecount++] = *(UINT32 *) a;
+		  break;
+		default:
+		  abort();
+		}
+	    }
+	}
+    }
+
+  ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
+		   cif->flags, rvalue, fn, ssecount);
+}
+
+
+extern void ffi_closure_unix64(void);
+
+ffi_status
+ffi_prep_closure_loc (ffi_closure* closure,
+		      ffi_cif* cif,
+		      void (*fun)(ffi_cif*, void*, void**, void*),
+		      void *user_data,
+		      void *codeloc)
+{
+  volatile unsigned short *tramp;
+
+  /* Sanity check on the cif ABI.  */
+  {
+    int abi = cif->abi;
+    if (UNLIKELY (! (abi > FFI_FIRST_ABI && abi < FFI_LAST_ABI)))
+      return FFI_BAD_ABI;
+  }
+
+  tramp = (volatile unsigned short *) &closure->tramp[0];
+
+  tramp[0] = 0xbb49;		/* mov <code>, %r11	*/
+  *((unsigned long long * volatile) &tramp[1])
+    = (unsigned long) ffi_closure_unix64;
+  tramp[5] = 0xba49;		/* mov <data>, %r10	*/
+  *((unsigned long long * volatile) &tramp[6])
+    = (unsigned long) codeloc;
+
+  /* Set the carry bit iff the function uses any sse registers.
+     This is clc or stc, together with the first byte of the jmp.  */
+  tramp[10] = cif->flags & (1 << 11) ? 0x49f9 : 0x49f8;
+
+  tramp[11] = 0xe3ff;			/* jmp *%r11    */
+
+  closure->cif = cif;
+  closure->fun = fun;
+  closure->user_data = user_data;
+
+  return FFI_OK;
+}
+
+int
+ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue,
+			 struct register_args *reg_args, char *argp)
+{
+  ffi_cif *cif;
+  void **avalue;
+  ffi_type **arg_types;
+  long i, avn;
+  int gprcount, ssecount, ngpr, nsse;
+  int ret;
+
+  cif = closure->cif;
+  avalue = alloca(cif->nargs * sizeof(void *));
+  gprcount = ssecount = 0;
+
+  ret = cif->rtype->type;
+  if (ret != FFI_TYPE_VOID)
+    {
+      enum x86_64_reg_class classes[MAX_CLASSES];
+      int n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
+      if (n == 0)
+	{
+	  /* The return value goes in memory.  Arrange for the closure
+	     return value to go directly back to the original caller.  */
+	  rvalue = (void *) (unsigned long) reg_args->gpr[gprcount++];
+	  /* We don't have to do anything in asm for the return.  */
+	  ret = FFI_TYPE_VOID;
+	}
+      else if (ret == FFI_TYPE_STRUCT && n == 2)
+	{
+	  /* Mark which register the second word of the structure goes in.  */
+	  _Bool sse0 = SSE_CLASS_P (classes[0]);
+	  _Bool sse1 = SSE_CLASS_P (classes[1]);
+	  if (!sse0 && sse1)
+	    ret |= 1 << 8;
+	  else if (sse0 && !sse1)
+	    ret |= 1 << 9;
+	}
+    }
+
+  avn = cif->nargs;
+  arg_types = cif->arg_types;
+  
+  for (i = 0; i < avn; ++i)
+    {
+      enum x86_64_reg_class classes[MAX_CLASSES];
+      int n;
+
+      n = examine_argument (arg_types[i], classes, 0, &ngpr, &nsse);
+      if (n == 0
+	  || gprcount + ngpr > MAX_GPR_REGS
+	  || ssecount + nsse > MAX_SSE_REGS)
+	{
+	  long align = arg_types[i]->alignment;
+
+	  /* Stack arguments are *always* at least 8 byte aligned.  */
+	  if (align < 8)
+	    align = 8;
+
+	  /* Pass this argument in memory.  */
+	  argp = (void *) ALIGN (argp, align);
+	  avalue[i] = argp;
+	  argp += arg_types[i]->size;
+	}
+      /* If the argument is in a single register, or two consecutive
+	 integer registers, then we can use that address directly.  */
+      else if (n == 1
+	       || (n == 2 && !(SSE_CLASS_P (classes[0])
+			       || SSE_CLASS_P (classes[1]))))
+	{
+	  /* The argument is in a single register.  */
+	  if (SSE_CLASS_P (classes[0]))
+	    {
+	      avalue[i] = &reg_args->sse[ssecount];
+	      ssecount += n;
+	    }
+	  else
+	    {
+	      avalue[i] = &reg_args->gpr[gprcount];
+	      gprcount += n;
+	    }
+	}
+      /* Otherwise, allocate space to make them consecutive.  */
+      else
+	{
+	  char *a = alloca (16);
+	  int j;
+
+	  avalue[i] = a;
+	  for (j = 0; j < n; j++, a += 8)
+	    {
+	      if (SSE_CLASS_P (classes[j]))
+		memcpy (a, &reg_args->sse[ssecount++], 8);
+	      else
+		memcpy (a, &reg_args->gpr[gprcount++], 8);
+	    }
+	}
+    }
+
+  /* Invoke the closure.  */
+  closure->fun (cif, rvalue, avalue, closure->user_data);
+
+  /* Tell assembly how to perform return type promotions.  */
+  return ret;
+}
+
+#endif /* __x86_64__ */
diff --git a/libffi/src/x86/darwin_c.c b/libffi/src/x86/darwin_c.c
new file mode 100644
index 0000000..6338de2
--- /dev/null
+++ b/libffi/src/x86/darwin_c.c
@@ -0,0 +1,843 @@
+/* -----------------------------------------------------------------------
+   ffi.c - Copyright (c) 1996, 1998, 1999, 2001, 2007, 2008  Red Hat, Inc.
+           Copyright (c) 2002  Ranjit Mathew
+           Copyright (c) 2002  Bo Thorsen
+           Copyright (c) 2002  Roger Sayle
+           Copyright (C) 2008, 2010  Free Software Foundation, Inc.
+
+   x86 Foreign Function Interface
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+   ----------------------------------------------------------------------- */
+
+#if !defined(__x86_64__) || defined(_WIN64) || defined(__CYGWIN__)
+
+#ifdef _WIN64
+#include <windows.h>
+#endif
+
+#include <ffi.h>
+#include <ffi_common.h>
+
+#include <stdlib.h>
+
+/* ffi_prep_args is called by the assembly routine once stack space
+   has been allocated for the function's arguments */
+
+void ffi_prep_args(char *stack, extended_cif *ecif)
+{
+  register unsigned int i;
+  register void **p_argv;
+  register char *argp;
+  register ffi_type **p_arg;
+#ifdef X86_WIN32
+  size_t p_stack_args[2];
+  void *p_stack_data[2];
+  char *argp2 = stack;
+  int stack_args_count = 0;
+  int cabi = ecif->cif->abi;
+#endif
+
+  argp = stack;
+
+  if ((ecif->cif->flags == FFI_TYPE_STRUCT
+       || ecif->cif->flags == FFI_TYPE_MS_STRUCT)
+#ifdef X86_WIN64
+      && (ecif->cif->rtype->size != 1 && ecif->cif->rtype->size != 2
+          && ecif->cif->rtype->size != 4 && ecif->cif->rtype->size != 8)
+#endif
+      )
+    {
+      *(void **) argp = ecif->rvalue;
+#ifdef X86_WIN32
+      /* For fastcall/thiscall this is first register-passed
+         argument.  */
+      if (cabi == FFI_THISCALL || cabi == FFI_FASTCALL)
+	{
+	  p_stack_args[stack_args_count] = sizeof (void*);
+	  p_stack_data[stack_args_count] = argp;
+	  ++stack_args_count;
+	}
+#endif
+      argp += sizeof(void*);
+    }
+
+  p_argv = ecif->avalue;
+
+  for (i = ecif->cif->nargs, p_arg = ecif->cif->arg_types;
+       i != 0;
+       i--, p_arg++)
+    {
+      size_t z;
+
+      /* Align if necessary */
+      if ((sizeof(void*) - 1) & (size_t) argp)
+        argp = (char *) ALIGN(argp, sizeof(void*));
+
+      z = (*p_arg)->size;
+#ifdef X86_WIN64
+      if (z > sizeof(ffi_arg)
+          || ((*p_arg)->type == FFI_TYPE_STRUCT
+              && (z != 1 && z != 2 && z != 4 && z != 8))
+#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
+          || ((*p_arg)->type == FFI_TYPE_LONGDOUBLE)
+#endif
+          )
+        {
+          z = sizeof(ffi_arg);
+          *(void **)argp = *p_argv;
+        }
+      else if ((*p_arg)->type == FFI_TYPE_FLOAT)
+        {
+          memcpy(argp, *p_argv, z);
+        }
+      else
+#endif
+      if (z < sizeof(ffi_arg))
+        {
+          z = sizeof(ffi_arg);
+          switch ((*p_arg)->type)
+            {
+            case FFI_TYPE_SINT8:
+              *(ffi_sarg *) argp = (ffi_sarg)*(SINT8 *)(* p_argv);
+              break;
+
+            case FFI_TYPE_UINT8:
+              *(ffi_arg *) argp = (ffi_arg)*(UINT8 *)(* p_argv);
+              break;
+
+            case FFI_TYPE_SINT16:
+              *(ffi_sarg *) argp = (ffi_sarg)*(SINT16 *)(* p_argv);
+              break;
+
+            case FFI_TYPE_UINT16:
+              *(ffi_arg *) argp = (ffi_arg)*(UINT16 *)(* p_argv);
+              break;
+
+            case FFI_TYPE_SINT32:
+              *(ffi_sarg *) argp = (ffi_sarg)*(SINT32 *)(* p_argv);
+              break;
+
+            case FFI_TYPE_UINT32:
+              *(ffi_arg *) argp = (ffi_arg)*(UINT32 *)(* p_argv);
+              break;
+
+            case FFI_TYPE_STRUCT:
+              *(ffi_arg *) argp = *(ffi_arg *)(* p_argv);
+              break;
+
+            default:
+              FFI_ASSERT(0);
+            }
+        }
+      else
+        {
+          memcpy(argp, *p_argv, z);
+        }
+
+#ifdef X86_WIN32
+    /* For thiscall/fastcall convention register-passed arguments
+       are the first two none-floating-point arguments with a size
+       smaller or equal to sizeof (void*).  */
+    if ((cabi == FFI_THISCALL && stack_args_count < 1)
+        || (cabi == FFI_FASTCALL && stack_args_count < 2))
+      {
+	if (z <= 4
+	    && ((*p_arg)->type != FFI_TYPE_FLOAT
+	        && (*p_arg)->type != FFI_TYPE_STRUCT))
+	  {
+	    p_stack_args[stack_args_count] = z;
+	    p_stack_data[stack_args_count] = argp;
+	    ++stack_args_count;
+	  }
+      }
+#endif
+      p_argv++;
+#ifdef X86_WIN64
+      argp += (z + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+#else
+      argp += z;
+#endif
+    }
+
+#ifdef X86_WIN32
+  /* We need to move the register-passed arguments for thiscall/fastcall
+     on top of stack, so that those can be moved to registers ecx/edx by
+     call-handler.  */
+  if (stack_args_count > 0)
+    {
+      size_t zz = (p_stack_args[0] + 3) & ~3;
+      char *h;
+
+      /* Move first argument to top-stack position.  */
+      if (p_stack_data[0] != argp2)
+	{
+	  h = alloca (zz + 1);
+	  memcpy (h, p_stack_data[0], zz);
+	  memmove (argp2 + zz, argp2,
+	           (size_t) ((char *) p_stack_data[0] - (char*)argp2));
+	  memcpy (argp2, h, zz);
+	}
+
+      argp2 += zz;
+      --stack_args_count;
+      if (zz > 4)
+	stack_args_count = 0;
+
+      /* If we have a second argument, then move it on top
+         after the first one.  */
+      if (stack_args_count > 0 && p_stack_data[1] != argp2)
+	{
+	  zz = p_stack_args[1];
+	  zz = (zz + 3) & ~3;
+	  h = alloca (zz + 1);
+	  h = alloca (zz + 1);
+	  memcpy (h, p_stack_data[1], zz);
+	  memmove (argp2 + zz, argp2, (size_t) ((char*) p_stack_data[1] - (char*)argp2));
+	  memcpy (argp2, h, zz);
+	}
+    }
+#endif
+  return;
+}
+
+/* Perform machine dependent cif processing */
+ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
+{
+  unsigned int i;
+  ffi_type **ptr;
+
+  /* Set the return type flag */
+  switch (cif->rtype->type)
+    {
+    case FFI_TYPE_VOID:
+    case FFI_TYPE_UINT8:
+    case FFI_TYPE_UINT16:
+    case FFI_TYPE_SINT8:
+    case FFI_TYPE_SINT16:
+#ifdef X86_WIN64
+    case FFI_TYPE_UINT32:
+    case FFI_TYPE_SINT32:
+#endif
+    case FFI_TYPE_SINT64:
+    case FFI_TYPE_FLOAT:
+    case FFI_TYPE_DOUBLE:
+#ifndef X86_WIN64
+#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
+    case FFI_TYPE_LONGDOUBLE:
+#endif
+#endif
+      cif->flags = (unsigned) cif->rtype->type;
+      break;
+
+    case FFI_TYPE_UINT64:
+#ifdef X86_WIN64
+    case FFI_TYPE_POINTER:
+#endif
+      cif->flags = FFI_TYPE_SINT64;
+      break;
+
+    case FFI_TYPE_STRUCT:
+#ifndef X86
+      if (cif->rtype->size == 1)
+        {
+          cif->flags = FFI_TYPE_SMALL_STRUCT_1B; /* same as char size */
+        }
+      else if (cif->rtype->size == 2)
+        {
+          cif->flags = FFI_TYPE_SMALL_STRUCT_2B; /* same as short size */
+        }
+      else if (cif->rtype->size == 4)
+        {
+#ifdef X86_WIN64
+          cif->flags = FFI_TYPE_SMALL_STRUCT_4B;
+#else
+          cif->flags = FFI_TYPE_INT; /* same as int type */
+#endif
+        }
+      else if (cif->rtype->size == 8)
+        {
+          cif->flags = FFI_TYPE_SINT64; /* same as int64 type */
+        }
+      else
+#endif
+        {
+#ifdef X86_WIN32
+          if (cif->abi == FFI_MS_CDECL)
+            cif->flags = FFI_TYPE_MS_STRUCT;
+          else
+#endif
+            cif->flags = FFI_TYPE_STRUCT;
+          /* allocate space for return value pointer */
+          cif->bytes += ALIGN(sizeof(void*), FFI_SIZEOF_ARG);
+        }
+      break;
+
+    default:
+#ifdef X86_WIN64
+      cif->flags = FFI_TYPE_SINT64;
+      break;
+    case FFI_TYPE_INT:
+      cif->flags = FFI_TYPE_SINT32;
+#else
+      cif->flags = FFI_TYPE_INT;
+#endif
+      break;
+    }
+
+  for (ptr = cif->arg_types, i = cif->nargs; i > 0; i--, ptr++)
+    {
+      if (((*ptr)->alignment - 1) & cif->bytes)
+        cif->bytes = ALIGN(cif->bytes, (*ptr)->alignment);
+      cif->bytes += ALIGN((*ptr)->size, FFI_SIZEOF_ARG);
+    }
+
+#ifdef X86_WIN64
+  /* ensure space for storing four registers */
+  cif->bytes += 4 * sizeof(ffi_arg);
+#endif
+
+#ifdef X86_DARWIN
+  cif->bytes = (cif->bytes + 15) & ~0xF;
+#endif
+
+  return FFI_OK;
+}
+
+#ifdef X86_WIN64
+extern int
+ffi_call_win64(void (*)(char *, extended_cif *), extended_cif *,
+               unsigned, unsigned, unsigned *, void (*fn)(void));
+#elif defined(X86_WIN32)
+extern void
+ffi_call_win32(void (*)(char *, extended_cif *), extended_cif *,
+               unsigned, unsigned, unsigned, unsigned *, void (*fn)(void));
+#else
+extern void ffi_call_SYSV(void (*)(char *, extended_cif *), extended_cif *,
+                          unsigned, unsigned, unsigned *, void (*fn)(void));
+#endif
+
+void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  extended_cif ecif;
+
+  ecif.cif = cif;
+  ecif.avalue = avalue;
+  
+  /* If the return value is a struct and we don't have a return */
+  /* value address then we need to make one                     */
+
+#ifdef X86_WIN64
+  if (rvalue == NULL
+      && cif->flags == FFI_TYPE_STRUCT
+      && cif->rtype->size != 1 && cif->rtype->size != 2
+      && cif->rtype->size != 4 && cif->rtype->size != 8)
+    {
+      ecif.rvalue = alloca((cif->rtype->size + 0xF) & ~0xF);
+    }
+#else
+  if (rvalue == NULL
+      && (cif->flags == FFI_TYPE_STRUCT
+          || cif->flags == FFI_TYPE_MS_STRUCT))
+    {
+      ecif.rvalue = alloca(cif->rtype->size);
+    }
+#endif
+  else
+    ecif.rvalue = rvalue;
+    
+  
+  switch (cif->abi) 
+    {
+#ifdef X86_WIN64
+    case FFI_WIN64:
+      ffi_call_win64(ffi_prep_args, &ecif, cif->bytes,
+                     cif->flags, ecif.rvalue, fn);
+      break;
+#elif defined(X86_WIN32)
+    case FFI_SYSV:
+    case FFI_STDCALL:
+    case FFI_MS_CDECL:
+      ffi_call_win32(ffi_prep_args, &ecif, cif->abi, cif->bytes, cif->flags,
+		     ecif.rvalue, fn);
+      break;
+    case FFI_THISCALL:
+    case FFI_FASTCALL:
+      {
+	unsigned int abi = cif->abi;
+	unsigned int i, passed_regs = 0;
+
+	if (cif->flags == FFI_TYPE_STRUCT)
+	  ++passed_regs;
+
+	for (i=0; i < cif->nargs && passed_regs < 2;i++)
+	  {
+	    size_t sz;
+
+	    if (cif->arg_types[i]->type == FFI_TYPE_FLOAT
+	        || cif->arg_types[i]->type == FFI_TYPE_STRUCT)
+	      continue;
+	    sz = (cif->arg_types[i]->size + 3) & ~3;
+	    if (sz == 0 || sz > 4)
+	      continue;
+	    ++passed_regs;
+	  }
+	if (passed_regs < 2 && abi == FFI_FASTCALL)
+	  abi = FFI_THISCALL;
+	if (passed_regs < 1 && abi == FFI_THISCALL)
+	  abi = FFI_STDCALL;
+        ffi_call_win32(ffi_prep_args, &ecif, abi, cif->bytes, cif->flags,
+                       ecif.rvalue, fn);
+      }
+      break;
+#else
+    case FFI_SYSV:
+      ffi_call_SYSV(ffi_prep_args, &ecif, cif->bytes, cif->flags, ecif.rvalue,
+                    fn);
+      break;
+#endif
+    default:
+      FFI_ASSERT(0);
+      break;
+    }
+}
+
+
+/** private members **/
+
+/* The following __attribute__((regparm(1))) decorations will have no effect
+   on MSVC - standard cdecl convention applies. */
+static void ffi_prep_incoming_args_SYSV (char *stack, void **ret,
+                                         void** args, ffi_cif* cif);
+void FFI_HIDDEN ffi_closure_SYSV (ffi_closure *)
+     __attribute__ ((regparm(1)));
+unsigned int FFI_HIDDEN ffi_closure_SYSV_inner (ffi_closure *, void **, void *)
+     __attribute__ ((regparm(1)));
+void FFI_HIDDEN ffi_closure_raw_SYSV (ffi_raw_closure *)
+     __attribute__ ((regparm(1)));
+#ifdef X86_WIN32
+void FFI_HIDDEN ffi_closure_raw_THISCALL (ffi_raw_closure *)
+     __attribute__ ((regparm(1)));
+void FFI_HIDDEN ffi_closure_STDCALL (ffi_closure *)
+     __attribute__ ((regparm(1)));
+void FFI_HIDDEN ffi_closure_THISCALL (ffi_closure *)
+     __attribute__ ((regparm(1)));
+#endif
+#ifdef X86_WIN64
+void FFI_HIDDEN ffi_closure_win64 (ffi_closure *);
+#endif
+
+/* This function is jumped to by the trampoline */
+
+#ifdef X86_WIN64
+void * FFI_HIDDEN
+ffi_closure_win64_inner (ffi_closure *closure, void *args) {
+  ffi_cif       *cif;
+  void         **arg_area;
+  void          *result;
+  void          *resp = &result;
+
+  cif         = closure->cif;
+  arg_area    = (void**) alloca (cif->nargs * sizeof (void*));  
+
+  /* this call will initialize ARG_AREA, such that each
+   * element in that array points to the corresponding 
+   * value on the stack; and if the function returns
+   * a structure, it will change RESP to point to the
+   * structure return address.  */
+
+  ffi_prep_incoming_args_SYSV(args, &resp, arg_area, cif);
+  
+  (closure->fun) (cif, resp, arg_area, closure->user_data);
+
+  /* The result is returned in rax.  This does the right thing for
+     result types except for floats; we have to 'mov xmm0, rax' in the
+     caller to correct this.
+     TODO: structure sizes of 3 5 6 7 are returned by reference, too!!!
+  */
+  return cif->rtype->size > sizeof(void *) ? resp : *(void **)resp;
+}
+
+#else
+unsigned int FFI_HIDDEN __attribute__ ((regparm(1)))
+ffi_closure_SYSV_inner (ffi_closure *closure, void **respp, void *args)
+{
+  /* our various things...  */
+  ffi_cif       *cif;
+  void         **arg_area;
+
+  cif         = closure->cif;
+  arg_area    = (void**) alloca (cif->nargs * sizeof (void*));  
+
+  /* this call will initialize ARG_AREA, such that each
+   * element in that array points to the corresponding 
+   * value on the stack; and if the function returns
+   * a structure, it will change RESP to point to the
+   * structure return address.  */
+
+  ffi_prep_incoming_args_SYSV(args, respp, arg_area, cif);
+
+  (closure->fun) (cif, *respp, arg_area, closure->user_data);
+
+  return cif->flags;
+}
+#endif /* !X86_WIN64 */
+
+static void
+ffi_prep_incoming_args_SYSV(char *stack, void **rvalue, void **avalue,
+                            ffi_cif *cif)
+{
+  register unsigned int i;
+  register void **p_argv;
+  register char *argp;
+  register ffi_type **p_arg;
+
+  argp = stack;
+
+#ifdef X86_WIN64
+  if (cif->rtype->size > sizeof(ffi_arg)
+      || (cif->flags == FFI_TYPE_STRUCT
+          && (cif->rtype->size != 1 && cif->rtype->size != 2
+              && cif->rtype->size != 4 && cif->rtype->size != 8))) {
+    *rvalue = *(void **) argp;
+    argp += sizeof(void *);
+  }
+#else
+  if ( cif->flags == FFI_TYPE_STRUCT
+       || cif->flags == FFI_TYPE_MS_STRUCT ) {
+    *rvalue = *(void **) argp;
+    argp += sizeof(void *);
+  }
+#endif
+
+  p_argv = avalue;
+
+  for (i = cif->nargs, p_arg = cif->arg_types; (i != 0); i--, p_arg++)
+    {
+      size_t z;
+
+      /* Align if necessary */
+      if ((sizeof(void*) - 1) & (size_t) argp) {
+        argp = (char *) ALIGN(argp, sizeof(void*));
+      }
+
+#ifdef X86_WIN64
+      if ((*p_arg)->size > sizeof(ffi_arg)
+          || ((*p_arg)->type == FFI_TYPE_STRUCT
+              && ((*p_arg)->size != 1 && (*p_arg)->size != 2
+                  && (*p_arg)->size != 4 && (*p_arg)->size != 8)))
+        {
+          z = sizeof(void *);
+          *p_argv = *(void **)argp;
+        }
+      else
+#endif
+        {
+          z = (*p_arg)->size;
+          
+          /* because we're little endian, this is what it turns into.   */
+          
+          *p_argv = (void*) argp;
+        }
+          
+      p_argv++;
+#ifdef X86_WIN64
+      argp += (z + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+#else
+      argp += z;
+#endif
+    }
+  
+  return;
+}
+
+#define FFI_INIT_TRAMPOLINE_WIN64(TRAMP,FUN,CTX,MASK) \
+{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
+   void*  __fun = (void*)(FUN); \
+   void*  __ctx = (void*)(CTX); \
+   *(unsigned char*) &__tramp[0] = 0x41; \
+   *(unsigned char*) &__tramp[1] = 0xbb; \
+   *(unsigned int*) &__tramp[2] = MASK; /* mov $mask, %r11 */ \
+   *(unsigned char*) &__tramp[6] = 0x48; \
+   *(unsigned char*) &__tramp[7] = 0xb8; \
+   *(void**) &__tramp[8] = __ctx; /* mov __ctx, %rax */ \
+   *(unsigned char *)  &__tramp[16] = 0x49; \
+   *(unsigned char *)  &__tramp[17] = 0xba; \
+   *(void**) &__tramp[18] = __fun; /* mov __fun, %r10 */ \
+   *(unsigned char *)  &__tramp[26] = 0x41; \
+   *(unsigned char *)  &__tramp[27] = 0xff; \
+   *(unsigned char *)  &__tramp[28] = 0xe2; /* jmp %r10 */ \
+ }
+
+/* How to make a trampoline.  Derived from gcc/config/i386/i386.c. */
+
+#define FFI_INIT_TRAMPOLINE(TRAMP,FUN,CTX) \
+{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
+   unsigned int  __fun = (unsigned int)(FUN); \
+   unsigned int  __ctx = (unsigned int)(CTX); \
+   unsigned int  __dis = __fun - (__ctx + 10);  \
+   *(unsigned char*) &__tramp[0] = 0xb8; \
+   *(unsigned int*)  &__tramp[1] = __ctx; /* movl __ctx, %eax */ \
+   *(unsigned char *)  &__tramp[5] = 0xe9; \
+   *(unsigned int*)  &__tramp[6] = __dis; /* jmp __fun  */ \
+ }
+
+#define FFI_INIT_TRAMPOLINE_THISCALL(TRAMP,FUN,CTX,SIZE) \
+{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
+   unsigned int  __fun = (unsigned int)(FUN); \
+   unsigned int  __ctx = (unsigned int)(CTX); \
+   unsigned int  __dis = __fun - (__ctx + 49);  \
+   unsigned short __size = (unsigned short)(SIZE); \
+   *(unsigned int *) &__tramp[0] = 0x8324048b;	/* mov (%esp), %eax */ \
+   *(unsigned int *) &__tramp[4] = 0x4c890cec;	/* sub $12, %esp */ \
+   *(unsigned int *) &__tramp[8] = 0x04890424;	/* mov %ecx, 4(%esp) */ \
+   *(unsigned char*) &__tramp[12] = 0x24;	/* mov %eax, (%esp) */ \
+   *(unsigned char*) &__tramp[13] = 0xb8; \
+   *(unsigned int *) &__tramp[14] = __size;	/* mov __size, %eax */ \
+   *(unsigned int *) &__tramp[18] = 0x08244c8d;	/* lea 8(%esp), %ecx */ \
+   *(unsigned int *) &__tramp[22] = 0x4802e8c1; /* shr $2, %eax ; dec %eax */ \
+   *(unsigned short*) &__tramp[26] = 0x0b74;	/* jz 1f */ \
+   *(unsigned int *) &__tramp[28] = 0x8908518b;	/* 2b: mov 8(%ecx), %edx */ \
+   *(unsigned int *) &__tramp[32] = 0x04c18311; /* mov %edx, (%ecx) ; add $4, %ecx */ \
+   *(unsigned char*) &__tramp[36] = 0x48;	/* dec %eax */ \
+   *(unsigned short*) &__tramp[37] = 0xf575;	/* jnz 2b ; 1f: */ \
+   *(unsigned char*) &__tramp[39] = 0xb8; \
+   *(unsigned int*)  &__tramp[40] = __ctx; /* movl __ctx, %eax */ \
+   *(unsigned char *)  &__tramp[44] = 0xe8; \
+   *(unsigned int*)  &__tramp[45] = __dis; /* call __fun  */ \
+   *(unsigned char*)  &__tramp[49] = 0xc2; /* ret  */ \
+   *(unsigned short*)  &__tramp[50] = (__size + 8); /* ret (__size + 8)  */ \
+ }
+
+#define FFI_INIT_TRAMPOLINE_STDCALL(TRAMP,FUN,CTX,SIZE)  \
+{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
+   unsigned int  __fun = (unsigned int)(FUN); \
+   unsigned int  __ctx = (unsigned int)(CTX); \
+   unsigned int  __dis = __fun - (__ctx + 10); \
+   unsigned short __size = (unsigned short)(SIZE); \
+   *(unsigned char*) &__tramp[0] = 0xb8; \
+   *(unsigned int*)  &__tramp[1] = __ctx; /* movl __ctx, %eax */ \
+   *(unsigned char *)  &__tramp[5] = 0xe8; \
+   *(unsigned int*)  &__tramp[6] = __dis; /* call __fun  */ \
+   *(unsigned char *)  &__tramp[10] = 0xc2; \
+   *(unsigned short*)  &__tramp[11] = __size; /* ret __size  */ \
+ }
+
+/* the cif must already be prep'ed */
+
+ffi_status
+ffi_prep_closure_loc (ffi_closure* closure,
+                      ffi_cif* cif,
+                      void (*fun)(ffi_cif*,void*,void**,void*),
+                      void *user_data,
+                      void *codeloc)
+{
+#ifdef X86_WIN64
+#define ISFLOAT(IDX) (cif->arg_types[IDX]->type == FFI_TYPE_FLOAT || cif->arg_types[IDX]->type == FFI_TYPE_DOUBLE)
+#define FLAG(IDX) (cif->nargs>(IDX)&&ISFLOAT(IDX)?(1<<(IDX)):0)
+  if (cif->abi == FFI_WIN64) 
+    {
+      int mask = FLAG(0)|FLAG(1)|FLAG(2)|FLAG(3);
+      FFI_INIT_TRAMPOLINE_WIN64 (&closure->tramp[0],
+                                 &ffi_closure_win64,
+                                 codeloc, mask);
+      /* make sure we can execute here */
+    }
+#else
+  if (cif->abi == FFI_SYSV)
+    {
+      FFI_INIT_TRAMPOLINE (&closure->tramp[0],
+                           &ffi_closure_SYSV,
+                           (void*)codeloc);
+    }
+#ifdef X86_WIN32
+  else if (cif->abi == FFI_THISCALL)
+    {
+      FFI_INIT_TRAMPOLINE_THISCALL (&closure->tramp[0],
+				    &ffi_closure_THISCALL,
+				    (void*)codeloc,
+				    cif->bytes);
+    }
+  else if (cif->abi == FFI_STDCALL)
+    {
+      FFI_INIT_TRAMPOLINE_STDCALL (&closure->tramp[0],
+                                   &ffi_closure_STDCALL,
+                                   (void*)codeloc, cif->bytes);
+    }
+  else if (cif->abi == FFI_MS_CDECL)
+    {
+      FFI_INIT_TRAMPOLINE (&closure->tramp[0],
+                           &ffi_closure_SYSV,
+                           (void*)codeloc);
+    }
+#endif /* X86_WIN32 */
+#endif /* !X86_WIN64 */
+  else
+    {
+      return FFI_BAD_ABI;
+    }
+    
+  closure->cif  = cif;
+  closure->user_data = user_data;
+  closure->fun  = fun;
+
+  return FFI_OK;
+}
+
+/* ------- Native raw API support -------------------------------- */
+
+#if !FFI_NO_RAW_API
+
+ffi_status
+ffi_prep_raw_closure_loc (ffi_raw_closure* closure,
+                          ffi_cif* cif,
+                          void (*fun)(ffi_cif*,void*,ffi_raw*,void*),
+                          void *user_data,
+                          void *codeloc)
+{
+  int i;
+
+  if (cif->abi != FFI_SYSV) {
+#ifdef X86_WIN32
+    if (cif->abi != FFI_THISCALL)
+#endif
+    return FFI_BAD_ABI;
+  }
+
+  /* we currently don't support certain kinds of arguments for raw
+     closures.  This should be implemented by a separate assembly
+     language routine, since it would require argument processing,
+     something we don't do now for performance.  */
+
+  for (i = cif->nargs-1; i >= 0; i--)
+    {
+      FFI_ASSERT (cif->arg_types[i]->type != FFI_TYPE_STRUCT);
+      FFI_ASSERT (cif->arg_types[i]->type != FFI_TYPE_LONGDOUBLE);
+    }
+  
+#ifdef X86_WIN32
+  if (cif->abi == FFI_SYSV)
+    {
+#endif
+  FFI_INIT_TRAMPOLINE (&closure->tramp[0], &ffi_closure_raw_SYSV,
+                       codeloc);
+#ifdef X86_WIN32
+    }
+  else if (cif->abi == FFI_THISCALL)
+    {
+      FFI_INIT_TRAMPOLINE_THISCALL (&closure->tramp[0], &ffi_closure_raw_THISCALL,
+				    codeloc, cif->bytes);
+    }
+#endif
+  closure->cif  = cif;
+  closure->user_data = user_data;
+  closure->fun  = fun;
+
+  return FFI_OK;
+}
+
+static void 
+ffi_prep_args_raw(char *stack, extended_cif *ecif)
+{
+  memcpy (stack, ecif->avalue, ecif->cif->bytes);
+}
+
+/* we borrow this routine from libffi (it must be changed, though, to
+ * actually call the function passed in the first argument.  as of
+ * libffi-1.20, this is not the case.)
+ */
+
+void
+ffi_raw_call(ffi_cif *cif, void (*fn)(void), void *rvalue, ffi_raw *fake_avalue)
+{
+  extended_cif ecif;
+  void **avalue = (void **)fake_avalue;
+
+  ecif.cif = cif;
+  ecif.avalue = avalue;
+  
+  /* If the return value is a struct and we don't have a return */
+  /* value address then we need to make one                     */
+
+  if (rvalue == NULL
+      && (cif->flags == FFI_TYPE_STRUCT
+          || cif->flags == FFI_TYPE_MS_STRUCT))
+    {
+      ecif.rvalue = alloca(cif->rtype->size);
+    }
+  else
+    ecif.rvalue = rvalue;
+    
+  
+  switch (cif->abi) 
+    {
+#ifdef X86_WIN32
+    case FFI_SYSV:
+    case FFI_STDCALL:
+    case FFI_MS_CDECL:
+      ffi_call_win32(ffi_prep_args_raw, &ecif, cif->abi, cif->bytes, cif->flags,
+		     ecif.rvalue, fn);
+      break;
+    case FFI_THISCALL:
+    case FFI_FASTCALL:
+      {
+	unsigned int abi = cif->abi;
+	unsigned int i, passed_regs = 0;
+
+	if (cif->flags == FFI_TYPE_STRUCT)
+	  ++passed_regs;
+
+	for (i=0; i < cif->nargs && passed_regs < 2;i++)
+	  {
+	    size_t sz;
+
+	    if (cif->arg_types[i]->type == FFI_TYPE_FLOAT
+	        || cif->arg_types[i]->type == FFI_TYPE_STRUCT)
+	      continue;
+	    sz = (cif->arg_types[i]->size + 3) & ~3;
+	    if (sz == 0 || sz > 4)
+	      continue;
+	    ++passed_regs;
+	  }
+	if (passed_regs < 2 && abi == FFI_FASTCALL)
+	  cif->abi = abi = FFI_THISCALL;
+	if (passed_regs < 1 && abi == FFI_THISCALL)
+	  cif->abi = abi = FFI_STDCALL;
+        ffi_call_win32(ffi_prep_args_raw, &ecif, abi, cif->bytes, cif->flags,
+                       ecif.rvalue, fn);
+      }
+      break;
+#else
+    case FFI_SYSV:
+      ffi_call_SYSV(ffi_prep_args_raw, &ecif, cif->bytes, cif->flags,
+                    ecif.rvalue, fn);
+      break;
+#endif
+    default:
+      FFI_ASSERT(0);
+      break;
+    }
+}
+
+#endif
+
+#endif /* !__x86_64__  || X86_WIN64 */
+
diff --git a/libffi/src/x86/ffi.c b/libffi/src/x86/ffi.c
index 6338de2..3885e39 100644
--- a/libffi/src/x86/ffi.c
+++ b/libffi/src/x86/ffi.c
@@ -28,620 +28,473 @@
    DEALINGS IN THE SOFTWARE.
    ----------------------------------------------------------------------- */
 
-#if !defined(__x86_64__) || defined(_WIN64) || defined(__CYGWIN__)
-
-#ifdef _WIN64
-#include <windows.h>
-#endif
-
+#ifndef __x86_64__
 #include <ffi.h>
 #include <ffi_common.h>
-
 #include <stdlib.h>
-
-/* ffi_prep_args is called by the assembly routine once stack space
-   has been allocated for the function's arguments */
-
-void ffi_prep_args(char *stack, extended_cif *ecif)
-{
-  register unsigned int i;
-  register void **p_argv;
-  register char *argp;
-  register ffi_type **p_arg;
-#ifdef X86_WIN32
-  size_t p_stack_args[2];
-  void *p_stack_data[2];
-  char *argp2 = stack;
-  int stack_args_count = 0;
-  int cabi = ecif->cif->abi;
+#include "internal.h"
+
+/* Force FFI_TYPE_LONGDOUBLE to be different than FFI_TYPE_DOUBLE;
+   all further uses in this file will refer to the 80-bit type.  */
+#if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
+# if FFI_TYPE_LONGDOUBLE != 4
+#  error FFI_TYPE_LONGDOUBLE out of date
+# endif
+#else
+# undef FFI_TYPE_LONGDOUBLE
+# define FFI_TYPE_LONGDOUBLE 4
 #endif
 
-  argp = stack;
-
-  if ((ecif->cif->flags == FFI_TYPE_STRUCT
-       || ecif->cif->flags == FFI_TYPE_MS_STRUCT)
-#ifdef X86_WIN64
-      && (ecif->cif->rtype->size != 1 && ecif->cif->rtype->size != 2
-          && ecif->cif->rtype->size != 4 && ecif->cif->rtype->size != 8)
+#if defined(__GNUC__) && !defined(__declspec)
+# define __declspec(x)  __attribute__((x))
 #endif
-      )
-    {
-      *(void **) argp = ecif->rvalue;
-#ifdef X86_WIN32
-      /* For fastcall/thiscall this is first register-passed
-         argument.  */
-      if (cabi == FFI_THISCALL || cabi == FFI_FASTCALL)
-	{
-	  p_stack_args[stack_args_count] = sizeof (void*);
-	  p_stack_data[stack_args_count] = argp;
-	  ++stack_args_count;
-	}
-#endif
-      argp += sizeof(void*);
-    }
-
-  p_argv = ecif->avalue;
 
-  for (i = ecif->cif->nargs, p_arg = ecif->cif->arg_types;
-       i != 0;
-       i--, p_arg++)
-    {
-      size_t z;
-
-      /* Align if necessary */
-      if ((sizeof(void*) - 1) & (size_t) argp)
-        argp = (char *) ALIGN(argp, sizeof(void*));
-
-      z = (*p_arg)->size;
-#ifdef X86_WIN64
-      if (z > sizeof(ffi_arg)
-          || ((*p_arg)->type == FFI_TYPE_STRUCT
-              && (z != 1 && z != 2 && z != 4 && z != 8))
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
-          || ((*p_arg)->type == FFI_TYPE_LONGDOUBLE)
-#endif
-          )
-        {
-          z = sizeof(ffi_arg);
-          *(void **)argp = *p_argv;
-        }
-      else if ((*p_arg)->type == FFI_TYPE_FLOAT)
-        {
-          memcpy(argp, *p_argv, z);
-        }
-      else
-#endif
-      if (z < sizeof(ffi_arg))
-        {
-          z = sizeof(ffi_arg);
-          switch ((*p_arg)->type)
-            {
-            case FFI_TYPE_SINT8:
-              *(ffi_sarg *) argp = (ffi_sarg)*(SINT8 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_UINT8:
-              *(ffi_arg *) argp = (ffi_arg)*(UINT8 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_SINT16:
-              *(ffi_sarg *) argp = (ffi_sarg)*(SINT16 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_UINT16:
-              *(ffi_arg *) argp = (ffi_arg)*(UINT16 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_SINT32:
-              *(ffi_sarg *) argp = (ffi_sarg)*(SINT32 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_UINT32:
-              *(ffi_arg *) argp = (ffi_arg)*(UINT32 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_STRUCT:
-              *(ffi_arg *) argp = *(ffi_arg *)(* p_argv);
-              break;
-
-            default:
-              FFI_ASSERT(0);
-            }
-        }
-      else
-        {
-          memcpy(argp, *p_argv, z);
-        }
-
-#ifdef X86_WIN32
-    /* For thiscall/fastcall convention register-passed arguments
-       are the first two none-floating-point arguments with a size
-       smaller or equal to sizeof (void*).  */
-    if ((cabi == FFI_THISCALL && stack_args_count < 1)
-        || (cabi == FFI_FASTCALL && stack_args_count < 2))
-      {
-	if (z <= 4
-	    && ((*p_arg)->type != FFI_TYPE_FLOAT
-	        && (*p_arg)->type != FFI_TYPE_STRUCT))
-	  {
-	    p_stack_args[stack_args_count] = z;
-	    p_stack_data[stack_args_count] = argp;
-	    ++stack_args_count;
-	  }
-      }
-#endif
-      p_argv++;
-#ifdef X86_WIN64
-      argp += (z + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
-#else
-      argp += z;
-#endif
-    }
+/* Perform machine dependent cif processing.  */
+ffi_status FFI_HIDDEN
+ffi_prep_cif_machdep(ffi_cif *cif)
+{
+  size_t bytes = 0;
+  int i, n, flags, cabi = cif->abi;
 
-#ifdef X86_WIN32
-  /* We need to move the register-passed arguments for thiscall/fastcall
-     on top of stack, so that those can be moved to registers ecx/edx by
-     call-handler.  */
-  if (stack_args_count > 0)
+  switch (cabi)
     {
-      size_t zz = (p_stack_args[0] + 3) & ~3;
-      char *h;
-
-      /* Move first argument to top-stack position.  */
-      if (p_stack_data[0] != argp2)
-	{
-	  h = alloca (zz + 1);
-	  memcpy (h, p_stack_data[0], zz);
-	  memmove (argp2 + zz, argp2,
-	           (size_t) ((char *) p_stack_data[0] - (char*)argp2));
-	  memcpy (argp2, h, zz);
-	}
-
-      argp2 += zz;
-      --stack_args_count;
-      if (zz > 4)
-	stack_args_count = 0;
-
-      /* If we have a second argument, then move it on top
-         after the first one.  */
-      if (stack_args_count > 0 && p_stack_data[1] != argp2)
-	{
-	  zz = p_stack_args[1];
-	  zz = (zz + 3) & ~3;
-	  h = alloca (zz + 1);
-	  h = alloca (zz + 1);
-	  memcpy (h, p_stack_data[1], zz);
-	  memmove (argp2 + zz, argp2, (size_t) ((char*) p_stack_data[1] - (char*)argp2));
-	  memcpy (argp2, h, zz);
-	}
+    case FFI_SYSV:
+    case FFI_STDCALL:
+    case FFI_THISCALL:
+    case FFI_FASTCALL:
+    case FFI_MS_CDECL:
+    case FFI_PASCAL:
+    case FFI_REGISTER:
+      break;
+    default:
+      return FFI_BAD_ABI;
     }
-#endif
-  return;
-}
-
-/* Perform machine dependent cif processing */
-ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
-{
-  unsigned int i;
-  ffi_type **ptr;
 
-  /* Set the return type flag */
   switch (cif->rtype->type)
     {
     case FFI_TYPE_VOID:
+      flags = X86_RET_VOID;
+      break;
+    case FFI_TYPE_FLOAT:
+      flags = X86_RET_FLOAT;
+      break;
+    case FFI_TYPE_DOUBLE:
+      flags = X86_RET_DOUBLE;
+      break;
+    case FFI_TYPE_LONGDOUBLE:
+      flags = X86_RET_LDOUBLE;
+      break;
     case FFI_TYPE_UINT8:
+      flags = X86_RET_UINT8;
+      break;
     case FFI_TYPE_UINT16:
+      flags = X86_RET_UINT16;
+      break;
     case FFI_TYPE_SINT8:
+      flags = X86_RET_SINT8;
+      break;
     case FFI_TYPE_SINT16:
-#ifdef X86_WIN64
-    case FFI_TYPE_UINT32:
+      flags = X86_RET_SINT16;
+      break;
+    case FFI_TYPE_INT:
     case FFI_TYPE_SINT32:
-#endif
-    case FFI_TYPE_SINT64:
-    case FFI_TYPE_FLOAT:
-    case FFI_TYPE_DOUBLE:
-#ifndef X86_WIN64
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
-    case FFI_TYPE_LONGDOUBLE:
-#endif
-#endif
-      cif->flags = (unsigned) cif->rtype->type;
+    case FFI_TYPE_UINT32:
+    case FFI_TYPE_POINTER:
+      flags = X86_RET_INT32;
       break;
-
+    case FFI_TYPE_SINT64:
     case FFI_TYPE_UINT64:
-#ifdef X86_WIN64
-    case FFI_TYPE_POINTER:
-#endif
-      cif->flags = FFI_TYPE_SINT64;
+      flags = X86_RET_INT64;
       break;
-
     case FFI_TYPE_STRUCT:
 #ifndef X86
+      /* ??? This should be a different ABI rather than an ifdef.  */
       if (cif->rtype->size == 1)
-        {
-          cif->flags = FFI_TYPE_SMALL_STRUCT_1B; /* same as char size */
-        }
+	flags = X86_RET_STRUCT_1B;
       else if (cif->rtype->size == 2)
-        {
-          cif->flags = FFI_TYPE_SMALL_STRUCT_2B; /* same as short size */
-        }
+	flags = X86_RET_STRUCT_2B;
       else if (cif->rtype->size == 4)
-        {
-#ifdef X86_WIN64
-          cif->flags = FFI_TYPE_SMALL_STRUCT_4B;
-#else
-          cif->flags = FFI_TYPE_INT; /* same as int type */
-#endif
-        }
+	flags = X86_RET_INT32;
       else if (cif->rtype->size == 8)
-        {
-          cif->flags = FFI_TYPE_SINT64; /* same as int64 type */
-        }
+	flags = X86_RET_INT64;
       else
 #endif
-        {
-#ifdef X86_WIN32
-          if (cif->abi == FFI_MS_CDECL)
-            cif->flags = FFI_TYPE_MS_STRUCT;
-          else
-#endif
-            cif->flags = FFI_TYPE_STRUCT;
-          /* allocate space for return value pointer */
-          cif->bytes += ALIGN(sizeof(void*), FFI_SIZEOF_ARG);
-        }
-      break;
-
-    default:
-#ifdef X86_WIN64
-      cif->flags = FFI_TYPE_SINT64;
+	{
+	do_struct:
+	  switch (cabi)
+	    {
+	    case FFI_THISCALL:
+	    case FFI_FASTCALL:
+	    case FFI_STDCALL:
+	    case FFI_MS_CDECL:
+	      flags = X86_RET_STRUCTARG;
+	      break;
+	    default:
+	      flags = X86_RET_STRUCTPOP;
+	      break;
+	    }
+	  /* Allocate space for return value pointer.  */
+	  bytes += ALIGN (sizeof(void*), FFI_SIZEOF_ARG);
+	}
       break;
-    case FFI_TYPE_INT:
-      cif->flags = FFI_TYPE_SINT32;
-#else
-      cif->flags = FFI_TYPE_INT;
-#endif
+    case FFI_TYPE_COMPLEX:
+      switch (cif->rtype->elements[0]->type)
+	{
+	case FFI_TYPE_DOUBLE:
+	case FFI_TYPE_LONGDOUBLE:
+	case FFI_TYPE_SINT64:
+	case FFI_TYPE_UINT64:
+	  goto do_struct;
+	case FFI_TYPE_FLOAT:
+	case FFI_TYPE_INT:
+	case FFI_TYPE_SINT32:
+	case FFI_TYPE_UINT32:
+	  flags = X86_RET_INT64;
+	  break;
+	case FFI_TYPE_SINT16:
+	case FFI_TYPE_UINT16:
+	  flags = X86_RET_INT32;
+	  break;
+	case FFI_TYPE_SINT8:
+	case FFI_TYPE_UINT8:
+	  flags = X86_RET_STRUCT_2B;
+	  break;
+	default:
+	  return FFI_BAD_TYPEDEF;
+	}
       break;
+    default:
+      return FFI_BAD_TYPEDEF;
     }
+  cif->flags = flags;
 
-  for (ptr = cif->arg_types, i = cif->nargs; i > 0; i--, ptr++)
+  for (i = 0, n = cif->nargs; i < n; i++)
     {
-      if (((*ptr)->alignment - 1) & cif->bytes)
-        cif->bytes = ALIGN(cif->bytes, (*ptr)->alignment);
-      cif->bytes += ALIGN((*ptr)->size, FFI_SIZEOF_ARG);
-    }
+      ffi_type *t = cif->arg_types[i];
 
-#ifdef X86_WIN64
-  /* ensure space for storing four registers */
-  cif->bytes += 4 * sizeof(ffi_arg);
-#endif
-
-#ifdef X86_DARWIN
-  cif->bytes = (cif->bytes + 15) & ~0xF;
-#endif
+      bytes = ALIGN (bytes, t->alignment);
+      bytes += ALIGN (t->size, FFI_SIZEOF_ARG);
+    }
+  cif->bytes = ALIGN (bytes, 16);
 
   return FFI_OK;
 }
 
-#ifdef X86_WIN64
-extern int
-ffi_call_win64(void (*)(char *, extended_cif *), extended_cif *,
-               unsigned, unsigned, unsigned *, void (*fn)(void));
-#elif defined(X86_WIN32)
-extern void
-ffi_call_win32(void (*)(char *, extended_cif *), extended_cif *,
-               unsigned, unsigned, unsigned, unsigned *, void (*fn)(void));
-#else
-extern void ffi_call_SYSV(void (*)(char *, extended_cif *), extended_cif *,
-                          unsigned, unsigned, unsigned *, void (*fn)(void));
-#endif
-
-void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+static ffi_arg
+extend_basic_type(void *arg, int type)
 {
-  extended_cif ecif;
-
-  ecif.cif = cif;
-  ecif.avalue = avalue;
-  
-  /* If the return value is a struct and we don't have a return */
-  /* value address then we need to make one                     */
-
-#ifdef X86_WIN64
-  if (rvalue == NULL
-      && cif->flags == FFI_TYPE_STRUCT
-      && cif->rtype->size != 1 && cif->rtype->size != 2
-      && cif->rtype->size != 4 && cif->rtype->size != 8)
+  switch (type)
     {
-      ecif.rvalue = alloca((cif->rtype->size + 0xF) & ~0xF);
+    case FFI_TYPE_SINT8:
+      return *(SINT8 *)arg;
+    case FFI_TYPE_UINT8:
+      return *(UINT8 *)arg;
+    case FFI_TYPE_SINT16:
+      return *(SINT16 *)arg;
+    case FFI_TYPE_UINT16:
+      return *(UINT16 *)arg;
+
+    case FFI_TYPE_SINT32:
+    case FFI_TYPE_UINT32:
+    case FFI_TYPE_POINTER:
+    case FFI_TYPE_FLOAT:
+      return *(UINT32 *)arg;
+
+    default:
+      abort();
     }
-#else
-  if (rvalue == NULL
-      && (cif->flags == FFI_TYPE_STRUCT
-          || cif->flags == FFI_TYPE_MS_STRUCT))
+}
+
+struct call_frame
+{
+  void *ebp;		/* 0 */
+  void *retaddr;	/* 4 */
+  void (*fn)(void);	/* 8 */
+  int flags;		/* 12 */
+  void *rvalue;		/* 16 */
+  unsigned regs[3];	/* 20-28 */
+};
+
+struct abi_params
+{
+  int dir;		/* parameter growth direction */
+  int static_chain;	/* the static chain register used by gcc */
+  int nregs;		/* number of register parameters */
+  int regs[3];
+};
+
+static const struct abi_params abi_params[FFI_LAST_ABI] = {
+  [FFI_SYSV] = { 1, R_ECX, 0 },
+  [FFI_THISCALL] = { 1, R_EAX, 1, { R_ECX } },
+  [FFI_FASTCALL] = { 1, R_EAX, 2, { R_ECX, R_EDX } },
+  [FFI_STDCALL] = { 1, R_ECX, 0 },
+  [FFI_PASCAL] = { -1, R_ECX, 0 },
+  /* ??? No defined static chain; gcc does not support REGISTER.  */
+  [FFI_REGISTER] = { -1, R_ECX, 3, { R_EAX, R_EDX, R_ECX } },
+  [FFI_MS_CDECL] = { 1, R_ECX, 0 }
+};
+
+extern void ffi_call_i386(struct call_frame *, char *)
+#if HAVE_FASTCALL
+	__declspec(fastcall)
+#endif
+	FFI_HIDDEN;
+
+static void
+ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	      void **avalue, void *closure)
+{
+  size_t rsize, bytes;
+  struct call_frame *frame;
+  char *stack, *argp;
+  ffi_type **arg_types;
+  int flags, cabi, i, n, dir, narg_reg;
+  const struct abi_params *pabi;
+
+  flags = cif->flags;
+  cabi = cif->abi;
+  pabi = &abi_params[cabi];
+  dir = pabi->dir;
+
+  rsize = 0;
+  if (rvalue == NULL)
     {
-      ecif.rvalue = alloca(cif->rtype->size);
+      switch (flags)
+	{
+	case X86_RET_FLOAT:
+	case X86_RET_DOUBLE:
+	case X86_RET_LDOUBLE:
+	case X86_RET_STRUCTPOP:
+	case X86_RET_STRUCTARG:
+	  /* The float cases need to pop the 387 stack.
+	     The struct cases need to pass a valid pointer to the callee.  */
+	  rsize = cif->rtype->size;
+	  break;
+	default:
+	  /* We can pretend that the callee returns nothing.  */
+	  flags = X86_RET_VOID;
+	  break;
+	}
     }
-#endif
-  else
-    ecif.rvalue = rvalue;
-    
-  
-  switch (cif->abi) 
+
+  bytes = cif->bytes;
+  stack = alloca(bytes + sizeof(*frame) + rsize);
+  argp = (dir < 0 ? stack + bytes : stack);
+  frame = (struct call_frame *)(stack + bytes);
+  if (rsize)
+    rvalue = frame + 1;
+
+  frame->fn = fn;
+  frame->flags = flags;
+  frame->rvalue = rvalue;
+  frame->regs[pabi->static_chain] = (unsigned)closure;
+
+  narg_reg = 0;
+  switch (flags)
     {
-#ifdef X86_WIN64
-    case FFI_WIN64:
-      ffi_call_win64(ffi_prep_args, &ecif, cif->bytes,
-                     cif->flags, ecif.rvalue, fn);
-      break;
-#elif defined(X86_WIN32)
-    case FFI_SYSV:
-    case FFI_STDCALL:
-    case FFI_MS_CDECL:
-      ffi_call_win32(ffi_prep_args, &ecif, cif->abi, cif->bytes, cif->flags,
-		     ecif.rvalue, fn);
-      break;
-    case FFI_THISCALL:
-    case FFI_FASTCALL:
-      {
-	unsigned int abi = cif->abi;
-	unsigned int i, passed_regs = 0;
-
-	if (cif->flags == FFI_TYPE_STRUCT)
-	  ++passed_regs;
-
-	for (i=0; i < cif->nargs && passed_regs < 2;i++)
-	  {
-	    size_t sz;
-
-	    if (cif->arg_types[i]->type == FFI_TYPE_FLOAT
-	        || cif->arg_types[i]->type == FFI_TYPE_STRUCT)
-	      continue;
-	    sz = (cif->arg_types[i]->size + 3) & ~3;
-	    if (sz == 0 || sz > 4)
-	      continue;
-	    ++passed_regs;
-	  }
-	if (passed_regs < 2 && abi == FFI_FASTCALL)
-	  abi = FFI_THISCALL;
-	if (passed_regs < 1 && abi == FFI_THISCALL)
-	  abi = FFI_STDCALL;
-        ffi_call_win32(ffi_prep_args, &ecif, abi, cif->bytes, cif->flags,
-                       ecif.rvalue, fn);
-      }
-      break;
-#else
-    case FFI_SYSV:
-      ffi_call_SYSV(ffi_prep_args, &ecif, cif->bytes, cif->flags, ecif.rvalue,
-                    fn);
-      break;
-#endif
-    default:
-      FFI_ASSERT(0);
+    case X86_RET_STRUCTARG:
+      /* The pointer is passed as the first argument.  */
+      if (pabi->nregs > 0)
+	{
+	  frame->regs[pabi->regs[0]] = (unsigned)rvalue;
+	  narg_reg = 1;
+	  break;
+	}
+      /* fallthru */
+    case X86_RET_STRUCTPOP:
+      *(void **)argp = rvalue;
+      argp += sizeof(void *);
       break;
     }
-}
-
 
-/** private members **/
+  arg_types = cif->arg_types;
+  for (i = 0, n = cif->nargs; i < n; i++)
+    {
+      ffi_type *ty = arg_types[i];
+      void *valp = avalue[i];
+      size_t z = ty->size;
+      int t = ty->type;
 
-/* The following __attribute__((regparm(1))) decorations will have no effect
-   on MSVC - standard cdecl convention applies. */
-static void ffi_prep_incoming_args_SYSV (char *stack, void **ret,
-                                         void** args, ffi_cif* cif);
-void FFI_HIDDEN ffi_closure_SYSV (ffi_closure *)
-     __attribute__ ((regparm(1)));
-unsigned int FFI_HIDDEN ffi_closure_SYSV_inner (ffi_closure *, void **, void *)
-     __attribute__ ((regparm(1)));
-void FFI_HIDDEN ffi_closure_raw_SYSV (ffi_raw_closure *)
-     __attribute__ ((regparm(1)));
-#ifdef X86_WIN32
-void FFI_HIDDEN ffi_closure_raw_THISCALL (ffi_raw_closure *)
-     __attribute__ ((regparm(1)));
-void FFI_HIDDEN ffi_closure_STDCALL (ffi_closure *)
-     __attribute__ ((regparm(1)));
-void FFI_HIDDEN ffi_closure_THISCALL (ffi_closure *)
-     __attribute__ ((regparm(1)));
-#endif
-#ifdef X86_WIN64
-void FFI_HIDDEN ffi_closure_win64 (ffi_closure *);
-#endif
+      if (z <= FFI_SIZEOF_ARG && t != FFI_TYPE_STRUCT)
+        {
+	  ffi_arg val = extend_basic_type (valp, t);
+
+	  if (t != FFI_TYPE_FLOAT && narg_reg < pabi->nregs)
+	    frame->regs[pabi->regs[narg_reg++]] = val;
+	  else if (dir < 0)
+	    {
+	      argp -= 4;
+	      *(ffi_arg *)argp = val;
+	    }
+	  else
+	    {
+	      *(ffi_arg *)argp = val;
+	      argp += 4;
+	    }
+	}
+      else
+	{
+	  size_t za = ALIGN (z, FFI_SIZEOF_ARG);
+	  size_t align = FFI_SIZEOF_ARG;
+
+	  /* Alignment rules for arguments are quite complex.  Vectors and
+	     structures with 16 byte alignment get it.  Note that long double
+	     on Darwin does have 16 byte alignment, and does not get this
+	     alignment if passed directly; a structure with a long double
+	     inside, however, would get 16 byte alignment.  Since libffi does
+	     not support vectors, we need non concern ourselves with other
+	     cases.  */
+	  if (t == FFI_TYPE_STRUCT && ty->alignment >= 16)
+	    align = 16;
+	    
+	  if (dir < 0)
+	    {
+	      /* ??? These reverse argument ABIs are probably too old
+		 to have cared about alignment.  Someone should check.  */
+	      argp -= za;
+	      memcpy (argp, valp, z);
+	    }
+	  else
+	    {
+	      argp = (char *)ALIGN (argp, align);
+	      memcpy (argp, valp, z);
+	      argp += za;
+	    }
+	}
+    }
+  FFI_ASSERT (dir > 0 || argp == stack);
 
-/* This function is jumped to by the trampoline */
-
-#ifdef X86_WIN64
-void * FFI_HIDDEN
-ffi_closure_win64_inner (ffi_closure *closure, void *args) {
-  ffi_cif       *cif;
-  void         **arg_area;
-  void          *result;
-  void          *resp = &result;
-
-  cif         = closure->cif;
-  arg_area    = (void**) alloca (cif->nargs * sizeof (void*));  
-
-  /* this call will initialize ARG_AREA, such that each
-   * element in that array points to the corresponding 
-   * value on the stack; and if the function returns
-   * a structure, it will change RESP to point to the
-   * structure return address.  */
-
-  ffi_prep_incoming_args_SYSV(args, &resp, arg_area, cif);
-  
-  (closure->fun) (cif, resp, arg_area, closure->user_data);
-
-  /* The result is returned in rax.  This does the right thing for
-     result types except for floats; we have to 'mov xmm0, rax' in the
-     caller to correct this.
-     TODO: structure sizes of 3 5 6 7 are returned by reference, too!!!
-  */
-  return cif->rtype->size > sizeof(void *) ? resp : *(void **)resp;
+  ffi_call_i386 (frame, stack);
 }
 
-#else
-unsigned int FFI_HIDDEN __attribute__ ((regparm(1)))
-ffi_closure_SYSV_inner (ffi_closure *closure, void **respp, void *args)
+void
+ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 {
-  /* our various things...  */
-  ffi_cif       *cif;
-  void         **arg_area;
-
-  cif         = closure->cif;
-  arg_area    = (void**) alloca (cif->nargs * sizeof (void*));  
-
-  /* this call will initialize ARG_AREA, such that each
-   * element in that array points to the corresponding 
-   * value on the stack; and if the function returns
-   * a structure, it will change RESP to point to the
-   * structure return address.  */
+  ffi_call_int (cif, fn, rvalue, avalue, NULL);
+}
 
-  ffi_prep_incoming_args_SYSV(args, respp, arg_area, cif);
+void
+ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	     void **avalue, void *closure)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, closure);
+}
 
-  (closure->fun) (cif, *respp, arg_area, closure->user_data);
+/** private members **/
 
-  return cif->flags;
-}
-#endif /* !X86_WIN64 */
+void FFI_HIDDEN ffi_closure_i386(void);
+void FFI_HIDDEN ffi_closure_STDCALL(void);
+void FFI_HIDDEN ffi_closure_REGISTER(void);
 
-static void
-ffi_prep_incoming_args_SYSV(char *stack, void **rvalue, void **avalue,
-                            ffi_cif *cif)
+struct closure_frame
 {
-  register unsigned int i;
-  register void **p_argv;
-  register char *argp;
-  register ffi_type **p_arg;
-
-  argp = stack;
-
-#ifdef X86_WIN64
-  if (cif->rtype->size > sizeof(ffi_arg)
-      || (cif->flags == FFI_TYPE_STRUCT
-          && (cif->rtype->size != 1 && cif->rtype->size != 2
-              && cif->rtype->size != 4 && cif->rtype->size != 8))) {
-    *rvalue = *(void **) argp;
-    argp += sizeof(void *);
-  }
-#else
-  if ( cif->flags == FFI_TYPE_STRUCT
-       || cif->flags == FFI_TYPE_MS_STRUCT ) {
-    *rvalue = *(void **) argp;
-    argp += sizeof(void *);
-  }
-#endif
+  unsigned rettemp[4];				/* 0 */
+  unsigned regs[3];				/* 16-24 */
+  ffi_cif *cif;					/* 28 */
+  void (*fun)(ffi_cif*,void*,void**,void*);	/* 32 */
+  void *user_data;				/* 36 */
+};
+
+int FFI_HIDDEN
+#if HAVE_FASTCALL
+__declspec(fastcall)
+#endif
+ffi_closure_inner (struct closure_frame *frame, char *stack)
+{
+  ffi_cif *cif = frame->cif;
+  int cabi, i, n, flags, dir, narg_reg;
+  const struct abi_params *pabi;
+  ffi_type **arg_types;
+  char *argp;
+  void *rvalue;
+  void **avalue;
+
+  cabi = cif->abi;
+  flags = cif->flags;
+  narg_reg = 0;
+  rvalue = frame->rettemp;
+  pabi = &abi_params[cabi];
+  dir = pabi->dir;
+  argp = (dir < 0 ? stack + cif->bytes : stack);
+
+  switch (flags)
+    {
+    case X86_RET_STRUCTARG:
+      if (pabi->nregs > 0)
+	{
+	  rvalue = (void *)frame->regs[pabi->regs[0]];
+	  narg_reg = 1;
+	  frame->rettemp[0] = (unsigned)rvalue;
+	  break;
+	}
+      /* fallthru */
+    case X86_RET_STRUCTPOP:
+      rvalue = *(void **)argp;
+      argp += sizeof(void *);
+      frame->rettemp[0] = (unsigned)rvalue;
+      break;
+    }
 
-  p_argv = avalue;
+  n = cif->nargs;
+  avalue = alloca(sizeof(void *) * n);
 
-  for (i = cif->nargs, p_arg = cif->arg_types; (i != 0); i--, p_arg++)
+  arg_types = cif->arg_types;
+  for (i = 0; i < n; ++i)
     {
-      size_t z;
-
-      /* Align if necessary */
-      if ((sizeof(void*) - 1) & (size_t) argp) {
-        argp = (char *) ALIGN(argp, sizeof(void*));
-      }
+      ffi_type *ty = arg_types[i];
+      size_t z = ty->size;
+      int t = ty->type;
+      void *valp;
 
-#ifdef X86_WIN64
-      if ((*p_arg)->size > sizeof(ffi_arg)
-          || ((*p_arg)->type == FFI_TYPE_STRUCT
-              && ((*p_arg)->size != 1 && (*p_arg)->size != 2
-                  && (*p_arg)->size != 4 && (*p_arg)->size != 8)))
-        {
-          z = sizeof(void *);
-          *p_argv = *(void **)argp;
-        }
+      if (z <= FFI_SIZEOF_ARG && t != FFI_TYPE_STRUCT)
+	{
+	  if (t != FFI_TYPE_FLOAT && narg_reg < pabi->nregs)
+	    valp = &frame->regs[pabi->regs[narg_reg++]];
+	  else if (dir < 0)
+	    {
+	      argp -= 4;
+	      valp = argp;
+	    }
+	  else
+	    {
+	      valp = argp;
+	      argp += 4;
+	    }
+	}
       else
-#endif
-        {
-          z = (*p_arg)->size;
-          
-          /* because we're little endian, this is what it turns into.   */
-          
-          *p_argv = (void*) argp;
-        }
-          
-      p_argv++;
-#ifdef X86_WIN64
-      argp += (z + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
-#else
-      argp += z;
-#endif
+	{
+	  size_t za = ALIGN (z, FFI_SIZEOF_ARG);
+	  size_t align = FFI_SIZEOF_ARG;
+
+	  /* See the comment in ffi_call_int.  */
+	  if (t == FFI_TYPE_STRUCT && ty->alignment >= 16)
+	    align = 16;
+
+	  if (dir < 0)
+	    {
+	      /* ??? These reverse argument ABIs are probably too old
+		 to have cared about alignment.  Someone should check.  */
+	      argp -= za;
+	      valp = argp;
+	    }
+	  else
+	    {
+	      argp = (char *)ALIGN (argp, align);
+	      valp = argp;
+	      argp += za;
+	    }
+	}
+
+      avalue[i] = valp;
     }
-  
-  return;
-}
 
-#define FFI_INIT_TRAMPOLINE_WIN64(TRAMP,FUN,CTX,MASK) \
-{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
-   void*  __fun = (void*)(FUN); \
-   void*  __ctx = (void*)(CTX); \
-   *(unsigned char*) &__tramp[0] = 0x41; \
-   *(unsigned char*) &__tramp[1] = 0xbb; \
-   *(unsigned int*) &__tramp[2] = MASK; /* mov $mask, %r11 */ \
-   *(unsigned char*) &__tramp[6] = 0x48; \
-   *(unsigned char*) &__tramp[7] = 0xb8; \
-   *(void**) &__tramp[8] = __ctx; /* mov __ctx, %rax */ \
-   *(unsigned char *)  &__tramp[16] = 0x49; \
-   *(unsigned char *)  &__tramp[17] = 0xba; \
-   *(void**) &__tramp[18] = __fun; /* mov __fun, %r10 */ \
-   *(unsigned char *)  &__tramp[26] = 0x41; \
-   *(unsigned char *)  &__tramp[27] = 0xff; \
-   *(unsigned char *)  &__tramp[28] = 0xe2; /* jmp %r10 */ \
- }
-
-/* How to make a trampoline.  Derived from gcc/config/i386/i386.c. */
-
-#define FFI_INIT_TRAMPOLINE(TRAMP,FUN,CTX) \
-{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
-   unsigned int  __fun = (unsigned int)(FUN); \
-   unsigned int  __ctx = (unsigned int)(CTX); \
-   unsigned int  __dis = __fun - (__ctx + 10);  \
-   *(unsigned char*) &__tramp[0] = 0xb8; \
-   *(unsigned int*)  &__tramp[1] = __ctx; /* movl __ctx, %eax */ \
-   *(unsigned char *)  &__tramp[5] = 0xe9; \
-   *(unsigned int*)  &__tramp[6] = __dis; /* jmp __fun  */ \
- }
-
-#define FFI_INIT_TRAMPOLINE_THISCALL(TRAMP,FUN,CTX,SIZE) \
-{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
-   unsigned int  __fun = (unsigned int)(FUN); \
-   unsigned int  __ctx = (unsigned int)(CTX); \
-   unsigned int  __dis = __fun - (__ctx + 49);  \
-   unsigned short __size = (unsigned short)(SIZE); \
-   *(unsigned int *) &__tramp[0] = 0x8324048b;	/* mov (%esp), %eax */ \
-   *(unsigned int *) &__tramp[4] = 0x4c890cec;	/* sub $12, %esp */ \
-   *(unsigned int *) &__tramp[8] = 0x04890424;	/* mov %ecx, 4(%esp) */ \
-   *(unsigned char*) &__tramp[12] = 0x24;	/* mov %eax, (%esp) */ \
-   *(unsigned char*) &__tramp[13] = 0xb8; \
-   *(unsigned int *) &__tramp[14] = __size;	/* mov __size, %eax */ \
-   *(unsigned int *) &__tramp[18] = 0x08244c8d;	/* lea 8(%esp), %ecx */ \
-   *(unsigned int *) &__tramp[22] = 0x4802e8c1; /* shr $2, %eax ; dec %eax */ \
-   *(unsigned short*) &__tramp[26] = 0x0b74;	/* jz 1f */ \
-   *(unsigned int *) &__tramp[28] = 0x8908518b;	/* 2b: mov 8(%ecx), %edx */ \
-   *(unsigned int *) &__tramp[32] = 0x04c18311; /* mov %edx, (%ecx) ; add $4, %ecx */ \
-   *(unsigned char*) &__tramp[36] = 0x48;	/* dec %eax */ \
-   *(unsigned short*) &__tramp[37] = 0xf575;	/* jnz 2b ; 1f: */ \
-   *(unsigned char*) &__tramp[39] = 0xb8; \
-   *(unsigned int*)  &__tramp[40] = __ctx; /* movl __ctx, %eax */ \
-   *(unsigned char *)  &__tramp[44] = 0xe8; \
-   *(unsigned int*)  &__tramp[45] = __dis; /* call __fun  */ \
-   *(unsigned char*)  &__tramp[49] = 0xc2; /* ret  */ \
-   *(unsigned short*)  &__tramp[50] = (__size + 8); /* ret (__size + 8)  */ \
- }
-
-#define FFI_INIT_TRAMPOLINE_STDCALL(TRAMP,FUN,CTX,SIZE)  \
-{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
-   unsigned int  __fun = (unsigned int)(FUN); \
-   unsigned int  __ctx = (unsigned int)(CTX); \
-   unsigned int  __dis = __fun - (__ctx + 10); \
-   unsigned short __size = (unsigned short)(SIZE); \
-   *(unsigned char*) &__tramp[0] = 0xb8; \
-   *(unsigned int*)  &__tramp[1] = __ctx; /* movl __ctx, %eax */ \
-   *(unsigned char *)  &__tramp[5] = 0xe8; \
-   *(unsigned int*)  &__tramp[6] = __dis; /* call __fun  */ \
-   *(unsigned char *)  &__tramp[10] = 0xc2; \
-   *(unsigned short*)  &__tramp[11] = __size; /* ret __size  */ \
- }
-
-/* the cif must already be prep'ed */
+  frame->fun (cif, rvalue, avalue, frame->user_data);
+
+  if (cabi == FFI_STDCALL)
+    return flags + (cif->bytes << X86_RET_POP_SHIFT);
+  else
+    return flags;
+}
 
 ffi_status
 ffi_prep_closure_loc (ffi_closure* closure,
@@ -650,54 +503,76 @@ ffi_prep_closure_loc (ffi_closure* closure,
                       void *user_data,
                       void *codeloc)
 {
-#ifdef X86_WIN64
-#define ISFLOAT(IDX) (cif->arg_types[IDX]->type == FFI_TYPE_FLOAT || cif->arg_types[IDX]->type == FFI_TYPE_DOUBLE)
-#define FLAG(IDX) (cif->nargs>(IDX)&&ISFLOAT(IDX)?(1<<(IDX)):0)
-  if (cif->abi == FFI_WIN64) 
-    {
-      int mask = FLAG(0)|FLAG(1)|FLAG(2)|FLAG(3);
-      FFI_INIT_TRAMPOLINE_WIN64 (&closure->tramp[0],
-                                 &ffi_closure_win64,
-                                 codeloc, mask);
-      /* make sure we can execute here */
-    }
-#else
-  if (cif->abi == FFI_SYSV)
-    {
-      FFI_INIT_TRAMPOLINE (&closure->tramp[0],
-                           &ffi_closure_SYSV,
-                           (void*)codeloc);
-    }
-#ifdef X86_WIN32
-  else if (cif->abi == FFI_THISCALL)
-    {
-      FFI_INIT_TRAMPOLINE_THISCALL (&closure->tramp[0],
-				    &ffi_closure_THISCALL,
-				    (void*)codeloc,
-				    cif->bytes);
-    }
-  else if (cif->abi == FFI_STDCALL)
-    {
-      FFI_INIT_TRAMPOLINE_STDCALL (&closure->tramp[0],
-                                   &ffi_closure_STDCALL,
-                                   (void*)codeloc, cif->bytes);
-    }
-  else if (cif->abi == FFI_MS_CDECL)
+  char *tramp = closure->tramp;
+  void (*dest)(void);
+  int op = 0xb8;  /* movl imm, %eax */
+
+  switch (cif->abi)
     {
-      FFI_INIT_TRAMPOLINE (&closure->tramp[0],
-                           &ffi_closure_SYSV,
-                           (void*)codeloc);
+    case FFI_SYSV:
+    case FFI_THISCALL:
+    case FFI_FASTCALL:
+    case FFI_MS_CDECL:
+      dest = ffi_closure_i386;
+      break;
+    case FFI_STDCALL:
+    case FFI_PASCAL:
+      dest = ffi_closure_STDCALL;
+      break;
+    case FFI_REGISTER:
+      dest = ffi_closure_REGISTER;
+      op = 0x68;  /* pushl imm */
+    default:
+      return FFI_BAD_ABI;
     }
-#endif /* X86_WIN32 */
-#endif /* !X86_WIN64 */
-  else
+
+  /* movl or pushl immediate.  */
+  tramp[0] = op;
+  *(void **)(tramp + 1) = codeloc;
+
+  /* jmp dest */
+  tramp[5] = 0xe9;
+  *(unsigned *)(tramp + 6) = (unsigned)dest - ((unsigned)codeloc + 10);
+
+  closure->cif = cif;
+  closure->fun = fun;
+  closure->user_data = user_data;
+
+  return FFI_OK;
+}
+
+void FFI_HIDDEN ffi_go_closure_EAX(void);
+void FFI_HIDDEN ffi_go_closure_ECX(void);
+void FFI_HIDDEN ffi_go_closure_STDCALL(void);
+
+ffi_status
+ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
+		     void (*fun)(ffi_cif*,void*,void**,void*))
+{
+  void (*dest)(void);
+
+  switch (cif->abi)
     {
+    case FFI_SYSV:
+    case FFI_MS_CDECL:
+      dest = ffi_go_closure_ECX;
+      break;
+    case FFI_THISCALL:
+    case FFI_FASTCALL:
+      dest = ffi_go_closure_EAX;
+      break;
+    case FFI_STDCALL:
+    case FFI_PASCAL:
+      dest = ffi_go_closure_STDCALL;
+      break;
+    case FFI_REGISTER:
+    default:
       return FFI_BAD_ABI;
     }
-    
-  closure->cif  = cif;
-  closure->user_data = user_data;
-  closure->fun  = fun;
+
+  closure->tramp = dest;
+  closure->cif = cif;
+  closure->fun = fun;
 
   return FFI_OK;
 }
@@ -706,138 +581,145 @@ ffi_prep_closure_loc (ffi_closure* closure,
 
 #if !FFI_NO_RAW_API
 
+void FFI_HIDDEN ffi_closure_raw_SYSV(void);
+void FFI_HIDDEN ffi_closure_raw_THISCALL(void);
+
 ffi_status
-ffi_prep_raw_closure_loc (ffi_raw_closure* closure,
-                          ffi_cif* cif,
+ffi_prep_raw_closure_loc (ffi_raw_closure *closure,
+                          ffi_cif *cif,
                           void (*fun)(ffi_cif*,void*,ffi_raw*,void*),
                           void *user_data,
                           void *codeloc)
 {
+  char *tramp = closure->tramp;
+  void (*dest)(void);
   int i;
 
-  if (cif->abi != FFI_SYSV) {
-#ifdef X86_WIN32
-    if (cif->abi != FFI_THISCALL)
-#endif
-    return FFI_BAD_ABI;
-  }
-
-  /* we currently don't support certain kinds of arguments for raw
+  /* We currently don't support certain kinds of arguments for raw
      closures.  This should be implemented by a separate assembly
      language routine, since it would require argument processing,
      something we don't do now for performance.  */
-
   for (i = cif->nargs-1; i >= 0; i--)
+    switch (cif->arg_types[i]->type)
+      {
+      case FFI_TYPE_STRUCT:
+      case FFI_TYPE_LONGDOUBLE:
+	return FFI_BAD_TYPEDEF;
+      }
+
+  switch (cif->abi)
     {
-      FFI_ASSERT (cif->arg_types[i]->type != FFI_TYPE_STRUCT);
-      FFI_ASSERT (cif->arg_types[i]->type != FFI_TYPE_LONGDOUBLE);
-    }
-  
-#ifdef X86_WIN32
-  if (cif->abi == FFI_SYSV)
-    {
-#endif
-  FFI_INIT_TRAMPOLINE (&closure->tramp[0], &ffi_closure_raw_SYSV,
-                       codeloc);
-#ifdef X86_WIN32
-    }
-  else if (cif->abi == FFI_THISCALL)
-    {
-      FFI_INIT_TRAMPOLINE_THISCALL (&closure->tramp[0], &ffi_closure_raw_THISCALL,
-				    codeloc, cif->bytes);
+    case FFI_THISCALL:
+      dest = ffi_closure_raw_THISCALL;
+      break;
+    case FFI_SYSV:
+      dest = ffi_closure_raw_SYSV;
+      break;
+    default:
+      return FFI_BAD_ABI;
     }
-#endif
-  closure->cif  = cif;
+
+  /* movl imm, %eax.  */
+  tramp[0] = 0xb8;
+  *(void **)(tramp + 1) = codeloc;
+
+  /* jmp dest */
+  tramp[5] = 0xe9;
+  *(unsigned *)(tramp + 6) = (unsigned)dest - ((unsigned)codeloc + 10);
+
+  closure->cif = cif;
+  closure->fun = fun;
   closure->user_data = user_data;
-  closure->fun  = fun;
 
   return FFI_OK;
 }
 
-static void 
-ffi_prep_args_raw(char *stack, extended_cif *ecif)
-{
-  memcpy (stack, ecif->avalue, ecif->cif->bytes);
-}
-
-/* we borrow this routine from libffi (it must be changed, though, to
- * actually call the function passed in the first argument.  as of
- * libffi-1.20, this is not the case.)
- */
-
 void
-ffi_raw_call(ffi_cif *cif, void (*fn)(void), void *rvalue, ffi_raw *fake_avalue)
+ffi_raw_call(ffi_cif *cif, void (*fn)(void), void *rvalue, ffi_raw *avalue)
 {
-  extended_cif ecif;
-  void **avalue = (void **)fake_avalue;
-
-  ecif.cif = cif;
-  ecif.avalue = avalue;
-  
-  /* If the return value is a struct and we don't have a return */
-  /* value address then we need to make one                     */
-
-  if (rvalue == NULL
-      && (cif->flags == FFI_TYPE_STRUCT
-          || cif->flags == FFI_TYPE_MS_STRUCT))
+  size_t rsize, bytes;
+  struct call_frame *frame;
+  char *stack, *argp;
+  ffi_type **arg_types;
+  int flags, cabi, i, n, narg_reg;
+  const struct abi_params *pabi;
+
+  flags = cif->flags;
+  cabi = cif->abi;
+  pabi = &abi_params[cabi];
+
+  rsize = 0;
+  if (rvalue == NULL)
     {
-      ecif.rvalue = alloca(cif->rtype->size);
+      switch (flags)
+	{
+	case X86_RET_FLOAT:
+	case X86_RET_DOUBLE:
+	case X86_RET_LDOUBLE:
+	case X86_RET_STRUCTPOP:
+	case X86_RET_STRUCTARG:
+	  /* The float cases need to pop the 387 stack.
+	     The struct cases need to pass a valid pointer to the callee.  */
+	  rsize = cif->rtype->size;
+	  break;
+	default:
+	  /* We can pretend that the callee returns nothing.  */
+	  flags = X86_RET_VOID;
+	  break;
+	}
     }
-  else
-    ecif.rvalue = rvalue;
-    
-  
-  switch (cif->abi) 
+
+  bytes = cif->bytes;
+  argp = stack = alloca(bytes + sizeof(*frame) + rsize);
+  frame = (struct call_frame *)(stack + bytes);
+  if (rsize)
+    rvalue = frame + 1;
+
+  narg_reg = 0;
+  switch (flags)
     {
-#ifdef X86_WIN32
-    case FFI_SYSV:
-    case FFI_STDCALL:
-    case FFI_MS_CDECL:
-      ffi_call_win32(ffi_prep_args_raw, &ecif, cif->abi, cif->bytes, cif->flags,
-		     ecif.rvalue, fn);
-      break;
-    case FFI_THISCALL:
-    case FFI_FASTCALL:
-      {
-	unsigned int abi = cif->abi;
-	unsigned int i, passed_regs = 0;
-
-	if (cif->flags == FFI_TYPE_STRUCT)
-	  ++passed_regs;
-
-	for (i=0; i < cif->nargs && passed_regs < 2;i++)
-	  {
-	    size_t sz;
-
-	    if (cif->arg_types[i]->type == FFI_TYPE_FLOAT
-	        || cif->arg_types[i]->type == FFI_TYPE_STRUCT)
-	      continue;
-	    sz = (cif->arg_types[i]->size + 3) & ~3;
-	    if (sz == 0 || sz > 4)
-	      continue;
-	    ++passed_regs;
-	  }
-	if (passed_regs < 2 && abi == FFI_FASTCALL)
-	  cif->abi = abi = FFI_THISCALL;
-	if (passed_regs < 1 && abi == FFI_THISCALL)
-	  cif->abi = abi = FFI_STDCALL;
-        ffi_call_win32(ffi_prep_args_raw, &ecif, abi, cif->bytes, cif->flags,
-                       ecif.rvalue, fn);
-      }
-      break;
-#else
-    case FFI_SYSV:
-      ffi_call_SYSV(ffi_prep_args_raw, &ecif, cif->bytes, cif->flags,
-                    ecif.rvalue, fn);
-      break;
-#endif
-    default:
-      FFI_ASSERT(0);
+    case X86_RET_STRUCTARG:
+      /* The pointer is passed as the first argument.  */
+      if (pabi->nregs > 0)
+	{
+	  frame->regs[pabi->regs[0]] = (unsigned)rvalue;
+	  narg_reg = 1;
+	  break;
+	}
+      /* fallthru */
+    case X86_RET_STRUCTPOP:
+      *(void **)argp = rvalue;
+      argp += sizeof(void *);
+      bytes -= sizeof(void *);
       break;
     }
-}
 
-#endif
+  arg_types = cif->arg_types;
+  for (i = 0, n = cif->nargs; narg_reg < pabi->nregs && i < n; i++)
+    {
+      ffi_type *ty = arg_types[i];
+      size_t z = ty->size;
+      int t = ty->type;
 
-#endif /* !__x86_64__  || X86_WIN64 */
+      if (z <= FFI_SIZEOF_ARG && t != FFI_TYPE_STRUCT && t != FFI_TYPE_FLOAT)
+	{
+	  ffi_arg val = extend_basic_type (avalue, t);
+	  frame->regs[pabi->regs[narg_reg++]] = val;
+	  z = FFI_SIZEOF_ARG;
+	}
+      else
+	{
+	  memcpy (argp, avalue, z);
+	  z = ALIGN (z, FFI_SIZEOF_ARG);
+	  argp += z;
+	}
+      avalue += z;
+      bytes -= z;
+    }
+  if (i < n)
+    memcpy (argp, avalue, bytes);
 
+  ffi_call_i386 (frame, stack);
+}
+#endif /* !FFI_NO_RAW_API */
+#endif /* !__x86_64__ */
diff --git a/libffi/src/x86/ffi64.c b/libffi/src/x86/ffi64.c
index 1daa1c0..131b5e3 100644
--- a/libffi/src/x86/ffi64.c
+++ b/libffi/src/x86/ffi64.c
@@ -1,9 +1,10 @@
 /* -----------------------------------------------------------------------
-   ffi64.c - Copyright (c) 20011  Anthony Green
+   ffi64.c - Copyright (c) 2013  The Written Word, Inc.
+             Copyright (c) 2011  Anthony Green
              Copyright (c) 2008, 2010  Red Hat, Inc.
              Copyright (c) 2002, 2007  Bo Thorsen <bo@suse.de>
-             
-   x86-64 Foreign Function Interface 
+
+   x86-64 Foreign Function Interface
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -31,27 +32,44 @@
 
 #include <stdlib.h>
 #include <stdarg.h>
+#include <stdint.h>
+#include "internal64.h"
 
 #ifdef __x86_64__
 
 #define MAX_GPR_REGS 6
 #define MAX_SSE_REGS 8
 
-#ifdef __INTEL_COMPILER
+#if defined(__INTEL_COMPILER)
+#include "xmmintrin.h"
 #define UINT128 __m128
 #else
+#if defined(__SUNPRO_C)
+#include <sunmedia_types.h>
+#define UINT128 __m128i
+#else
 #define UINT128 __int128_t
 #endif
+#endif
+
+union big_int_union
+{
+  UINT32 i32;
+  UINT64 i64;
+  UINT128 i128;
+};
 
 struct register_args
 {
   /* Registers for argument passing.  */
   UINT64 gpr[MAX_GPR_REGS];
-  UINT128 sse[MAX_SSE_REGS];
+  union big_int_union sse[MAX_SSE_REGS];
+  UINT64 rax;	/* ssecount */
+  UINT64 r10;	/* static chain */
 };
 
 extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
-			     void *raddr, void (*fnaddr)(void), unsigned ssecount);
+			     void *raddr, void (*fnaddr)(void)) FFI_HIDDEN;
 
 /* All reference to register classes here is identical to the code in
    gcc/config/i386/i386.c. Do *not* change one without the other.  */
@@ -138,7 +156,7 @@ merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
 
    See the x86-64 PS ABI for details.
 */
-static int
+static size_t
 classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
 		   size_t byte_offset)
 {
@@ -153,8 +171,9 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
     case FFI_TYPE_UINT64:
     case FFI_TYPE_SINT64:
     case FFI_TYPE_POINTER:
+    do_integer:
       {
-	int size = byte_offset + type->size;
+	size_t size = byte_offset + type->size;
 
 	if (size <= 4)
 	  {
@@ -174,7 +193,7 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
 	  }
 	else if (size <= 16)
 	  {
-	    classes[0] = classes[1] = X86_64_INTEGERSI_CLASS;
+	    classes[0] = classes[1] = X86_64_INTEGER_CLASS;
 	    return 2;
 	  }
 	else
@@ -189,15 +208,17 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
     case FFI_TYPE_DOUBLE:
       classes[0] = X86_64_SSEDF_CLASS;
       return 1;
+#if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
     case FFI_TYPE_LONGDOUBLE:
       classes[0] = X86_64_X87_CLASS;
       classes[1] = X86_64_X87UP_CLASS;
       return 2;
+#endif
     case FFI_TYPE_STRUCT:
       {
-	const int UNITS_PER_WORD = 8;
-	int words = (type->size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
-	ffi_type **ptr; 
+	const size_t UNITS_PER_WORD = 8;
+	size_t words = (type->size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
+	ffi_type **ptr;
 	int i;
 	enum x86_64_reg_class subclasses[MAX_CLASSES];
 
@@ -212,6 +233,7 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
 	   signalize memory class, so handle it as special case.  */
 	if (!words)
 	  {
+    case FFI_TYPE_VOID:
 	    classes[0] = X86_64_NO_CLASS;
 	    return 1;
 	  }
@@ -219,7 +241,7 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
 	/* Merge the fields of structure.  */
 	for (ptr = type->elements; *ptr != NULL; ptr++)
 	  {
-	    int num;
+	    size_t num;
 
 	    byte_offset = ALIGN (byte_offset, (*ptr)->alignment);
 
@@ -228,7 +250,7 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
 	      return 0;
 	    for (i = 0; i < num; i++)
 	      {
-		int pos = byte_offset / 8;
+		size_t pos = byte_offset / 8;
 		classes[i + pos] =
 		  merge_classes (subclasses[i], classes[i + pos]);
 	      }
@@ -281,22 +303,54 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
 	  }
 	return words;
       }
-
-    default:
-      FFI_ASSERT(0);
+    case FFI_TYPE_COMPLEX:
+      {
+	ffi_type *inner = type->elements[0];
+	switch (inner->type)
+	  {
+	  case FFI_TYPE_INT:
+	  case FFI_TYPE_UINT8:
+	  case FFI_TYPE_SINT8:
+	  case FFI_TYPE_UINT16:
+	  case FFI_TYPE_SINT16:
+	  case FFI_TYPE_UINT32:
+	  case FFI_TYPE_SINT32:
+	  case FFI_TYPE_UINT64:
+	  case FFI_TYPE_SINT64:
+	    goto do_integer;
+
+	  case FFI_TYPE_FLOAT:
+	    classes[0] = X86_64_SSE_CLASS;
+	    if (byte_offset % 8)
+	      {
+		classes[1] = X86_64_SSESF_CLASS;
+		return 2;
+	      }
+	    return 1;
+	  case FFI_TYPE_DOUBLE:
+	    classes[0] = classes[1] = X86_64_SSEDF_CLASS;
+	    return 2;
+#if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
+	  case FFI_TYPE_LONGDOUBLE:
+	    classes[0] = X86_64_COMPLEX_X87_CLASS;
+	    return 1;
+#endif
+	  }
+      }
     }
-  return 0; /* Never reached.  */
+  abort();
 }
 
 /* Examine the argument and return set number of register required in each
    class.  Return zero iff parameter should be passed in memory, otherwise
    the number of registers.  */
 
-static int
+static size_t
 examine_argument (ffi_type *type, enum x86_64_reg_class classes[MAX_CLASSES],
 		  _Bool in_return, int *pngpr, int *pnsse)
 {
-  int i, n, ngpr, nsse;
+  size_t n;
+  int i, ngpr, nsse;
 
   n = classify_argument (type, classes, 0);
   if (n == 0)
@@ -337,15 +391,59 @@ examine_argument (ffi_type *type, enum x86_64_reg_class classes[MAX_CLASSES],
 ffi_status
 ffi_prep_cif_machdep (ffi_cif *cif)
 {
-  int gprcount, ssecount, i, avn, n, ngpr, nsse, flags;
+  int gprcount, ssecount, i, avn, ngpr, nsse, flags;
   enum x86_64_reg_class classes[MAX_CLASSES];
-  size_t bytes;
+  size_t bytes, n, rtype_size;
+  ffi_type *rtype;
+
+  if (cif->abi != FFI_UNIX64)
+    return FFI_BAD_ABI;
 
   gprcount = ssecount = 0;
 
-  flags = cif->rtype->type;
-  if (flags != FFI_TYPE_VOID)
+  rtype = cif->rtype;
+  rtype_size = rtype->size;
+  switch (rtype->type)
     {
+    case FFI_TYPE_VOID:
+      flags = UNIX64_RET_VOID;
+      break;
+    case FFI_TYPE_UINT8:
+      flags = UNIX64_RET_UINT8;
+      break;
+    case FFI_TYPE_SINT8:
+      flags = UNIX64_RET_SINT8;
+      break;
+    case FFI_TYPE_UINT16:
+      flags = UNIX64_RET_UINT16;
+      break;
+    case FFI_TYPE_SINT16:
+      flags = UNIX64_RET_SINT16;
+      break;
+    case FFI_TYPE_UINT32:
+      flags = UNIX64_RET_UINT32;
+      break;
+    case FFI_TYPE_INT:
+    case FFI_TYPE_SINT32:
+      flags = UNIX64_RET_SINT32;
+      break;
+    case FFI_TYPE_UINT64:
+    case FFI_TYPE_SINT64:
+      flags = UNIX64_RET_INT64;
+      break;
+    case FFI_TYPE_POINTER:
+      flags = (sizeof(void *) == 4 ? UNIX64_RET_UINT32 : UNIX64_RET_INT64);
+      break;
+    case FFI_TYPE_FLOAT:
+      flags = UNIX64_RET_XMM32;
+      break;
+    case FFI_TYPE_DOUBLE:
+      flags = UNIX64_RET_XMM64;
+      break;
+    case FFI_TYPE_LONGDOUBLE:
+      flags = UNIX64_RET_X87;
+      break;
+    case FFI_TYPE_STRUCT:
       n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
       if (n == 0)
 	{
@@ -353,22 +451,62 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 	     memory is the first argument.  Allocate a register for it.  */
 	  gprcount++;
 	  /* We don't have to do anything in asm for the return.  */
-	  flags = FFI_TYPE_VOID;
+	  flags = UNIX64_RET_VOID | UNIX64_FLAG_RET_IN_MEM;
 	}
-      else if (flags == FFI_TYPE_STRUCT)
+      else
 	{
-	  /* Mark which registers the result appears in.  */
 	  _Bool sse0 = SSE_CLASS_P (classes[0]);
-	  _Bool sse1 = n == 2 && SSE_CLASS_P (classes[1]);
-	  if (sse0 && !sse1)
-	    flags |= 1 << 8;
-	  else if (!sse0 && sse1)
-	    flags |= 1 << 9;
-	  else if (sse0 && sse1)
-	    flags |= 1 << 10;
-	  /* Mark the true size of the structure.  */
-	  flags |= cif->rtype->size << 12;
+
+	  if (rtype_size == 4 && sse0)
+	    flags = UNIX64_RET_XMM32;
+	  else if (rtype_size == 8)
+	    flags = sse0 ? UNIX64_RET_XMM64 : UNIX64_RET_INT64;
+	  else
+	    {
+	      _Bool sse1 = n == 2 && SSE_CLASS_P (classes[1]);
+	      if (sse0 && sse1)
+		flags = UNIX64_RET_ST_XMM0_XMM1;
+	      else if (sse0)
+		flags = UNIX64_RET_ST_XMM0_RAX;
+	      else if (sse1)
+		flags = UNIX64_RET_ST_RAX_XMM0;
+	      else
+		flags = UNIX64_RET_ST_RAX_RDX;
+	      flags |= rtype_size << UNIX64_SIZE_SHIFT;
+	    }
+	}
+      break;
+    case FFI_TYPE_COMPLEX:
+      switch (rtype->elements[0]->type)
+	{
+	case FFI_TYPE_UINT8:
+	case FFI_TYPE_SINT8:
+	case FFI_TYPE_UINT16:
+	case FFI_TYPE_SINT16:
+	case FFI_TYPE_INT:
+	case FFI_TYPE_UINT32:
+	case FFI_TYPE_SINT32:
+	case FFI_TYPE_UINT64:
+	case FFI_TYPE_SINT64:
+	  flags = UNIX64_RET_ST_RAX_RDX | (rtype_size << UNIX64_SIZE_SHIFT);
+	  break;
+	case FFI_TYPE_FLOAT:
+	  flags = UNIX64_RET_XMM64;
+	  break;
+	case FFI_TYPE_DOUBLE:
+	  flags = UNIX64_RET_ST_XMM0_XMM1 | (16 << UNIX64_SIZE_SHIFT);
+	  break;
+#if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
+	case FFI_TYPE_LONGDOUBLE:
+	  flags = UNIX64_RET_X87_2;
+	  break;
+#endif
+	default:
+	  return FFI_BAD_TYPEDEF;
 	}
+      break;
+    default:
+      return FFI_BAD_TYPEDEF;
     }
 
   /* Go over all arguments and determine the way they should be passed.
@@ -395,44 +533,50 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 	}
     }
   if (ssecount)
-    flags |= 1 << 11;
+    flags |= UNIX64_FLAG_XMM_ARGS;
+
   cif->flags = flags;
   cif->bytes = ALIGN (bytes, 8);
 
   return FFI_OK;
 }
 
-void
-ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+static void
+ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	      void **avalue, void *closure)
 {
   enum x86_64_reg_class classes[MAX_CLASSES];
   char *stack, *argp;
   ffi_type **arg_types;
-  int gprcount, ssecount, ngpr, nsse, i, avn;
-  _Bool ret_in_memory;
+  int gprcount, ssecount, ngpr, nsse, i, avn, flags;
   struct register_args *reg_args;
 
   /* Can't call 32-bit mode from 64-bit mode.  */
   FFI_ASSERT (cif->abi == FFI_UNIX64);
 
   /* If the return value is a struct and we don't have a return value
-     address then we need to make one.  Note the setting of flags to
-     VOID above in ffi_prep_cif_machdep.  */
-  ret_in_memory = (cif->rtype->type == FFI_TYPE_STRUCT
-		   && (cif->flags & 0xff) == FFI_TYPE_VOID);
-  if (rvalue == NULL && ret_in_memory)
-    rvalue = alloca (cif->rtype->size);
+     address then we need to make one.  Otherwise we can ignore it.  */
+  flags = cif->flags;
+  if (rvalue == NULL)
+    {
+      if (flags & UNIX64_FLAG_RET_IN_MEM)
+	rvalue = alloca (cif->rtype->size);
+      else
+	flags = UNIX64_RET_VOID;
+    }
 
   /* Allocate the space for the arguments, plus 4 words of temp space.  */
   stack = alloca (sizeof (struct register_args) + cif->bytes + 4*8);
   reg_args = (struct register_args *) stack;
   argp = stack + sizeof (struct register_args);
 
+  reg_args->r10 = (uintptr_t) closure;
+
   gprcount = ssecount = 0;
 
   /* If the return value is passed in memory, add the pointer as the
      first integer argument.  */
-  if (ret_in_memory)
+  if (flags & UNIX64_FLAG_RET_IN_MEM)
     reg_args->gpr[gprcount++] = (unsigned long) rvalue;
 
   avn = cif->nargs;
@@ -440,8 +584,7 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 
   for (i = 0; i < avn; ++i)
     {
-      size_t size = arg_types[i]->size;
-      int n;
+      size_t n, size = arg_types[i]->size;
 
       n = examine_argument (arg_types[i], classes, 0, &ngpr, &nsse);
       if (n == 0
@@ -469,18 +612,38 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 	    {
 	      switch (classes[j])
 		{
+		case X86_64_NO_CLASS:
+		case X86_64_SSEUP_CLASS:
+		  break;
 		case X86_64_INTEGER_CLASS:
 		case X86_64_INTEGERSI_CLASS:
-		  reg_args->gpr[gprcount] = 0;
-		  memcpy (&reg_args->gpr[gprcount], a, size < 8 ? size : 8);
+		  /* Sign-extend integer arguments passed in general
+		     purpose registers, to cope with the fact that
+		     LLVM incorrectly assumes that this will be done
+		     (the x86-64 PS ABI does not specify this). */
+		  switch (arg_types[i]->type)
+		    {
+		    case FFI_TYPE_SINT8:
+		      reg_args->gpr[gprcount] = (SINT64) *((SINT8 *) a);
+		      break;
+		    case FFI_TYPE_SINT16:
+		      reg_args->gpr[gprcount] = (SINT64) *((SINT16 *) a);
+		      break;
+		    case FFI_TYPE_SINT32:
+		      reg_args->gpr[gprcount] = (SINT64) *((SINT32 *) a);
+		      break;
+		    default:
+		      reg_args->gpr[gprcount] = 0;
+		      memcpy (&reg_args->gpr[gprcount], a, size);
+		    }
 		  gprcount++;
 		  break;
 		case X86_64_SSE_CLASS:
 		case X86_64_SSEDF_CLASS:
-		  reg_args->sse[ssecount++] = *(UINT64 *) a;
+		  reg_args->sse[ssecount++].i64 = *(UINT64 *) a;
 		  break;
 		case X86_64_SSESF_CLASS:
-		  reg_args->sse[ssecount++] = *(UINT32 *) a;
+		  reg_args->sse[ssecount++].i32 = *(UINT32 *) a;
 		  break;
 		default:
 		  abort();
@@ -488,13 +651,27 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 	    }
 	}
     }
+  reg_args->rax = ssecount;
 
   ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
-		   cif->flags, rvalue, fn, ssecount);
+		   flags, rvalue, fn);
 }
 
+void
+ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, NULL);
+}
+
+void
+ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	     void **avalue, void *closure)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, closure);
+}
 
-extern void ffi_closure_unix64(void);
+extern void ffi_closure_unix64(void) FFI_HIDDEN;
+extern void ffi_closure_unix64_sse(void) FFI_HIDDEN;
 
 ffi_status
 ffi_prep_closure_loc (ffi_closure* closure,
@@ -503,29 +680,27 @@ ffi_prep_closure_loc (ffi_closure* closure,
 		      void *user_data,
 		      void *codeloc)
 {
-  volatile unsigned short *tramp;
-
-  /* Sanity check on the cif ABI.  */
-  {
-    int abi = cif->abi;
-    if (UNLIKELY (! (abi > FFI_FIRST_ABI && abi < FFI_LAST_ABI)))
-      return FFI_BAD_ABI;
-  }
-
-  tramp = (volatile unsigned short *) &closure->tramp[0];
+  static const unsigned char trampoline[16] = {
+    /* leaq  -0x7(%rip),%r10   # 0x0  */
+    0x4c, 0x8d, 0x15, 0xf9, 0xff, 0xff, 0xff,
+    /* jmpq  *0x3(%rip)        # 0x10 */
+    0xff, 0x25, 0x03, 0x00, 0x00, 0x00,
+    /* nopl  (%rax) */
+    0x0f, 0x1f, 0x00
+  };
+  void (*dest)(void);
+  char *tramp = closure->tramp;
 
-  tramp[0] = 0xbb49;		/* mov <code>, %r11	*/
-  *((unsigned long long * volatile) &tramp[1])
-    = (unsigned long) ffi_closure_unix64;
-  tramp[5] = 0xba49;		/* mov <data>, %r10	*/
-  *((unsigned long long * volatile) &tramp[6])
-    = (unsigned long) codeloc;
+  if (cif->abi != FFI_UNIX64)
+    return FFI_BAD_ABI;
 
-  /* Set the carry bit iff the function uses any sse registers.
-     This is clc or stc, together with the first byte of the jmp.  */
-  tramp[10] = cif->flags & (1 << 11) ? 0x49f9 : 0x49f8;
+  if (cif->flags & UNIX64_FLAG_XMM_ARGS)
+    dest = ffi_closure_unix64_sse;
+  else
+    dest = ffi_closure_unix64;
 
-  tramp[11] = 0xe3ff;			/* jmp *%r11    */
+  memcpy (tramp, trampoline, sizeof(trampoline));
+  *(UINT64 *)(tramp + 16) = (uintptr_t)dest;
 
   closure->cif = cif;
   closure->fun = fun;
@@ -534,53 +709,40 @@ ffi_prep_closure_loc (ffi_closure* closure,
   return FFI_OK;
 }
 
-int
-ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue,
-			 struct register_args *reg_args, char *argp)
+int FFI_HIDDEN
+ffi_closure_unix64_inner(ffi_cif *cif,
+			 void (*fun)(ffi_cif*, void*, void**, void*),
+			 void *user_data,
+			 void *rvalue,
+			 struct register_args *reg_args,
+			 char *argp)
 {
-  ffi_cif *cif;
   void **avalue;
   ffi_type **arg_types;
   long i, avn;
   int gprcount, ssecount, ngpr, nsse;
-  int ret;
+  int flags;
 
-  cif = closure->cif;
-  avalue = alloca(cif->nargs * sizeof(void *));
+  avn = cif->nargs;
+  flags = cif->flags;
+  avalue = alloca(avn * sizeof(void *));
   gprcount = ssecount = 0;
 
-  ret = cif->rtype->type;
-  if (ret != FFI_TYPE_VOID)
+  if (flags & UNIX64_FLAG_RET_IN_MEM)
     {
-      enum x86_64_reg_class classes[MAX_CLASSES];
-      int n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
-      if (n == 0)
-	{
-	  /* The return value goes in memory.  Arrange for the closure
-	     return value to go directly back to the original caller.  */
-	  rvalue = (void *) (unsigned long) reg_args->gpr[gprcount++];
-	  /* We don't have to do anything in asm for the return.  */
-	  ret = FFI_TYPE_VOID;
-	}
-      else if (ret == FFI_TYPE_STRUCT && n == 2)
-	{
-	  /* Mark which register the second word of the structure goes in.  */
-	  _Bool sse0 = SSE_CLASS_P (classes[0]);
-	  _Bool sse1 = SSE_CLASS_P (classes[1]);
-	  if (!sse0 && sse1)
-	    ret |= 1 << 8;
-	  else if (sse0 && !sse1)
-	    ret |= 1 << 9;
-	}
+      /* On return, %rax will contain the address that was passed
+	 by the caller in %rdi.  */
+      void *r = (void *)(uintptr_t)reg_args->gpr[gprcount++];
+      *(void **)rvalue = r;
+      rvalue = r;
+      flags = (sizeof(void *) == 4 ? UNIX64_RET_UINT32 : UNIX64_RET_INT64);
     }
 
-  avn = cif->nargs;
   arg_types = cif->arg_types;
-  
   for (i = 0; i < avn; ++i)
     {
       enum x86_64_reg_class classes[MAX_CLASSES];
-      int n;
+      size_t n;
 
       n = examine_argument (arg_types[i], classes, 0, &ngpr, &nsse);
       if (n == 0
@@ -634,10 +796,29 @@ ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue,
     }
 
   /* Invoke the closure.  */
-  closure->fun (cif, rvalue, avalue, closure->user_data);
+  fun (cif, rvalue, avalue, user_data);
 
   /* Tell assembly how to perform return type promotions.  */
-  return ret;
+  return flags;
+}
+
+extern void ffi_go_closure_unix64(void) FFI_HIDDEN;
+extern void ffi_go_closure_unix64_sse(void) FFI_HIDDEN;
+
+ffi_status
+ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
+		     void (*fun)(ffi_cif*, void*, void**, void*))
+{
+  if (cif->abi != FFI_UNIX64)
+    return FFI_BAD_ABI;
+
+  closure->tramp = (cif->flags & UNIX64_FLAG_XMM_ARGS
+		    ? ffi_go_closure_unix64_sse
+		    : ffi_go_closure_unix64);
+  closure->cif = cif;
+  closure->fun = fun;
+
+  return FFI_OK;
 }
 
 #endif /* __x86_64__ */
diff --git a/libffi/src/x86/ffitarget.h b/libffi/src/x86/ffitarget.h
index 46f294c..8c1dcac 100644
--- a/libffi/src/x86/ffitarget.h
+++ b/libffi/src/x86/ffitarget.h
@@ -1,5 +1,5 @@
 /* -----------------------------------------------------------------*-C-*-
-   ffitarget.h - Copyright (c) 2012  Anthony Green
+   ffitarget.h - Copyright (c) 2012, 2014  Anthony Green
                  Copyright (c) 1996-2003, 2010  Red Hat, Inc.
                  Copyright (C) 2008  Free Software Foundation, Inc.
 
@@ -49,6 +49,11 @@
 #define USE_BUILTIN_FFS 0 /* not yet implemented in mingw-64 */
 #endif
 
+#define FFI_TARGET_SPECIFIC_STACK_SPACE_ALLOCATION
+#ifndef _MSC_VER
+#define FFI_TARGET_HAS_COMPLEX_TYPE
+#endif
+
 /* ---- Generic type definitions ----------------------------------------- */
 
 #ifndef LIBFFI_ASM
@@ -73,37 +78,40 @@ typedef signed long            ffi_sarg;
 #endif
 
 typedef enum ffi_abi {
+#if defined(X86_WIN64)
   FFI_FIRST_ABI = 0,
-
-  /* ---- Intel x86 Win32 ---------- */
-#ifdef X86_WIN32
-  FFI_SYSV,
-  FFI_STDCALL,
-  FFI_THISCALL,
-  FFI_FASTCALL,
-  FFI_MS_CDECL,
-  FFI_LAST_ABI,
-#ifdef _MSC_VER
-  FFI_DEFAULT_ABI = FFI_MS_CDECL
-#else
-  FFI_DEFAULT_ABI = FFI_SYSV
-#endif
-
-#elif defined(X86_WIN64)
   FFI_WIN64,
   FFI_LAST_ABI,
   FFI_DEFAULT_ABI = FFI_WIN64
 
+#elif defined(X86_64) || (defined (__x86_64__) && defined (X86_DARWIN))
+  FFI_FIRST_ABI = 1,
+  FFI_UNIX64,
+  FFI_LAST_ABI,
+  FFI_DEFAULT_ABI = FFI_UNIX64
+
+#elif defined(X86_WIN32)
+  FFI_FIRST_ABI = 0,
+  FFI_SYSV      = 1,
+  FFI_STDCALL   = 2,
+  FFI_THISCALL  = 3,
+  FFI_FASTCALL  = 4,
+  FFI_MS_CDECL  = 5,
+  FFI_PASCAL    = 6,
+  FFI_REGISTER  = 7,
+  FFI_LAST_ABI,
+  FFI_DEFAULT_ABI = FFI_MS_CDECL
 #else
-  /* ---- Intel x86 and AMD x86-64 - */
-  FFI_SYSV,
-  FFI_UNIX64,   /* Unix variants all use the same ABI for x86-64  */
+  FFI_FIRST_ABI = 0,
+  FFI_SYSV      = 1,
+  FFI_THISCALL  = 3,
+  FFI_FASTCALL  = 4,
+  FFI_STDCALL   = 5,
+  FFI_PASCAL    = 6,
+  FFI_REGISTER  = 7,
+  FFI_MS_CDECL  = 8,
   FFI_LAST_ABI,
-#if defined(__i386__) || defined(__i386)
   FFI_DEFAULT_ABI = FFI_SYSV
-#else
-  FFI_DEFAULT_ABI = FFI_UNIX64
-#endif
 #endif
 } ffi_abi;
 #endif
@@ -111,29 +119,20 @@ typedef enum ffi_abi {
 /* ---- Definitions for closures ----------------------------------------- */
 
 #define FFI_CLOSURES 1
+#define FFI_GO_CLOSURES 1
+
 #define FFI_TYPE_SMALL_STRUCT_1B (FFI_TYPE_LAST + 1)
 #define FFI_TYPE_SMALL_STRUCT_2B (FFI_TYPE_LAST + 2)
 #define FFI_TYPE_SMALL_STRUCT_4B (FFI_TYPE_LAST + 3)
 #define FFI_TYPE_MS_STRUCT       (FFI_TYPE_LAST + 4)
 
-#if defined (X86_64) || (defined (__x86_64__) && defined (X86_DARWIN))
-#define FFI_TRAMPOLINE_SIZE 24
-#define FFI_NATIVE_RAW_API 0
+#if defined (X86_64) || defined(X86_WIN64) \
+    || (defined (__x86_64__) && defined (X86_DARWIN))
+# define FFI_TRAMPOLINE_SIZE 24
+# define FFI_NATIVE_RAW_API 0
 #else
-#ifdef X86_WIN32
-#define FFI_TRAMPOLINE_SIZE 52
-#else
-#ifdef X86_WIN64
-#define FFI_TRAMPOLINE_SIZE 29
-#define FFI_NATIVE_RAW_API 0
-#define FFI_NO_RAW_API 1
-#else
-#define FFI_TRAMPOLINE_SIZE 10
-#endif
-#endif
-#ifndef X86_WIN64
-#define FFI_NATIVE_RAW_API 1	/* x86 has native raw api support */
-#endif
+# define FFI_TRAMPOLINE_SIZE 12
+# define FFI_NATIVE_RAW_API 1  /* x86 has native raw api support */
 #endif
 
 #endif
diff --git a/libffi/src/x86/ffiw64.c b/libffi/src/x86/ffiw64.c
new file mode 100644
index 0000000..8a33a6c
--- /dev/null
+++ b/libffi/src/x86/ffiw64.c
@@ -0,0 +1,281 @@
+/* -----------------------------------------------------------------------
+   ffiw64.c - Copyright (c) 2014 Red Hat, Inc.
+
+   x86 win64 Foreign Function Interface
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+   ----------------------------------------------------------------------- */
+
+#include <ffi.h>
+#include <ffi_common.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#ifdef X86_WIN64
+
+struct win64_call_frame
+{
+  UINT64 rbp;		/* 0 */
+  UINT64 retaddr;	/* 8 */
+  UINT64 fn;		/* 16 */
+  UINT64 flags;		/* 24 */
+  UINT64 rvalue;	/* 32 */
+};
+
+extern void ffi_call_win64 (void *stack, struct win64_call_frame *,
+			    void *closure) FFI_HIDDEN;
+
+ffi_status
+ffi_prep_cif_machdep (ffi_cif *cif)
+{
+  int flags, n;
+
+  if (cif->abi != FFI_WIN64)
+    return FFI_BAD_ABI;
+
+  flags = cif->rtype->type;
+  switch (flags)
+    {
+    default:
+      break;
+    case FFI_TYPE_LONGDOUBLE:
+      flags = FFI_TYPE_STRUCT;
+      break;
+    case FFI_TYPE_COMPLEX:
+      flags = FFI_TYPE_STRUCT;
+      /* FALLTHRU */
+    case FFI_TYPE_STRUCT:
+      switch (cif->rtype->size)
+	{
+	case 8:
+	  flags = FFI_TYPE_UINT64;
+	  break;
+	case 4:
+	  flags = FFI_TYPE_SMALL_STRUCT_4B;
+	  break;
+	case 2:
+	  flags = FFI_TYPE_SMALL_STRUCT_2B;
+	  break;
+	case 1:
+	  flags = FFI_TYPE_SMALL_STRUCT_1B;
+	  break;
+	}
+      break;
+    }
+  cif->flags = flags;
+
+  /* Each argument either fits in a register, an 8 byte slot, or is
+     passed by reference with the pointer in the 8 byte slot.  */
+  n = cif->nargs;
+  n += (flags == FFI_TYPE_STRUCT);
+  if (n < 4)
+    n = 4;
+  cif->bytes = n * 8;
+
+  return FFI_OK;
+}
+
+static void
+ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	      void **avalue, void *closure)
+{
+  int i, j, n, flags;
+  UINT64 *stack;
+  size_t rsize;
+  struct win64_call_frame *frame;
+
+  FFI_ASSERT(cif->abi == FFI_WIN64);
+
+  flags = cif->flags;
+  rsize = 0;
+
+  /* If we have no return value for a structure, we need to create one.
+     Otherwise we can ignore the return type entirely.  */
+  if (rvalue == NULL)
+    {
+      if (flags == FFI_TYPE_STRUCT)
+	rsize = cif->rtype->size;
+      else
+	flags = FFI_TYPE_VOID;
+    }
+
+  stack = alloca(cif->bytes + sizeof(struct win64_call_frame) + rsize);
+  frame = (struct win64_call_frame *)((char *)stack + cif->bytes);
+  if (rsize)
+    rvalue = frame + 1;
+
+  frame->fn = (uintptr_t)fn;
+  frame->flags = flags;
+  frame->rvalue = (uintptr_t)rvalue;
+
+  j = 0;
+  if (flags == FFI_TYPE_STRUCT)
+    {
+      stack[0] = (uintptr_t)rvalue;
+      j = 1;
+    }
+
+  for (i = 0, n = cif->nargs; i < n; ++i, ++j)
+    {
+      switch (cif->arg_types[i]->size)
+	{
+	case 8:
+	  stack[j] = *(UINT64 *)avalue[i];
+	  break;
+	case 4:
+	  stack[j] = *(UINT32 *)avalue[i];
+	  break;
+	case 2:
+	  stack[j] = *(UINT16 *)avalue[i];
+	  break;
+	case 1:
+	  stack[j] = *(UINT8 *)avalue[i];
+	  break;
+	default:
+	  stack[j] = (uintptr_t)avalue[i];
+	  break;
+	}
+    }
+
+  ffi_call_win64 (stack, frame, closure);
+}
+
+void
+ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, NULL);
+}
+
+void
+ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	     void **avalue, void *closure)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, closure);
+}
+
+
+extern void ffi_closure_win64(void) FFI_HIDDEN;
+extern void ffi_go_closure_win64(void) FFI_HIDDEN;
+
+ffi_status
+ffi_prep_closure_loc (ffi_closure* closure,
+		      ffi_cif* cif,
+		      void (*fun)(ffi_cif*, void*, void**, void*),
+		      void *user_data,
+		      void *codeloc)
+{
+  static const unsigned char trampoline[16] = {
+    /* leaq  -0x7(%rip),%r10   # 0x0  */
+    0x4c, 0x8d, 0x15, 0xf9, 0xff, 0xff, 0xff,
+    /* jmpq  *0x3(%rip)        # 0x10 */
+    0xff, 0x25, 0x03, 0x00, 0x00, 0x00,
+    /* nopl  (%rax) */
+    0x0f, 0x1f, 0x00
+  };
+  unsigned char *tramp = closure->tramp;
+
+  if (cif->abi != FFI_WIN64)
+    return FFI_BAD_ABI;
+
+  memcpy (tramp, trampoline, sizeof(trampoline));
+  *(UINT64 *)(tramp + 16) = (uintptr_t)ffi_closure_win64;
+
+  closure->cif = cif;
+  closure->fun = fun;
+  closure->user_data = user_data;
+
+  return FFI_OK;
+}
+
+ffi_status
+ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
+		     void (*fun)(ffi_cif*, void*, void**, void*))
+{
+  if (cif->abi != FFI_WIN64)
+    return FFI_BAD_ABI;
+
+  closure->tramp = ffi_go_closure_win64;
+  closure->cif = cif;
+  closure->fun = fun;
+
+  return FFI_OK;
+}
+
+struct win64_closure_frame
+{
+  UINT64 rvalue[2];
+  UINT64 fargs[4];
+  UINT64 retaddr;
+  UINT64 args[];
+};
+
+int FFI_HIDDEN
+ffi_closure_win64_inner(ffi_cif *cif,
+			void (*fun)(ffi_cif*, void*, void**, void*),
+			void *user_data,
+			struct win64_closure_frame *frame)
+{
+  void **avalue;
+  void *rvalue;
+  int i, n, nreg, flags;
+
+  avalue = alloca(cif->nargs * sizeof(void *));
+  rvalue = frame->rvalue;
+  nreg = 0;
+
+  /* When returning a structure, the address is in the first argument.
+     We must also be prepared to return the same address in eax, so
+     install that address in the frame and pretend we return a pointer.  */
+  flags = cif->flags;
+  if (flags == FFI_TYPE_STRUCT)
+    {
+      rvalue = (void *)(uintptr_t)frame->args[0];
+      frame->rvalue[0] = frame->args[0];
+      nreg = 1;
+    }
+
+  for (i = 0, n = cif->nargs; i < n; ++i, ++nreg)
+    {
+      size_t size = cif->arg_types[i]->size;
+      size_t type = cif->arg_types[i]->type;
+      void *a;
+
+      if (type == FFI_TYPE_DOUBLE || type == FFI_TYPE_FLOAT)
+	{
+	  if (nreg < 4)
+	    a = &frame->fargs[nreg];
+	  else
+	    a = &frame->args[nreg];
+	}
+      else if (size == 1 || size == 2 || size == 4 || size == 8)
+	a = &frame->args[nreg];
+      else
+	a = (void *)(uintptr_t)frame->args[nreg];
+
+      avalue[i] = a;
+    }
+
+  /* Invoke the closure.  */
+  fun (cif, rvalue, avalue, user_data);
+  return flags;
+}
+
+#endif /* X86_WIN64 */
diff --git a/libffi/src/x86/freebsd.S b/libffi/src/x86/freebsd.S
deleted file mode 100644
index afde513..0000000
--- a/libffi/src/x86/freebsd.S
+++ /dev/null
@@ -1,458 +0,0 @@
-/* -----------------------------------------------------------------------
-   freebsd.S - Copyright (c) 1996, 1998, 2001, 2002, 2003, 2005  Red Hat, Inc.
-	       Copyright (c) 2008  Björn König
-	
-   X86 Foreign Function Interface for FreeBSD
-
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files (the
-   ``Software''), to deal in the Software without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Software, and to
-   permit persons to whom the Software is furnished to do so, subject to
-   the following conditions:
-
-   The above copyright notice and this permission notice shall be included
-   in all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-   DEALINGS IN THE SOFTWARE.
------------------------------------------------------------------------ */
-
-#ifndef __x86_64__
-
-#define LIBFFI_ASM	
-#include <fficonfig.h>
-#include <ffi.h>
-
-.text
-
-.globl ffi_prep_args
-
-	.align 4
-.globl ffi_call_SYSV
-        .type    ffi_call_SYSV,@function
-
-ffi_call_SYSV:
-.LFB1:
-        pushl %ebp
-.LCFI0:
-        movl  %esp,%ebp
-.LCFI1:
-	/* Make room for all of the new args.  */
-	movl  16(%ebp),%ecx
-	subl  %ecx,%esp
-
-	movl  %esp,%eax
-
-	/* Place all of the ffi_prep_args in position  */
-	pushl 12(%ebp)
-	pushl %eax
-	call  *8(%ebp)
-
-	/* Return stack to previous state and call the function  */
-	addl  $8,%esp	
-
-	call  *28(%ebp)
-
-	/* Load %ecx with the return type code  */
-	movl  20(%ebp),%ecx	
-
-	/* Protect %esi.  We're going to pop it in the epilogue.  */
-	pushl %esi
-
-	/* If the return value pointer is NULL, assume no return value.  */
-	cmpl  $0,24(%ebp)
-	jne  0f
-
-	/* Even if there is no space for the return value, we are 
-	   obliged to handle floating-point values.  */
-	cmpl  $FFI_TYPE_FLOAT,%ecx
-	jne   noretval
-	fstp  %st(0)
-
-        jmp   epilogue
-
-0:
-	call  1f
-
-.Lstore_table:
-	.long	noretval-.Lstore_table	/* FFI_TYPE_VOID */
-	.long	retint-.Lstore_table	/* FFI_TYPE_INT */
-	.long	retfloat-.Lstore_table	/* FFI_TYPE_FLOAT */
-	.long	retdouble-.Lstore_table	/* FFI_TYPE_DOUBLE */
-	.long	retlongdouble-.Lstore_table	/* FFI_TYPE_LONGDOUBLE */
-	.long	retuint8-.Lstore_table	/* FFI_TYPE_UINT8 */
-	.long	retsint8-.Lstore_table	/* FFI_TYPE_SINT8 */
-	.long	retuint16-.Lstore_table	/* FFI_TYPE_UINT16 */
-	.long	retsint16-.Lstore_table	/* FFI_TYPE_SINT16 */
-	.long	retint-.Lstore_table	/* FFI_TYPE_UINT32 */
-	.long	retint-.Lstore_table	/* FFI_TYPE_SINT32 */
-	.long	retint64-.Lstore_table	/* FFI_TYPE_UINT64 */
-	.long	retint64-.Lstore_table	/* FFI_TYPE_SINT64 */
-	.long	retstruct-.Lstore_table	/* FFI_TYPE_STRUCT */
-	.long	retint-.Lstore_table	/* FFI_TYPE_POINTER */
-	.long   retstruct1b-.Lstore_table	/* FFI_TYPE_SMALL_STRUCT_1B */
-	.long   retstruct2b-.Lstore_table	/* FFI_TYPE_SMALL_STRUCT_2B */
-
-1:
-	pop  %esi
-	add  (%esi, %ecx, 4), %esi
-	jmp  *%esi
-
-	/* Sign/zero extend as appropriate.  */
-retsint8:
-	movsbl  %al, %eax
-	jmp  retint
-
-retsint16:
-	movswl  %ax, %eax
-	jmp  retint
-
-retuint8:
-	movzbl  %al, %eax
-	jmp  retint
-
-retuint16:
-	movzwl  %ax, %eax
-	jmp  retint
-
-retfloat:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	fstps (%ecx)
-	jmp   epilogue
-
-retdouble:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	fstpl (%ecx)
-	jmp   epilogue
-
-retlongdouble:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	fstpt (%ecx)
-	jmp   epilogue
-	
-retint64:	
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	movl  %eax,0(%ecx)
-	movl  %edx,4(%ecx)
-	jmp   epilogue
-	
-retstruct1b:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	movb  %al,0(%ecx)
-	jmp   epilogue
-
-retstruct2b:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	movw  %ax,0(%ecx)
-	jmp   epilogue
-
-retint:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	movl  %eax,0(%ecx)
-
-retstruct:
-	/* Nothing to do!  */
-
-noretval:
-epilogue:
-        popl %esi
-        movl %ebp,%esp
-        popl %ebp
-        ret
-.LFE1:
-.ffi_call_SYSV_end:
-        .size    ffi_call_SYSV,.ffi_call_SYSV_end-ffi_call_SYSV
-
-	.align	4
-FFI_HIDDEN (ffi_closure_SYSV)
-.globl ffi_closure_SYSV
-	.type	ffi_closure_SYSV, @function
-
-ffi_closure_SYSV:
-.LFB2:
-	pushl	%ebp
-.LCFI2:
-	movl	%esp, %ebp
-.LCFI3:
-	subl	$40, %esp
-	leal	-24(%ebp), %edx
-	movl	%edx, -12(%ebp)	/* resp */
-	leal	8(%ebp), %edx
-	movl	%edx, 4(%esp)	/* args = __builtin_dwarf_cfa () */
-	leal	-12(%ebp), %edx
-	movl	%edx, (%esp)	/* &resp */
-#if defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE || !defined __PIC__
-	call	ffi_closure_SYSV_inner
-#else
-	movl	%ebx, 8(%esp)
-.LCFI7:
-	call	1f
-1:	popl	%ebx
-	addl	$_GLOBAL_OFFSET_TABLE_+[.-1b], %ebx
-	call	ffi_closure_SYSV_inner@PLT
-	movl	8(%esp), %ebx
-#endif
-	movl	-12(%ebp), %ecx
-	cmpl	$FFI_TYPE_INT, %eax
-	je	.Lcls_retint
-
-	/* Handle FFI_TYPE_UINT8, FFI_TYPE_SINT8, FFI_TYPE_UINT16,
-	   FFI_TYPE_SINT16, FFI_TYPE_UINT32, FFI_TYPE_SINT32.  */
-	cmpl	$FFI_TYPE_UINT64, %eax
-	jge	0f
-	cmpl	$FFI_TYPE_UINT8, %eax
-	jge	.Lcls_retint
-	
-0:	cmpl	$FFI_TYPE_FLOAT, %eax
-	je	.Lcls_retfloat
-	cmpl	$FFI_TYPE_DOUBLE, %eax
-	je	.Lcls_retdouble
-	cmpl	$FFI_TYPE_LONGDOUBLE, %eax
-	je	.Lcls_retldouble
-	cmpl	$FFI_TYPE_SINT64, %eax
-	je	.Lcls_retllong
-	cmpl	$FFI_TYPE_SMALL_STRUCT_1B, %eax
-	je	.Lcls_retstruct1b
-	cmpl	$FFI_TYPE_SMALL_STRUCT_2B, %eax
-	je	.Lcls_retstruct2b
-	cmpl	$FFI_TYPE_STRUCT, %eax
-	je	.Lcls_retstruct
-.Lcls_epilogue:
-	movl	%ebp, %esp
-	popl	%ebp
-	ret
-.Lcls_retint:
-	movl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-.Lcls_retfloat:
-	flds	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retdouble:
-	fldl	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retldouble:
-	fldt	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retllong:
-	movl	(%ecx), %eax
-	movl	4(%ecx), %edx
-	jmp	.Lcls_epilogue
-.Lcls_retstruct1b:
-	movsbl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-.Lcls_retstruct2b:
-	movswl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-.Lcls_retstruct:
-	movl	%ebp, %esp
-	popl	%ebp
-	ret	$4
-.LFE2:
-	.size	ffi_closure_SYSV, .-ffi_closure_SYSV
-
-#if !FFI_NO_RAW_API
-
-#define RAW_CLOSURE_CIF_OFFSET ((FFI_TRAMPOLINE_SIZE + 3) & ~3)
-#define RAW_CLOSURE_FUN_OFFSET (RAW_CLOSURE_CIF_OFFSET + 4)
-#define RAW_CLOSURE_USER_DATA_OFFSET (RAW_CLOSURE_FUN_OFFSET + 4)
-#define CIF_FLAGS_OFFSET 20
-
-	.align	4
-FFI_HIDDEN (ffi_closure_raw_SYSV)
-.globl ffi_closure_raw_SYSV
-	.type	ffi_closure_raw_SYSV, @function
-
-ffi_closure_raw_SYSV:
-.LFB3:
-	pushl	%ebp
-.LCFI4:
-	movl	%esp, %ebp
-.LCFI5:
-	pushl	%esi
-.LCFI6:
-	subl	$36, %esp
-	movl	RAW_CLOSURE_CIF_OFFSET(%eax), %esi	 /* closure->cif */
-	movl	RAW_CLOSURE_USER_DATA_OFFSET(%eax), %edx /* closure->user_data */
-	movl	%edx, 12(%esp)	/* user_data */
-	leal	8(%ebp), %edx	/* __builtin_dwarf_cfa () */
-	movl	%edx, 8(%esp)	/* raw_args */
-	leal	-24(%ebp), %edx
-	movl	%edx, 4(%esp)	/* &res */
-	movl	%esi, (%esp)	/* cif */
-	call	*RAW_CLOSURE_FUN_OFFSET(%eax)		 /* closure->fun */
-	movl	CIF_FLAGS_OFFSET(%esi), %eax		 /* rtype */
-	cmpl	$FFI_TYPE_INT, %eax
-	je	.Lrcls_retint
-
-	/* Handle FFI_TYPE_UINT8, FFI_TYPE_SINT8, FFI_TYPE_UINT16,
-	   FFI_TYPE_SINT16, FFI_TYPE_UINT32, FFI_TYPE_SINT32.  */
-	cmpl	$FFI_TYPE_UINT64, %eax
-	jge	0f
-	cmpl	$FFI_TYPE_UINT8, %eax
-	jge	.Lrcls_retint
-0:
-	cmpl	$FFI_TYPE_FLOAT, %eax
-	je	.Lrcls_retfloat
-	cmpl	$FFI_TYPE_DOUBLE, %eax
-	je	.Lrcls_retdouble
-	cmpl	$FFI_TYPE_LONGDOUBLE, %eax
-	je	.Lrcls_retldouble
-	cmpl	$FFI_TYPE_SINT64, %eax
-	je	.Lrcls_retllong
-.Lrcls_epilogue:
-	addl	$36, %esp
-	popl	%esi
-	popl	%ebp
-	ret
-.Lrcls_retint:
-	movl	-24(%ebp), %eax
-	jmp	.Lrcls_epilogue
-.Lrcls_retfloat:
-	flds	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retdouble:
-	fldl	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retldouble:
-	fldt	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retllong:
-	movl	-24(%ebp), %eax
-	movl	-20(%ebp), %edx
-	jmp	.Lrcls_epilogue
-.LFE3:
-	.size	ffi_closure_raw_SYSV, .-ffi_closure_raw_SYSV
-#endif
-
-	.section	.eh_frame,EH_FRAME_FLAGS,@progbits
-.Lframe1:
-	.long	.LECIE1-.LSCIE1	/* Length of Common Information Entry */
-.LSCIE1:
-	.long	0x0	/* CIE Identifier Tag */
-	.byte	0x1	/* CIE Version */
-#ifdef __PIC__
-	.ascii "zR\0"	/* CIE Augmentation */
-#else
-	.ascii "\0"	/* CIE Augmentation */
-#endif
-	.byte	0x1	/* .uleb128 0x1; CIE Code Alignment Factor */
-	.byte	0x7c	/* .sleb128 -4; CIE Data Alignment Factor */
-	.byte	0x8	/* CIE RA Column */
-#ifdef __PIC__
-	.byte	0x1	/* .uleb128 0x1; Augmentation size */
-	.byte	0x1b	/* FDE Encoding (pcrel sdata4) */
-#endif
-	.byte	0xc	/* DW_CFA_def_cfa */
-	.byte	0x4	/* .uleb128 0x4 */
-	.byte	0x4	/* .uleb128 0x4 */
-	.byte	0x88	/* DW_CFA_offset, column 0x8 */
-	.byte	0x1	/* .uleb128 0x1 */
-	.align 4
-.LECIE1:
-.LSFDE1:
-	.long	.LEFDE1-.LASFDE1	/* FDE Length */
-.LASFDE1:
-	.long	.LASFDE1-.Lframe1	/* FDE CIE offset */
-#ifdef __PIC__
-	.long	.LFB1-.	/* FDE initial location */
-#else
-	.long	.LFB1	/* FDE initial location */
-#endif
-	.long	.LFE1-.LFB1	/* FDE address range */
-#ifdef __PIC__
-	.byte	0x0	/* .uleb128 0x0; Augmentation size */
-#endif
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI0-.LFB1
-	.byte	0xe	/* DW_CFA_def_cfa_offset */
-	.byte	0x8	/* .uleb128 0x8 */
-	.byte	0x85	/* DW_CFA_offset, column 0x5 */
-	.byte	0x2	/* .uleb128 0x2 */
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI1-.LCFI0
-	.byte	0xd	/* DW_CFA_def_cfa_register */
-	.byte	0x5	/* .uleb128 0x5 */
-	.align 4
-.LEFDE1:
-.LSFDE2:
-	.long	.LEFDE2-.LASFDE2	/* FDE Length */
-.LASFDE2:
-	.long	.LASFDE2-.Lframe1	/* FDE CIE offset */
-#ifdef __PIC__
-	.long	.LFB2-.	/* FDE initial location */
-#else
-	.long	.LFB2
-#endif
-	.long	.LFE2-.LFB2	/* FDE address range */
-#ifdef __PIC__
-	.byte	0x0	/* .uleb128 0x0; Augmentation size */
-#endif
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI2-.LFB2
-	.byte	0xe	/* DW_CFA_def_cfa_offset */
-	.byte	0x8	/* .uleb128 0x8 */
-	.byte	0x85	/* DW_CFA_offset, column 0x5 */
-	.byte	0x2	/* .uleb128 0x2 */
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI3-.LCFI2
-	.byte	0xd	/* DW_CFA_def_cfa_register */
-	.byte	0x5	/* .uleb128 0x5 */
-#if !defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE && defined __PIC__
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI7-.LCFI3
-	.byte	0x83	/* DW_CFA_offset, column 0x3 */
-	.byte	0xa	/* .uleb128 0xa */
-#endif
-	.align 4
-.LEFDE2:
-
-#if !FFI_NO_RAW_API
-
-.LSFDE3:
-	.long	.LEFDE3-.LASFDE3	/* FDE Length */
-.LASFDE3:
-	.long	.LASFDE3-.Lframe1	/* FDE CIE offset */
-#ifdef __PIC__
-	.long	.LFB3-.	/* FDE initial location */
-#else
-	.long	.LFB3
-#endif
-	.long	.LFE3-.LFB3	/* FDE address range */
-#ifdef __PIC__
-	.byte	0x0	/* .uleb128 0x0; Augmentation size */
-#endif
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI4-.LFB3
-	.byte	0xe	/* DW_CFA_def_cfa_offset */
-	.byte	0x8	/* .uleb128 0x8 */
-	.byte	0x85	/* DW_CFA_offset, column 0x5 */
-	.byte	0x2	/* .uleb128 0x2 */
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI5-.LCFI4
-	.byte	0xd	/* DW_CFA_def_cfa_register */
-	.byte	0x5	/* .uleb128 0x5 */
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI6-.LCFI5
-	.byte	0x86	/* DW_CFA_offset, column 0x6 */
-	.byte	0x3	/* .uleb128 0x3 */
-	.align 4
-.LEFDE3:
-
-#endif
-
-#endif /* ifndef __x86_64__ */
diff --git a/libffi/src/x86/internal.h b/libffi/src/x86/internal.h
new file mode 100644
index 0000000..09771ba
--- /dev/null
+++ b/libffi/src/x86/internal.h
@@ -0,0 +1,29 @@
+#define X86_RET_FLOAT		0
+#define X86_RET_DOUBLE		1
+#define X86_RET_LDOUBLE		2
+#define X86_RET_SINT8		3
+#define X86_RET_SINT16		4
+#define X86_RET_UINT8		5
+#define X86_RET_UINT16		6
+#define X86_RET_INT64		7
+#define X86_RET_INT32		8
+#define X86_RET_VOID		9
+#define X86_RET_STRUCTPOP	10
+#define X86_RET_STRUCTARG       11
+#define X86_RET_STRUCT_1B	12
+#define X86_RET_STRUCT_2B	13
+#define X86_RET_UNUSED14	14
+#define X86_RET_UNUSED15	15
+
+#define X86_RET_TYPE_MASK	15
+#define X86_RET_POP_SHIFT	4
+
+#define R_EAX	0
+#define R_EDX	1
+#define R_ECX	2
+
+#ifdef __PCC__
+# define HAVE_FASTCALL 0
+#else
+# define HAVE_FASTCALL 1
+#endif
diff --git a/libffi/src/x86/internal64.h b/libffi/src/x86/internal64.h
new file mode 100644
index 0000000..512e955
--- /dev/null
+++ b/libffi/src/x86/internal64.h
@@ -0,0 +1,22 @@
+#define UNIX64_RET_VOID		0
+#define UNIX64_RET_UINT8	1
+#define UNIX64_RET_UINT16	2
+#define UNIX64_RET_UINT32	3
+#define UNIX64_RET_SINT8	4
+#define UNIX64_RET_SINT16	5
+#define UNIX64_RET_SINT32	6
+#define UNIX64_RET_INT64	7
+#define UNIX64_RET_XMM32	8
+#define UNIX64_RET_XMM64	9
+#define UNIX64_RET_X87		10
+#define UNIX64_RET_X87_2	11
+#define UNIX64_RET_ST_XMM0_RAX	12
+#define UNIX64_RET_ST_RAX_XMM0	13
+#define UNIX64_RET_ST_XMM0_XMM1	14
+#define UNIX64_RET_ST_RAX_RDX	15
+
+#define UNIX64_RET_LAST		15
+
+#define UNIX64_FLAG_RET_IN_MEM	(1 << 10)
+#define UNIX64_FLAG_XMM_ARGS	(1 << 11)
+#define UNIX64_SIZE_SHIFT	12
diff --git a/libffi/src/x86/sysv.S b/libffi/src/x86/sysv.S
index f108dd8..ebbea5d 100644
--- a/libffi/src/x86/sysv.S
+++ b/libffi/src/x86/sysv.S
@@ -1,5 +1,6 @@
 /* -----------------------------------------------------------------------
-   sysv.S - Copyright (c) 1996, 1998, 2001-2003, 2005, 2008, 2010  Red Hat, Inc.
+   sysv.S - Copyright (c) 2013  The Written Word, Inc.
+	  - Copyright (c) 1996,1998,2001-2003,2005,2008,2010  Red Hat, Inc.
    
    X86 Foreign Function Interface 
 
@@ -29,437 +30,1006 @@
 #define LIBFFI_ASM	
 #include <fficonfig.h>
 #include <ffi.h>
+#include "internal.h"
 
-.text
-
-.globl ffi_prep_args
-
-	.align 4
-.globl ffi_call_SYSV
-        .type    ffi_call_SYSV,@function
-
-ffi_call_SYSV:
-.LFB1:
-        pushl %ebp
-.LCFI0:
-        movl  %esp,%ebp
-.LCFI1:
-	/* Make room for all of the new args.  */
-	movl  16(%ebp),%ecx
-	subl  %ecx,%esp
-
-        /* Align the stack pointer to 16-bytes */
-        andl  $0xfffffff0, %esp
-
-	movl  %esp,%eax
-
-	/* Place all of the ffi_prep_args in position  */
-	pushl 12(%ebp)
-	pushl %eax
-	call  *8(%ebp)
-
-	/* Return stack to previous state and call the function  */
-	addl  $8,%esp	
-
-	call  *28(%ebp)
-
-	/* Load %ecx with the return type code  */
-	movl  20(%ebp),%ecx	
-
-	/* Protect %esi.  We're going to pop it in the epilogue.  */
-	pushl %esi
-
-	/* If the return value pointer is NULL, assume no return value.  */
-	cmpl  $0,24(%ebp)
-	jne  0f
-
-	/* Even if there is no space for the return value, we are 
-	   obliged to handle floating-point values.  */
-	cmpl  $FFI_TYPE_FLOAT,%ecx
-	jne   noretval
-	fstp  %st(0)
-
-        jmp   epilogue
-
-0:
-	call  1f
-
-.Lstore_table:
-	.long	noretval-.Lstore_table	/* FFI_TYPE_VOID */
-	.long	retint-.Lstore_table	/* FFI_TYPE_INT */
-	.long	retfloat-.Lstore_table	/* FFI_TYPE_FLOAT */
-	.long	retdouble-.Lstore_table	/* FFI_TYPE_DOUBLE */
-	.long	retlongdouble-.Lstore_table	/* FFI_TYPE_LONGDOUBLE */
-	.long	retuint8-.Lstore_table	/* FFI_TYPE_UINT8 */
-	.long	retsint8-.Lstore_table	/* FFI_TYPE_SINT8 */
-	.long	retuint16-.Lstore_table	/* FFI_TYPE_UINT16 */
-	.long	retsint16-.Lstore_table	/* FFI_TYPE_SINT16 */
-	.long	retint-.Lstore_table	/* FFI_TYPE_UINT32 */
-	.long	retint-.Lstore_table	/* FFI_TYPE_SINT32 */
-	.long	retint64-.Lstore_table	/* FFI_TYPE_UINT64 */
-	.long	retint64-.Lstore_table	/* FFI_TYPE_SINT64 */
-	.long	retstruct-.Lstore_table	/* FFI_TYPE_STRUCT */
-	.long	retint-.Lstore_table	/* FFI_TYPE_POINTER */
-
-1:
-	pop  %esi
-	add  (%esi, %ecx, 4), %esi
-	jmp  *%esi
-
-	/* Sign/zero extend as appropriate.  */
-retsint8:
-	movsbl  %al, %eax
-	jmp  retint
+#define C2(X, Y)  X ## Y
+#define C1(X, Y)  C2(X, Y)
+#ifdef __USER_LABEL_PREFIX__
+# define C(X)     C1(__USER_LABEL_PREFIX__, X)
+#else
+# define C(X)     X
+#endif
 
-retsint16:
-	movswl  %ax, %eax
-	jmp  retint
+#ifdef X86_DARWIN
+# define L(X)     C1(L, X)
+#else
+# define L(X)     C1(.L, X)
+#endif
 
-retuint8:
-	movzbl  %al, %eax
-	jmp  retint
+#ifdef __ELF__
+# define ENDF(X)  .type	X,@function; .size X, . - X
+#else
+# define ENDF(X)
+#endif
 
-retuint16:
-	movzwl  %ax, %eax
-	jmp  retint
-
-retfloat:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	fstps (%ecx)
-	jmp   epilogue
-
-retdouble:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	fstpl (%ecx)
-	jmp   epilogue
-
-retlongdouble:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	fstpt (%ecx)
-	jmp   epilogue
-	
-retint64:	
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	movl  %eax,0(%ecx)
-	movl  %edx,4(%ecx)
-	jmp   epilogue
-	
-retint:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	movl  %eax,0(%ecx)
-
-retstruct:
-	/* Nothing to do!  */
-
-noretval:
-epilogue:
-        popl %esi
-        movl %ebp,%esp
-        popl %ebp
-        ret
-.LFE1:
-.ffi_call_SYSV_end:
-        .size    ffi_call_SYSV,.ffi_call_SYSV_end-ffi_call_SYSV
-
-	.align	4
-FFI_HIDDEN (ffi_closure_SYSV)
-.globl ffi_closure_SYSV
-	.type	ffi_closure_SYSV, @function
-
-ffi_closure_SYSV:
-.LFB2:
-	pushl	%ebp
-.LCFI2:
-	movl	%esp, %ebp
-.LCFI3:
-	subl	$40, %esp
-	leal	-24(%ebp), %edx
-	movl	%edx, -12(%ebp)	/* resp */
-	leal	8(%ebp), %edx
-	movl	%edx, 4(%esp)	/* args = __builtin_dwarf_cfa () */
-	leal	-12(%ebp), %edx
-	movl	%edx, (%esp)	/* &resp */
-#if defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE || !defined __PIC__
-	call	ffi_closure_SYSV_inner
+/* Handle win32 fastcall name mangling.  */
+#ifdef X86_WIN32
+# define ffi_call_i386		@ffi_call_i386@8
+# define ffi_closure_inner	@ffi_closure_inner@8
 #else
-	movl	%ebx, 8(%esp)
-.LCFI7:
-	call	1f
-1:	popl	%ebx
-	addl	$_GLOBAL_OFFSET_TABLE_+[.-1b], %ebx
-	call	ffi_closure_SYSV_inner@PLT
-	movl	8(%esp), %ebx
+# define ffi_call_i386		C(ffi_call_i386)
+# define ffi_closure_inner	C(ffi_closure_inner)
 #endif
-	movl	-12(%ebp), %ecx
-	cmpl	$FFI_TYPE_INT, %eax
-	je	.Lcls_retint
-
-	/* Handle FFI_TYPE_UINT8, FFI_TYPE_SINT8, FFI_TYPE_UINT16,
-	   FFI_TYPE_SINT16, FFI_TYPE_UINT32, FFI_TYPE_SINT32.  */
-	cmpl	$FFI_TYPE_UINT64, %eax
-	jge	0f
-	cmpl	$FFI_TYPE_UINT8, %eax
-	jge	.Lcls_retint
-	
-0:	cmpl	$FFI_TYPE_FLOAT, %eax
-	je	.Lcls_retfloat
-	cmpl	$FFI_TYPE_DOUBLE, %eax
-	je	.Lcls_retdouble
-	cmpl	$FFI_TYPE_LONGDOUBLE, %eax
-	je	.Lcls_retldouble
-	cmpl	$FFI_TYPE_SINT64, %eax
-	je	.Lcls_retllong
-	cmpl	$FFI_TYPE_STRUCT, %eax
-	je	.Lcls_retstruct
-.Lcls_epilogue:
-	movl	%ebp, %esp
-	popl	%ebp
-	ret
-.Lcls_retint:
-	movl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-.Lcls_retfloat:
-	flds	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retdouble:
-	fldl	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retldouble:
-	fldt	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retllong:
-	movl	(%ecx), %eax
-	movl	4(%ecx), %edx
-	jmp	.Lcls_epilogue
-.Lcls_retstruct:
-	movl	%ebp, %esp
-	popl	%ebp
-	ret	$4
-.LFE2:
-	.size	ffi_closure_SYSV, .-ffi_closure_SYSV
 
-#if !FFI_NO_RAW_API
+/* This macro allows the safe creation of jump tables without an
+   actual table.  The entry points into the table are all 8 bytes.
+   The use of ORG asserts that we're at the correct location.  */
+/* ??? The clang assembler doesn't handle .org with symbolic expressions.  */
+#if defined(__clang__) || defined(__APPLE__)
+# define E(BASE, X)	.balign 8
+#else
+# define E(BASE, X)	.balign 8; .org BASE + X * 8
+#endif
+
+	.text
+	.balign	16
+	.globl	ffi_call_i386
+	FFI_HIDDEN(ffi_call_i386)
+
+/* This is declared as
+
+   void ffi_call_i386(struct call_frame *frame, char *argp)
+        __attribute__((fastcall));
+
+   Thus the arguments are present in
+
+        ecx: frame
+        edx: argp
+*/
+
+ffi_call_i386:
+L(UW0):
+	# cfi_startproc
+#if !HAVE_FASTCALL
+	movl	4(%esp), %ecx
+	movl	8(%esp), %edx
+#endif
+	movl	(%esp), %eax		/* move the return address */
+	movl	%ebp, (%ecx)		/* store %ebp into local frame */
+	movl	%eax, 4(%ecx)		/* store retaddr into local frame */
+
+	/* New stack frame based off ebp.  This is a itty bit of unwind
+	   trickery in that the CFA *has* changed.  There is no easy way
+	   to describe it correctly on entry to the function.  Fortunately,
+	   it doesn't matter too much since at all points we can correctly
+	   unwind back to ffi_call.  Note that the location to which we
+	   moved the return address is (the new) CFA-4, so from the
+	   perspective of the unwind info, it hasn't moved.  */
+	movl	%ecx, %ebp
+L(UW1):
+	# cfi_def_cfa(%ebp, 8)
+	# cfi_rel_offset(%ebp, 0)
 
-/* Precalculate for e.g. the Solaris 10/x86 assembler.  */
-#if FFI_TRAMPOLINE_SIZE == 10
-#define RAW_CLOSURE_CIF_OFFSET 12
-#define RAW_CLOSURE_FUN_OFFSET 16
-#define RAW_CLOSURE_USER_DATA_OFFSET 20
-#elif FFI_TRAMPOLINE_SIZE == 24
-#define RAW_CLOSURE_CIF_OFFSET 24
-#define RAW_CLOSURE_FUN_OFFSET 28
-#define RAW_CLOSURE_USER_DATA_OFFSET 32
+	movl	%edx, %esp		/* set outgoing argument stack */
+	movl	20+R_EAX*4(%ebp), %eax	/* set register arguments */
+	movl	20+R_EDX*4(%ebp), %edx
+	movl	20+R_ECX*4(%ebp), %ecx
+
+	call	*8(%ebp)
+
+	movl	12(%ebp), %ecx		/* load return type code */
+	movl	%ebx, 8(%ebp)		/* preserve %ebx */
+L(UW2):
+	# cfi_rel_offset(%ebx, 8)
+
+	andl	$X86_RET_TYPE_MASK, %ecx
+#ifdef __PIC__
+	call	C(__x86.get_pc_thunk.bx)
+L(pc1):
+	leal	L(store_table)-L(pc1)(%ebx, %ecx, 8), %ebx
 #else
-#define RAW_CLOSURE_CIF_OFFSET ((FFI_TRAMPOLINE_SIZE + 3) & ~3)
-#define RAW_CLOSURE_FUN_OFFSET (RAW_CLOSURE_CIF_OFFSET + 4)
-#define RAW_CLOSURE_USER_DATA_OFFSET (RAW_CLOSURE_FUN_OFFSET + 4)
+	leal	L(store_table)(,%ecx, 8), %ebx
 #endif
-#define CIF_FLAGS_OFFSET 20
-
-	.align	4
-FFI_HIDDEN (ffi_closure_raw_SYSV)
-.globl ffi_closure_raw_SYSV
-	.type	ffi_closure_raw_SYSV, @function
-
-ffi_closure_raw_SYSV:
-.LFB3:
-	pushl	%ebp
-.LCFI4:
-	movl	%esp, %ebp
-.LCFI5:
-	pushl	%esi
-.LCFI6:
-	subl	$36, %esp
-	movl	RAW_CLOSURE_CIF_OFFSET(%eax), %esi	 /* closure->cif */
-	movl	RAW_CLOSURE_USER_DATA_OFFSET(%eax), %edx /* closure->user_data */
-	movl	%edx, 12(%esp)	/* user_data */
-	leal	8(%ebp), %edx	/* __builtin_dwarf_cfa () */
-	movl	%edx, 8(%esp)	/* raw_args */
-	leal	-24(%ebp), %edx
-	movl	%edx, 4(%esp)	/* &res */
-	movl	%esi, (%esp)	/* cif */
-	call	*RAW_CLOSURE_FUN_OFFSET(%eax)		 /* closure->fun */
-	movl	CIF_FLAGS_OFFSET(%esi), %eax		 /* rtype */
-	cmpl	$FFI_TYPE_INT, %eax
-	je	.Lrcls_retint
-
-	/* Handle FFI_TYPE_UINT8, FFI_TYPE_SINT8, FFI_TYPE_UINT16,
-	   FFI_TYPE_SINT16, FFI_TYPE_UINT32, FFI_TYPE_SINT32.  */
-	cmpl	$FFI_TYPE_UINT64, %eax
-	jge	0f
-	cmpl	$FFI_TYPE_UINT8, %eax
-	jge	.Lrcls_retint
-0:
-	cmpl	$FFI_TYPE_FLOAT, %eax
-	je	.Lrcls_retfloat
-	cmpl	$FFI_TYPE_DOUBLE, %eax
-	je	.Lrcls_retdouble
-	cmpl	$FFI_TYPE_LONGDOUBLE, %eax
-	je	.Lrcls_retldouble
-	cmpl	$FFI_TYPE_SINT64, %eax
-	je	.Lrcls_retllong
-.Lrcls_epilogue:
-	addl	$36, %esp
-	popl	%esi
+	movl	16(%ebp), %ecx		/* load result address */
+	jmp	*%ebx
+
+	.balign	8
+L(store_table):
+E(L(store_table), X86_RET_FLOAT)
+	fstps	(%ecx)
+	jmp	L(e1)
+E(L(store_table), X86_RET_DOUBLE)
+	fstpl	(%ecx)
+	jmp	L(e1)
+E(L(store_table), X86_RET_LDOUBLE)
+	fstpt	(%ecx)
+	jmp	L(e1)
+E(L(store_table), X86_RET_SINT8)
+	movsbl	%al, %eax
+	mov	%eax, (%ecx)
+	jmp	L(e1)
+E(L(store_table), X86_RET_SINT16)
+	movswl	%ax, %eax
+	mov	%eax, (%ecx)
+	jmp	L(e1)
+E(L(store_table), X86_RET_UINT8)
+	movzbl	%al, %eax
+	mov	%eax, (%ecx)
+	jmp	L(e1)
+E(L(store_table), X86_RET_UINT16)
+	movzwl	%ax, %eax
+	mov	%eax, (%ecx)
+	jmp	L(e1)
+E(L(store_table), X86_RET_INT64)
+	movl	%edx, 4(%ecx)
+	/* fallthru */
+E(L(store_table), X86_RET_INT32)
+	movl	%eax, (%ecx)
+	/* fallthru */
+E(L(store_table), X86_RET_VOID)
+L(e1):
+	movl	8(%ebp), %ebx
+	movl	%ebp, %esp
 	popl	%ebp
+L(UW3):
+	# cfi_remember_state
+	# cfi_def_cfa(%esp, 4)
+	# cfi_restore(%ebx)
+	# cfi_restore(%ebp)
 	ret
-.Lrcls_retint:
-	movl	-24(%ebp), %eax
-	jmp	.Lrcls_epilogue
-.Lrcls_retfloat:
-	flds	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retdouble:
-	fldl	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retldouble:
-	fldt	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retllong:
-	movl	-24(%ebp), %eax
-	movl	-20(%ebp), %edx
-	jmp	.Lrcls_epilogue
-.LFE3:
-	.size	ffi_closure_raw_SYSV, .-ffi_closure_raw_SYSV
+L(UW4):
+	# cfi_restore_state
+
+E(L(store_table), X86_RET_STRUCTPOP)
+	jmp	L(e1)
+E(L(store_table), X86_RET_STRUCTARG)
+	jmp	L(e1)
+E(L(store_table), X86_RET_STRUCT_1B)
+	movb	%al, (%ecx)
+	jmp	L(e1)
+E(L(store_table), X86_RET_STRUCT_2B)
+	movw	%ax, (%ecx)
+	jmp	L(e1)
+
+	/* Fill out the table so that bad values are predictable.  */
+E(L(store_table), X86_RET_UNUSED14)
+	ud2
+E(L(store_table), X86_RET_UNUSED15)
+	ud2
+
+L(UW5):
+	# cfi_endproc
+ENDF(ffi_call_i386)
+
+/* The inner helper is declared as
+
+   void ffi_closure_inner(struct closure_frame *frame, char *argp)
+	__attribute_((fastcall))
+
+   Thus the arguments are placed in
+
+	ecx:	frame
+	edx:	argp
+*/
+
+/* Macros to help setting up the closure_data structure.  */
+
+#if HAVE_FASTCALL
+# define closure_FS	(40 + 4)
+# define closure_CF	0
+#else
+# define closure_FS	(8 + 40 + 12)
+# define closure_CF	8
 #endif
 
-#if defined __PIC__
-# if defined __sun__ && defined __svr4__
-/* 32-bit Solaris 2/x86 uses datarel encoding for PIC.  GNU ld before 2.22
-   doesn't correctly sort .eh_frame_hdr with mixed encodings, so match this.  */
-#  define FDE_ENCODING		0x30	/* datarel */
-#  define FDE_ENCODE(X)		X@GOTOFF
-# else
-#  define FDE_ENCODING		0x1b	/* pcrel sdata4 */
-#  if defined HAVE_AS_X86_PCREL
-#   define FDE_ENCODE(X)	X-.
-#  else
-#   define FDE_ENCODE(X)	X@rel
-#  endif
-# endif
+#define FFI_CLOSURE_SAVE_REGS		\
+	movl	%eax, closure_CF+16+R_EAX*4(%esp);	\
+	movl	%edx, closure_CF+16+R_EDX*4(%esp);	\
+	movl	%ecx, closure_CF+16+R_ECX*4(%esp)
+
+#define FFI_CLOSURE_COPY_TRAMP_DATA					\
+	movl	FFI_TRAMPOLINE_SIZE(%eax), %edx;	/* copy cif */	\
+	movl	FFI_TRAMPOLINE_SIZE+4(%eax), %ecx;	/* copy fun */	\
+	movl	FFI_TRAMPOLINE_SIZE+8(%eax), %eax;	/* copy user_data */ \
+	movl	%edx, closure_CF+28(%esp);				\
+	movl	%ecx, closure_CF+32(%esp);				\
+	movl	%eax, closure_CF+36(%esp)
+
+#if HAVE_FASTCALL
+# define FFI_CLOSURE_PREP_CALL						\
+	movl	%esp, %ecx;			/* load closure_data */	\
+	leal	closure_FS+4(%esp), %edx;	/* load incoming stack */
 #else
-# define FDE_ENCODING		0	/* absolute */
-# define FDE_ENCODE(X)		X
+# define FFI_CLOSURE_PREP_CALL						\
+	leal	closure_CF(%esp), %ecx;		/* load closure_data */	\
+	leal	closure_FS+4(%esp), %edx;	/* load incoming stack */ \
+	movl	%ecx, (%esp);						\
+	movl	%edx, 4(%esp)
 #endif
 
-	.section	.eh_frame,EH_FRAME_FLAGS,@progbits
-.Lframe1:
-	.long	.LECIE1-.LSCIE1	/* Length of Common Information Entry */
-.LSCIE1:
-	.long	0x0	/* CIE Identifier Tag */
-	.byte	0x1	/* CIE Version */
-#ifdef HAVE_AS_ASCII_PSEUDO_OP
+#define FFI_CLOSURE_CALL_INNER(UWN) \
+	call	ffi_closure_inner
+
+#define FFI_CLOSURE_MASK_AND_JUMP(N, UW)				\
+	andl	$X86_RET_TYPE_MASK, %eax;				\
+	leal	L(C1(load_table,N))(, %eax, 8), %edx;			\
+	movl	closure_CF(%esp), %eax;		/* optimiztic load */	\
+	jmp	*%edx
+
 #ifdef __PIC__
-	.ascii "zR\0"	/* CIE Augmentation */
+# if defined X86_DARWIN || defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE
+#  undef FFI_CLOSURE_MASK_AND_JUMP
+#  define FFI_CLOSURE_MASK_AND_JUMP(N, UW)				\
+	andl	$X86_RET_TYPE_MASK, %eax;				\
+	call	C(__x86.get_pc_thunk.dx);				\
+L(C1(pc,N)):								\
+	leal	L(C1(load_table,N))-L(C1(pc,N))(%edx, %eax, 8), %edx;	\
+	movl	closure_CF(%esp), %eax;		/* optimiztic load */	\
+	jmp	*%edx
+# else
+#  define FFI_CLOSURE_CALL_INNER_SAVE_EBX
+#  undef FFI_CLOSURE_CALL_INNER
+#  define FFI_CLOSURE_CALL_INNER(UWN)					\
+	movl	%ebx, 40(%esp);			/* save ebx */		\
+L(C1(UW,UWN)):								\
+	# cfi_rel_offset(%ebx, 40);					\
+	call	C(__x86.get_pc_thunk.bx);	/* load got register */	\
+	addl	$C(_GLOBAL_OFFSET_TABLE_), %ebx;			\
+	call	ffi_closure_inner@PLT
+#  undef FFI_CLOSURE_MASK_AND_JUMP
+#  define FFI_CLOSURE_MASK_AND_JUMP(N, UWN)				\
+	andl	$X86_RET_TYPE_MASK, %eax;				\
+	leal	L(C1(load_table,N))@GOTOFF(%ebx, %eax, 8), %edx;	\
+	movl	40(%esp), %ebx;			/* restore ebx */	\
+L(C1(UW,UWN)):								\
+	# cfi_restore(%ebx);						\
+	movl	closure_CF(%esp), %eax;		/* optimiztic load */	\
+	jmp	*%edx
+# endif /* DARWIN || HIDDEN */
+#endif /* __PIC__ */
+
+	.balign	16
+	.globl	C(ffi_go_closure_EAX)
+	FFI_HIDDEN(C(ffi_go_closure_EAX))
+C(ffi_go_closure_EAX):
+L(UW6):
+	# cfi_startproc
+	subl	$closure_FS, %esp
+L(UW7):
+	# cfi_def_cfa_offset(closure_FS + 4)
+	FFI_CLOSURE_SAVE_REGS
+	movl	4(%eax), %edx			/* copy cif */
+	movl	8(%eax), %ecx			/* copy fun */
+	movl	%edx, closure_CF+28(%esp)
+	movl	%ecx, closure_CF+32(%esp)
+	movl	%eax, closure_CF+36(%esp)	/* closure is user_data */
+	jmp	L(do_closure_i386)
+L(UW8):
+	# cfi_endproc
+ENDF(C(ffi_go_closure_EAX))
+
+	.balign	16
+	.globl	C(ffi_go_closure_ECX)
+	FFI_HIDDEN(C(ffi_go_closure_ECX))
+C(ffi_go_closure_ECX):
+L(UW9):
+	# cfi_startproc
+	subl	$closure_FS, %esp
+L(UW10):
+	# cfi_def_cfa_offset(closure_FS + 4)
+	FFI_CLOSURE_SAVE_REGS
+	movl	4(%ecx), %edx			/* copy cif */
+	movl	8(%ecx), %eax			/* copy fun */
+	movl	%edx, closure_CF+28(%esp)
+	movl	%eax, closure_CF+32(%esp)
+	movl	%ecx, closure_CF+36(%esp)	/* closure is user_data */
+	jmp	L(do_closure_i386)
+L(UW11):
+	# cfi_endproc
+ENDF(C(ffi_go_closure_ECX))
+
+/* The closure entry points are reached from the ffi_closure trampoline.
+   On entry, %eax contains the address of the ffi_closure.  */
+
+	.balign	16
+	.globl	C(ffi_closure_i386)
+	FFI_HIDDEN(C(ffi_closure_i386))
+
+C(ffi_closure_i386):
+L(UW12):
+	# cfi_startproc
+	subl	$closure_FS, %esp
+L(UW13):
+	# cfi_def_cfa_offset(closure_FS + 4)
+
+	FFI_CLOSURE_SAVE_REGS
+	FFI_CLOSURE_COPY_TRAMP_DATA
+
+	/* Entry point from preceeding Go closures.  */
+L(do_closure_i386):
+
+	FFI_CLOSURE_PREP_CALL
+	FFI_CLOSURE_CALL_INNER(14)
+	FFI_CLOSURE_MASK_AND_JUMP(2, 15)
+
+	.balign	8
+L(load_table2):
+E(L(load_table2), X86_RET_FLOAT)
+	flds	closure_CF(%esp)
+	jmp	L(e2)
+E(L(load_table2), X86_RET_DOUBLE)
+	fldl	closure_CF(%esp)
+	jmp	L(e2)
+E(L(load_table2), X86_RET_LDOUBLE)
+	fldt	closure_CF(%esp)
+	jmp	L(e2)
+E(L(load_table2), X86_RET_SINT8)
+	movsbl	%al, %eax
+	jmp	L(e2)
+E(L(load_table2), X86_RET_SINT16)
+	movswl	%ax, %eax
+	jmp	L(e2)
+E(L(load_table2), X86_RET_UINT8)
+	movzbl	%al, %eax
+	jmp	L(e2)
+E(L(load_table2), X86_RET_UINT16)
+	movzwl	%ax, %eax
+	jmp	L(e2)
+E(L(load_table2), X86_RET_INT64)
+	movl	closure_CF+4(%esp), %edx
+	jmp	L(e2)
+E(L(load_table2), X86_RET_INT32)
+	nop
+	/* fallthru */
+E(L(load_table2), X86_RET_VOID)
+L(e2):
+	addl	$closure_FS, %esp
+L(UW16):
+	# cfi_adjust_cfa_offset(-closure_FS)
+	ret
+L(UW17):
+	# cfi_adjust_cfa_offset(closure_FS)
+E(L(load_table2), X86_RET_STRUCTPOP)
+	addl	$closure_FS, %esp
+L(UW18):
+	# cfi_adjust_cfa_offset(-closure_FS)
+	ret	$4
+L(UW19):
+	# cfi_adjust_cfa_offset(closure_FS)
+E(L(load_table2), X86_RET_STRUCTARG)
+	jmp	L(e2)
+E(L(load_table2), X86_RET_STRUCT_1B)
+	movzbl	%al, %eax
+	jmp	L(e2)
+E(L(load_table2), X86_RET_STRUCT_2B)
+	movzwl	%ax, %eax
+	jmp	L(e2)
+
+	/* Fill out the table so that bad values are predictable.  */
+E(L(load_table2), X86_RET_UNUSED14)
+	ud2
+E(L(load_table2), X86_RET_UNUSED15)
+	ud2
+
+L(UW20):
+	# cfi_endproc
+ENDF(C(ffi_closure_i386))
+
+	.balign	16
+	.globl	C(ffi_go_closure_STDCALL)
+	FFI_HIDDEN(C(ffi_go_closure_STDCALL))
+C(ffi_go_closure_STDCALL):
+L(UW21):
+	# cfi_startproc
+	subl	$closure_FS, %esp
+L(UW22):
+	# cfi_def_cfa_offset(closure_FS + 4)
+	FFI_CLOSURE_SAVE_REGS
+	movl	4(%ecx), %edx			/* copy cif */
+	movl	8(%ecx), %eax			/* copy fun */
+	movl	%edx, closure_CF+28(%esp)
+	movl	%eax, closure_CF+32(%esp)
+	movl	%ecx, closure_CF+36(%esp)	/* closure is user_data */
+	jmp	L(do_closure_STDCALL)
+L(UW23):
+	# cfi_endproc
+ENDF(C(ffi_go_closure_STDCALL))
+
+/* For REGISTER, we have no available parameter registers, and so we
+   enter here having pushed the closure onto the stack.  */
+
+	.balign	16
+	.globl	C(ffi_closure_REGISTER)
+	FFI_HIDDEN(C(ffi_closure_REGISTER))
+C(ffi_closure_REGISTER):
+L(UW24):
+	# cfi_startproc
+	# cfi_def_cfa(%esp, 8)
+	# cfi_offset(%eip, -8)
+	subl	$closure_FS-4, %esp
+L(UW25):
+	# cfi_def_cfa_offset(closure_FS + 4)
+	FFI_CLOSURE_SAVE_REGS
+	movl	closure_FS-4(%esp), %ecx	/* load retaddr */
+	movl	closure_FS(%esp), %eax		/* load closure */
+	movl	%ecx, closure_FS(%esp)		/* move retaddr */
+	jmp	L(do_closure_REGISTER)
+L(UW26):
+	# cfi_endproc
+ENDF(C(ffi_closure_REGISTER))
+
+/* For STDCALL (and others), we need to pop N bytes of arguments off
+   the stack following the closure.  The amount needing to be popped
+   is returned to us from ffi_closure_inner.  */
+
+	.balign	16
+	.globl	C(ffi_closure_STDCALL)
+	FFI_HIDDEN(C(ffi_closure_STDCALL))
+C(ffi_closure_STDCALL):
+L(UW27):
+	# cfi_startproc
+	subl	$closure_FS, %esp
+L(UW28):
+	# cfi_def_cfa_offset(closure_FS + 4)
+
+	FFI_CLOSURE_SAVE_REGS
+
+	/* Entry point from ffi_closure_REGISTER.  */
+L(do_closure_REGISTER):
+
+	FFI_CLOSURE_COPY_TRAMP_DATA
+
+	/* Entry point from preceeding Go closure.  */
+L(do_closure_STDCALL):
+
+	FFI_CLOSURE_PREP_CALL
+	FFI_CLOSURE_CALL_INNER(29)
+
+	movl	%eax, %ecx
+	shrl	$X86_RET_POP_SHIFT, %ecx	/* isolate pop count */
+	leal	closure_FS(%esp, %ecx), %ecx	/* compute popped esp */
+	movl	closure_FS(%esp), %edx		/* move return address */
+	movl	%edx, (%ecx)
+
+	/* From this point on, the value of %esp upon return is %ecx+4,
+	   and we've copied the return address to %ecx to make return easy.
+	   There's no point in representing this in the unwind info, as
+	   there is always a window between the mov and the ret which
+	   will be wrong from one point of view or another.  */
+
+	FFI_CLOSURE_MASK_AND_JUMP(3, 30)
+
+	.balign	8
+L(load_table3):
+E(L(load_table3), X86_RET_FLOAT)
+	flds    closure_CF(%esp)
+	movl    %ecx, %esp
+	ret
+E(L(load_table3), X86_RET_DOUBLE)
+	fldl    closure_CF(%esp)
+	movl    %ecx, %esp
+	ret
+E(L(load_table3), X86_RET_LDOUBLE)
+	fldt    closure_CF(%esp)
+	movl    %ecx, %esp
+	ret
+E(L(load_table3), X86_RET_SINT8)
+	movsbl  %al, %eax
+	movl    %ecx, %esp
+	ret
+E(L(load_table3), X86_RET_SINT16)
+	movswl  %ax, %eax
+	movl    %ecx, %esp
+	ret
+E(L(load_table3), X86_RET_UINT8)
+	movzbl  %al, %eax
+	movl    %ecx, %esp
+	ret
+E(L(load_table3), X86_RET_UINT16)
+	movzwl  %ax, %eax
+	movl    %ecx, %esp
+	ret
+E(L(load_table3), X86_RET_INT64)
+	movl	closure_CF+4(%esp), %edx
+	movl    %ecx, %esp
+	ret
+E(L(load_table3), X86_RET_INT32)
+	movl    %ecx, %esp
+	ret
+E(L(load_table3), X86_RET_VOID)
+	movl    %ecx, %esp
+	ret
+E(L(load_table3), X86_RET_STRUCTPOP)
+	movl    %ecx, %esp
+	ret
+E(L(load_table3), X86_RET_STRUCTARG)
+	movl	%ecx, %esp
+	ret
+E(L(load_table3), X86_RET_STRUCT_1B)
+	movzbl	%al, %eax
+	movl	%ecx, %esp
+	ret
+E(L(load_table3), X86_RET_STRUCT_2B)
+	movzwl	%ax, %eax
+	movl	%ecx, %esp
+	ret
+
+	/* Fill out the table so that bad values are predictable.  */
+E(L(load_table3), X86_RET_UNUSED14)
+	ud2
+E(L(load_table3), X86_RET_UNUSED15)
+	ud2
+
+L(UW31):
+	# cfi_endproc
+ENDF(C(ffi_closure_STDCALL))
+
+#if !FFI_NO_RAW_API
+
+#define raw_closure_S_FS	(16+16+12)
+
+	.balign	16
+	.globl	C(ffi_closure_raw_SYSV)
+	FFI_HIDDEN(C(ffi_closure_raw_SYSV))
+C(ffi_closure_raw_SYSV):
+L(UW32):
+	# cfi_startproc
+	subl	$raw_closure_S_FS, %esp
+L(UW33):
+	# cfi_def_cfa_offset(raw_closure_S_FS + 4)
+	movl	%ebx, raw_closure_S_FS-4(%esp)
+L(UW34):
+	# cfi_rel_offset(%ebx, raw_closure_S_FS-4)
+
+	movl	FFI_TRAMPOLINE_SIZE+8(%eax), %edx	/* load cl->user_data */
+	movl	%edx, 12(%esp)
+	leal	raw_closure_S_FS+4(%esp), %edx		/* load raw_args */
+	movl	%edx, 8(%esp)
+	leal	16(%esp), %edx				/* load &res */
+	movl	%edx, 4(%esp)
+	movl	FFI_TRAMPOLINE_SIZE(%eax), %ebx		/* load cl->cif */
+	movl	%ebx, (%esp)
+	call	*FFI_TRAMPOLINE_SIZE+4(%eax)		/* call cl->fun */
+
+	movl	20(%ebx), %eax				/* load cif->flags */
+	andl	$X86_RET_TYPE_MASK, %eax
+#ifdef __PIC__
+	call	C(__x86.get_pc_thunk.bx)
+L(pc4):
+	leal	L(load_table4)-L(pc4)(%ebx, %eax, 8), %ecx
 #else
-	.ascii "\0"	/* CIE Augmentation */
+	leal	L(load_table4)(,%eax, 8), %ecx
 #endif
-#elif defined HAVE_AS_STRING_PSEUDO_OP
+	movl	raw_closure_S_FS-4(%esp), %ebx
+L(UW35):
+	# cfi_restore(%ebx)
+	movl	16(%esp), %eax				/* Optimistic load */
+	jmp	*%ecx
+
+	.balign	8
+L(load_table4):
+E(L(load_table4), X86_RET_FLOAT)
+	flds	16(%esp)
+	jmp	L(e4)
+E(L(load_table4), X86_RET_DOUBLE)
+	fldl	16(%esp)
+	jmp	L(e4)
+E(L(load_table4), X86_RET_LDOUBLE)
+	fldt	16(%esp)
+	jmp	L(e4)
+E(L(load_table4), X86_RET_SINT8)
+	movsbl	%al, %eax
+	jmp	L(e4)
+E(L(load_table4), X86_RET_SINT16)
+	movswl	%ax, %eax
+	jmp	L(e4)
+E(L(load_table4), X86_RET_UINT8)
+	movzbl	%al, %eax
+	jmp	L(e4)
+E(L(load_table4), X86_RET_UINT16)
+	movzwl	%ax, %eax
+	jmp	L(e4)
+E(L(load_table4), X86_RET_INT64)
+	movl	16+4(%esp), %edx
+	jmp	L(e4)
+E(L(load_table4), X86_RET_INT32)
+	nop
+	/* fallthru */
+E(L(load_table4), X86_RET_VOID)
+L(e4):
+	addl	$raw_closure_S_FS, %esp
+L(UW36):
+	# cfi_adjust_cfa_offset(-raw_closure_S_FS)
+	ret
+L(UW37):
+	# cfi_adjust_cfa_offset(raw_closure_S_FS)
+E(L(load_table4), X86_RET_STRUCTPOP)
+	addl	$raw_closure_S_FS, %esp
+L(UW38):
+	# cfi_adjust_cfa_offset(-raw_closure_S_FS)
+	ret	$4
+L(UW39):
+	# cfi_adjust_cfa_offset(raw_closure_S_FS)
+E(L(load_table4), X86_RET_STRUCTARG)
+	jmp	L(e4)
+E(L(load_table4), X86_RET_STRUCT_1B)
+	movzbl	%al, %eax
+	jmp	L(e4)
+E(L(load_table4), X86_RET_STRUCT_2B)
+	movzwl	%ax, %eax
+	jmp	L(e4)
+
+	/* Fill out the table so that bad values are predictable.  */
+E(L(load_table4), X86_RET_UNUSED14)
+	ud2
+E(L(load_table4), X86_RET_UNUSED15)
+	ud2
+
+L(UW40):
+	# cfi_endproc
+ENDF(C(ffi_closure_raw_SYSV))
+
+#define raw_closure_T_FS	(16+16+8)
+
+	.balign	16
+	.globl	C(ffi_closure_raw_THISCALL)
+	FFI_HIDDEN(C(ffi_closure_raw_THISCALL))
+C(ffi_closure_raw_THISCALL):
+L(UW41):
+	# cfi_startproc
+	/* Rearrange the stack such that %ecx is the first argument.
+	   This means moving the return address.  */
+	popl	%edx
+L(UW42):
+	# cfi_def_cfa_offset(0)
+	# cfi_register(%eip, %edx)
+	pushl	%ecx
+L(UW43):
+	# cfi_adjust_cfa_offset(4)
+	pushl	%edx
+L(UW44):
+	# cfi_adjust_cfa_offset(4)
+	# cfi_rel_offset(%eip, 0)
+	subl	$raw_closure_T_FS, %esp
+L(UW45):
+	# cfi_adjust_cfa_offset(raw_closure_T_FS)
+	movl	%ebx, raw_closure_T_FS-4(%esp)
+L(UW46):
+	# cfi_rel_offset(%ebx, raw_closure_T_FS-4)
+
+	movl	FFI_TRAMPOLINE_SIZE+8(%eax), %edx	/* load cl->user_data */
+	movl	%edx, 12(%esp)
+	leal	raw_closure_T_FS+4(%esp), %edx		/* load raw_args */
+	movl	%edx, 8(%esp)
+	leal	16(%esp), %edx				/* load &res */
+	movl	%edx, 4(%esp)
+	movl	FFI_TRAMPOLINE_SIZE(%eax), %ebx		/* load cl->cif */
+	movl	%ebx, (%esp)
+	call	*FFI_TRAMPOLINE_SIZE+4(%eax)		/* call cl->fun */
+
+	movl	20(%ebx), %eax				/* load cif->flags */
+	andl	$X86_RET_TYPE_MASK, %eax
 #ifdef __PIC__
-	.string "zR"	/* CIE Augmentation */
+	call	C(__x86.get_pc_thunk.bx)
+L(pc5):
+	leal	L(load_table5)-L(pc5)(%ebx, %eax, 8), %ecx
 #else
-	.string ""	/* CIE Augmentation */
+	leal	L(load_table5)(,%eax, 8), %ecx
 #endif
+	movl	raw_closure_T_FS-4(%esp), %ebx
+L(UW47):
+	# cfi_restore(%ebx)
+	movl	16(%esp), %eax				/* Optimistic load */
+	jmp	*%ecx
+
+	.balign	8
+L(load_table5):
+E(L(load_table5), X86_RET_FLOAT)
+	flds	16(%esp)
+	jmp	L(e5)
+E(L(load_table5), X86_RET_DOUBLE)
+	fldl	16(%esp)
+	jmp	L(e5)
+E(L(load_table5), X86_RET_LDOUBLE)
+	fldt	16(%esp)
+	jmp	L(e5)
+E(L(load_table5), X86_RET_SINT8)
+	movsbl	%al, %eax
+	jmp	L(e5)
+E(L(load_table5), X86_RET_SINT16)
+	movswl	%ax, %eax
+	jmp	L(e5)
+E(L(load_table5), X86_RET_UINT8)
+	movzbl	%al, %eax
+	jmp	L(e5)
+E(L(load_table5), X86_RET_UINT16)
+	movzwl	%ax, %eax
+	jmp	L(e5)
+E(L(load_table5), X86_RET_INT64)
+	movl	16+4(%esp), %edx
+	jmp	L(e5)
+E(L(load_table5), X86_RET_INT32)
+	nop
+	/* fallthru */
+E(L(load_table5), X86_RET_VOID)
+L(e5):
+	addl	$raw_closure_T_FS, %esp
+L(UW48):
+	# cfi_adjust_cfa_offset(-raw_closure_T_FS)
+	/* Remove the extra %ecx argument we pushed.  */
+	ret	$4
+L(UW49):
+	# cfi_adjust_cfa_offset(raw_closure_T_FS)
+E(L(load_table5), X86_RET_STRUCTPOP)
+	addl	$raw_closure_T_FS, %esp
+L(UW50):
+	# cfi_adjust_cfa_offset(-raw_closure_T_FS)
+	ret	$8
+L(UW51):
+	# cfi_adjust_cfa_offset(raw_closure_T_FS)
+E(L(load_table5), X86_RET_STRUCTARG)
+	jmp	L(e5)
+E(L(load_table5), X86_RET_STRUCT_1B)
+	movzbl	%al, %eax
+	jmp	L(e5)
+E(L(load_table5), X86_RET_STRUCT_2B)
+	movzwl	%ax, %eax
+	jmp	L(e5)
+
+	/* Fill out the table so that bad values are predictable.  */
+E(L(load_table5), X86_RET_UNUSED14)
+	ud2
+E(L(load_table5), X86_RET_UNUSED15)
+	ud2
+
+L(UW52):
+	# cfi_endproc
+ENDF(C(ffi_closure_raw_THISCALL))
+
+#endif /* !FFI_NO_RAW_API */
+
+#ifdef X86_DARWIN
+# define COMDAT(X)							\
+        .section __TEXT,__textcoal_nt,coalesced,pure_instructions;	\
+        .weak_definition X;						\
+        .private_extern X
+#elif defined __ELF__
+# define COMDAT(X)							\
+	.section .text.X,"axG",@progbits,X,comdat;			\
+	.globl	X;							\
+	FFI_HIDDEN(X)
 #else
-#error missing .ascii/.string
-#endif
-	.byte	0x1	/* .uleb128 0x1; CIE Code Alignment Factor */
-	.byte	0x7c	/* .sleb128 -4; CIE Data Alignment Factor */
-	.byte	0x8	/* CIE RA Column */
-#ifdef __PIC__
-	.byte	0x1	/* .uleb128 0x1; Augmentation size */
-	.byte	FDE_ENCODING
-#endif
-	.byte	0xc	/* DW_CFA_def_cfa */
-	.byte	0x4	/* .uleb128 0x4 */
-	.byte	0x4	/* .uleb128 0x4 */
-	.byte	0x88	/* DW_CFA_offset, column 0x8 */
-	.byte	0x1	/* .uleb128 0x1 */
-	.align 4
-.LECIE1:
-.LSFDE1:
-	.long	.LEFDE1-.LASFDE1	/* FDE Length */
-.LASFDE1:
-	.long	.LASFDE1-.Lframe1	/* FDE CIE offset */
-	.long	FDE_ENCODE(.LFB1)	/* FDE initial location */
-	.long	.LFE1-.LFB1		/* FDE address range */
-#ifdef __PIC__
-	.byte	0x0	/* .uleb128 0x0; Augmentation size */
+# define COMDAT(X)
 #endif
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI0-.LFB1
-	.byte	0xe	/* DW_CFA_def_cfa_offset */
-	.byte	0x8	/* .uleb128 0x8 */
-	.byte	0x85	/* DW_CFA_offset, column 0x5 */
-	.byte	0x2	/* .uleb128 0x2 */
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI1-.LCFI0
-	.byte	0xd	/* DW_CFA_def_cfa_register */
-	.byte	0x5	/* .uleb128 0x5 */
-	.align 4
-.LEFDE1:
-.LSFDE2:
-	.long	.LEFDE2-.LASFDE2	/* FDE Length */
-.LASFDE2:
-	.long	.LASFDE2-.Lframe1	/* FDE CIE offset */
-	.long	FDE_ENCODE(.LFB2)	/* FDE initial location */
-	.long	.LFE2-.LFB2		/* FDE address range */
-#ifdef __PIC__
-	.byte	0x0	/* .uleb128 0x0; Augmentation size */
+
+#if defined(__PIC__)
+	COMDAT(C(__x86.get_pc_thunk.bx))
+C(__x86.get_pc_thunk.bx):
+	movl	(%esp), %ebx
+	ret
+ENDF(C(__x86.get_pc_thunk.bx))
+# if defined X86_DARWIN || defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE
+	COMDAT(C(__x86.get_pc_thunk.dx))
+C(__x86.get_pc_thunk.dx):
+	movl	(%esp), %edx
+	ret
+ENDF(C(__x86.get_pc_thunk.dx))
+#endif /* DARWIN || HIDDEN */
+#endif /* __PIC__ */
+
+/* Sadly, OSX cctools-as doesn't understand .cfi directives at all.  */
+
+#ifdef __APPLE__
+.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
+EHFrame0:
+#elif defined(HAVE_AS_X86_64_UNWIND_SECTION_TYPE)
+.section .eh_frame,"a",@unwind
+#else
+.section .eh_frame,"a",@progbits
 #endif
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI2-.LFB2
-	.byte	0xe	/* DW_CFA_def_cfa_offset */
-	.byte	0x8	/* .uleb128 0x8 */
-	.byte	0x85	/* DW_CFA_offset, column 0x5 */
-	.byte	0x2	/* .uleb128 0x2 */
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI3-.LCFI2
-	.byte	0xd	/* DW_CFA_def_cfa_register */
-	.byte	0x5	/* .uleb128 0x5 */
-#if !defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE && defined __PIC__
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI7-.LCFI3
-	.byte	0x83	/* DW_CFA_offset, column 0x3 */
-	.byte	0xa	/* .uleb128 0xa */
+
+#ifdef HAVE_AS_X86_PCREL
+# define PCREL(X)	X - .
+#else
+# define PCREL(X)	X@rel
 #endif
-	.align 4
-.LEFDE2:
 
-#if !FFI_NO_RAW_API
+/* Simplify advancing between labels.  Assume DW_CFA_advance_loc1 fits.  */
+#define ADV(N, P)	.byte 2, L(N)-L(P)
 
-.LSFDE3:
-	.long	.LEFDE3-.LASFDE3	/* FDE Length */
-.LASFDE3:
-	.long	.LASFDE3-.Lframe1	/* FDE CIE offset */
-	.long	FDE_ENCODE(.LFB3)	/* FDE initial location */
-	.long	.LFE3-.LFB3		/* FDE address range */
-#ifdef __PIC__
-	.byte	0x0	/* .uleb128 0x0; Augmentation size */
+	.balign 4
+L(CIE):
+	.set	L(set0),L(ECIE)-L(SCIE)
+	.long	L(set0)			/* CIE Length */
+L(SCIE):
+	.long	0			/* CIE Identifier Tag */
+	.byte	1			/* CIE Version */
+	.ascii	"zR\0"			/* CIE Augmentation */
+	.byte	1			/* CIE Code Alignment Factor */
+	.byte	0x7c			/* CIE Data Alignment Factor */
+	.byte	0x8			/* CIE RA Column */
+	.byte	1			/* Augmentation size */
+	.byte	0x1b			/* FDE Encoding (pcrel sdata4) */
+	.byte	0xc, 4, 4		/* DW_CFA_def_cfa, %esp offset 4 */
+	.byte	0x80+8, 1		/* DW_CFA_offset, %eip offset 1*-4 */
+	.balign 4
+L(ECIE):
+
+	.set	L(set1),L(EFDE1)-L(SFDE1)
+	.long	L(set1)			/* FDE Length */
+L(SFDE1):
+	.long	L(SFDE1)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW0))		/* Initial location */
+	.long	L(UW5)-L(UW0)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW1, UW0)
+	.byte	0xc, 5, 8		/* DW_CFA_def_cfa, %ebp 8 */
+	.byte	0x80+5, 2		/* DW_CFA_offset, %ebp 2*-4 */
+	ADV(UW2, UW1)
+	.byte	0x80+3, 0		/* DW_CFA_offset, %ebx 0*-4 */
+	ADV(UW3, UW2)
+	.byte	0xa			/* DW_CFA_remember_state */
+	.byte	0xc, 4, 4		/* DW_CFA_def_cfa, %esp 4 */
+	.byte	0xc0+3			/* DW_CFA_restore, %ebx */
+	.byte	0xc0+5			/* DW_CFA_restore, %ebp */
+	ADV(UW4, UW3)
+	.byte	0xb			/* DW_CFA_restore_state */
+	.balign	4
+L(EFDE1):
+
+	.set	L(set2),L(EFDE2)-L(SFDE2)
+	.long	L(set2)			/* FDE Length */
+L(SFDE2):
+	.long	L(SFDE2)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW6))		/* Initial location */
+	.long	L(UW8)-L(UW6)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW7, UW6)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE2):
+
+	.set	L(set3),L(EFDE3)-L(SFDE3)
+	.long	L(set3)			/* FDE Length */
+L(SFDE3):
+	.long	L(SFDE3)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW9))		/* Initial location */
+	.long	L(UW11)-L(UW9)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW10, UW9)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE3):
+
+	.set	L(set4),L(EFDE4)-L(SFDE4)
+	.long	L(set4)			/* FDE Length */
+L(SFDE4):
+	.long	L(SFDE4)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW12))		/* Initial location */
+	.long	L(UW20)-L(UW12)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW13, UW12)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+#ifdef FFI_CLOSURE_CALL_INNER_SAVE_EBX
+	ADV(UW14, UW13)
+	.byte	0x80+3, (40-(closure_FS+4))/-4  /* DW_CFA_offset %ebx */
+	ADV(UW15, UW14)
+	.byte	0xc0+3			/* DW_CFA_restore %ebx */
+	ADV(UW16, UW15)
+#else
+	ADV(UW16, UW13)
 #endif
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI4-.LFB3
-	.byte	0xe	/* DW_CFA_def_cfa_offset */
-	.byte	0x8	/* .uleb128 0x8 */
-	.byte	0x85	/* DW_CFA_offset, column 0x5 */
-	.byte	0x2	/* .uleb128 0x2 */
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI5-.LCFI4
-	.byte	0xd	/* DW_CFA_def_cfa_register */
-	.byte	0x5	/* .uleb128 0x5 */
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI6-.LCFI5
-	.byte	0x86	/* DW_CFA_offset, column 0x6 */
-	.byte	0x3	/* .uleb128 0x3 */
-	.align 4
-.LEFDE3:
+	.byte	0xe, 4			/* DW_CFA_def_cfa_offset */
+	ADV(UW17, UW16)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+	ADV(UW18, UW17)
+	.byte	0xe, 4			/* DW_CFA_def_cfa_offset */
+	ADV(UW19, UW18)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE4):
+
+	.set	L(set5),L(EFDE5)-L(SFDE5)
+	.long	L(set5)			/* FDE Length */
+L(SFDE5):
+	.long	L(SFDE5)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW21))		/* Initial location */
+	.long	L(UW23)-L(UW21)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW22, UW21)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE5):
 
+	.set	L(set6),L(EFDE6)-L(SFDE6)
+	.long	L(set6)			/* FDE Length */
+L(SFDE6):
+	.long	L(SFDE6)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW24))		/* Initial location */
+	.long	L(UW26)-L(UW24)		/* Address range */
+	.byte	0			/* Augmentation size */
+	.byte	0xe, 8			/* DW_CFA_def_cfa_offset */
+	.byte	0x80+8, 2		/* DW_CFA_offset %eip, 2*-4 */
+	ADV(UW25, UW24)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE6):
+
+	.set	L(set7),L(EFDE7)-L(SFDE7)
+	.long	L(set7)			/* FDE Length */
+L(SFDE7):
+	.long	L(SFDE7)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW27))		/* Initial location */
+	.long	L(UW31)-L(UW27)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW28, UW27)
+	.byte	0xe, closure_FS+4	/* DW_CFA_def_cfa_offset */
+#ifdef FFI_CLOSURE_CALL_INNER_SAVE_EBX
+	ADV(UW29, UW28)
+	.byte	0x80+3, (40-(closure_FS+4))/-4  /* DW_CFA_offset %ebx */
+	ADV(UW30, UW29)
+	.byte	0xc0+3			/* DW_CFA_restore %ebx */
 #endif
+	.balign	4
+L(EFDE7):
+
+#if !FFI_NO_RAW_API
+	.set	L(set8),L(EFDE8)-L(SFDE8)
+	.long	L(set8)			/* FDE Length */
+L(SFDE8):
+	.long	L(SFDE8)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW32))		/* Initial location */
+	.long	L(UW40)-L(UW32)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW33, UW32)
+	.byte	0xe, raw_closure_S_FS+4	/* DW_CFA_def_cfa_offset */
+	ADV(UW34, UW33)
+	.byte	0x80+3, 2		/* DW_CFA_offset %ebx 2*-4 */
+	ADV(UW35, UW34)
+	.byte	0xc0+3			/* DW_CFA_restore %ebx */
+	ADV(UW36, UW35)
+	.byte	0xe, 4			/* DW_CFA_def_cfa_offset */
+	ADV(UW37, UW36)
+	.byte	0xe, raw_closure_S_FS+4	/* DW_CFA_def_cfa_offset */
+	ADV(UW38, UW37)
+	.byte	0xe, 4			/* DW_CFA_def_cfa_offset */
+	ADV(UW39, UW38)
+	.byte	0xe, raw_closure_S_FS+4	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE8):
+
+	.set	L(set9),L(EFDE9)-L(SFDE9)
+	.long	L(set9)			/* FDE Length */
+L(SFDE9):
+	.long	L(SFDE9)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW41))		/* Initial location */
+	.long	L(UW52)-L(UW41)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW42, UW41)
+	.byte	0xe, 0			/* DW_CFA_def_cfa_offset */
+	.byte	0x9, 8, 2		/* DW_CFA_register %eip, %edx */
+	ADV(UW43, UW42)
+	.byte	0xe, 4			/* DW_CFA_def_cfa_offset */
+	ADV(UW44, UW43)
+	.byte	0xe, 8			/* DW_CFA_def_cfa_offset */
+	.byte	0x80+8, 2		/* DW_CFA_offset %eip 2*-4 */
+	ADV(UW45, UW44)
+	.byte	0xe, raw_closure_T_FS+8	/* DW_CFA_def_cfa_offset */
+	ADV(UW46, UW45)
+	.byte	0x80+3, 3		/* DW_CFA_offset %ebx 3*-4 */
+	ADV(UW47, UW46)
+	.byte	0xc0+3			/* DW_CFA_restore %ebx */
+	ADV(UW48, UW47)
+	.byte	0xe, 8			/* DW_CFA_def_cfa_offset */
+	ADV(UW49, UW48)
+	.byte	0xe, raw_closure_T_FS+8	/* DW_CFA_def_cfa_offset */
+	ADV(UW50, UW49)
+	.byte	0xe, 8			/* DW_CFA_def_cfa_offset */
+	ADV(UW51, UW50)
+	.byte	0xe, raw_closure_T_FS+8	/* DW_CFA_def_cfa_offset */
+	.balign	4
+L(EFDE9):
+#endif /* !FFI_NO_RAW_API */
 
 #endif /* ifndef __x86_64__ */
 
diff --git a/libffi/src/x86/unix64.S b/libffi/src/x86/unix64.S
index 7a6619a..f9f9163 100644
--- a/libffi/src/x86/unix64.S
+++ b/libffi/src/x86/unix64.S
@@ -1,6 +1,7 @@
 /* -----------------------------------------------------------------------
-   unix64.S - Copyright (c) 2002  Bo Thorsen <bo@suse.de>
-	      Copyright (c) 2008  Red Hat, Inc
+   unix64.S - Copyright (c) 2013  The Written Word, Inc.
+	    - Copyright (c) 2008  Red Hat, Inc
+	    - Copyright (c) 2002  Bo Thorsen <bo@suse.de>
 
    x86-64 Foreign Function Interface 
 
@@ -29,8 +30,41 @@
 #define LIBFFI_ASM	
 #include <fficonfig.h>
 #include <ffi.h>
+#include "internal64.h"
 
-.text
+	.text
+
+#define C2(X, Y)  X ## Y
+#define C1(X, Y)  C2(X, Y)
+#ifdef __USER_LABEL_PREFIX__
+# define C(X)     C1(__USER_LABEL_PREFIX__, X)
+#else
+# define C(X)     X
+#endif
+
+#ifdef __APPLE__
+# define L(X)     C1(L, X)
+#else
+# define L(X)     C1(.L, X)
+#endif
+
+#ifdef __ELF__
+# define PLT(X)	  X@PLT
+# define ENDF(X)  .type	X,@function; .size X, . - X
+#else
+# define PLT(X)	  X
+# define ENDF(X)
+#endif
+
+/* This macro allows the safe creation of jump tables without an
+   actual table.  The entry points into the table are all 8 bytes.
+   The use of ORG asserts that we're at the correct location.  */
+/* ??? The clang assembler doesn't handle .org with symbolic expressions.  */
+#if defined(__clang__) || defined(__APPLE__)
+# define E(BASE, X)	.balign 8
+#else
+# define E(BASE, X)	.balign 8; .org BASE + X * 8
+#endif
 
 /* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
 	            void *raddr, void (*fnaddr)(void));
@@ -39,12 +73,12 @@
    for this function.  This has been allocated by ffi_call.  We also
    deallocate some of the stack that has been alloca'd.  */
 
-	.align	2
-	.globl	ffi_call_unix64
-	.type	ffi_call_unix64,@function
+	.balign	8
+	.globl	C(ffi_call_unix64)
+	FFI_HIDDEN(C(ffi_call_unix64))
 
-ffi_call_unix64:
-.LUW0:
+C(ffi_call_unix64):
+L(UW0):
 	movq	(%rsp), %r10		/* Load return address.  */
 	leaq	(%rdi, %rsi), %rax	/* Find local stack base.  */
 	movq	%rdx, (%rax)		/* Save flags.  */
@@ -52,24 +86,37 @@ ffi_call_unix64:
 	movq	%rbp, 16(%rax)		/* Save old frame pointer.  */
 	movq	%r10, 24(%rax)		/* Relocate return address.  */
 	movq	%rax, %rbp		/* Finalize local stack frame.  */
-.LUW1:
+
+	/* New stack frame based off rbp.  This is a itty bit of unwind
+	   trickery in that the CFA *has* changed.  There is no easy way
+	   to describe it correctly on entry to the function.  Fortunately,
+	   it doesn't matter too much since at all points we can correctly
+	   unwind back to ffi_call.  Note that the location to which we
+	   moved the return address is (the new) CFA-8, so from the
+	   perspective of the unwind info, it hasn't moved.  */
+L(UW1):
+	/* cfi_def_cfa(%rbp, 32) */
+	/* cfi_rel_offset(%rbp, 16) */
+
 	movq	%rdi, %r10		/* Save a copy of the register area. */
 	movq	%r8, %r11		/* Save a copy of the target fn.  */
 	movl	%r9d, %eax		/* Set number of SSE registers.  */
 
 	/* Load up all argument registers.  */
 	movq	(%r10), %rdi
-	movq	8(%r10), %rsi
-	movq	16(%r10), %rdx
-	movq	24(%r10), %rcx
-	movq	32(%r10), %r8
-	movq	40(%r10), %r9
+	movq	0x08(%r10), %rsi
+	movq	0x10(%r10), %rdx
+	movq	0x18(%r10), %rcx
+	movq	0x20(%r10), %r8
+	movq	0x28(%r10), %r9
+	movl	0xb0(%r10), %eax
 	testl	%eax, %eax
-	jnz	.Lload_sse
-.Lret_from_load_sse:
+	jnz	L(load_sse)
+L(ret_from_load_sse):
 
-	/* Deallocate the reg arg area.  */
-	leaq	176(%r10), %rsp
+	/* Deallocate the reg arg area, except for r10, then load via pop.  */
+	leaq	0xb8(%r10), %rsp
+	popq	%r10
 
 	/* Call the user function.  */
 	call	*%r11
@@ -80,347 +127,420 @@ ffi_call_unix64:
 	movq	0(%rbp), %rcx		/* Reload flags.  */
 	movq	8(%rbp), %rdi		/* Reload raddr.  */
 	movq	16(%rbp), %rbp		/* Reload old frame pointer.  */
-.LUW2:
+L(UW2):
+	/* cfi_remember_state */
+	/* cfi_def_cfa(%rsp, 8) */
+	/* cfi_restore(%rbp) */
 
 	/* The first byte of the flags contains the FFI_TYPE.  */
+	cmpb	$UNIX64_RET_LAST, %cl
 	movzbl	%cl, %r10d
-	leaq	.Lstore_table(%rip), %r11
-	movslq	(%r11, %r10, 4), %r10
-	addq	%r11, %r10
+	leaq	L(store_table)(%rip), %r11
+	ja	L(sa)
+	leaq	(%r11, %r10, 8), %r10
+
+	/* Prep for the structure cases: scratch area in redzone.  */
+	leaq	-20(%rsp), %rsi
 	jmp	*%r10
 
-.Lstore_table:
-	.long	.Lst_void-.Lstore_table		/* FFI_TYPE_VOID */
-	.long	.Lst_sint32-.Lstore_table	/* FFI_TYPE_INT */
-	.long	.Lst_float-.Lstore_table	/* FFI_TYPE_FLOAT */
-	.long	.Lst_double-.Lstore_table	/* FFI_TYPE_DOUBLE */
-	.long	.Lst_ldouble-.Lstore_table	/* FFI_TYPE_LONGDOUBLE */
-	.long	.Lst_uint8-.Lstore_table	/* FFI_TYPE_UINT8 */
-	.long	.Lst_sint8-.Lstore_table	/* FFI_TYPE_SINT8 */
-	.long	.Lst_uint16-.Lstore_table	/* FFI_TYPE_UINT16 */
-	.long	.Lst_sint16-.Lstore_table	/* FFI_TYPE_SINT16 */
-	.long	.Lst_uint32-.Lstore_table	/* FFI_TYPE_UINT32 */
-	.long	.Lst_sint32-.Lstore_table	/* FFI_TYPE_SINT32 */
-	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_UINT64 */
-	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_SINT64 */
-	.long	.Lst_struct-.Lstore_table	/* FFI_TYPE_STRUCT */
-	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_POINTER */
-
-	.align 2
-.Lst_void:
+	.balign	8
+L(store_table):
+E(L(store_table), UNIX64_RET_VOID)
 	ret
-	.align 2
-
-.Lst_uint8:
-	movzbq	%al, %rax
+E(L(store_table), UNIX64_RET_UINT8)
+	movzbl	%al, %eax
 	movq	%rax, (%rdi)
 	ret
-	.align 2
-.Lst_sint8:
-	movsbq	%al, %rax
+E(L(store_table), UNIX64_RET_UINT16)
+	movzwl	%ax, %eax
 	movq	%rax, (%rdi)
 	ret
-	.align 2
-.Lst_uint16:
-	movzwq	%ax, %rax
+E(L(store_table), UNIX64_RET_UINT32)
+	movl	%eax, %eax
 	movq	%rax, (%rdi)
-	.align 2
-.Lst_sint16:
-	movswq	%ax, %rax
+	ret
+E(L(store_table), UNIX64_RET_SINT8)
+	movsbq	%al, %rax
 	movq	%rax, (%rdi)
 	ret
-	.align 2
-.Lst_uint32:
-	movl	%eax, %eax
+E(L(store_table), UNIX64_RET_SINT16)
+	movswq	%ax, %rax
 	movq	%rax, (%rdi)
-	.align 2
-.Lst_sint32:
+	ret
+E(L(store_table), UNIX64_RET_SINT32)
 	cltq
 	movq	%rax, (%rdi)
 	ret
-	.align 2
-.Lst_int64:
+E(L(store_table), UNIX64_RET_INT64)
 	movq	%rax, (%rdi)
 	ret
-
-	.align 2
-.Lst_float:
-	movss	%xmm0, (%rdi)
+E(L(store_table), UNIX64_RET_XMM32)
+	movd	%xmm0, (%rdi)
 	ret
-	.align 2
-.Lst_double:
-	movsd	%xmm0, (%rdi)
+E(L(store_table), UNIX64_RET_XMM64)
+	movq	%xmm0, (%rdi)
 	ret
-.Lst_ldouble:
+E(L(store_table), UNIX64_RET_X87)
 	fstpt	(%rdi)
 	ret
-
-	.align 2
-.Lst_struct:
-	leaq	-20(%rsp), %rsi		/* Scratch area in redzone.  */
-
-	/* We have to locate the values now, and since we don't want to
-	   write too much data into the user's return value, we spill the
-	   value to a 16 byte scratch area first.  Bits 8, 9, and 10
-	   control where the values are located.  Only one of the three
-	   bits will be set; see ffi_prep_cif_machdep for the pattern.  */
-	movd	%xmm0, %r10
-	movd	%xmm1, %r11
-	testl	$0x100, %ecx
-	cmovnz	%rax, %rdx
-	cmovnz	%r10, %rax
-	testl	$0x200, %ecx
-	cmovnz	%r10, %rdx
-	testl	$0x400, %ecx
-	cmovnz	%r10, %rax
-	cmovnz	%r11, %rdx
-	movq	%rax, (%rsi)
+E(L(store_table), UNIX64_RET_X87_2)
+	fstpt	(%rdi)
+	fstpt	16(%rdi)
+	ret
+E(L(store_table), UNIX64_RET_ST_XMM0_RAX)
+	movq	%rax, 8(%rsi)
+	jmp	L(s3)
+E(L(store_table), UNIX64_RET_ST_RAX_XMM0)
+	movq	%xmm0, 8(%rsi)
+	jmp	L(s2)
+E(L(store_table), UNIX64_RET_ST_XMM0_XMM1)
+	movq	%xmm1, 8(%rsi)
+	jmp	L(s3)
+E(L(store_table), UNIX64_RET_ST_RAX_RDX)
 	movq	%rdx, 8(%rsi)
-
-	/* Bits 12-31 contain the true size of the structure.  Copy from
-	   the scratch area to the true destination.  */
-	shrl	$12, %ecx
+L(s2):
+	movq	%rax, (%rsi)
+	shrl	$UNIX64_SIZE_SHIFT, %ecx
 	rep movsb
 	ret
+	.balign 8
+L(s3):
+	movq	%xmm0, (%rsi)
+	shrl	$UNIX64_SIZE_SHIFT, %ecx
+	rep movsb
+	ret
+
+L(sa):	call	PLT(C(abort))
 
 	/* Many times we can avoid loading any SSE registers at all.
 	   It's not worth an indirect jump to load the exact set of
 	   SSE registers needed; zero or all is a good compromise.  */
-	.align 2
-.LUW3:
-.Lload_sse:
-	movdqa	48(%r10), %xmm0
-	movdqa	64(%r10), %xmm1
-	movdqa	80(%r10), %xmm2
-	movdqa	96(%r10), %xmm3
-	movdqa	112(%r10), %xmm4
-	movdqa	128(%r10), %xmm5
-	movdqa	144(%r10), %xmm6
-	movdqa	160(%r10), %xmm7
-	jmp	.Lret_from_load_sse
-
-.LUW4:
-	.size    ffi_call_unix64,.-ffi_call_unix64
-
-	.align	2
-	.globl ffi_closure_unix64
-	.type	ffi_closure_unix64,@function
-
-ffi_closure_unix64:
-.LUW5:
-	/* The carry flag is set by the trampoline iff SSE registers
-	   are used.  Don't clobber it before the branch instruction.  */
-	leaq    -200(%rsp), %rsp
-.LUW6:
-	movq	%rdi, (%rsp)
-	movq    %rsi, 8(%rsp)
-	movq    %rdx, 16(%rsp)
-	movq    %rcx, 24(%rsp)
-	movq    %r8, 32(%rsp)
-	movq    %r9, 40(%rsp)
-	jc      .Lsave_sse
-.Lret_from_save_sse:
-
-	movq	%r10, %rdi
-	leaq	176(%rsp), %rsi
-	movq	%rsp, %rdx
-	leaq	208(%rsp), %rcx
-	call	ffi_closure_unix64_inner@PLT
+	.balign 2
+L(UW3):
+	/* cfi_restore_state */
+L(load_sse):
+	movdqa	0x30(%r10), %xmm0
+	movdqa	0x40(%r10), %xmm1
+	movdqa	0x50(%r10), %xmm2
+	movdqa	0x60(%r10), %xmm3
+	movdqa	0x70(%r10), %xmm4
+	movdqa	0x80(%r10), %xmm5
+	movdqa	0x90(%r10), %xmm6
+	movdqa	0xa0(%r10), %xmm7
+	jmp	L(ret_from_load_sse)
+
+L(UW4):
+ENDF(C(ffi_call_unix64))
+
+/* 6 general registers, 8 vector registers,
+   32 bytes of rvalue, 8 bytes of alignment.  */
+#define ffi_closure_OFS_G	0
+#define ffi_closure_OFS_V	(6*8)
+#define ffi_closure_OFS_RVALUE	(ffi_closure_OFS_V + 8*16)
+#define ffi_closure_FS		(ffi_closure_OFS_RVALUE + 32 + 8)
+
+/* The location of rvalue within the red zone after deallocating the frame.  */
+#define ffi_closure_RED_RVALUE	(ffi_closure_OFS_RVALUE - ffi_closure_FS)
+
+	.balign	2
+	.globl	C(ffi_closure_unix64_sse)
+	FFI_HIDDEN(C(ffi_closure_unix64_sse))
+
+C(ffi_closure_unix64_sse):
+L(UW5):
+	subq	$ffi_closure_FS, %rsp
+L(UW6):
+	/* cfi_adjust_cfa_offset(ffi_closure_FS) */
+
+	movdqa	%xmm0, ffi_closure_OFS_V+0x00(%rsp)
+	movdqa	%xmm1, ffi_closure_OFS_V+0x10(%rsp)
+	movdqa	%xmm2, ffi_closure_OFS_V+0x20(%rsp)
+	movdqa	%xmm3, ffi_closure_OFS_V+0x30(%rsp)
+	movdqa	%xmm4, ffi_closure_OFS_V+0x40(%rsp)
+	movdqa	%xmm5, ffi_closure_OFS_V+0x50(%rsp)
+	movdqa	%xmm6, ffi_closure_OFS_V+0x60(%rsp)
+	movdqa	%xmm7, ffi_closure_OFS_V+0x70(%rsp)
+	jmp	L(sse_entry1)
+
+L(UW7):
+ENDF(C(ffi_closure_unix64_sse))
+
+	.balign	2
+	.globl	C(ffi_closure_unix64)
+	FFI_HIDDEN(C(ffi_closure_unix64))
+
+C(ffi_closure_unix64):
+L(UW8):
+	subq	$ffi_closure_FS, %rsp
+L(UW9):
+	/* cfi_adjust_cfa_offset(ffi_closure_FS) */
+L(sse_entry1):
+	movq	%rdi, ffi_closure_OFS_G+0x00(%rsp)
+	movq    %rsi, ffi_closure_OFS_G+0x08(%rsp)
+	movq    %rdx, ffi_closure_OFS_G+0x10(%rsp)
+	movq    %rcx, ffi_closure_OFS_G+0x18(%rsp)
+	movq    %r8,  ffi_closure_OFS_G+0x20(%rsp)
+	movq    %r9,  ffi_closure_OFS_G+0x28(%rsp)
+
+#ifdef __ILP32__
+	movl	FFI_TRAMPOLINE_SIZE(%r10), %edi		/* Load cif */
+	movl	FFI_TRAMPOLINE_SIZE+4(%r10), %esi	/* Load fun */
+	movl	FFI_TRAMPOLINE_SIZE+8(%r10), %edx	/* Load user_data */
+#else
+	movq	FFI_TRAMPOLINE_SIZE(%r10), %rdi		/* Load cif */
+	movq	FFI_TRAMPOLINE_SIZE+8(%r10), %rsi	/* Load fun */
+	movq	FFI_TRAMPOLINE_SIZE+16(%r10), %rdx	/* Load user_data */
+#endif
+L(do_closure):
+	leaq	ffi_closure_OFS_RVALUE(%rsp), %rcx	/* Load rvalue */
+	movq	%rsp, %r8				/* Load reg_args */
+	leaq	ffi_closure_FS+8(%rsp), %r9		/* Load argp */
+	call	C(ffi_closure_unix64_inner)
 
 	/* Deallocate stack frame early; return value is now in redzone.  */
-	addq	$200, %rsp
-.LUW7:
+	addq	$ffi_closure_FS, %rsp
+L(UW10):
+	/* cfi_adjust_cfa_offset(-ffi_closure_FS) */
 
 	/* The first byte of the return value contains the FFI_TYPE.  */
+	cmpb	$UNIX64_RET_LAST, %al
 	movzbl	%al, %r10d
-	leaq	.Lload_table(%rip), %r11
-	movslq	(%r11, %r10, 4), %r10
-	addq	%r11, %r10
+	leaq	L(load_table)(%rip), %r11
+	ja	L(la)
+	leaq	(%r11, %r10, 8), %r10
+	leaq	ffi_closure_RED_RVALUE(%rsp), %rsi
 	jmp	*%r10
 
-.Lload_table:
-	.long	.Lld_void-.Lload_table		/* FFI_TYPE_VOID */
-	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_INT */
-	.long	.Lld_float-.Lload_table		/* FFI_TYPE_FLOAT */
-	.long	.Lld_double-.Lload_table	/* FFI_TYPE_DOUBLE */
-	.long	.Lld_ldouble-.Lload_table	/* FFI_TYPE_LONGDOUBLE */
-	.long	.Lld_int8-.Lload_table		/* FFI_TYPE_UINT8 */
-	.long	.Lld_int8-.Lload_table		/* FFI_TYPE_SINT8 */
-	.long	.Lld_int16-.Lload_table		/* FFI_TYPE_UINT16 */
-	.long	.Lld_int16-.Lload_table		/* FFI_TYPE_SINT16 */
-	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_UINT32 */
-	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_SINT32 */
-	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_UINT64 */
-	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_SINT64 */
-	.long	.Lld_struct-.Lload_table	/* FFI_TYPE_STRUCT */
-	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_POINTER */
-
-	.align 2
-.Lld_void:
+	.balign	8
+L(load_table):
+E(L(load_table), UNIX64_RET_VOID)
 	ret
-
-	.align 2
-.Lld_int8:
-	movzbl	-24(%rsp), %eax
+E(L(load_table), UNIX64_RET_UINT8)
+	movzbl	(%rsi), %eax
 	ret
-	.align 2
-.Lld_int16:
-	movzwl	-24(%rsp), %eax
+E(L(load_table), UNIX64_RET_UINT16)
+	movzwl	(%rsi), %eax
 	ret
-	.align 2
-.Lld_int32:
-	movl	-24(%rsp), %eax
+E(L(load_table), UNIX64_RET_UINT32)
+	movl	(%rsi), %eax
 	ret
-	.align 2
-.Lld_int64:
-	movq	-24(%rsp), %rax
+E(L(load_table), UNIX64_RET_SINT8)
+	movsbl	(%rsi), %eax
 	ret
-
-	.align 2
-.Lld_float:
-	movss	-24(%rsp), %xmm0
+E(L(load_table), UNIX64_RET_SINT16)
+	movswl	(%rsi), %eax
 	ret
-	.align 2
-.Lld_double:
-	movsd	-24(%rsp), %xmm0
+E(L(load_table), UNIX64_RET_SINT32)
+	movl	(%rsi), %eax
 	ret
-	.align 2
-.Lld_ldouble:
-	fldt	-24(%rsp)
+E(L(load_table), UNIX64_RET_INT64)
+	movq	(%rsi), %rax
 	ret
-
-	.align 2
-.Lld_struct:
-	/* There are four possibilities here, %rax/%rdx, %xmm0/%rax,
-	   %rax/%xmm0, %xmm0/%xmm1.  We collapse two by always loading
-	   both rdx and xmm1 with the second word.  For the remaining,
-	   bit 8 set means xmm0 gets the second word, and bit 9 means
-	   that rax gets the second word.  */
-	movq	-24(%rsp), %rcx
-	movq	-16(%rsp), %rdx
-	movq	-16(%rsp), %xmm1
-	testl	$0x100, %eax
-	cmovnz	%rdx, %rcx
-	movd	%rcx, %xmm0
-	testl	$0x200, %eax
-	movq	-24(%rsp), %rax
-	cmovnz	%rdx, %rax
+E(L(load_table), UNIX64_RET_XMM32)
+	movd	(%rsi), %xmm0
+	ret
+E(L(load_table), UNIX64_RET_XMM64)
+	movq	(%rsi), %xmm0
+	ret
+E(L(load_table), UNIX64_RET_X87)
+	fldt	(%rsi)
+	ret
+E(L(load_table), UNIX64_RET_X87_2)
+	fldt	16(%rsi)
+	fldt	(%rsi)
+	ret
+E(L(load_table), UNIX64_RET_ST_XMM0_RAX)
+	movq	8(%rsi), %rax
+	jmp	L(l3)
+E(L(load_table), UNIX64_RET_ST_RAX_XMM0)
+	movq	8(%rsi), %xmm0
+	jmp	L(l2)
+E(L(load_table), UNIX64_RET_ST_XMM0_XMM1)
+	movq	8(%rsi), %xmm1
+	jmp	L(l3)
+E(L(load_table), UNIX64_RET_ST_RAX_RDX)
+	movq	8(%rsi), %rdx
+L(l2):
+	movq	(%rsi), %rax
+	ret
+	.balign	8
+L(l3):
+	movq	(%rsi), %xmm0
 	ret
 
-	/* See the comment above .Lload_sse; the same logic applies here.  */
-	.align 2
-.LUW8:
-.Lsave_sse:
-	movdqa	%xmm0, 48(%rsp)
-	movdqa	%xmm1, 64(%rsp)
-	movdqa	%xmm2, 80(%rsp)
-	movdqa	%xmm3, 96(%rsp)
-	movdqa	%xmm4, 112(%rsp)
-	movdqa	%xmm5, 128(%rsp)
-	movdqa	%xmm6, 144(%rsp)
-	movdqa	%xmm7, 160(%rsp)
-	jmp	.Lret_from_save_sse
-
-.LUW9:
-	.size	ffi_closure_unix64,.-ffi_closure_unix64
-
-#ifdef HAVE_AS_X86_64_UNWIND_SECTION_TYPE
-	.section	.eh_frame,"a",@unwind
-#else
-	.section	.eh_frame,"a",@progbits
-#endif
-.Lframe1:
-	.long	.LECIE1-.LSCIE1		/* CIE Length */
-.LSCIE1:
-	.long	0			/* CIE Identifier Tag */
-	.byte	1			/* CIE Version */
-	.ascii "zR\0"			/* CIE Augmentation */
-	.uleb128 1			/* CIE Code Alignment Factor */
-	.sleb128 -8			/* CIE Data Alignment Factor */
-	.byte	0x10			/* CIE RA Column */
-	.uleb128 1			/* Augmentation size */
-	.byte	0x1b			/* FDE Encoding (pcrel sdata4) */
-	.byte	0xc			/* DW_CFA_def_cfa, %rsp offset 8 */
-	.uleb128 7
-	.uleb128 8
-	.byte	0x80+16			/* DW_CFA_offset, %rip offset 1*-8 */
-	.uleb128 1
-	.align 8
-.LECIE1:
-.LSFDE1:
-	.long	.LEFDE1-.LASFDE1	/* FDE Length */
-.LASFDE1:
-	.long	.LASFDE1-.Lframe1	/* FDE CIE offset */
-#if HAVE_AS_X86_PCREL
-	.long	.LUW0-.			/* FDE initial location */
+L(la):	call	PLT(C(abort))
+
+L(UW11):
+ENDF(C(ffi_closure_unix64))
+
+	.balign	2
+	.globl	C(ffi_go_closure_unix64_sse)
+	FFI_HIDDEN(C(ffi_go_closure_unix64_sse))
+
+C(ffi_go_closure_unix64_sse):
+L(UW12):
+	subq	$ffi_closure_FS, %rsp
+L(UW13):
+	/* cfi_adjust_cfa_offset(ffi_closure_FS) */
+
+	movdqa	%xmm0, ffi_closure_OFS_V+0x00(%rsp)
+	movdqa	%xmm1, ffi_closure_OFS_V+0x10(%rsp)
+	movdqa	%xmm2, ffi_closure_OFS_V+0x20(%rsp)
+	movdqa	%xmm3, ffi_closure_OFS_V+0x30(%rsp)
+	movdqa	%xmm4, ffi_closure_OFS_V+0x40(%rsp)
+	movdqa	%xmm5, ffi_closure_OFS_V+0x50(%rsp)
+	movdqa	%xmm6, ffi_closure_OFS_V+0x60(%rsp)
+	movdqa	%xmm7, ffi_closure_OFS_V+0x70(%rsp)
+	jmp	L(sse_entry2)
+
+L(UW14):
+ENDF(C(ffi_go_closure_unix64_sse))
+
+	.balign	2
+	.globl	C(ffi_go_closure_unix64)
+	FFI_HIDDEN(C(ffi_go_closure_unix64))
+
+C(ffi_go_closure_unix64):
+L(UW15):
+	subq	$ffi_closure_FS, %rsp
+L(UW16):
+	/* cfi_adjust_cfa_offset(ffi_closure_FS) */
+L(sse_entry2):
+	movq	%rdi, ffi_closure_OFS_G+0x00(%rsp)
+	movq    %rsi, ffi_closure_OFS_G+0x08(%rsp)
+	movq    %rdx, ffi_closure_OFS_G+0x10(%rsp)
+	movq    %rcx, ffi_closure_OFS_G+0x18(%rsp)
+	movq    %r8,  ffi_closure_OFS_G+0x20(%rsp)
+	movq    %r9,  ffi_closure_OFS_G+0x28(%rsp)
+
+#ifdef __ILP32__
+	movl	4(%r10), %edi		/* Load cif */
+	movl	8(%r10), %esi		/* Load fun */
+	movl	%r10d, %edx		/* Load closure (user_data) */
 #else
-	.long	.LUW0@rel
+	movq	8(%r10), %rdi		/* Load cif */
+	movq	16(%r10), %rsi		/* Load fun */
+	movq	%r10, %rdx		/* Load closure (user_data) */
 #endif
-	.long	.LUW4-.LUW0		/* FDE address range */
-	.uleb128 0x0			/* Augmentation size */
+	jmp	L(do_closure)
 
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW1-.LUW0
+L(UW17):
+ENDF(C(ffi_go_closure_unix64))
 
-	/* New stack frame based off rbp.  This is a itty bit of unwind
-	   trickery in that the CFA *has* changed.  There is no easy way
-	   to describe it correctly on entry to the function.  Fortunately,
-	   it doesn't matter too much since at all points we can correctly
-	   unwind back to ffi_call.  Note that the location to which we
-	   moved the return address is (the new) CFA-8, so from the
-	   perspective of the unwind info, it hasn't moved.  */
-	.byte	0xc			/* DW_CFA_def_cfa, %rbp offset 32 */
-	.uleb128 6
-	.uleb128 32
-	.byte	0x80+6			/* DW_CFA_offset, %rbp offset 2*-8 */
-	.uleb128 2
-	.byte	0xa			/* DW_CFA_remember_state */
+/* Sadly, OSX cctools-as doesn't understand .cfi directives at all.  */
 
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW2-.LUW1
-	.byte	0xc			/* DW_CFA_def_cfa, %rsp offset 8 */
-	.uleb128 7
-	.uleb128 8
-	.byte	0xc0+6			/* DW_CFA_restore, %rbp */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW3-.LUW2
-	.byte	0xb			/* DW_CFA_restore_state */
-
-	.align 8
-.LEFDE1:
-.LSFDE3:
-	.long	.LEFDE3-.LASFDE3	/* FDE Length */
-.LASFDE3:
-	.long	.LASFDE3-.Lframe1	/* FDE CIE offset */
-#if HAVE_AS_X86_PCREL
-	.long	.LUW5-.			/* FDE initial location */
+#ifdef __APPLE__
+.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
+EHFrame0:
+#elif defined(HAVE_AS_X86_64_UNWIND_SECTION_TYPE)
+.section .eh_frame,"a",@unwind
 #else
-	.long	.LUW5@rel
+.section .eh_frame,"a",@progbits
 #endif
-	.long	.LUW9-.LUW5		/* FDE address range */
-	.uleb128 0x0			/* Augmentation size */
 
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW6-.LUW5
-	.byte	0xe			/* DW_CFA_def_cfa_offset */
-	.uleb128 208
-	.byte	0xa			/* DW_CFA_remember_state */
+#ifdef HAVE_AS_X86_PCREL
+# define PCREL(X)	X - .
+#else
+# define PCREL(X)	X@rel
+#endif
 
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW7-.LUW6
-	.byte	0xe			/* DW_CFA_def_cfa_offset */
-	.uleb128 8
+/* Simplify advancing between labels.  Assume DW_CFA_advance_loc1 fits.  */
+#define ADV(N, P)	.byte 2, L(N)-L(P)
 
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW8-.LUW7
+	.balign 8
+L(CIE):
+	.set	L(set0),L(ECIE)-L(SCIE)
+	.long	L(set0)			/* CIE Length */
+L(SCIE):
+	.long	0			/* CIE Identifier Tag */
+	.byte	1			/* CIE Version */
+	.ascii	"zR\0"			/* CIE Augmentation */
+	.byte	1			/* CIE Code Alignment Factor */
+	.byte	0x78			/* CIE Data Alignment Factor */
+	.byte	0x10			/* CIE RA Column */
+	.byte	1			/* Augmentation size */
+	.byte	0x1b			/* FDE Encoding (pcrel sdata4) */
+	.byte	0xc, 7, 8		/* DW_CFA_def_cfa, %rsp offset 8 */
+	.byte	0x80+16, 1		/* DW_CFA_offset, %rip offset 1*-8 */
+	.balign 8
+L(ECIE):
+
+	.set	L(set1),L(EFDE1)-L(SFDE1)
+	.long	L(set1)			/* FDE Length */
+L(SFDE1):
+	.long	L(SFDE1)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW0))		/* Initial location */
+	.long	L(UW4)-L(UW0)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW1, UW0)
+	.byte	0xc, 6, 32		/* DW_CFA_def_cfa, %rbp 32 */
+	.byte	0x80+6, 2		/* DW_CFA_offset, %rbp 2*-8 */
+	ADV(UW2, UW1)
+	.byte	0xa			/* DW_CFA_remember_state */
+	.byte	0xc, 7, 8		/* DW_CFA_def_cfa, %rsp 8 */
+	.byte	0xc0+6			/* DW_CFA_restore, %rbp */
+	ADV(UW3, UW2)
 	.byte	0xb			/* DW_CFA_restore_state */
-
-	.align 8
-.LEFDE3:
+	.balign	8
+L(EFDE1):
+
+	.set	L(set2),L(EFDE2)-L(SFDE2)
+	.long	L(set2)			/* FDE Length */
+L(SFDE2):
+	.long	L(SFDE2)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW5))		/* Initial location */
+	.long	L(UW7)-L(UW5)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW6, UW5)
+	.byte	0xe			/* DW_CFA_def_cfa_offset */
+	.byte	ffi_closure_FS + 8, 1	/* uleb128, assuming 128 <= FS < 255 */
+	.balign	8
+L(EFDE2):
+
+	.set	L(set3),L(EFDE3)-L(SFDE3)
+	.long	L(set3)			/* FDE Length */
+L(SFDE3):
+	.long	L(SFDE3)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW8))		/* Initial location */
+	.long	L(UW11)-L(UW8)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW9, UW8)
+	.byte	0xe			/* DW_CFA_def_cfa_offset */
+	.byte	ffi_closure_FS + 8, 1	/* uleb128, assuming 128 <= FS < 255 */
+	ADV(UW10, UW9)
+	.byte	0xe, 8			/* DW_CFA_def_cfa_offset 8 */
+L(EFDE3):
+
+	.set	L(set4),L(EFDE4)-L(SFDE4)
+	.long	L(set4)			/* FDE Length */
+L(SFDE4):
+	.long	L(SFDE4)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW12))		/* Initial location */
+	.long	L(UW14)-L(UW12)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW13, UW12)
+	.byte	0xe			/* DW_CFA_def_cfa_offset */
+	.byte	ffi_closure_FS + 8, 1	/* uleb128, assuming 128 <= FS < 255 */
+	.balign	8
+L(EFDE4):
+
+	.set	L(set5),L(EFDE5)-L(SFDE5)
+	.long	L(set5)			/* FDE Length */
+L(SFDE5):
+	.long	L(SFDE5)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW15))		/* Initial location */
+	.long	L(UW17)-L(UW15)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW16, UW15)
+	.byte	0xe			/* DW_CFA_def_cfa_offset */
+	.byte	ffi_closure_FS + 8, 1	/* uleb128, assuming 128 <= FS < 255 */
+	.balign	8
+L(EFDE5):
+#ifdef __APPLE__
+	.subsections_via_symbols
+#endif
 
 #endif /* __x86_64__ */
-
 #if defined __ELF__ && defined __linux__
 	.section	.note.GNU-stack,"",@progbits
 #endif
diff --git a/libffi/src/x86/win32.S b/libffi/src/x86/win32.S
deleted file mode 100644
index 24b7bbd..0000000
--- a/libffi/src/x86/win32.S
+++ /dev/null
@@ -1,1201 +0,0 @@
-/* -----------------------------------------------------------------------
-   win32.S - Copyright (c) 1996, 1998, 2001, 2002, 2009  Red Hat, Inc.
-	     Copyright (c) 2001  John Beniton
-	     Copyright (c) 2002  Ranjit Mathew
-	     Copyright (c) 2009  Daniel Witte
-			
- 
-   X86 Foreign Function Interface
- 
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files (the
-   ``Software''), to deal in the Software without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Software, and to
-   permit persons to whom the Software is furnished to do so, subject to
-   the following conditions:
- 
-   The above copyright notice and this permission notice shall be included
-   in all copies or substantial portions of the Software.
- 
-   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-   DEALINGS IN THE SOFTWARE.
-   -----------------------------------------------------------------------
-   */
- 
-#define LIBFFI_ASM
-#include <fficonfig.h>
-#include <ffi.h>
-
-#ifdef _MSC_VER
-
-.386
-.MODEL FLAT, C
-
-EXTRN ffi_closure_SYSV_inner:NEAR
-
-_TEXT SEGMENT
-
-ffi_call_win32 PROC NEAR,
-    ffi_prep_args : NEAR PTR DWORD,
-    ecif          : NEAR PTR DWORD,
-    cif_abi       : DWORD,
-    cif_bytes     : DWORD,
-    cif_flags     : DWORD,
-    rvalue        : NEAR PTR DWORD,
-    fn            : NEAR PTR DWORD
-
-        ;; Make room for all of the new args.
-        mov  ecx, cif_bytes
-        sub  esp, ecx
-
-        mov  eax, esp
-
-        ;; Place all of the ffi_prep_args in position
-        push ecif
-        push eax
-        call ffi_prep_args
-
-        ;; Return stack to previous state and call the function
-        add  esp, 8
-
-	;; Handle thiscall and fastcall
-	cmp cif_abi, 3 ;; FFI_THISCALL
-	jz do_thiscall
-	cmp cif_abi, 4 ;; FFI_FASTCALL
-	jnz do_stdcall
-	mov ecx, DWORD PTR [esp]
-	mov edx, DWORD PTR [esp+4]
-	add esp, 8
-	jmp do_stdcall
-do_thiscall:
-	mov ecx, DWORD PTR [esp]
-	add esp, 4
-do_stdcall:
-        call fn
-
-        ;; cdecl:   we restore esp in the epilogue, so there's no need to
-        ;;          remove the space we pushed for the args.
-        ;; stdcall: the callee has already cleaned the stack.
-
-        ;; Load ecx with the return type code
-        mov  ecx, cif_flags
-
-        ;; If the return value pointer is NULL, assume no return value.
-        cmp  rvalue, 0
-        jne  ca_jumptable
-
-        ;; Even if there is no space for the return value, we are
-        ;; obliged to handle floating-point values.
-        cmp  ecx, FFI_TYPE_FLOAT
-        jne  ca_epilogue
-        fstp st(0)
-
-        jmp  ca_epilogue
-
-ca_jumptable:
-        jmp  [ca_jumpdata + 4 * ecx]
-ca_jumpdata:
-        ;; Do not insert anything here between label and jump table.
-        dd offset ca_epilogue       ;; FFI_TYPE_VOID
-        dd offset ca_retint         ;; FFI_TYPE_INT
-        dd offset ca_retfloat       ;; FFI_TYPE_FLOAT
-        dd offset ca_retdouble      ;; FFI_TYPE_DOUBLE
-        dd offset ca_retlongdouble  ;; FFI_TYPE_LONGDOUBLE
-        dd offset ca_retuint8       ;; FFI_TYPE_UINT8
-        dd offset ca_retsint8       ;; FFI_TYPE_SINT8
-        dd offset ca_retuint16      ;; FFI_TYPE_UINT16
-        dd offset ca_retsint16      ;; FFI_TYPE_SINT16
-        dd offset ca_retint         ;; FFI_TYPE_UINT32
-        dd offset ca_retint         ;; FFI_TYPE_SINT32
-        dd offset ca_retint64       ;; FFI_TYPE_UINT64
-        dd offset ca_retint64       ;; FFI_TYPE_SINT64
-        dd offset ca_epilogue       ;; FFI_TYPE_STRUCT
-        dd offset ca_retint         ;; FFI_TYPE_POINTER
-        dd offset ca_retstruct1b    ;; FFI_TYPE_SMALL_STRUCT_1B
-        dd offset ca_retstruct2b    ;; FFI_TYPE_SMALL_STRUCT_2B
-        dd offset ca_retint         ;; FFI_TYPE_SMALL_STRUCT_4B
-        dd offset ca_epilogue       ;; FFI_TYPE_MS_STRUCT
-
-        /* Sign/zero extend as appropriate.  */
-ca_retuint8:
-        movzx eax, al
-        jmp   ca_retint
-
-ca_retsint8:
-        movsx eax, al
-        jmp   ca_retint
-
-ca_retuint16:
-        movzx eax, ax
-        jmp   ca_retint
-
-ca_retsint16:
-        movsx eax, ax
-        jmp   ca_retint
-
-ca_retint:
-        ;; Load %ecx with the pointer to storage for the return value
-        mov   ecx, rvalue
-        mov   [ecx + 0], eax
-        jmp   ca_epilogue
-
-ca_retint64:
-        ;; Load %ecx with the pointer to storage for the return value
-        mov   ecx, rvalue
-        mov   [ecx + 0], eax
-        mov   [ecx + 4], edx
-        jmp   ca_epilogue
-
-ca_retfloat:
-        ;; Load %ecx with the pointer to storage for the return value
-        mov   ecx, rvalue
-        fstp  DWORD PTR [ecx]
-        jmp   ca_epilogue
-
-ca_retdouble:
-        ;; Load %ecx with the pointer to storage for the return value
-        mov   ecx, rvalue
-        fstp  QWORD PTR [ecx]
-        jmp   ca_epilogue
-
-ca_retlongdouble:
-        ;; Load %ecx with the pointer to storage for the return value
-        mov   ecx, rvalue
-        fstp  TBYTE PTR [ecx]
-        jmp   ca_epilogue
-
-ca_retstruct1b:
-        ;; Load %ecx with the pointer to storage for the return value
-        mov   ecx, rvalue
-        mov   [ecx + 0], al
-        jmp   ca_epilogue
-
-ca_retstruct2b:
-        ;; Load %ecx with the pointer to storage for the return value
-        mov   ecx, rvalue
-        mov   [ecx + 0], ax
-        jmp   ca_epilogue
-
-ca_epilogue:
-        ;; Epilogue code is autogenerated.
-        ret
-ffi_call_win32 ENDP
-
-ffi_closure_THISCALL PROC NEAR FORCEFRAME
-	sub	esp, 40
-	lea	edx, [ebp -24]
-	mov	[ebp - 12], edx	/* resp */
-	lea	edx, [ebp + 12]  /* account for stub return address on stack */
-	jmp	stub
-ffi_closure_THISCALL ENDP
-
-ffi_closure_SYSV PROC NEAR FORCEFRAME
-    ;; the ffi_closure ctx is passed in eax by the trampoline.
-
-        sub  esp, 40
-        lea  edx, [ebp - 24]
-        mov  [ebp - 12], edx         ;; resp
-        lea  edx, [ebp + 8]
-stub::
-        mov  [esp + 8], edx          ;; args
-        lea  edx, [ebp - 12]
-        mov  [esp + 4], edx          ;; &resp
-        mov  [esp], eax              ;; closure
-        call ffi_closure_SYSV_inner
-        mov  ecx, [ebp - 12]
-
-cs_jumptable:
-        jmp  [cs_jumpdata + 4 * eax]
-cs_jumpdata:
-        ;; Do not insert anything here between the label and jump table.
-        dd offset cs_epilogue       ;; FFI_TYPE_VOID
-        dd offset cs_retint         ;; FFI_TYPE_INT
-        dd offset cs_retfloat       ;; FFI_TYPE_FLOAT
-        dd offset cs_retdouble      ;; FFI_TYPE_DOUBLE
-        dd offset cs_retlongdouble  ;; FFI_TYPE_LONGDOUBLE
-        dd offset cs_retuint8       ;; FFI_TYPE_UINT8
-        dd offset cs_retsint8       ;; FFI_TYPE_SINT8
-        dd offset cs_retuint16      ;; FFI_TYPE_UINT16
-        dd offset cs_retsint16      ;; FFI_TYPE_SINT16
-        dd offset cs_retint         ;; FFI_TYPE_UINT32
-        dd offset cs_retint         ;; FFI_TYPE_SINT32
-        dd offset cs_retint64       ;; FFI_TYPE_UINT64
-        dd offset cs_retint64       ;; FFI_TYPE_SINT64
-        dd offset cs_retstruct      ;; FFI_TYPE_STRUCT
-        dd offset cs_retint         ;; FFI_TYPE_POINTER
-        dd offset cs_retsint8       ;; FFI_TYPE_SMALL_STRUCT_1B
-        dd offset cs_retsint16      ;; FFI_TYPE_SMALL_STRUCT_2B
-        dd offset cs_retint         ;; FFI_TYPE_SMALL_STRUCT_4B
-        dd offset cs_retmsstruct    ;; FFI_TYPE_MS_STRUCT
-
-cs_retuint8:
-        movzx eax, BYTE PTR [ecx]
-        jmp   cs_epilogue
-
-cs_retsint8:
-        movsx eax, BYTE PTR [ecx]
-        jmp   cs_epilogue
-
-cs_retuint16:
-        movzx eax, WORD PTR [ecx]
-        jmp   cs_epilogue
-
-cs_retsint16:
-        movsx eax, WORD PTR [ecx]
-        jmp   cs_epilogue
-
-cs_retint:
-        mov   eax, [ecx]
-        jmp   cs_epilogue
-
-cs_retint64:
-        mov   eax, [ecx + 0]
-        mov   edx, [ecx + 4]
-        jmp   cs_epilogue
-
-cs_retfloat:
-        fld   DWORD PTR [ecx]
-        jmp   cs_epilogue
-
-cs_retdouble:
-        fld   QWORD PTR [ecx]
-        jmp   cs_epilogue
-
-cs_retlongdouble:
-        fld   TBYTE PTR [ecx]
-        jmp   cs_epilogue
-
-cs_retstruct:
-        ;; Caller expects us to pop struct return value pointer hidden arg.
-        ;; Epilogue code is autogenerated.
-        ret	4
-
-cs_retmsstruct:
-        ;; Caller expects us to return a pointer to the real return value.
-        mov   eax, ecx
-        ;; Caller doesn't expects us to pop struct return value pointer hidden arg.
-        jmp   cs_epilogue
-
-cs_epilogue:
-        ;; Epilogue code is autogenerated.
-        ret
-ffi_closure_SYSV ENDP
-
-#if !FFI_NO_RAW_API
-
-#define RAW_CLOSURE_CIF_OFFSET ((FFI_TRAMPOLINE_SIZE + 3) AND NOT 3)
-#define RAW_CLOSURE_FUN_OFFSET (RAW_CLOSURE_CIF_OFFSET + 4)
-#define RAW_CLOSURE_USER_DATA_OFFSET (RAW_CLOSURE_FUN_OFFSET + 4)
-#define CIF_FLAGS_OFFSET 20
-
-ffi_closure_raw_THISCALL PROC NEAR USES esi FORCEFRAME
-	sub esp, 36
-	mov  esi, [eax + RAW_CLOSURE_CIF_OFFSET]        ;; closure->cif
-	mov  edx, [eax + RAW_CLOSURE_USER_DATA_OFFSET]  ;; closure->user_data
-	mov [esp + 12], edx
-	lea edx, [ebp + 12]
-	jmp stubraw
-ffi_closure_raw_THISCALL ENDP
-
-ffi_closure_raw_SYSV PROC NEAR USES esi FORCEFRAME
-    ;; the ffi_closure ctx is passed in eax by the trampoline.
-
-        sub  esp, 40
-        mov  esi, [eax + RAW_CLOSURE_CIF_OFFSET]        ;; closure->cif
-        mov  edx, [eax + RAW_CLOSURE_USER_DATA_OFFSET]  ;; closure->user_data
-        mov  [esp + 12], edx                            ;; user_data
-        lea  edx, [ebp + 8]
-stubraw::
-        mov  [esp + 8], edx                             ;; raw_args
-        lea  edx, [ebp - 24]
-        mov  [esp + 4], edx                             ;; &res
-        mov  [esp], esi                                 ;; cif
-        call DWORD PTR [eax + RAW_CLOSURE_FUN_OFFSET]   ;; closure->fun
-        mov  eax, [esi + CIF_FLAGS_OFFSET]              ;; cif->flags
-        lea  ecx, [ebp - 24]
-
-cr_jumptable:
-        jmp  [cr_jumpdata + 4 * eax]
-cr_jumpdata:
-        ;; Do not insert anything here between the label and jump table.
-        dd offset cr_epilogue       ;; FFI_TYPE_VOID
-        dd offset cr_retint         ;; FFI_TYPE_INT
-        dd offset cr_retfloat       ;; FFI_TYPE_FLOAT
-        dd offset cr_retdouble      ;; FFI_TYPE_DOUBLE
-        dd offset cr_retlongdouble  ;; FFI_TYPE_LONGDOUBLE
-        dd offset cr_retuint8       ;; FFI_TYPE_UINT8
-        dd offset cr_retsint8       ;; FFI_TYPE_SINT8
-        dd offset cr_retuint16      ;; FFI_TYPE_UINT16
-        dd offset cr_retsint16      ;; FFI_TYPE_SINT16
-        dd offset cr_retint         ;; FFI_TYPE_UINT32
-        dd offset cr_retint         ;; FFI_TYPE_SINT32
-        dd offset cr_retint64       ;; FFI_TYPE_UINT64
-        dd offset cr_retint64       ;; FFI_TYPE_SINT64
-        dd offset cr_epilogue       ;; FFI_TYPE_STRUCT
-        dd offset cr_retint         ;; FFI_TYPE_POINTER
-        dd offset cr_retsint8       ;; FFI_TYPE_SMALL_STRUCT_1B
-        dd offset cr_retsint16      ;; FFI_TYPE_SMALL_STRUCT_2B
-        dd offset cr_retint         ;; FFI_TYPE_SMALL_STRUCT_4B
-        dd offset cr_epilogue       ;; FFI_TYPE_MS_STRUCT
-
-cr_retuint8:
-        movzx eax, BYTE PTR [ecx]
-        jmp   cr_epilogue
-
-cr_retsint8:
-        movsx eax, BYTE PTR [ecx]
-        jmp   cr_epilogue
-
-cr_retuint16:
-        movzx eax, WORD PTR [ecx]
-        jmp   cr_epilogue
-
-cr_retsint16:
-        movsx eax, WORD PTR [ecx]
-        jmp   cr_epilogue
-
-cr_retint:
-        mov   eax, [ecx]
-        jmp   cr_epilogue
-
-cr_retint64:
-        mov   eax, [ecx + 0]
-        mov   edx, [ecx + 4]
-        jmp   cr_epilogue
-
-cr_retfloat:
-        fld   DWORD PTR [ecx]
-        jmp   cr_epilogue
-
-cr_retdouble:
-        fld   QWORD PTR [ecx]
-        jmp   cr_epilogue
-
-cr_retlongdouble:
-        fld   TBYTE PTR [ecx]
-        jmp   cr_epilogue
-
-cr_epilogue:
-        ;; Epilogue code is autogenerated.
-        ret
-ffi_closure_raw_SYSV ENDP
-
-#endif /* !FFI_NO_RAW_API */
-
-ffi_closure_STDCALL PROC NEAR FORCEFRAME
-    ;; the ffi_closure ctx is passed in eax by the trampoline.
-
-        sub  esp, 40
-        lea  edx, [ebp - 24]
-        mov  [ebp - 12], edx         ;; resp
-        lea  edx, [ebp + 12]         ;; account for stub return address on stack
-        mov  [esp + 8], edx          ;; args
-        lea  edx, [ebp - 12]
-        mov  [esp + 4], edx          ;; &resp
-        mov  [esp], eax              ;; closure
-        call ffi_closure_SYSV_inner
-        mov  ecx, [ebp - 12]
-
-cd_jumptable:
-        jmp  [cd_jumpdata + 4 * eax]
-cd_jumpdata:
-        ;; Do not insert anything here between the label and jump table.
-        dd offset cd_epilogue       ;; FFI_TYPE_VOID
-        dd offset cd_retint         ;; FFI_TYPE_INT
-        dd offset cd_retfloat       ;; FFI_TYPE_FLOAT
-        dd offset cd_retdouble      ;; FFI_TYPE_DOUBLE
-        dd offset cd_retlongdouble  ;; FFI_TYPE_LONGDOUBLE
-        dd offset cd_retuint8       ;; FFI_TYPE_UINT8
-        dd offset cd_retsint8       ;; FFI_TYPE_SINT8
-        dd offset cd_retuint16      ;; FFI_TYPE_UINT16
-        dd offset cd_retsint16      ;; FFI_TYPE_SINT16
-        dd offset cd_retint         ;; FFI_TYPE_UINT32
-        dd offset cd_retint         ;; FFI_TYPE_SINT32
-        dd offset cd_retint64       ;; FFI_TYPE_UINT64
-        dd offset cd_retint64       ;; FFI_TYPE_SINT64
-        dd offset cd_epilogue       ;; FFI_TYPE_STRUCT
-        dd offset cd_retint         ;; FFI_TYPE_POINTER
-        dd offset cd_retsint8       ;; FFI_TYPE_SMALL_STRUCT_1B
-        dd offset cd_retsint16      ;; FFI_TYPE_SMALL_STRUCT_2B
-        dd offset cd_retint         ;; FFI_TYPE_SMALL_STRUCT_4B
-
-cd_retuint8:
-        movzx eax, BYTE PTR [ecx]
-        jmp   cd_epilogue
-
-cd_retsint8:
-        movsx eax, BYTE PTR [ecx]
-        jmp   cd_epilogue
-
-cd_retuint16:
-        movzx eax, WORD PTR [ecx]
-        jmp   cd_epilogue
-
-cd_retsint16:
-        movsx eax, WORD PTR [ecx]
-        jmp   cd_epilogue
-
-cd_retint:
-        mov   eax, [ecx]
-        jmp   cd_epilogue
-
-cd_retint64:
-        mov   eax, [ecx + 0]
-        mov   edx, [ecx + 4]
-        jmp   cd_epilogue
-
-cd_retfloat:
-        fld   DWORD PTR [ecx]
-        jmp   cd_epilogue
-
-cd_retdouble:
-        fld   QWORD PTR [ecx]
-        jmp   cd_epilogue
-
-cd_retlongdouble:
-        fld   TBYTE PTR [ecx]
-        jmp   cd_epilogue
-
-cd_epilogue:
-        ;; Epilogue code is autogenerated.
-        ret
-ffi_closure_STDCALL ENDP
-
-_TEXT ENDS
-END
-
-#else
-
-	.text
- 
-        # This assumes we are using gas.
-        .balign 16
-	.globl	_ffi_call_win32
-#ifndef __OS2__
-	.def	_ffi_call_win32;	.scl	2;	.type	32;	.endef
-#endif
-_ffi_call_win32:
-.LFB1:
-        pushl %ebp
-.LCFI0:
-        movl  %esp,%ebp
-.LCFI1:
-        # Make room for all of the new args.
-        movl  20(%ebp),%ecx                                                     
-        subl  %ecx,%esp
- 
-        movl  %esp,%eax
- 
-        # Place all of the ffi_prep_args in position
-        pushl 12(%ebp)
-        pushl %eax
-        call  *8(%ebp)
- 
-        # Return stack to previous state and call the function
-        addl  $8,%esp
-
-	# Handle fastcall and thiscall
-	cmpl $3, 16(%ebp)  # FFI_THISCALL
-	jz .do_thiscall
-	cmpl $4, 16(%ebp) # FFI_FASTCALL
-	jnz .do_fncall
-	movl (%esp), %ecx
-	movl 4(%esp), %edx
-	addl $8, %esp
-	jmp .do_fncall
-.do_thiscall:
-	movl (%esp), %ecx
-	addl $4, %esp
-
-.do_fncall:
-	 
-        # FIXME: Align the stack to a 128-bit boundary to avoid
-        # potential performance hits.
-
-        call  *32(%ebp)
- 
-        # stdcall functions pop arguments off the stack themselves
-
-        # Load %ecx with the return type code
-        movl  24(%ebp),%ecx
- 
-        # If the return value pointer is NULL, assume no return value.
-        cmpl  $0,28(%ebp)
-        jne   0f
- 
-        # Even if there is no space for the return value, we are
-        # obliged to handle floating-point values.
-        cmpl  $FFI_TYPE_FLOAT,%ecx
-        jne   .Lnoretval
-        fstp  %st(0)
- 
-        jmp   .Lepilogue
-
-0:
-	call	1f
-	# Do not insert anything here between the call and the jump table.
-.Lstore_table:
-	.long	.Lnoretval		/* FFI_TYPE_VOID */
-	.long	.Lretint		/* FFI_TYPE_INT */
-	.long	.Lretfloat		/* FFI_TYPE_FLOAT */
-	.long	.Lretdouble		/* FFI_TYPE_DOUBLE */
-	.long	.Lretlongdouble		/* FFI_TYPE_LONGDOUBLE */
-	.long	.Lretuint8		/* FFI_TYPE_UINT8 */
-	.long	.Lretsint8		/* FFI_TYPE_SINT8 */
-	.long	.Lretuint16		/* FFI_TYPE_UINT16 */
-	.long	.Lretsint16		/* FFI_TYPE_SINT16 */
-	.long	.Lretint		/* FFI_TYPE_UINT32 */
-	.long	.Lretint		/* FFI_TYPE_SINT32 */
-	.long	.Lretint64		/* FFI_TYPE_UINT64 */
-	.long	.Lretint64		/* FFI_TYPE_SINT64 */
-	.long	.Lretstruct		/* FFI_TYPE_STRUCT */
-	.long	.Lretint		/* FFI_TYPE_POINTER */
-	.long	.Lretstruct1b		/* FFI_TYPE_SMALL_STRUCT_1B */
-	.long	.Lretstruct2b		/* FFI_TYPE_SMALL_STRUCT_2B */
-	.long	.Lretstruct4b		/* FFI_TYPE_SMALL_STRUCT_4B */
-	.long	.Lretstruct		/* FFI_TYPE_MS_STRUCT */
-1:
-	add	%ecx, %ecx
-	add	%ecx, %ecx
-	add	(%esp),%ecx
-	add	$4, %esp
-	jmp	*(%ecx)
-
-	/* Sign/zero extend as appropriate.  */
-.Lretsint8:
-	movsbl	%al, %eax
-	jmp	.Lretint
-
-.Lretsint16:
-	movswl	%ax, %eax
-	jmp	.Lretint
-
-.Lretuint8:
-	movzbl	%al, %eax
-	jmp	.Lretint
-
-.Lretuint16:
-	movzwl	%ax, %eax
-	jmp	.Lretint
-
-.Lretint:
-        # Load %ecx with the pointer to storage for the return value
-        movl  28(%ebp),%ecx
-        movl  %eax,0(%ecx)
-        jmp   .Lepilogue
- 
-.Lretfloat:
-         # Load %ecx with the pointer to storage for the return value
-        movl  28(%ebp),%ecx
-        fstps (%ecx)
-        jmp   .Lepilogue
- 
-.Lretdouble:
-        # Load %ecx with the pointer to storage for the return value
-        movl  28(%ebp),%ecx
-        fstpl (%ecx)
-        jmp   .Lepilogue
- 
-.Lretlongdouble:
-        # Load %ecx with the pointer to storage for the return value
-        movl  28(%ebp),%ecx
-        fstpt (%ecx)
-        jmp   .Lepilogue
- 
-.Lretint64:
-        # Load %ecx with the pointer to storage for the return value
-        movl  28(%ebp),%ecx
-        movl  %eax,0(%ecx)
-        movl  %edx,4(%ecx)
-	jmp   .Lepilogue
-
-.Lretstruct1b:
-        # Load %ecx with the pointer to storage for the return value
-        movl  28(%ebp),%ecx
-        movb  %al,0(%ecx)
-        jmp   .Lepilogue
- 
-.Lretstruct2b:
-        # Load %ecx with the pointer to storage for the return value
-        movl  28(%ebp),%ecx
-        movw  %ax,0(%ecx)
-        jmp   .Lepilogue
-
-.Lretstruct4b:
-        # Load %ecx with the pointer to storage for the return value
-        movl  28(%ebp),%ecx
-        movl  %eax,0(%ecx)
-        jmp   .Lepilogue
-
-.Lretstruct:
-        # Nothing to do!
- 
-.Lnoretval:
-.Lepilogue:
-        movl %ebp,%esp
-        popl %ebp
-        ret
-.ffi_call_win32_end:
-        .balign 16
-	.globl	_ffi_closure_THISCALL
-#ifndef __OS2__
-	.def	_ffi_closure_THISCALL;	.scl	2;	.type	32;	.endef
-#endif
-_ffi_closure_THISCALL:
-	pushl	%ebp
-	movl	%esp, %ebp
-	subl	$40, %esp
-	leal	-24(%ebp), %edx
-	movl	%edx, -12(%ebp)	/* resp */
-	leal	12(%ebp), %edx  /* account for stub return address on stack */
-	jmp	.stub
-.LFE1:
-
-        # This assumes we are using gas.
-        .balign 16
-	.globl	_ffi_closure_SYSV
-#ifndef __OS2__
-	.def	_ffi_closure_SYSV;	.scl	2;	.type	32;	.endef
-#endif
-_ffi_closure_SYSV:
-.LFB3:
-	pushl	%ebp
-.LCFI4:
-	movl	%esp, %ebp
-.LCFI5:
-	subl	$40, %esp
-	leal	-24(%ebp), %edx
-	movl	%edx, -12(%ebp)	/* resp */
-	leal	8(%ebp), %edx
-.stub:
-	movl	%edx, 4(%esp)	/* args = __builtin_dwarf_cfa () */
-	leal	-12(%ebp), %edx
-	movl	%edx, (%esp)	/* &resp */
-	call	_ffi_closure_SYSV_inner
-	movl	-12(%ebp), %ecx
-
-0:
-	call	1f
-	# Do not insert anything here between the call and the jump table.
-.Lcls_store_table:
-	.long	.Lcls_noretval		/* FFI_TYPE_VOID */
-	.long	.Lcls_retint		/* FFI_TYPE_INT */
-	.long	.Lcls_retfloat		/* FFI_TYPE_FLOAT */
-	.long	.Lcls_retdouble		/* FFI_TYPE_DOUBLE */
-	.long	.Lcls_retldouble	/* FFI_TYPE_LONGDOUBLE */
-	.long	.Lcls_retuint8		/* FFI_TYPE_UINT8 */
-	.long	.Lcls_retsint8		/* FFI_TYPE_SINT8 */
-	.long	.Lcls_retuint16		/* FFI_TYPE_UINT16 */
-	.long	.Lcls_retsint16		/* FFI_TYPE_SINT16 */
-	.long	.Lcls_retint		/* FFI_TYPE_UINT32 */
-	.long	.Lcls_retint		/* FFI_TYPE_SINT32 */
-	.long	.Lcls_retllong		/* FFI_TYPE_UINT64 */
-	.long	.Lcls_retllong		/* FFI_TYPE_SINT64 */
-	.long	.Lcls_retstruct		/* FFI_TYPE_STRUCT */
-	.long	.Lcls_retint		/* FFI_TYPE_POINTER */
-	.long	.Lcls_retstruct1	/* FFI_TYPE_SMALL_STRUCT_1B */
-	.long	.Lcls_retstruct2	/* FFI_TYPE_SMALL_STRUCT_2B */
-	.long	.Lcls_retstruct4	/* FFI_TYPE_SMALL_STRUCT_4B */
-	.long	.Lcls_retmsstruct	/* FFI_TYPE_MS_STRUCT */
-
-1:
-	add	%eax, %eax
-	add	%eax, %eax
-	add	(%esp),%eax
-	add	$4, %esp
-	jmp	*(%eax)
-
-	/* Sign/zero extend as appropriate.  */
-.Lcls_retsint8:
-	movsbl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-
-.Lcls_retsint16:
-	movswl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-
-.Lcls_retuint8:
-	movzbl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-
-.Lcls_retuint16:
-	movzwl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-
-.Lcls_retint:
-	movl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-
-.Lcls_retfloat:
-	flds	(%ecx)
-	jmp	.Lcls_epilogue
-
-.Lcls_retdouble:
-	fldl	(%ecx)
-	jmp	.Lcls_epilogue
-
-.Lcls_retldouble:
-	fldt	(%ecx)
-	jmp	.Lcls_epilogue
-
-.Lcls_retllong:
-	movl	(%ecx), %eax
-	movl	4(%ecx), %edx
-	jmp	.Lcls_epilogue
-
-.Lcls_retstruct1:
-	movsbl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-
-.Lcls_retstruct2:
-	movswl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-
-.Lcls_retstruct4:
-	movl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-
-.Lcls_retstruct:
-        # Caller expects us to pop struct return value pointer hidden arg.
-	movl	%ebp, %esp
-	popl	%ebp
-	ret	$0x4
-
-.Lcls_retmsstruct:
-	# Caller expects us to return a pointer to the real return value.
-	mov	%ecx, %eax
-	# Caller doesn't expects us to pop struct return value pointer hidden arg.
-	jmp	.Lcls_epilogue
-
-.Lcls_noretval:
-.Lcls_epilogue:
-	movl	%ebp, %esp
-	popl	%ebp
-	ret
-.ffi_closure_SYSV_end:
-.LFE3:
-
-#if !FFI_NO_RAW_API
-
-#define RAW_CLOSURE_CIF_OFFSET ((FFI_TRAMPOLINE_SIZE + 3) & ~3)
-#define RAW_CLOSURE_FUN_OFFSET (RAW_CLOSURE_CIF_OFFSET + 4)
-#define RAW_CLOSURE_USER_DATA_OFFSET (RAW_CLOSURE_FUN_OFFSET + 4)
-#define CIF_FLAGS_OFFSET 20
-        .balign 16
-	.globl	_ffi_closure_raw_THISCALL
-#ifndef __OS2__
-	.def	_ffi_closure_raw_THISCALL;	.scl	2;	.type	32;	.endef
-#endif
-_ffi_closure_raw_THISCALL:
-	pushl	%ebp
-	movl	%esp, %ebp
-	pushl	%esi
-	subl	$36, %esp
-	movl	RAW_CLOSURE_CIF_OFFSET(%eax), %esi	 /* closure->cif */
-	movl	RAW_CLOSURE_USER_DATA_OFFSET(%eax), %edx /* closure->user_data */
-	movl	%edx, 12(%esp)	/* user_data */
-	leal	12(%ebp), %edx	/* __builtin_dwarf_cfa () */
-	jmp	.stubraw
-        # This assumes we are using gas.
-        .balign 16
-	.globl	_ffi_closure_raw_SYSV
-#ifndef __OS2__
-	.def	_ffi_closure_raw_SYSV;	.scl	2;	.type	32;	.endef
-#endif
-_ffi_closure_raw_SYSV:
-.LFB4:
-	pushl	%ebp
-.LCFI6:
-	movl	%esp, %ebp
-.LCFI7:
-	pushl	%esi
-.LCFI8:
-	subl	$36, %esp
-	movl	RAW_CLOSURE_CIF_OFFSET(%eax), %esi	 /* closure->cif */
-	movl	RAW_CLOSURE_USER_DATA_OFFSET(%eax), %edx /* closure->user_data */
-	movl	%edx, 12(%esp)	/* user_data */
-	leal	8(%ebp), %edx	/* __builtin_dwarf_cfa () */
-.stubraw:
-	movl	%edx, 8(%esp)	/* raw_args */
-	leal	-24(%ebp), %edx
-	movl	%edx, 4(%esp)	/* &res */
-	movl	%esi, (%esp)	/* cif */
-	call	*RAW_CLOSURE_FUN_OFFSET(%eax)		 /* closure->fun */
-	movl	CIF_FLAGS_OFFSET(%esi), %eax		 /* rtype */
-0:
-	call	1f
-	# Do not insert anything here between the call and the jump table.
-.Lrcls_store_table:
-	.long	.Lrcls_noretval		/* FFI_TYPE_VOID */
-	.long	.Lrcls_retint		/* FFI_TYPE_INT */
-	.long	.Lrcls_retfloat		/* FFI_TYPE_FLOAT */
-	.long	.Lrcls_retdouble	/* FFI_TYPE_DOUBLE */
-	.long	.Lrcls_retldouble	/* FFI_TYPE_LONGDOUBLE */
-	.long	.Lrcls_retuint8		/* FFI_TYPE_UINT8 */
-	.long	.Lrcls_retsint8		/* FFI_TYPE_SINT8 */
-	.long	.Lrcls_retuint16	/* FFI_TYPE_UINT16 */
-	.long	.Lrcls_retsint16	/* FFI_TYPE_SINT16 */
-	.long	.Lrcls_retint		/* FFI_TYPE_UINT32 */
-	.long	.Lrcls_retint		/* FFI_TYPE_SINT32 */
-	.long	.Lrcls_retllong		/* FFI_TYPE_UINT64 */
-	.long	.Lrcls_retllong		/* FFI_TYPE_SINT64 */
-	.long	.Lrcls_retstruct	/* FFI_TYPE_STRUCT */
-	.long	.Lrcls_retint		/* FFI_TYPE_POINTER */
-	.long	.Lrcls_retstruct1	/* FFI_TYPE_SMALL_STRUCT_1B */
-	.long	.Lrcls_retstruct2	/* FFI_TYPE_SMALL_STRUCT_2B */
-	.long	.Lrcls_retstruct4	/* FFI_TYPE_SMALL_STRUCT_4B */
-	.long	.Lrcls_retstruct	/* FFI_TYPE_MS_STRUCT */
-1:
-	add	%eax, %eax
-	add	%eax, %eax
-	add	(%esp),%eax
-	add	$4, %esp
-	jmp	*(%eax)
-
-	/* Sign/zero extend as appropriate.  */
-.Lrcls_retsint8:
-	movsbl	-24(%ebp), %eax
-	jmp	.Lrcls_epilogue
-
-.Lrcls_retsint16:
-	movswl	-24(%ebp), %eax
-	jmp	.Lrcls_epilogue
-
-.Lrcls_retuint8:
-	movzbl	-24(%ebp), %eax
-	jmp	.Lrcls_epilogue
-
-.Lrcls_retuint16:
-	movzwl	-24(%ebp), %eax
-	jmp	.Lrcls_epilogue
-
-.Lrcls_retint:
-	movl	-24(%ebp), %eax
-	jmp	.Lrcls_epilogue
-
-.Lrcls_retfloat:
-	flds	-24(%ebp)
-	jmp	.Lrcls_epilogue
-
-.Lrcls_retdouble:
-	fldl	-24(%ebp)
-	jmp	.Lrcls_epilogue
-
-.Lrcls_retldouble:
-	fldt	-24(%ebp)
-	jmp	.Lrcls_epilogue
-
-.Lrcls_retllong:
-	movl	-24(%ebp), %eax
-	movl	-20(%ebp), %edx
-	jmp	.Lrcls_epilogue
-
-.Lrcls_retstruct1:
-	movsbl	-24(%ebp), %eax
-	jmp	.Lrcls_epilogue
-
-.Lrcls_retstruct2:
-	movswl	-24(%ebp), %eax
-	jmp	.Lrcls_epilogue
-
-.Lrcls_retstruct4:
-	movl	-24(%ebp), %eax
-	jmp	.Lrcls_epilogue
-
-.Lrcls_retstruct:
-	# Nothing to do!
-
-.Lrcls_noretval:
-.Lrcls_epilogue:
-	addl	$36, %esp
-	popl	%esi
-	popl	%ebp
-	ret
-.ffi_closure_raw_SYSV_end:
-.LFE4:
-
-#endif /* !FFI_NO_RAW_API */
-
-        # This assumes we are using gas.
-	.balign	16
-	.globl	_ffi_closure_STDCALL
-#ifndef __OS2__
-	.def	_ffi_closure_STDCALL;	.scl	2;	.type	32;	.endef
-#endif
-_ffi_closure_STDCALL:
-.LFB5:
-	pushl	%ebp
-.LCFI9:
-	movl	%esp, %ebp
-.LCFI10:
-	subl	$40, %esp
-	leal	-24(%ebp), %edx
-	movl	%edx, -12(%ebp)	/* resp */
-	leal	12(%ebp), %edx  /* account for stub return address on stack */
-	movl	%edx, 4(%esp)	/* args */
-	leal	-12(%ebp), %edx
-	movl	%edx, (%esp)	/* &resp */
-	call	_ffi_closure_SYSV_inner
-	movl	-12(%ebp), %ecx
-0:
-	call	1f
-	# Do not insert anything here between the call and the jump table.
-.Lscls_store_table:
-	.long	.Lscls_noretval		/* FFI_TYPE_VOID */
-	.long	.Lscls_retint		/* FFI_TYPE_INT */
-	.long	.Lscls_retfloat		/* FFI_TYPE_FLOAT */
-	.long	.Lscls_retdouble	/* FFI_TYPE_DOUBLE */
-	.long	.Lscls_retldouble	/* FFI_TYPE_LONGDOUBLE */
-	.long	.Lscls_retuint8		/* FFI_TYPE_UINT8 */
-	.long	.Lscls_retsint8		/* FFI_TYPE_SINT8 */
-	.long	.Lscls_retuint16	/* FFI_TYPE_UINT16 */
-	.long	.Lscls_retsint16	/* FFI_TYPE_SINT16 */
-	.long	.Lscls_retint		/* FFI_TYPE_UINT32 */
-	.long	.Lscls_retint		/* FFI_TYPE_SINT32 */
-	.long	.Lscls_retllong		/* FFI_TYPE_UINT64 */
-	.long	.Lscls_retllong		/* FFI_TYPE_SINT64 */
-	.long	.Lscls_retstruct	/* FFI_TYPE_STRUCT */
-	.long	.Lscls_retint		/* FFI_TYPE_POINTER */
-	.long	.Lscls_retstruct1	/* FFI_TYPE_SMALL_STRUCT_1B */
-	.long	.Lscls_retstruct2	/* FFI_TYPE_SMALL_STRUCT_2B */
-	.long	.Lscls_retstruct4	/* FFI_TYPE_SMALL_STRUCT_4B */
-1:
-	add	%eax, %eax
-	add	%eax, %eax
-	add	(%esp),%eax
-	add	$4, %esp
-	jmp	*(%eax)
-
-	/* Sign/zero extend as appropriate.  */
-.Lscls_retsint8:
-	movsbl	(%ecx), %eax
-	jmp	.Lscls_epilogue
-
-.Lscls_retsint16:
-	movswl	(%ecx), %eax
-	jmp	.Lscls_epilogue
-
-.Lscls_retuint8:
-	movzbl	(%ecx), %eax
-	jmp	.Lscls_epilogue
-
-.Lscls_retuint16:
-	movzwl	(%ecx), %eax
-	jmp	.Lscls_epilogue
-
-.Lscls_retint:
-	movl	(%ecx), %eax
-	jmp	.Lscls_epilogue
-
-.Lscls_retfloat:
-	flds	(%ecx)
-	jmp	.Lscls_epilogue
-
-.Lscls_retdouble:
-	fldl	(%ecx)
-	jmp	.Lscls_epilogue
-
-.Lscls_retldouble:
-	fldt	(%ecx)
-	jmp	.Lscls_epilogue
-
-.Lscls_retllong:
-	movl	(%ecx), %eax
-	movl	4(%ecx), %edx
-	jmp	.Lscls_epilogue
-
-.Lscls_retstruct1:
-	movsbl	(%ecx), %eax
-	jmp	.Lscls_epilogue
-
-.Lscls_retstruct2:
-	movswl	(%ecx), %eax
-	jmp	.Lscls_epilogue
-
-.Lscls_retstruct4:
-	movl	(%ecx), %eax
-	jmp	.Lscls_epilogue
-
-.Lscls_retstruct:
-	# Nothing to do!
-
-.Lscls_noretval:
-.Lscls_epilogue:
-	movl	%ebp, %esp
-	popl	%ebp
-	ret
-.ffi_closure_STDCALL_end:
-.LFE5:
-
-#ifndef __OS2__
-	.section	.eh_frame,"w"
-#endif
-.Lframe1:
-.LSCIE1:
-	.long	.LECIE1-.LASCIE1  /* Length of Common Information Entry */
-.LASCIE1:
-	.long	0x0	/* CIE Identifier Tag */
-	.byte	0x1	/* CIE Version */
-#ifdef __PIC__
-	.ascii "zR\0"	/* CIE Augmentation */
-#else
-	.ascii "\0"	/* CIE Augmentation */
-#endif
-	.byte	0x1	/* .uleb128 0x1; CIE Code Alignment Factor */
-	.byte	0x7c	/* .sleb128 -4; CIE Data Alignment Factor */
-	.byte	0x8	/* CIE RA Column */
-#ifdef __PIC__
-	.byte	0x1	/* .uleb128 0x1; Augmentation size */
-	.byte	0x1b	/* FDE Encoding (pcrel sdata4) */
-#endif
-	.byte	0xc	/* DW_CFA_def_cfa CFA = r4 + 4 = 4(%esp) */
-	.byte	0x4	/* .uleb128 0x4 */
-	.byte	0x4	/* .uleb128 0x4 */
-	.byte	0x88	/* DW_CFA_offset, column 0x8 %eip at CFA + 1 * -4 */
-	.byte	0x1	/* .uleb128 0x1 */
-	.align 4
-.LECIE1:
-
-.LSFDE1:
-	.long	.LEFDE1-.LASFDE1	/* FDE Length */
-.LASFDE1:
-	.long	.LASFDE1-.Lframe1	/* FDE CIE offset */
-#if defined __PIC__ && defined HAVE_AS_X86_PCREL
-	.long	.LFB1-.	/* FDE initial location */
-#else
-	.long	.LFB1
-#endif
-	.long	.LFE1-.LFB1	/* FDE address range */
-#ifdef __PIC__
-	.byte	0x0	/* .uleb128 0x0; Augmentation size */
-#endif
-	/* DW_CFA_xxx CFI instructions go here.  */
-
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI0-.LFB1
-	.byte	0xe	/* DW_CFA_def_cfa_offset CFA = r4 + 8 = 8(%esp) */
-	.byte	0x8	/* .uleb128 0x8 */
-	.byte	0x85	/* DW_CFA_offset, column 0x5 %ebp at CFA + 2 * -4 */
-	.byte	0x2	/* .uleb128 0x2 */
-
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI1-.LCFI0
-	.byte	0xd	/* DW_CFA_def_cfa_register CFA = r5 = %ebp */
-	.byte	0x5	/* .uleb128 0x5 */
-
-	/* End of DW_CFA_xxx CFI instructions.  */
-	.align 4
-.LEFDE1:
-
-
-.LSFDE3:
-	.long	.LEFDE3-.LASFDE3	/* FDE Length */
-.LASFDE3:
-	.long	.LASFDE3-.Lframe1	/* FDE CIE offset */
-#if defined __PIC__ && defined HAVE_AS_X86_PCREL
-	.long	.LFB3-.	/* FDE initial location */
-#else
-	.long	.LFB3
-#endif
-	.long	.LFE3-.LFB3	/* FDE address range */
-#ifdef __PIC__
-	.byte	0x0	/* .uleb128 0x0; Augmentation size */
-#endif
-	/* DW_CFA_xxx CFI instructions go here.  */
-
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI4-.LFB3
-	.byte	0xe	/* DW_CFA_def_cfa_offset CFA = r4 + 8 = 8(%esp) */
-	.byte	0x8	/* .uleb128 0x8 */
-	.byte	0x85	/* DW_CFA_offset, column 0x5 %ebp at CFA + 2 * -4 */
-	.byte	0x2	/* .uleb128 0x2 */
-
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI5-.LCFI4
-	.byte	0xd	/* DW_CFA_def_cfa_register CFA = r5 = %ebp */
-	.byte	0x5	/* .uleb128 0x5 */
-
-	/* End of DW_CFA_xxx CFI instructions.  */
-	.align 4
-.LEFDE3:
-
-#if !FFI_NO_RAW_API
-
-.LSFDE4:
-	.long	.LEFDE4-.LASFDE4	/* FDE Length */
-.LASFDE4:
-	.long	.LASFDE4-.Lframe1	/* FDE CIE offset */
-#if defined __PIC__ && defined HAVE_AS_X86_PCREL
-	.long	.LFB4-.	/* FDE initial location */
-#else
-	.long	.LFB4
-#endif
-	.long	.LFE4-.LFB4	/* FDE address range */
-#ifdef __PIC__
-	.byte	0x0	/* .uleb128 0x0; Augmentation size */
-#endif
-	/* DW_CFA_xxx CFI instructions go here.  */
-
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI6-.LFB4
-	.byte	0xe	/* DW_CFA_def_cfa_offset CFA = r4 + 8 = 8(%esp) */
-	.byte	0x8	/* .uleb128 0x8 */
-	.byte	0x85	/* DW_CFA_offset, column 0x5 %ebp at CFA + 2 * -4 */
-	.byte	0x2	/* .uleb128 0x2 */
-
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI7-.LCFI6
-	.byte	0xd	/* DW_CFA_def_cfa_register CFA = r5 = %ebp */
-	.byte	0x5	/* .uleb128 0x5 */
-
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI8-.LCFI7
-	.byte	0x86	/* DW_CFA_offset, column 0x6 %esi at CFA + 3 * -4 */
-	.byte	0x3	/* .uleb128 0x3 */
-
-	/* End of DW_CFA_xxx CFI instructions.  */
-	.align 4
-.LEFDE4:
-
-#endif /* !FFI_NO_RAW_API */
-
-.LSFDE5:
-	.long	.LEFDE5-.LASFDE5	/* FDE Length */
-.LASFDE5:
-	.long	.LASFDE5-.Lframe1	/* FDE CIE offset */
-#if defined __PIC__ && defined HAVE_AS_X86_PCREL
-	.long	.LFB5-.	/* FDE initial location */
-#else
-	.long	.LFB5
-#endif
-	.long	.LFE5-.LFB5	/* FDE address range */
-#ifdef __PIC__
-	.byte	0x0	/* .uleb128 0x0; Augmentation size */
-#endif
-	/* DW_CFA_xxx CFI instructions go here.  */
-
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI9-.LFB5
-	.byte	0xe	/* DW_CFA_def_cfa_offset CFA = r4 + 8 = 8(%esp) */
-	.byte	0x8	/* .uleb128 0x8 */
-	.byte	0x85	/* DW_CFA_offset, column 0x5 %ebp at CFA + 2 * -4 */
-	.byte	0x2	/* .uleb128 0x2 */
-
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI10-.LCFI9
-	.byte	0xd	/* DW_CFA_def_cfa_register CFA = r5 = %ebp */
-	.byte	0x5	/* .uleb128 0x5 */
-
-	/* End of DW_CFA_xxx CFI instructions.  */
-	.align 4
-.LEFDE5:
-
-#endif /* !_MSC_VER */
-
diff --git a/libffi/src/x86/win64.S b/libffi/src/x86/win64.S
index 687f97c..a5a20b6 100644
--- a/libffi/src/x86/win64.S
+++ b/libffi/src/x86/win64.S
@@ -1,264 +1,16 @@
 #define LIBFFI_ASM
 #include <fficonfig.h>
 #include <ffi.h>
+#include <ffi_cfi.h>
 
-/* Constants for ffi_call_win64 */
-#define STACK 0
-#define PREP_ARGS_FN 32
-#define ECIF 40
-#define CIF_BYTES 48
-#define CIF_FLAGS 56
-#define RVALUE 64
-#define FN 72
-
-/* ffi_call_win64 (void (*prep_args_fn)(char *, extended_cif *),
-		   extended_cif *ecif, unsigned bytes, unsigned flags,
-		   unsigned *rvalue, void (*fn)());
- */
-
-#ifdef _MSC_VER
-PUBLIC	ffi_call_win64
-
-EXTRN	__chkstk:NEAR
-EXTRN	ffi_closure_win64_inner:NEAR
-
-_TEXT	SEGMENT
-
-;;; ffi_closure_win64 will be called with these registers set:
-;;;    rax points to 'closure'
-;;;    r11 contains a bit mask that specifies which of the
-;;;    first four parameters are float or double
-;;;
-;;; It must move the parameters passed in registers to their stack location,
-;;; call ffi_closure_win64_inner for the actual work, then return the result.
-;;;
-ffi_closure_win64 PROC FRAME
-	;; copy register arguments onto stack
-	test	r11, 1
-	jne	first_is_float
-	mov	QWORD PTR [rsp+8], rcx
-	jmp	second
-first_is_float:
-	movlpd	QWORD PTR [rsp+8], xmm0
-
-second:
-	test	r11, 2
-	jne	second_is_float
-	mov	QWORD PTR [rsp+16], rdx
-	jmp	third
-second_is_float:
-	movlpd	QWORD PTR [rsp+16], xmm1
-
-third:
-	test	r11, 4
-	jne	third_is_float
-	mov	QWORD PTR [rsp+24], r8
-	jmp	fourth
-third_is_float:
-	movlpd	QWORD PTR [rsp+24], xmm2
-
-fourth:
-	test	r11, 8
-	jne	fourth_is_float
-	mov	QWORD PTR [rsp+32], r9
-	jmp	done
-fourth_is_float:
-	movlpd	QWORD PTR [rsp+32], xmm3
-
-done:
-	.ALLOCSTACK 40
-	sub	rsp, 40
-	.ENDPROLOG
-	mov	rcx, rax	; context is first parameter
-	mov	rdx, rsp	; stack is second parameter
-	add	rdx, 48		; point to start of arguments
-	mov	rax, ffi_closure_win64_inner
-	call	rax		; call the real closure function
-	add	rsp, 40
-	movd	xmm0, rax	; If the closure returned a float,
-				; ffi_closure_win64_inner wrote it to rax
-	ret	0
-ffi_closure_win64 ENDP
-
-ffi_call_win64 PROC FRAME
-	;; copy registers onto stack
-	mov	QWORD PTR [rsp+32], r9
-	mov	QWORD PTR [rsp+24], r8
-	mov	QWORD PTR [rsp+16], rdx
-	mov	QWORD PTR [rsp+8], rcx
-	.PUSHREG rbp
-	push	rbp
-	.ALLOCSTACK 48
-	sub	rsp, 48					; 00000030H
-	.SETFRAME rbp, 32
-	lea	rbp, QWORD PTR [rsp+32]
-	.ENDPROLOG
-
-	mov	eax, DWORD PTR CIF_BYTES[rbp]
-	add	rax, 15
-	and	rax, -16
-	call	__chkstk
-	sub	rsp, rax
-	lea	rax, QWORD PTR [rsp+32]
-	mov	QWORD PTR STACK[rbp], rax
-
-	mov	rdx, QWORD PTR ECIF[rbp]
-	mov	rcx, QWORD PTR STACK[rbp]
-	call	QWORD PTR PREP_ARGS_FN[rbp]
-
-	mov	rsp, QWORD PTR STACK[rbp]
-
-	movlpd	xmm3, QWORD PTR [rsp+24]
-	movd	r9, xmm3
-
-	movlpd	xmm2, QWORD PTR [rsp+16]
-	movd	r8, xmm2
-
-	movlpd	xmm1, QWORD PTR [rsp+8]
-	movd	rdx, xmm1
-
-	movlpd	xmm0, QWORD PTR [rsp]
-	movd	rcx, xmm0
-
-	call	QWORD PTR FN[rbp]
-ret_struct4b$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SMALL_STRUCT_4B
- 	jne	ret_struct2b$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	mov	DWORD PTR [rcx], eax
-	jmp	ret_void$
-
-ret_struct2b$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SMALL_STRUCT_2B
- 	jne	ret_struct1b$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	mov	WORD PTR [rcx], ax
-	jmp	ret_void$
-
-ret_struct1b$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SMALL_STRUCT_1B
- 	jne	ret_uint8$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	mov	BYTE PTR [rcx], al
-	jmp	ret_void$
-
-ret_uint8$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_UINT8
- 	jne	ret_sint8$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	movzx   rax, al
-	mov	QWORD PTR [rcx], rax
-	jmp	ret_void$
-
-ret_sint8$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SINT8
- 	jne	ret_uint16$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	movsx   rax, al
-	mov	QWORD PTR [rcx], rax
-	jmp	ret_void$
-
-ret_uint16$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_UINT16
- 	jne	ret_sint16$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	movzx   rax, ax
-	mov	QWORD PTR [rcx], rax
-	jmp	SHORT ret_void$
-
-ret_sint16$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SINT16
- 	jne	ret_uint32$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	movsx   rax, ax
-	mov	QWORD PTR [rcx], rax
-	jmp	SHORT ret_void$
-
-ret_uint32$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_UINT32
- 	jne	ret_sint32$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	mov     eax, eax
-	mov	QWORD PTR [rcx], rax
-	jmp	SHORT ret_void$
-
-ret_sint32$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SINT32
- 	jne	ret_float$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	cdqe
-	mov	QWORD PTR [rcx], rax
-	jmp	SHORT ret_void$
-
-ret_float$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_FLOAT
- 	jne	SHORT ret_double$
-
- 	mov	rax, QWORD PTR RVALUE[rbp]
- 	movss	DWORD PTR [rax], xmm0
- 	jmp	SHORT ret_void$
-
-ret_double$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_DOUBLE
- 	jne	SHORT ret_uint64$
-
- 	mov	rax, QWORD PTR RVALUE[rbp]
- 	movlpd	QWORD PTR [rax], xmm0
- 	jmp	SHORT ret_void$
-
-ret_uint64$:
-  	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_UINT64
-  	jne	SHORT ret_sint64$
-
- 	mov	rcx, QWORD PTR RVALUE[rbp]
- 	mov	QWORD PTR [rcx], rax
- 	jmp	SHORT ret_void$
-
-ret_sint64$:
-  	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SINT64
-  	jne	SHORT ret_pointer$
-
- 	mov	rcx, QWORD PTR RVALUE[rbp]
- 	mov	QWORD PTR [rcx], rax
- 	jmp	SHORT ret_void$
-
-ret_pointer$:
-  	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_POINTER
-  	jne	SHORT ret_int$
-
- 	mov	rcx, QWORD PTR RVALUE[rbp]
- 	mov	QWORD PTR [rcx], rax
- 	jmp	SHORT ret_void$
-
-ret_int$:
-  	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_INT
-  	jne	SHORT ret_void$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	cdqe
-	mov	QWORD PTR [rcx], rax
- 	jmp	SHORT ret_void$
-
-ret_void$:
-	xor	rax, rax
-
-	lea	rsp, QWORD PTR [rbp+16]
-	pop	rbp
-	ret	0
-ffi_call_win64 ENDP
-_TEXT	ENDS
-END
+#if defined(HAVE_AS_CFI_PSEUDO_OP)
+        .cfi_sections   .debug_frame
+#endif
 
-#else
+#define arg0	%rcx
+#define arg1	%rdx
+#define arg2	%r8
+#define arg3	%r9
 
 #ifdef SYMBOL_UNDERSCORE
 #define SYMBOL_NAME(name) _##name
@@ -266,255 +18,202 @@ END
 #define SYMBOL_NAME(name) name
 #endif
 
-.text
-
-.extern SYMBOL_NAME(ffi_closure_win64_inner)
-
-# ffi_closure_win64 will be called with these registers set:
-#    rax points to 'closure'
-#    r11 contains a bit mask that specifies which of the
-#    first four parameters are float or double
-#
-# It must move the parameters passed in registers to their stack location,
-# call ffi_closure_win64_inner for the actual work, then return the result.
-#
-	.balign 16
-	.globl SYMBOL_NAME(ffi_closure_win64)
-	.seh_proc SYMBOL_NAME(ffi_closure_win64)
-SYMBOL_NAME(ffi_closure_win64):
-	# copy register arguments onto stack
-	test	$1,%r11
-	jne	.Lfirst_is_float
-	mov	%rcx, 8(%rsp)
-	jmp	.Lsecond
-.Lfirst_is_float:
-	movlpd	%xmm0, 8(%rsp)
-
-.Lsecond:
-	test	$2, %r11
-	jne	.Lsecond_is_float
-	mov	%rdx, 16(%rsp)
-	jmp	.Lthird
-.Lsecond_is_float:
-	movlpd	%xmm1, 16(%rsp)
-
-.Lthird:
-	test	$4, %r11
-	jne	.Lthird_is_float
-	mov	%r8,24(%rsp)
-	jmp	.Lfourth
-.Lthird_is_float:
-	movlpd	%xmm2, 24(%rsp)
-
-.Lfourth:
-	test	$8, %r11
-	jne	.Lfourth_is_float
-	mov	%r9, 32(%rsp)
-	jmp	.Ldone
-.Lfourth_is_float:
-	movlpd	%xmm3, 32(%rsp)
-
-.Ldone:
-	.seh_stackalloc 40
-	sub	$40, %rsp
+.macro E which
+	.align	8
+	.org	0b + \which * 8
+.endm
+
+	.text
+
+/* ffi_call_win64 (void *stack, struct win64_call_frame *frame, void *r10)
+
+   Bit o trickiness here -- FRAME is the base of the stack frame
+   for this function.  This has been allocated by ffi_call.  We also
+   deallocate some of the stack that has been alloca'd.  */
+
+	.align	8
+	.globl	ffi_call_win64
+
+	.seh_proc ffi_call_win64
+ffi_call_win64:
+	cfi_startproc
+	/* Set up the local stack frame and install it in rbp/rsp.  */
+	movq	(%rsp), %rax
+	movq	%rbp, (arg1)
+	movq	%rax, 8(arg1)
+	movq	arg1, %rbp
+	cfi_def_cfa(%rbp, 16)
+	cfi_rel_offset(%rbp, 0)
+	.seh_pushreg %rbp
+	.seh_setframe %rbp, 0
 	.seh_endprologue
-	mov	%rax, %rcx	# context is first parameter
-	mov	%rsp, %rdx	# stack is second parameter
-	add	$48, %rdx	# point to start of arguments
-	leaq	SYMBOL_NAME(ffi_closure_win64_inner)(%rip), %rax
-	callq	*%rax		# call the real closure function
-	add	$40, %rsp
-	movq	%rax, %xmm0	# If the closure returned a float,
-				# ffi_closure_win64_inner wrote it to rax
-	retq
+	movq	arg0, %rsp
+
+	movq	arg2, %r10
+
+	/* Load all slots into both general and xmm registers.  */
+	movq	(%rsp), %rcx
+	movsd	(%rsp), %xmm0
+	movq	8(%rsp), %rdx
+	movsd	8(%rsp), %xmm1
+	movq	16(%rsp), %r8
+	movsd	16(%rsp), %xmm2
+	movq	24(%rsp), %r9
+	movsd	24(%rsp), %xmm3
+
+	call	*16(%rbp)
+
+	movl	24(%rbp), %ecx
+	movq	32(%rbp), %r8
+	leaq	0f(%rip), %r10
+	cmpl	$FFI_TYPE_SMALL_STRUCT_4B, %ecx
+	leaq	(%r10, %rcx, 8), %r10
+	ja	99f
+	jmp	*%r10
+
+/* Below, we're space constrained most of the time.  Thus we eschew the
+   modern "mov, pop, ret" sequence (5 bytes) for "leave, ret" (2 bytes).  */
+.macro epilogue
+	leaveq
+	cfi_remember_state
+	cfi_def_cfa(%rsp, 8)
+	cfi_restore(%rbp)
+	ret
+	cfi_restore_state
+.endm
+
+	.align	8
+0:
+E FFI_TYPE_VOID
+	epilogue
+E FFI_TYPE_INT
+	movslq	%eax, %rax
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_FLOAT
+	movss	%xmm0, (%r8)
+	epilogue
+E FFI_TYPE_DOUBLE
+	movsd	%xmm0, (%r8)
+	epilogue
+E FFI_TYPE_LONGDOUBLE
+	call	abort
+E FFI_TYPE_UINT8
+	movzbl	%al, %eax
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_SINT8
+	movsbq	%al, %rax
+	jmp	98f
+E FFI_TYPE_UINT16
+	movzwl	%ax, %eax
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_SINT16
+	movswq	%ax, %rax
+	jmp	98f
+E FFI_TYPE_UINT32
+	movl	%eax, %eax
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_SINT32
+	movslq	%eax, %rax
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_UINT64
+98:	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_SINT64
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_STRUCT
+	epilogue
+E FFI_TYPE_POINTER
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_COMPLEX
+	call	abort
+E FFI_TYPE_SMALL_STRUCT_1B
+	movb	%al, (%r8)
+	epilogue
+E FFI_TYPE_SMALL_STRUCT_2B
+	movw	%ax, (%r8)
+	epilogue
+E FFI_TYPE_SMALL_STRUCT_4B
+	movl	%eax, (%r8)
+	epilogue
+
+	.align	8
+99:	call	abort
+
+.purgem epilogue
+
+	cfi_endproc
 	.seh_endproc
 
-	.balign 16
-	.globl	SYMBOL_NAME(ffi_call_win64)
-	.seh_proc SYMBOL_NAME(ffi_call_win64)
-SYMBOL_NAME(ffi_call_win64):
-	# copy registers onto stack
-	mov	%r9,32(%rsp)
-	mov	%r8,24(%rsp)
-	mov	%rdx,16(%rsp)
-	mov	%rcx,8(%rsp)
-	.seh_pushreg rbp
-	push	%rbp
-	.seh_stackalloc 48
-	sub	$48,%rsp
-	.seh_setframe rbp, 32
-	lea	32(%rsp),%rbp
-	.seh_endprologue
-
-	mov	CIF_BYTES(%rbp),%eax
-	add	$15, %rax
-	and	$-16, %rax
-	cmpq	$0x1000, %rax
-	jb	Lch_done
-Lch_probe:
-	subq	$0x1000,%rsp
-	orl	$0x0, (%rsp)
-	subq	$0x1000,%rax
-	cmpq	$0x1000,%rax
-	ja	Lch_probe
-Lch_done:
-	subq	%rax, %rsp
-	orl	$0x0, (%rsp)
-	lea	32(%rsp), %rax
-	mov	%rax, STACK(%rbp)
-
-	mov	ECIF(%rbp), %rdx
-	mov	STACK(%rbp), %rcx
-	callq	*PREP_ARGS_FN(%rbp)
-
-	mov	STACK(%rbp), %rsp
-
-	movlpd	24(%rsp), %xmm3
-	movd	%xmm3, %r9
-
-	movlpd	16(%rsp), %xmm2
-	movd	%xmm2, %r8
-
-	movlpd	8(%rsp), %xmm1
-	movd	%xmm1, %rdx
-
-	movlpd	(%rsp), %xmm0
-	movd	%xmm0, %rcx
-
-	callq	*FN(%rbp)
-.Lret_struct4b:
- 	cmpl	$FFI_TYPE_SMALL_STRUCT_4B, CIF_FLAGS(%rbp)
- 	jne .Lret_struct2b
-
-	mov	RVALUE(%rbp), %rcx
-	mov	%eax, (%rcx)
-	jmp	.Lret_void
-
-.Lret_struct2b:
-	cmpl	$FFI_TYPE_SMALL_STRUCT_2B, CIF_FLAGS(%rbp)
-	jne .Lret_struct1b
-
-	mov	RVALUE(%rbp), %rcx
-	mov	%ax, (%rcx)
-	jmp .Lret_void
-
-.Lret_struct1b:
-	cmpl	$FFI_TYPE_SMALL_STRUCT_1B, CIF_FLAGS(%rbp)
-	jne .Lret_uint8
 
-	mov	RVALUE(%rbp), %rcx
-	mov	%al, (%rcx)
-	jmp .Lret_void
-
-.Lret_uint8:
-	cmpl	$FFI_TYPE_UINT8, CIF_FLAGS(%rbp)
-	jne .Lret_sint8
-
-	mov     RVALUE(%rbp), %rcx
-	movzbq  %al, %rax
-	movq    %rax, (%rcx)
-	jmp .Lret_void
-
-.Lret_sint8:
-	cmpl	$FFI_TYPE_SINT8, CIF_FLAGS(%rbp)
-	jne .Lret_uint16
-
-	mov     RVALUE(%rbp), %rcx
-	movsbq  %al, %rax
-	movq    %rax, (%rcx)
-	jmp .Lret_void
-
-.Lret_uint16:
-	cmpl	$FFI_TYPE_UINT16, CIF_FLAGS(%rbp)
-	jne .Lret_sint16
-
-	mov     RVALUE(%rbp), %rcx
-	movzwq  %ax, %rax
-	movq    %rax, (%rcx)
-	jmp .Lret_void
-
-.Lret_sint16:
-	cmpl	$FFI_TYPE_SINT16, CIF_FLAGS(%rbp)
-	jne .Lret_uint32
-
-	mov     RVALUE(%rbp), %rcx
-	movswq  %ax, %rax
-	movq    %rax, (%rcx)
-	jmp .Lret_void
-
-.Lret_uint32:
-	cmpl	$FFI_TYPE_UINT32, CIF_FLAGS(%rbp)
-	jne .Lret_sint32
-
-	mov     RVALUE(%rbp), %rcx
-	movl    %eax, %eax
-	movq    %rax, (%rcx)
-	jmp .Lret_void
-
-.Lret_sint32:
- 	cmpl	$FFI_TYPE_SINT32, CIF_FLAGS(%rbp)
- 	jne	.Lret_float
-
-	mov	RVALUE(%rbp), %rcx
-	cltq
-	movq	%rax, (%rcx)
-	jmp	.Lret_void
-
-.Lret_float:
- 	cmpl	$FFI_TYPE_FLOAT, CIF_FLAGS(%rbp)
- 	jne	.Lret_double
-
- 	mov	RVALUE(%rbp), %rax
- 	movss	%xmm0, (%rax)
- 	jmp	.Lret_void
-
-.Lret_double:
- 	cmpl	$FFI_TYPE_DOUBLE, CIF_FLAGS(%rbp)
- 	jne	.Lret_uint64
-
- 	mov	RVALUE(%rbp), %rax
- 	movlpd	%xmm0, (%rax)
- 	jmp	.Lret_void
-
-.Lret_uint64:
-  	cmpl	$FFI_TYPE_UINT64, CIF_FLAGS(%rbp)
- 	jne	.Lret_sint64
-
- 	mov	RVALUE(%rbp), %rcx
- 	mov	%rax, (%rcx)
- 	jmp	.Lret_void
-
-.Lret_sint64:
-  	cmpl	$FFI_TYPE_SINT64, CIF_FLAGS(%rbp)
-  	jne	.Lret_pointer
-
- 	mov	RVALUE(%rbp), %rcx
- 	mov	%rax, (%rcx)
- 	jmp	.Lret_void
+/* 32 bytes of outgoing register stack space, 8 bytes of alignment,
+   16 bytes of result, 32 bytes of xmm registers.  */
+#define ffi_clo_FS	(32+8+16+32)
+#define ffi_clo_OFF_R	(32+8)
+#define ffi_clo_OFF_X	(32+8+16)
+
+	.align	8
+	.globl	ffi_go_closure_win64
+
+	.seh_proc ffi_go_closure_win64
+ffi_go_closure_win64:
+	cfi_startproc
+	/* Save all integer arguments into the incoming reg stack space.  */
+	movq	arg0, 8(%rsp)
+	movq	arg1, 16(%rsp)
+	movq	arg2, 24(%rsp)
+	movq	arg3, 32(%rsp)
+
+	movq	8(%r10), arg0			/* load cif */
+	movq	16(%r10), arg1			/* load fun */
+	movq	%r10, arg2			/* closure is user_data */
+	jmp	0f
+	cfi_endproc
+	.seh_endproc
 
-.Lret_pointer:
-  	cmpl	$FFI_TYPE_POINTER, CIF_FLAGS(%rbp)
-  	jne	.Lret_int
+	.align	8
+	.globl	ffi_closure_win64
+
+	.seh_proc ffi_closure_win64
+ffi_closure_win64:
+	cfi_startproc
+	/* Save all integer arguments into the incoming reg stack space.  */
+	movq	arg0, 8(%rsp)
+	movq	arg1, 16(%rsp)
+	movq	arg2, 24(%rsp)
+	movq	arg3, 32(%rsp)
+
+	movq	FFI_TRAMPOLINE_SIZE(%r10), arg0		/* load cif */
+	movq	FFI_TRAMPOLINE_SIZE+8(%r10), arg1	/* load fun */
+	movq	FFI_TRAMPOLINE_SIZE+16(%r10), arg2	/* load user_data */
+0:
+	subq	$ffi_clo_FS, %rsp
+	cfi_adjust_cfa_offset(ffi_clo_FS)
+	.seh_stackalloc ffi_clo_FS
+	.seh_endprologue
 
- 	mov	RVALUE(%rbp), %rcx
- 	mov	%rax, (%rcx)
- 	jmp	.Lret_void
+	/* Save all sse arguments into the stack frame.  */
+	movsd	%xmm0, ffi_clo_OFF_X(%rsp)
+	movsd	%xmm1, ffi_clo_OFF_X+8(%rsp)
+	movsd	%xmm2, ffi_clo_OFF_X+16(%rsp)
+	movsd	%xmm3, ffi_clo_OFF_X+24(%rsp)
 
-.Lret_int:
-  	cmpl	$FFI_TYPE_INT, CIF_FLAGS(%rbp)
-  	jne	.Lret_void
+	leaq	ffi_clo_OFF_R(%rsp), arg3
+	call	ffi_closure_win64_inner
 
-	mov	RVALUE(%rbp), %rcx
-	cltq
-	movq	%rax, (%rcx)
-	jmp	.Lret_void
+	/* Load the result into both possible result registers.  */
+	movq    ffi_clo_OFF_R(%rsp), %rax
+	movsd   ffi_clo_OFF_R(%rsp), %xmm0
 
-.Lret_void:
-	xor	%rax, %rax
+	addq	$ffi_clo_FS, %rsp
+	cfi_adjust_cfa_offset(-ffi_clo_FS)
+	ret
 
-	lea	16(%rbp), %rsp
-	pop	%rbp
-	retq
+	cfi_endproc
 	.seh_endproc
-#endif /* !_MSC_VER */
-
author	Richard Henderson <rth@redhat.com>	2015-01-12 08:19:59 -0800
committer	Richard Henderson <rth@gcc.gnu.org>	2015-01-12 08:19:59 -0800
commit	b1760f7f915a36ee9b4636fb54719c9b3ae59356 (patch)
tree	1a64d747b069bdebf651d856989dd40a54daf0cc /libffi/src/x86
parent	62e22fcb7985349b93646b86351033e1fb09c46c (diff)
download	gcc-b1760f7f915a36ee9b4636fb54719c9b3ae59356.zip gcc-b1760f7f915a36ee9b4636fb54719c9b3ae59356.tar.gz gcc-b1760f7f915a36ee9b4636fb54719c9b3ae59356.tar.bz2