diff options
Diffstat (limited to 'libffi/src/x86/unix64.S')
-rw-r--r-- | libffi/src/x86/unix64.S | 123 |
1 files changed, 80 insertions, 43 deletions
diff --git a/libffi/src/x86/unix64.S b/libffi/src/x86/unix64.S index c9e0792..831e1d7 100644 --- a/libffi/src/x86/unix64.S +++ b/libffi/src/x86/unix64.S @@ -31,7 +31,7 @@ .text /* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags, - void *raddr, void (*fnaddr)()); + void *raddr, void (*fnaddr)()); Bit o trickiness here -- ARGS+BYTES is the base of the stack frame for this function. This has been allocated by ffi_call. We also @@ -39,7 +39,7 @@ .align 2 .globl ffi_call_unix64 - .type ffi_call_unix64,@function + .type ffi_call_unix64,@function ffi_call_unix64: .LUW0: @@ -53,6 +53,7 @@ ffi_call_unix64: .LUW1: movq %rdi, %r10 /* Save a copy of the register area. */ movq %r8, %r11 /* Save a copy of the target fn. */ + movl %r9d, %eax /* Set number of SSE registers. */ /* Load up all argument registers. */ movq (%r10), %rdi @@ -61,14 +62,9 @@ ffi_call_unix64: movq 24(%r10), %rcx movq 32(%r10), %r8 movq 40(%r10), %r9 - movdqa 48(%r10), %xmm0 - movdqa 64(%r10), %xmm1 - movdqa 80(%r10), %xmm2 - movdqa 96(%r10), %xmm3 - movdqa 112(%r10), %xmm4 - movdqa 128(%r10), %xmm5 - movdqa 144(%r10), %xmm6 - movdqa 160(%r10), %xmm7 + testl %eax, %eax + jnz .Lload_sse +.Lret_from_load_sse: /* Deallocate the reg arg area. */ leaq 176(%r10), %rsp @@ -181,37 +177,49 @@ ffi_call_unix64: movq %rax, (%rsi) movq %rdx, 8(%rsi) - /* Bits 11-31 contain the true size of the structure. Copy from + /* Bits 12-31 contain the true size of the structure. Copy from the scratch area to the true destination. */ - shrl $11, %ecx + shrl $12, %ecx rep movsb ret + + /* Many times we can avoid loading any SSE registers at all. + It's not worth an indirect jump to load the exact set of + SSE registers needed; zero or all is a good compromise. */ + .align 2 .LUW3: +.Lload_sse: + movdqa 48(%r10), %xmm0 + movdqa 64(%r10), %xmm1 + movdqa 80(%r10), %xmm2 + movdqa 96(%r10), %xmm3 + movdqa 112(%r10), %xmm4 + movdqa 128(%r10), %xmm5 + movdqa 144(%r10), %xmm6 + movdqa 160(%r10), %xmm7 + jmp .Lret_from_load_sse + +.LUW4: .size ffi_call_unix64,.-ffi_call_unix64 .align 2 .globl ffi_closure_unix64 - .type ffi_closure_unix64,@function + .type ffi_closure_unix64,@function ffi_closure_unix64: -.LUW4: - subq $200, %rsp .LUW5: - + /* The carry flag is set by the trampoline iff SSE registers + are used. Don't clobber it before the branch instruction. */ + leaq -200(%rsp), %rsp +.LUW6: movq %rdi, (%rsp) - movq %rsi, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rcx, 24(%rsp) - movq %r8, 32(%rsp) - movq %r9, 40(%rsp) - movdqa %xmm0, 48(%rsp) - movdqa %xmm1, 64(%rsp) - movdqa %xmm2, 80(%rsp) - movdqa %xmm3, 96(%rsp) - movdqa %xmm4, 112(%rsp) - movdqa %xmm5, 128(%rsp) - movdqa %xmm6, 144(%rsp) - movdqa %xmm7, 160(%rsp) + movq %rsi, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rcx, 24(%rsp) + movq %r8, 32(%rsp) + movq %r9, 40(%rsp) + jc .Lsave_sse +.Lret_from_save_sse: movq %r10, %rdi leaq 176(%rsp), %rsi @@ -221,7 +229,7 @@ ffi_closure_unix64: /* Deallocate stack frame early; return value is now in redzone. */ addq $200, %rsp -.LUW6: +.LUW7: /* The first byte of the return value contains the FFI_TYPE. */ movzbl %al, %r10d @@ -300,7 +308,22 @@ ffi_closure_unix64: movq -24(%rsp), %rax cmovnz %rdx, %rax ret -.LUW7: + + /* See the comment above .Lload_sse; the same logic applies here. */ + .align 2 +.LUW8: +.Lsave_sse: + movdqa %xmm0, 48(%rsp) + movdqa %xmm1, 64(%rsp) + movdqa %xmm2, 80(%rsp) + movdqa %xmm3, 96(%rsp) + movdqa %xmm4, 112(%rsp) + movdqa %xmm5, 128(%rsp) + movdqa %xmm6, 144(%rsp) + movdqa %xmm7, 160(%rsp) + jmp .Lret_from_save_sse + +.LUW9: .size ffi_closure_unix64,.-ffi_closure_unix64 .section .eh_frame,"a",@progbits @@ -327,24 +350,25 @@ ffi_closure_unix64: .LASFDE1: .long .LASFDE1-.Lframe1 /* FDE CIE offset */ .long .LUW0-. /* FDE initial location */ - .long .LUW3-.LUW0 /* FDE address range */ + .long .LUW4-.LUW0 /* FDE address range */ .uleb128 0x0 /* Augmentation size */ .byte 0x4 /* DW_CFA_advance_loc4 */ .long .LUW1-.LUW0 - /* New stack frame based off rbp. This is a itty bit of unwind - trickery in that the CFA *has* changed. There is no easy way - to describe it correctly on entry to the function. Fortunately, - it doesn't matter too much since at all points we can correctly - unwind back to ffi_call. Note that the location to which we - moved the return address is (the new) CFA-8, so from the - perspective of the unwind info, it hasn't moved. */ + /* New stack frame based off rbp. This is a itty bit of unwind + trickery in that the CFA *has* changed. There is no easy way + to describe it correctly on entry to the function. Fortunately, + it doesn't matter too much since at all points we can correctly + unwind back to ffi_call. Note that the location to which we + moved the return address is (the new) CFA-8, so from the + perspective of the unwind info, it hasn't moved. */ .byte 0xc /* DW_CFA_def_cfa, %rbp offset 32 */ .uleb128 6 .uleb128 32 .byte 0x80+6 /* DW_CFA_offset, %rbp offset 2*-8 */ .uleb128 2 + .byte 0xa /* DW_CFA_remember_state */ .byte 0x4 /* DW_CFA_advance_loc4 */ .long .LUW2-.LUW1 @@ -352,23 +376,36 @@ ffi_closure_unix64: .uleb128 7 .uleb128 8 .byte 0xc0+6 /* DW_CFA_restore, %rbp */ + + .byte 0x4 /* DW_CFA_advance_loc4 */ + .long .LUW3-.LUW2 + .byte 0xb /* DW_CFA_restore_state */ + .align 8 .LEFDE1: .LSFDE3: .long .LEFDE3-.LASFDE3 /* FDE Length */ .LASFDE3: .long .LASFDE3-.Lframe1 /* FDE CIE offset */ - .long .LUW4-. /* FDE initial location */ - .long .LUW7-.LUW4 /* FDE address range */ + .long .LUW5-. /* FDE initial location */ + .long .LUW9-.LUW5 /* FDE address range */ .uleb128 0x0 /* Augmentation size */ + .byte 0x4 /* DW_CFA_advance_loc4 */ - .long .LUW5-.LUW4 + .long .LUW6-.LUW5 .byte 0xe /* DW_CFA_def_cfa_offset */ .uleb128 208 + .byte 0xa /* DW_CFA_remember_state */ + .byte 0x4 /* DW_CFA_advance_loc4 */ - .long .LUW6-.LUW5 + .long .LUW7-.LUW6 .byte 0xe /* DW_CFA_def_cfa_offset */ .uleb128 8 + + .byte 0x4 /* DW_CFA_advance_loc4 */ + .long .LUW8-.LUW7 + .byte 0xb /* DW_CFA_restore_state */ + .align 8 .LEFDE3: |