diff options
-rw-r--r-- | sysdeps/hppa/dl-fptr.c | 26 | ||||
-rw-r--r-- | sysdeps/hppa/dl-machine.h | 36 | ||||
-rw-r--r-- | sysdeps/hppa/dl-trampoline.S | 74 | ||||
-rw-r--r-- | sysdeps/unix/sysv/linux/hppa/atomic-machine.h | 28 |
4 files changed, 142 insertions, 22 deletions
diff --git a/sysdeps/hppa/dl-fptr.c b/sysdeps/hppa/dl-fptr.c index 0a37397..25ca8f8 100644 --- a/sysdeps/hppa/dl-fptr.c +++ b/sysdeps/hppa/dl-fptr.c @@ -172,8 +172,8 @@ make_fdesc (ElfW(Addr) ip, ElfW(Addr) gp) } install: - fdesc->ip = ip; fdesc->gp = gp; + fdesc->ip = ip; return (ElfW(Addr)) fdesc; } @@ -350,7 +350,9 @@ ElfW(Addr) _dl_lookup_address (const void *address) { ElfW(Addr) addr = (ElfW(Addr)) address; - unsigned int *desc, *gptr; + ElfW(Word) reloc_arg; + volatile unsigned int *desc; + unsigned int *gptr; /* Return ADDR if the least-significant two bits of ADDR are not consistent with ADDR being a linker defined function pointer. The normal value for @@ -367,7 +369,11 @@ _dl_lookup_address (const void *address) if (!_dl_read_access_allowed (desc)) return addr; - /* Load first word of candidate descriptor. It should be a pointer + /* First load the relocation offset. */ + reloc_arg = (ElfW(Word)) desc[1]; + atomic_full_barrier(); + + /* Then load first word of candidate descriptor. It should be a pointer with word alignment and point to memory that can be read. */ gptr = (unsigned int *) desc[0]; if (((unsigned int) gptr & 3) != 0 @@ -377,8 +383,8 @@ _dl_lookup_address (const void *address) /* See if descriptor requires resolution. The following trampoline is used in each global offset table for function resolution: - ldw 0(r20),r22 - bv r0(r22) + ldw 0(r20),r21 + bv r0(r21) ldw 4(r20),r21 tramp: b,l .-12,r20 depwi 0,31,2,r20 @@ -389,7 +395,15 @@ _dl_lookup_address (const void *address) if (gptr[0] == 0xea9f1fdd /* b,l .-12,r20 */ && gptr[1] == 0xd6801c1e /* depwi 0,31,2,r20 */ && (ElfW(Addr)) gptr[2] == elf_machine_resolve ()) - _dl_fixup ((struct link_map *) gptr[5], (ElfW(Word)) desc[1]); + { + struct link_map *l = (struct link_map *) gptr[5]; + + /* If gp has been resolved, we need to hunt for relocation offset. */ + if (!(reloc_arg & PA_GP_RELOC)) + reloc_arg = _dl_fix_reloc_arg (addr, l); + + _dl_fixup (l, reloc_arg); + } return (ElfW(Addr)) desc[0]; } diff --git a/sysdeps/hppa/dl-machine.h b/sysdeps/hppa/dl-machine.h index 9e98366..8ecff97 100644 --- a/sysdeps/hppa/dl-machine.h +++ b/sysdeps/hppa/dl-machine.h @@ -48,6 +48,14 @@ #define GOT_FROM_PLT_STUB (4*4) #define PLT_ENTRY_SIZE (2*4) +/* The gp slot in the function descriptor contains the relocation offset + before resolution. To distinguish between a resolved gp value and an + unresolved relocation offset we set an unused bit in the relocation + offset. This would allow us to do a synchronzied two word update + using this bit (interlocked update), but instead of waiting for the + update we simply recompute the gp value given that we know the ip. */ +#define PA_GP_RELOC 1 + /* Initialize the function descriptor table before relocations */ static inline void __hppa_init_bootstrap_fdesc_table (struct link_map *map) @@ -117,10 +125,28 @@ elf_machine_fixup_plt (struct link_map *map, lookup_t t, volatile Elf32_Addr *rfdesc = reloc_addr; /* map is the link_map for the caller, t is the link_map for the object being called */ - rfdesc[1] = value.gp; - /* Need to ensure that the gp is visible before the code - entry point is updated */ - rfdesc[0] = value.ip; + + /* We would like the function descriptor to be double word aligned. This + helps performance (ip and gp then reside on the same cache line) and + we can update the pair atomically with a single store. The linker + now ensures this alignment but we still have to handle old code. */ + if ((unsigned int)reloc_addr & 7) + { + /* Need to ensure that the gp is visible before the code + entry point is updated */ + rfdesc[1] = value.gp; + atomic_full_barrier(); + rfdesc[0] = value.ip; + } + else + { + /* Update pair atomically with floating point store. */ + union { ElfW(Word) v[2]; double d; } u; + + u.v[0] = value.ip; + u.v[1] = value.gp; + *(volatile double *)rfdesc = u.d; + } return value; } @@ -265,7 +291,7 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) here. The trampoline code will load the proper LTP and pass the reloc offset to the fixup function. */ - fptr->gp = iplt - jmprel; + fptr->gp = (iplt - jmprel) | PA_GP_RELOC; } /* r_sym != 0 */ else { diff --git a/sysdeps/hppa/dl-trampoline.S b/sysdeps/hppa/dl-trampoline.S index 0114ca8..d0804b3 100644 --- a/sysdeps/hppa/dl-trampoline.S +++ b/sysdeps/hppa/dl-trampoline.S @@ -31,7 +31,7 @@ slow down __cffc when it attempts to call fixup to resolve function descriptor references. Please refer to gcc/gcc/config/pa/fptr.c - Enter with r19 = reloc offset, r20 = got-8, r21 = fixup ltp. */ + Enter with r19 = reloc offset, r20 = got-8, r21 = fixup ltp, r22 = fp. */ /* RELOCATION MARKER: bl to provide gcc's __cffc with fixup loc. */ .text @@ -61,17 +61,20 @@ _dl_runtime_resolve: copy %sp, %r1 /* Copy previous sp */ /* Save function result address (on entry) */ stwm %r28,128(%sp) - /* Fillin some frame info to follow ABI */ + /* Fill in some frame info to follow ABI */ stw %r1,-4(%sp) /* Previous sp */ stw %r21,-32(%sp) /* PIC register value */ /* Save input floating point registers. This must be done in the new frame since the previous frame doesn't have enough space */ - ldo -56(%sp),%r1 + ldo -64(%sp),%r1 fstd,ma %fr4,-8(%r1) fstd,ma %fr5,-8(%r1) fstd,ma %fr6,-8(%r1) + + /* Test PA_GP_RELOC bit. */ + bb,>= %r19,31,2f /* branch if not reloc offset */ fstd,ma %fr7,-8(%r1) /* Set up args to fixup func, needs only two arguments */ @@ -79,7 +82,7 @@ _dl_runtime_resolve: copy %r19,%r25 /* (2) reloc offset */ /* Call the real address resolver. */ - bl _dl_fixup,%rp +3: bl _dl_fixup,%rp copy %r21,%r19 /* set fixup func ltp */ /* While the linker will set a function pointer to NULL when it @@ -102,7 +105,7 @@ _dl_runtime_resolve: copy %r29, %r19 /* Reload arguments fp args */ - ldo -56(%sp),%r1 + ldo -64(%sp),%r1 fldd,ma -8(%r1),%fr4 fldd,ma -8(%r1),%fr5 fldd,ma -8(%r1),%fr6 @@ -129,6 +132,25 @@ _dl_runtime_resolve: bv %r0(%rp) ldo -128(%sp),%sp +2: + /* Set up args for _dl_fix_reloc_arg. */ + copy %r22,%r26 /* (1) function pointer */ + depi 0,31,2,%r26 /* clear least significant bits */ + ldw 8+4(%r20),%r25 /* (2) got[1] == struct link_map */ + + /* Save ltp and link map arg for _dl_fixup. */ + stw %r21,-56(%sp) /* ltp */ + stw %r25,-60(%sp) /* struct link map */ + + /* Find reloc offset. */ + bl _dl_fix_reloc_arg,%rp + copy %r21,%r19 /* set func ltp */ + + /* Set up args for _dl_fixup. */ + ldw -56(%sp),%r21 /* ltp */ + ldw -60(%sp),%r26 /* (1) struct link map */ + b 3b + copy %ret0,%r25 /* (2) reloc offset */ .EXIT .PROCEND cfi_endproc @@ -153,7 +175,7 @@ _dl_runtime_profile: copy %sp, %r1 /* Copy previous sp */ /* Save function result address (on entry) */ stwm %r28,192(%sp) - /* Fillin some frame info to follow ABI */ + /* Fill in some frame info to follow ABI */ stw %r1,-4(%sp) /* Previous sp */ stw %r21,-32(%sp) /* PIC register value */ @@ -181,10 +203,11 @@ _dl_runtime_profile: fstd,ma %fr5,8(%r1) fstd,ma %fr6,8(%r1) fstd,ma %fr7,8(%r1) - /* 32-bit stack pointer and return register */ - stw %sp,-56(%sp) - stw %r2,-52(%sp) + /* Test PA_GP_RELOC bit. */ + bb,>= %r19,31,2f /* branch if not reloc offset */ + /* 32-bit stack pointer */ + stw %sp,-56(%sp) /* Set up args to fixup func, needs five arguments */ ldw 8+4(%r20),%r26 /* (1) got[1] == struct link_map */ @@ -197,7 +220,7 @@ _dl_runtime_profile: stw %r1, -52(%sp) /* (5) long int *framesizep */ /* Call the real address resolver. */ - bl _dl_profile_fixup,%rp +3: bl _dl_profile_fixup,%rp copy %r21,%r19 /* set fixup func ltp */ /* Load up the returned function descriptor */ @@ -215,7 +238,9 @@ _dl_runtime_profile: fldd,ma 8(%r1),%fr5 fldd,ma 8(%r1),%fr6 fldd,ma 8(%r1),%fr7 - ldw -52(%sp),%rp + + /* Reload rp register -(192+20) without adjusting stack */ + ldw -212(%sp),%rp /* Reload static link register -(192+16) without adjusting stack */ ldw -208(%sp),%r29 @@ -303,6 +328,33 @@ L(cont): ldw -20(%sp),%rp /* Return */ bv,n 0(%r2) + +2: + /* Set up args for _dl_fix_reloc_arg. */ + copy %r22,%r26 /* (1) function pointer */ + depi 0,31,2,%r26 /* clear least significant bits */ + ldw 8+4(%r20),%r25 /* (2) got[1] == struct link_map */ + + /* Save ltp and link map arg for _dl_fixup. */ + stw %r21,-92(%sp) /* ltp */ + stw %r25,-116(%sp) /* struct link map */ + + /* Find reloc offset. */ + bl _dl_fix_reloc_arg,%rp + copy %r21,%r19 /* set func ltp */ + + /* Restore fixup ltp. */ + ldw -92(%sp),%r21 /* ltp */ + + /* Set up args to fixup func, needs five arguments */ + ldw -116(%sp),%r26 /* (1) struct link map */ + copy %ret0,%r25 /* (2) reloc offset */ + stw %r25,-120(%sp) /* Save reloc offset */ + ldw -212(%sp),%r24 /* (3) profile_fixup needs rp */ + ldo -56(%sp),%r23 /* (4) La_hppa_regs */ + ldo -112(%sp), %r1 + b 3b + stw %r1, -52(%sp) /* (5) long int *framesizep */ .EXIT .PROCEND cfi_endproc diff --git a/sysdeps/unix/sysv/linux/hppa/atomic-machine.h b/sysdeps/unix/sysv/linux/hppa/atomic-machine.h index 9d8ffbe..bf61b66 100644 --- a/sysdeps/unix/sysv/linux/hppa/atomic-machine.h +++ b/sysdeps/unix/sysv/linux/hppa/atomic-machine.h @@ -36,9 +36,37 @@ typedef uintptr_t uatomicptr_t; typedef intmax_t atomic_max_t; typedef uintmax_t uatomic_max_t; +#define atomic_full_barrier() __sync_synchronize () + #define __HAVE_64B_ATOMICS 0 #define USE_ATOMIC_COMPILER_BUILTINS 0 +/* We use the compiler atomic load and store builtins as the generic + defines are not atomic. In particular, we need to use compare and + exchange for stores as the implementation is synthesized. */ +void __atomic_link_error (void); +#define __atomic_check_size_ls(mem) \ + if ((sizeof (*mem) != 1) && (sizeof (*mem) != 2) && sizeof (*mem) != 4) \ + __atomic_link_error (); + +#define atomic_load_relaxed(mem) \ + ({ __atomic_check_size_ls((mem)); \ + __atomic_load_n ((mem), __ATOMIC_RELAXED); }) +#define atomic_load_acquire(mem) \ + ({ __atomic_check_size_ls((mem)); \ + __atomic_load_n ((mem), __ATOMIC_ACQUIRE); }) + +#define atomic_store_relaxed(mem, val) \ + do { \ + __atomic_check_size_ls((mem)); \ + __atomic_store_n ((mem), (val), __ATOMIC_RELAXED); \ + } while (0) +#define atomic_store_release(mem, val) \ + do { \ + __atomic_check_size_ls((mem)); \ + __atomic_store_n ((mem), (val), __ATOMIC_RELEASE); \ + } while (0) + /* XXX Is this actually correct? */ #define ATOMIC_EXCHANGE_USES_CAS 1 |