From 8c2b1ed8bbd20d35314c2a602b903159fa567ffb Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 22 Oct 2014 14:20:35 -0700 Subject: ARM: Use movw/movt more when available --- ChangeLog | 27 ++++++++++++++++ config.h.in | 3 ++ setjmp/Makefile | 3 +- setjmp/tst-setjmp-static.c | 1 + sysdeps/arm/__longjmp.S | 35 ++++----------------- sysdeps/arm/configure | 52 +++++++++++++++++++++++++++++-- sysdeps/arm/configure.ac | 44 ++++++++++++++++++++++++-- sysdeps/arm/setjmp.S | 35 ++++----------------- sysdeps/arm/sysdep.h | 78 ++++++++++++++++++++++++++++++++++++++++++---- 9 files changed, 209 insertions(+), 69 deletions(-) create mode 100644 setjmp/tst-setjmp-static.c diff --git a/ChangeLog b/ChangeLog index 589da6b..e7fc134 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,30 @@ +2014-10-22 Roland McGrath + + * sysdeps/arm/__longjmp.S [NEED_HWCAP] [IS_IN_rtld]: Use LDST_PCREL + macro to get at the _rt_local_ro field. + [NEED_HWCAP] [!IS_IN_rtld]: Use LDR_GLOBAL to get at _rtld_global_ro + ([PIC] case) or _dl_hwcap ([!PIC] case). + * sysdeps/arm/setjmp.S: Likewise. + + * config.h.in (ARM_PCREL_MOVW_OK): New macro. + * sysdeps/arm/configure.ac: New check to define it. + * sysdeps/arm/configure: Regenerated. + * sysdeps/arm/sysdep.h [__ASSEMBLER__]: Include . + (LDST_INDEXED_NOINDEX, LDST_INDEXED_INDEX): New macros. + (LDST_INDEXED, LDST_PC_INDEXED): New macros, differing definitions + depending on [ARM_NO_INDEX_REGISTER] and [__thumb2__]. + (LDST_PCREL) [!__thumb2__ && ARCH_HAS_T2 && ARM_PCREL_MOVW_OK]: + Use move/movt pair instead of a load. + (LDST_GLOBAL): Macro removed. + (LDR_GLOBAL): New macro replaces it. + (LDR_HIDDEN): New macro. + (PTR_MANGLE_LOAD): Use LDR_GLOBAL rather than LDST_GLOBAL. + Use LDR_HIDDEN instead for __pointer_chk_guard_local. + + * setjmp/tst-setjmp-static.c: New file. + * setjmp/Makefile (tests): Add it. + (tests-static): New variable. + 2014-10-22 Maciej W. Rozycki [BZ #17485] diff --git a/config.h.in b/config.h.in index 20c0825..695ca35 100644 --- a/config.h.in +++ b/config.h.in @@ -243,6 +243,9 @@ /* The ARM hard-float ABI is being used. */ #undef HAVE_ARM_PCS_VFP +/* The ARM movw/movt instructions using PC-relative relocs work right. */ +#define ARM_PCREL_MOVW_OK 0 + /* The pt_chown binary is being built and used by grantpt. */ #define HAVE_PT_CHOWN 0 diff --git a/setjmp/Makefile b/setjmp/Makefile index 047b9ec..8006d16 100644 --- a/setjmp/Makefile +++ b/setjmp/Makefile @@ -28,7 +28,8 @@ routines := setjmp sigjmp bsd-setjmp bsd-_setjmp \ longjmp __longjmp jmp-unwind tests := tst-setjmp jmpbug bug269-setjmp tst-setjmp-fp \ - tst-sigsetjmp + tst-sigsetjmp tst-setjmp-static +tests-static := tst-setjmp-static include ../Rules diff --git a/setjmp/tst-setjmp-static.c b/setjmp/tst-setjmp-static.c new file mode 100644 index 0000000..5ca5df8 --- /dev/null +++ b/setjmp/tst-setjmp-static.c @@ -0,0 +1 @@ +#include "tst-setjmp.c" diff --git a/sysdeps/arm/__longjmp.S b/sysdeps/arm/__longjmp.S index 27d1b71..a983957 100644 --- a/sysdeps/arm/__longjmp.S +++ b/sysdeps/arm/__longjmp.S @@ -77,21 +77,15 @@ ENTRY (__longjmp) #ifdef NEED_HWCAP # ifdef IS_IN_rtld - ldr a4, 1f - ldr a3, .Lrtld_local_ro -0: add a4, pc, a4 - add a4, a4, a3 - ldr a4, [a4, #RTLD_GLOBAL_RO_DL_HWCAP_OFFSET] + LDST_PCREL (ldr, a4, a3, \ + C_SYMBOL_NAME(_rtld_local_ro) \ + + RTLD_GLOBAL_RO_DL_HWCAP_OFFSET) # else # ifdef PIC - ldr a4, 1f - ldr a3, .Lrtld_global_ro -0: add a4, pc, a4 - ldr a4, [a4, a3] - ldr a4, [a4, #RTLD_GLOBAL_RO_DL_HWCAP_OFFSET] + LDR_GLOBAL (a4, a3, C_SYMBOL_NAME(_rtld_global_ro), \ + RTLD_GLOBAL_RO_DL_HWCAP_OFFSET) # else - ldr a4, .Lhwcap - ldr a4, [a4, #0] + LDR_GLOBAL (a4, a3, C_SYMBOL_NAME(_dl_hwcap), 0) # endif # endif #endif @@ -138,21 +132,4 @@ ENTRY (__longjmp) DO_RET(lr) -#ifdef NEED_HWCAP -# ifdef IS_IN_rtld -1: .long _GLOBAL_OFFSET_TABLE_ - 0b - PC_OFS -.Lrtld_local_ro: - .long C_SYMBOL_NAME(_rtld_local_ro)(GOTOFF) -# else -# ifdef PIC -1: .long _GLOBAL_OFFSET_TABLE_ - 0b - PC_OFS -.Lrtld_global_ro: - .long C_SYMBOL_NAME(_rtld_global_ro)(GOT) -# else -.Lhwcap: - .long C_SYMBOL_NAME(_dl_hwcap) -# endif -# endif -#endif - END (__longjmp) diff --git a/sysdeps/arm/configure b/sysdeps/arm/configure index 238b335..45667cc 100644 --- a/sysdeps/arm/configure +++ b/sysdeps/arm/configure @@ -150,8 +150,8 @@ else cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ #ifdef __ARM_PCS_VFP - yes - #endif + yes + #endif _ACEOF if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | @@ -211,6 +211,54 @@ else have-arm-tls-desc = no" fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether PC-relative relocs in movw/movt work properly" >&5 +$as_echo_n "checking whether PC-relative relocs in movw/movt work properly... " >&6; } +if ${libc_cv_arm_pcrel_movw+:} false; then : + $as_echo_n "(cached) " >&6 +else + +cat > conftest.s <<\EOF + .syntax unified + .arm + .arch armv7-a + + .text + .globl foo + .type foo,%function +foo: movw r0, #:lower16:symbol - 1f - 8 + movt r0, #:upper16:symbol - 1f - 8 +1: add r0, pc + @ And now a case with a local symbol. + movw r0, #:lower16:3f - 2f - 8 + movt r0, #:upper16:3f - 2f - 8 +2: add r0, pc + bx lr + +.data + .globl symbol + .hidden symbol +symbol: .long 23 +3: .long 17 +EOF +libc_cv_arm_pcrel_movw=no +${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS \ + -nostartfiles -nostdlib -shared \ + -o conftest.so conftest.s 1>&5 2>&5 && +LC_ALL=C $READELF -dr conftest.so > conftest.dr 2>&5 && +{ + cat conftest.dr 1>&5 + fgrep 'TEXTREL +R_ARM_NONE' conftest.dr > /dev/null || libc_cv_arm_pcrel_movw=yes +} +rm -f conftest* +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_arm_pcrel_movw" >&5 +$as_echo "$libc_cv_arm_pcrel_movw" >&6; } +if test $libc_cv_arm_pcrel_movw = yes; then + $as_echo "#define ARM_PCREL_MOVW_OK 1" >>confdefs.h + +fi + libc_cv_gcc_unwind_find_fde=no # Remove -fno-unwind-tables that was added in sysdeps/arm/preconfigure.ac. diff --git a/sysdeps/arm/configure.ac b/sysdeps/arm/configure.ac index 86c0c08..002b8ef 100644 --- a/sysdeps/arm/configure.ac +++ b/sysdeps/arm/configure.ac @@ -17,8 +17,8 @@ dnl it. Until we do, don't define it. AC_CACHE_CHECK([whether the compiler is using the ARM hard-float ABI], [libc_cv_arm_pcs_vfp], [AC_EGREP_CPP(yes,[#ifdef __ARM_PCS_VFP - yes - #endif + yes + #endif ], libc_cv_arm_pcs_vfp=yes, libc_cv_arm_pcs_vfp=no)]) if test $libc_cv_arm_pcs_vfp = yes; then AC_DEFINE(HAVE_ARM_PCS_VFP) @@ -40,6 +40,46 @@ else LIBC_CONFIG_VAR([have-arm-tls-desc], [no]) fi +AC_CACHE_CHECK([whether PC-relative relocs in movw/movt work properly], + libc_cv_arm_pcrel_movw, [ +cat > conftest.s <<\EOF + .syntax unified + .arm + .arch armv7-a + + .text + .globl foo + .type foo,%function +foo: movw r0, #:lower16:symbol - 1f - 8 + movt r0, #:upper16:symbol - 1f - 8 +1: add r0, pc + @ And now a case with a local symbol. + movw r0, #:lower16:3f - 2f - 8 + movt r0, #:upper16:3f - 2f - 8 +2: add r0, pc + bx lr + +.data + .globl symbol + .hidden symbol +symbol: .long 23 +3: .long 17 +EOF +libc_cv_arm_pcrel_movw=no +${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS \ + -nostartfiles -nostdlib -shared \ + -o conftest.so conftest.s 1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD && +LC_ALL=C $READELF -dr conftest.so > conftest.dr 2>&AS_MESSAGE_LOG_FD && +{ + cat conftest.dr 1>&AS_MESSAGE_LOG_FD + fgrep 'TEXTREL +R_ARM_NONE' conftest.dr > /dev/null || libc_cv_arm_pcrel_movw=yes +} +rm -f conftest*]) +if test $libc_cv_arm_pcrel_movw = yes; then + AC_DEFINE([ARM_PCREL_MOVW_OK]) +fi + libc_cv_gcc_unwind_find_fde=no # Remove -fno-unwind-tables that was added in sysdeps/arm/preconfigure.ac. diff --git a/sysdeps/arm/setjmp.S b/sysdeps/arm/setjmp.S index 17a16c9..6f54ab3 100644 --- a/sysdeps/arm/setjmp.S +++ b/sysdeps/arm/setjmp.S @@ -58,21 +58,15 @@ ENTRY (__sigsetjmp) #ifdef NEED_HWCAP /* Check if we have a VFP unit. */ # ifdef IS_IN_rtld - ldr a3, 1f - ldr a4, .Lrtld_local_ro -0: add a3, pc, a3 - add a3, a3, a4 - ldr a3, [a3, #RTLD_GLOBAL_RO_DL_HWCAP_OFFSET] + LDST_PCREL (ldr, a3, a4, \ + C_SYMBOL_NAME(_rtld_local_ro) \ + + RTLD_GLOBAL_RO_DL_HWCAP_OFFSET) # else # ifdef PIC - ldr a3, 1f - ldr a4, .Lrtld_global_ro -0: add a3, pc, a3 - ldr a3, [a3, a4] - ldr a3, [a3, #RTLD_GLOBAL_RO_DL_HWCAP_OFFSET] + LDR_GLOBAL (a3, a4, C_SYMBOL_NAME(_rtld_global_ro), \ + RTLD_GLOBAL_RO_DL_HWCAP_OFFSET) # else - ldr a3, .Lhwcap - ldr a3, [a3, #0] + LDR_GLOBAL (a3, a4, C_SYMBOL_NAME(_dl_hwcap), 0) # endif # endif #endif @@ -114,23 +108,6 @@ ENTRY (__sigsetjmp) /* Make a tail call to __sigjmp_save; it takes the same args. */ B PLTJMP(C_SYMBOL_NAME(__sigjmp_save)) -#ifdef NEED_HWCAP -# ifdef IS_IN_rtld -1: .long _GLOBAL_OFFSET_TABLE_ - 0b - PC_OFS -.Lrtld_local_ro: - .long C_SYMBOL_NAME(_rtld_local_ro)(GOTOFF) -# else -# ifdef PIC -1: .long _GLOBAL_OFFSET_TABLE_ - 0b - PC_OFS -.Lrtld_global_ro: - .long C_SYMBOL_NAME(_rtld_global_ro)(GOT) -# else -.Lhwcap: - .long C_SYMBOL_NAME(_dl_hwcap) -# endif -# endif -#endif - END (__sigsetjmp) hidden_def (__sigsetjmp) diff --git a/sysdeps/arm/sysdep.h b/sysdeps/arm/sysdep.h index 4c41213..8614b4a 100644 --- a/sysdeps/arm/sysdep.h +++ b/sysdeps/arm/sysdep.h @@ -21,6 +21,8 @@ #ifndef __ASSEMBLER__ # include +#else +# include #endif /* The __ARM_ARCH define is provided by gcc 4.8. Construct it otherwise. */ @@ -157,6 +159,32 @@ .arm # endif +/* Load or store to/from address X + Y into/from R, (maybe) using T. + X or Y can use T freely; T can be R if OP is a load. The first + version eschews the two-register addressing mode, while the + second version uses it. */ +# define LDST_INDEXED_NOINDEX(OP, R, T, X, Y) \ + add T, X, Y; \ + sfi_breg T, \ + OP R, [T] +# define LDST_INDEXED_INDEX(OP, R, X, Y) \ + OP R, [X, Y] + +# ifdef ARM_NO_INDEX_REGISTER +/* We're never using the two-register addressing mode, so this + always uses an intermediate add. */ +# define LDST_INDEXED(OP, R, T, X, Y) LDST_INDEXED_NOINDEX (OP, R, T, X, Y) +# define LDST_PC_INDEXED(OP, R, T, X) LDST_INDEXED_NOINDEX (OP, R, T, pc, X) +# else +/* The two-register addressing mode is OK, except on Thumb with pc. */ +# define LDST_INDEXED(OP, R, T, X, Y) LDST_INDEXED_INDEX (OP, R, X, Y) +# ifdef __thumb2__ +# define LDST_PC_INDEXED(OP, R, T, X) LDST_INDEXED_NOINDEX (OP, R, T, pc, X) +# else +# define LDST_PC_INDEXED(OP, R, T, X) LDST_INDEXED_INDEX (OP, R, pc, X) +# endif +# endif + /* Load or store to/from a pc-relative EXPR into/from R, using T. */ # ifdef __thumb2__ # define LDST_PCREL(OP, R, T, EXPR) \ @@ -166,6 +194,11 @@ .previous; \ 99: add T, T, pc; \ OP R, [T] +# elif defined (ARCH_HAS_T2) && ARM_PCREL_MOVW_OK +# define LDST_PCREL(OP, R, T, EXPR) \ + movw T, #:lower16:EXPR - 99f - PC_OFS; \ + movt T, #:upper16:EXPR - 99f - PC_OFS; \ +99: LDST_PC_INDEXED (OP, R, T, T) # else # define LDST_PCREL(OP, R, T, EXPR) \ ldr T, 98f; \ @@ -175,17 +208,50 @@ 99: OP R, [pc, T] # endif -/* Load or store to/from a global EXPR into/from R, using T. */ -# define LDST_GLOBAL(OP, R, T, EXPR) \ +/* Load from a global SYMBOL + CONSTANT into R, using T. */ +# if defined (ARCH_HAS_T2) && !defined (PIC) +# define LDR_GLOBAL(R, T, SYMBOL, CONSTANT) \ + movw T, #:lower16:SYMBOL; \ + movt T, #:upper16:SYMBOL; \ + ldr R, [T, $CONSTANT] +# elif defined (ARCH_HAS_T2) && defined (PIC) && ARM_PCREL_MOVW_OK +# define LDR_GLOBAL(R, T, SYMBOL, CONSTANT) \ + movw R, #:lower16:_GLOBAL_OFFSET_TABLE_ - 97f - PC_OFS; \ + movw T, #:lower16:99f - 98f - PC_OFS; \ + movt R, #:upper16:_GLOBAL_OFFSET_TABLE_ - 97f - PC_OFS; \ + movt T, #:upper16:99f - 98f - PC_OFS; \ + .pushsection .rodata.cst4, "aM", %progbits, 4; \ + .balign 4; \ +99: .word SYMBOL##(GOT); \ + .popsection; \ +97: add R, R, pc; \ +98: LDST_PC_INDEXED (ldr, T, T, T); \ + LDST_INDEXED (ldr, R, T, R, T); \ + ldr R, [R, $CONSTANT] +# else +# define LDR_GLOBAL(R, T, SYMBOL, CONSTANT) \ ldr T, 99f; \ ldr R, 100f; \ 98: add T, T, pc; \ ldr T, [T, R]; \ .subsection 2; \ 99: .word _GLOBAL_OFFSET_TABLE_ - 98b - PC_OFS; \ -100: .word EXPR##(GOT); \ +100: .word SYMBOL##(GOT); \ .previous; \ - OP R, [T] + ldr R, [T, $CONSTANT] +# endif + +/* This is the same as LDR_GLOBAL, but for a SYMBOL that is known to + be in the same linked object (as for one with hidden visibility). + We can avoid the GOT indirection in the PIC case. For the pure + static case, LDR_GLOBAL is already optimal. */ +# ifdef PIC +# define LDR_HIDDEN(R, T, SYMBOL, CONSTANT) \ + LDST_PCREL (ldr, R, T, SYMBOL + CONSTANT) +# else +# define LDR_HIDDEN(R, T, SYMBOL, CONSTANT) \ + LDR_GLOBAL (R, T, SYMBOL, CONSTANT) +# endif /* Cope with negative memory offsets, which thumb can't encode. Use NEGOFF_ADJ_BASE to (conditionally) alter the base register, @@ -296,7 +362,7 @@ (!defined SHARED && (!defined NOT_IN_libc || defined IS_IN_libpthread))) # ifdef __ASSEMBLER__ # define PTR_MANGLE_LOAD(guard, tmp) \ - LDST_PCREL(ldr, guard, tmp, C_SYMBOL_NAME(__pointer_chk_guard_local)); + LDR_HIDDEN (guard, tmp, C_SYMBOL_NAME(__pointer_chk_guard_local), 0) # define PTR_MANGLE(dst, src, guard, tmp) \ PTR_MANGLE_LOAD(guard, tmp); \ PTR_MANGLE2(dst, src, guard) @@ -316,7 +382,7 @@ extern uintptr_t __pointer_chk_guard_local attribute_relro attribute_hidden; #else # ifdef __ASSEMBLER__ # define PTR_MANGLE_LOAD(guard, tmp) \ - LDST_GLOBAL(ldr, guard, tmp, C_SYMBOL_NAME(__pointer_chk_guard)); + LDR_GLOBAL (guard, tmp, C_SYMBOL_NAME(__pointer_chk_guard), 0); # define PTR_MANGLE(dst, src, guard, tmp) \ PTR_MANGLE_LOAD(guard, tmp); \ PTR_MANGLE2(dst, src, guard) -- cgit v1.1