aboutsummaryrefslogtreecommitdiff
path: root/nptl
diff options
context:
space:
mode:
authorAdhemerval Zanella <adhemerval.zanella@linaro.org>2017-01-31 18:01:59 -0200
committerAdhemerval Zanella <adhemerval.zanella@linaro.org>2017-06-14 17:22:35 -0300
commit0edbf1230131dfeb03d843d2859e2104456fad80 (patch)
tree308321439470d11d70f6b84464d33021cf65f575 /nptl
parent5c3e322d3be3803636e38bcaf083fb59b3a34f0c (diff)
downloadglibc-0edbf1230131dfeb03d843d2859e2104456fad80.zip
glibc-0edbf1230131dfeb03d843d2859e2104456fad80.tar.gz
glibc-0edbf1230131dfeb03d843d2859e2104456fad80.tar.bz2
nptl: Invert the mmap/mprotect logic on allocated stacks (BZ#18988)
Current allocate_stack logic for create stacks is to first mmap all the required memory with the desirable memory and then mprotect the guard area with PROT_NONE if required. Although it works as expected, it pessimizes the allocation because it requires the kernel to actually increase commit charge (it counts against the available physical/swap memory available for the system). The only issue is to actually check this change since side-effects are really Linux specific and to actually account them it would require a kernel specific tests to parse the system wide information. On the kernel I checked /proc/self/statm does not show any meaningful difference for vmm and/or rss before and after thread creation. I could only see really meaningful information checking on system wide /proc/meminfo between thread creation: MemFree, MemAvailable, and Committed_AS shows large difference without the patch. I think trying to use these kind of information on a testcase is fragile. The BZ#18988 reports shows that the commit pages are easily seen with mlockall (MCL_FUTURE) (with lock all pages that become mapped in the process) however a more straighfoward testcase shows that pthread_create could be faster using this patch: -- static const int inner_count = 256; static const int outer_count = 128; static void *thread1(void *arg) { return NULL; } static void *sleeper(void *arg) { pthread_t ts[inner_count]; for (int i = 0; i < inner_count; i++) pthread_create (&ts[i], &a, thread1, NULL); for (int i = 0; i < inner_count; i++) pthread_join (ts[i], NULL); return NULL; } int main(void) { pthread_attr_init(&a); pthread_attr_setguardsize(&a, 1<<20); pthread_attr_setstacksize(&a, 1134592); pthread_t ts[outer_count]; for (int i = 0; i < outer_count; i++) pthread_create(&ts[i], &a, sleeper, NULL); for (int i = 0; i < outer_count; i++) pthread_join(ts[i], NULL); assert(r == 0); } return 0; } -- On x86_64 (4.4.0-45-generic, gcc 5.4.0) running the small benchtests I see: $ time ./test real 0m3.647s user 0m0.080s sys 0m11.836s While with the patch I see: $ time ./test real 0m0.696s user 0m0.040s sys 0m1.152s So I added a pthread_create benchtest (thread_create) which check the thread creation latency. As for the simple benchtests, I saw improvements in thread creation on all architectures I tested the change. Checked on x86_64-linux-gnu, i686-linux-gnu, aarch64-linux-gnu, arm-linux-gnueabihf, powerpc64le-linux-gnu, sparc64-linux-gnu, and sparcv9-linux-gnu. [BZ #18988] * benchtests/thread_create-inputs: New file. * benchtests/thread_create-source.c: Likewise. * support/xpthread_attr_setguardsize.c: Likewise. * support/Makefile (libsupport-routines): Add xpthread_attr_setguardsize object. * support/xthread.h: Add xpthread_attr_setguardsize prototype. * benchtests/Makefile (bench-pthread): Add thread_create. * nptl/allocatestack.c (allocate_stack): Call mmap with PROT_NONE and then mprotect the required area.
Diffstat (limited to 'nptl')
-rw-r--r--nptl/allocatestack.c66
1 files changed, 58 insertions, 8 deletions
diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
index e5c5f79..8364406 100644
--- a/nptl/allocatestack.c
+++ b/nptl/allocatestack.c
@@ -334,6 +334,43 @@ change_stack_perm (struct pthread *pd
return 0;
}
+/* Return the guard page position on allocated stack. */
+static inline char *
+__attribute ((always_inline))
+guard_position (void *mem, size_t size, size_t guardsize, struct pthread *pd,
+ size_t pagesize_m1)
+{
+#ifdef NEED_SEPARATE_REGISTER_STACK
+ return mem + (((size - guardsize) / 2) & ~pagesize_m1);
+#elif _STACK_GROWS_DOWN
+ return mem;
+#elif _STACK_GROWS_UP
+ return (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
+#endif
+}
+
+/* Based on stack allocated with PROT_NONE, setup the required portions with
+ 'prot' flags based on the guard page position. */
+static inline int
+setup_stack_prot (char *mem, size_t size, char *guard, size_t guardsize,
+ const int prot)
+{
+ char *guardend = guard + guardsize;
+#if _STACK_GROWS_DOWN
+ /* As defined at guard_position, for architectures with downward stack
+ the guard page is always at start of the allocated area. */
+ if (mprotect (guardend, size - guardsize, prot) != 0)
+ return errno;
+#else
+ size_t mprots1 = (uintptr_t) guard - (uintptr_t) mem;
+ if (mprotect (mem, mprots1, prot) != 0)
+ return errno;
+ size_t mprots2 = ((uintptr_t) mem + size) - (uintptr_t) guardend;
+ if (mprotect (guardend, mprots2, prot) != 0)
+ return errno;
+#endif
+ return 0;
+}
/* Returns a usable stack for a new thread either by allocating a
new stack or reusing a cached stack of sufficient size.
@@ -490,7 +527,10 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
size += pagesize_m1 + 1;
#endif
- mem = mmap (NULL, size, prot,
+ /* If a guard page is required, avoid committing memory by first
+ allocate with PROT_NONE and then reserve with required permission
+ excluding the guard page. */
+ mem = mmap (NULL, size, (guardsize == 0) ? prot : PROT_NONE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
if (__glibc_unlikely (mem == MAP_FAILED))
@@ -510,9 +550,24 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
- TLS_PRE_TCB_SIZE);
#endif
+ /* Now mprotect the required region excluding the guard area. */
+ if (__glibc_likely (guardsize > 0))
+ {
+ char *guard = guard_position (mem, size, guardsize, pd,
+ pagesize_m1);
+ if (setup_stack_prot (mem, size, guard, guardsize, prot) != 0)
+ {
+ munmap (mem, size);
+ return errno;
+ }
+ }
+
/* Remember the stack-related values. */
pd->stackblock = mem;
pd->stackblock_size = size;
+ /* Update guardsize for newly allocated guardsize to avoid
+ an mprotect in guard resize below. */
+ pd->guardsize = guardsize;
/* We allocated the first block thread-specific data array.
This address will not change for the lifetime of this
@@ -593,13 +648,8 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
/* Create or resize the guard area if necessary. */
if (__glibc_unlikely (guardsize > pd->guardsize))
{
-#ifdef NEED_SEPARATE_REGISTER_STACK
- char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
-#elif _STACK_GROWS_DOWN
- char *guard = mem;
-#elif _STACK_GROWS_UP
- char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
-#endif
+ char *guard = guard_position (mem, size, guardsize, pd,
+ pagesize_m1);
if (mprotect (guard, guardsize, PROT_NONE) != 0)
{
mprot_error: