diff options
author | DJ Delorie <dj@delorie.com> | 2017-05-11 16:44:59 -0400 |
---|---|---|
committer | DJ Delorie <dj@delorie.com> | 2017-05-11 17:09:22 -0400 |
commit | 4da80dbb06a7394581f74deae489858bf1607f90 (patch) | |
tree | 975421d6fd43ac5ffc0d9616717c9f5193631411 | |
parent | 6d620560b6e21e15deeeb713af9ea52c679606e3 (diff) | |
download | glibc-4da80dbb06a7394581f74deae489858bf1607f90.zip glibc-4da80dbb06a7394581f74deae489858bf1607f90.tar.gz glibc-4da80dbb06a7394581f74deae489858bf1607f90.tar.bz2 |
Updates to tcache
* remove legacy environment variables
* remove tcache mallopt() options
* tweak size2tidx/tidx2size macros to be more accurate and consistent
* add comments
* tcache_max -> tcache_bins
* tunables made SXID_IGNORE
* dedup fastbin removal code snippets
* document tunables
* document probes
* DeCamelCaseify
-rw-r--r-- | elf/dl-tunables.list | 6 | ||||
-rw-r--r-- | malloc/arena.c | 28 | ||||
-rw-r--r-- | malloc/malloc.c | 163 | ||||
-rw-r--r-- | manual/probes.texi | 19 | ||||
-rw-r--r-- | manual/tunables.texi | 34 |
5 files changed, 130 insertions, 120 deletions
diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list index 37620c8..af2b46f 100644 --- a/elf/dl-tunables.list +++ b/elf/dl-tunables.list @@ -78,15 +78,15 @@ glibc { } tcache_max { type: SIZE_T - env_alias: MALLOC_TCACHE_MAX + security_level: SXID_IGNORE } tcache_count { type: SIZE_T - env_alias: MALLOC_TCACHE_COUNT + security_level: SXID_IGNORE } tcache_unsorted_limit { type: SIZE_T - env_alias: MALLOC_TCACHE_UNSORTED_LIMIT + security_level: SXID_IGNORE } } } diff --git a/malloc/arena.c b/malloc/arena.c index 79e918f..dacc481 100644 --- a/malloc/arena.c +++ b/malloc/arena.c @@ -330,7 +330,8 @@ ptmalloc_init (void) #if USE_TCACHE TUNABLE_SET_VAL_WITH_CALLBACK (tcache_max, NULL, set_tcache_max); TUNABLE_SET_VAL_WITH_CALLBACK (tcache_count, NULL, set_tcache_count); - TUNABLE_SET_VAL_WITH_CALLBACK (tcache_unsorted_limit, NULL, set_tcache_unsorted_limit); + TUNABLE_SET_VAL_WITH_CALLBACK (tcache_unsorted_limit, NULL, + set_tcache_unsorted_limit); #endif __libc_lock_unlock (main_arena.mutex); #else @@ -381,23 +382,7 @@ ptmalloc_init (void) if (memcmp (envline, "ARENA_TEST", 10) == 0) __libc_mallopt (M_ARENA_TEST, atoi (&envline[11])); } -#if USE_TCACHE - if (!__builtin_expect (__libc_enable_secure, 0)) - { - if (memcmp (envline, "TCACHE_MAX", 10) == 0) - __libc_mallopt (M_TCACHE_MAX, atoi (&envline[11])); - } -#endif break; -#if USE_TCACHE - case 12: - if (!__builtin_expect (__libc_enable_secure, 0)) - { - if (memcmp (envline, "TCACHE_COUNT", 12) == 0) - __libc_mallopt (M_TCACHE_COUNT, atoi (&envline[13])); - } - break; -#endif case 15: if (!__builtin_expect (__libc_enable_secure, 0)) { @@ -407,15 +392,6 @@ ptmalloc_init (void) __libc_mallopt (M_MMAP_THRESHOLD, atoi (&envline[16])); } break; -#if USE_TCACHE - case 21: - if (!__builtin_expect (__libc_enable_secure, 0)) - { - if (memcmp (envline, "TCACHE_UNSORTED_LIMIT", 21) == 0) - __libc_mallopt (M_TCACHE_UNSORTED_LIMIT, atoi (&envline[22])); - } - break; -#endif default: break; } diff --git a/malloc/malloc.c b/malloc/malloc.c index 8cd03d8..91551ae 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -297,30 +297,27 @@ __malloc_assert (const char *assertion, const char *file, unsigned int line, } #endif -#ifndef USE_TCACHE -# define USE_TCACHE 0 -#endif #if USE_TCACHE /* We want 64 entries. This is an arbitrary limit, which tunables can reduce. */ -# define MAX_TCACHE_SIZE (MALLOC_ALIGNMENT * 63) -# define TCACHE_IDX ((MAX_TCACHE_SIZE / MALLOC_ALIGNMENT) + 1) -# define size2tidx_(bytes) (((bytes) + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT) +# define TCACHE_MAX_BINS 64 +# define MAX_TCACHE_SIZE tidx2usize (TCACHE_MAX_BINS-1) -# define tidx2csize(idx) ((idx) * MALLOC_ALIGNMENT + SIZE_SZ) -# define tidx2usize(idx) ((idx) * MALLOC_ALIGNMENT) +/* Only used to pre-fill the tunables. */ +# define tidx2usize(idx) (((size_t) idx) * MALLOC_ALIGNMENT + MINSIZE - SIZE_SZ) -/* When "x" is a user-provided size. */ -# define usize2tidx(x) size2tidx_ (x) /* When "x" is from chunksize(). */ -# define csize2tidx(x) size2tidx_ ((x) - SIZE_SZ) +# define csize2tidx(x) (((x) - MINSIZE + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT) +/* When "x" is a user-provided size. */ +# define usize2tidx(x) csize2tidx (request2size (x)) -/* Rounds up, so... - idx 0 bytes 0 - idx 1 bytes 1..8 - idx 2 bytes 9..16 +/* With rounding and alignment, the bins are... + idx 0 bytes 0..24 (64-bit) or 0..12 (32-bit) + idx 1 bytes 25..40 or 13..20 + idx 2 bytes 41..56 or 21..28 etc. */ -/* This is another arbitrary limit, which tunables can change. */ +/* This is another arbitrary limit, which tunables can change. Each + tcache bin will hold at most this number of chunks. */ # define TCACHE_FILL_COUNT 7 #endif @@ -1741,12 +1738,12 @@ struct malloc_par #if USE_TCACHE /* Maximum number of buckets to use. */ - size_t tcache_max; + size_t tcache_bins; size_t tcache_max_bytes; /* Maximum number of chunks in each bucket. */ size_t tcache_count; /* Maximum number of chunks to remove from the unsorted list, which - don't match. */ + aren't used to prefill the cache. */ size_t tcache_unsorted_limit; #endif }; @@ -1790,19 +1787,12 @@ static struct malloc_par mp_ = #if USE_TCACHE , .tcache_count = TCACHE_FILL_COUNT, - .tcache_max = TCACHE_IDX, - .tcache_max_bytes = tidx2usize (TCACHE_IDX-1), - .tcache_unsorted_limit = 0 /* No limit */ + .tcache_bins = TCACHE_MAX_BINS, + .tcache_max_bytes = tidx2usize (TCACHE_MAX_BINS-1), + .tcache_unsorted_limit = 0 /* No limit. */ #endif }; -/* Non public mallopt parameters. */ -#if USE_TCACHE -# define M_TCACHE_COUNT -9 -# define M_TCACHE_MAX -10 -# define M_TCACHE_UNSORTED_LIMIT -11 -#endif - /* Maximum size of memory handled in fastbins. */ static INTERNAL_SIZE_T global_max_fast; @@ -2928,35 +2918,43 @@ mremap_chunk (mchunkptr p, size_t new_size) #if USE_TCACHE -typedef struct TCacheEntry { - struct TCacheEntry *next; -} TCacheEntry; +/* We overlay this structure on the user-data portion of a chunk when + the chunk is stored in the per-thread cache. */ +typedef struct tcache_entry { + struct tcache_entry *next; +} tcache_entry; /* There is one of these for each thread, which contains the - per-thread cache (hence "TCache"). Keeping overall size low is - mildly important. Note that COUNTS and ENTRIES are redundant, this - is for performance reasons. */ -typedef struct TCache { - char counts[TCACHE_IDX]; - TCacheEntry *entries[TCACHE_IDX]; -} TCache; + per-thread cache (hence "tcache_perthread_struct"). Keeping + overall size low is mildly important. Note that COUNTS and ENTRIES + are redundant, this is for performance reasons. */ +typedef struct tcache_perthread_struct { + char counts[TCACHE_MAX_BINS]; + tcache_entry *entries[TCACHE_MAX_BINS]; +} tcache_perthread_struct; static __thread char tcache_shutting_down = 0; -static __thread TCache *tcache = NULL; +static __thread tcache_perthread_struct *tcache = NULL; +/* Caller must ensure that we know tc_idx is valid and there's room + for more chunks. */ static void tcache_put (mchunkptr chunk, size_t tc_idx) { - TCacheEntry *e = (TCacheEntry *) chunk2mem (chunk); + tcache_entry *e = (tcache_entry *) chunk2mem (chunk); + assert (tc_idx < TCACHE_MAX_BINS); e->next = tcache->entries[tc_idx]; tcache->entries[tc_idx] = e; ++(tcache->counts[tc_idx]); } +/* Caller must ensure that we know tc_idx is valid and there's + available chunks to remove. */ static void * tcache_get (size_t tc_idx) { - TCacheEntry *e = tcache->entries[tc_idx]; + tcache_entry *e = tcache->entries[tc_idx]; + assert (tc_idx < TCACHE_MAX_BINS); tcache->entries[tc_idx] = e->next; --(tcache->counts[tc_idx]); return (void *) e; @@ -2966,17 +2964,17 @@ static void __attribute__ ((section ("__libc_thread_freeres_fn"))) tcache_thread_freeres (void) { int i; - TCache *tcache_tmp = tcache; + tcache_perthread_struct *tcache_tmp = tcache; if (!tcache) return; tcache = NULL; - for (i = 0; i < TCACHE_IDX; ++i) { + for (i = 0; i < TCACHE_MAX_BINS; ++i) { while (tcache_tmp->entries[i]) { - TCacheEntry *e = tcache_tmp->entries[i]; + tcache_entry *e = tcache_tmp->entries[i]; tcache_tmp->entries[i] = e->next; __libc_free (e); } @@ -2993,7 +2991,7 @@ tcache_init(void) { mstate ar_ptr; void *victim = 0; - const size_t bytes = sizeof (TCache); + const size_t bytes = sizeof (tcache_perthread_struct); if (tcache_shutting_down) return; @@ -3010,10 +3008,15 @@ tcache_init(void) if (ar_ptr != NULL) __libc_lock_unlock (ar_ptr->mutex); + /* In a low memory situation, we may not be able to allocate memory + - in which case, we just keep trying later. However, we + typically do this very early, so either there is sufficient + memory, or there isn't enough memory to do non-trivial + allocations anyway. */ if (victim) { - tcache = (TCache *) victim; - memset (tcache, 0, sizeof (TCache)); + tcache = (tcache_perthread_struct *) victim; + memset (tcache, 0, sizeof (tcache_perthread_struct)); } } @@ -3043,8 +3046,8 @@ __libc_malloc (size_t bytes) MAYBE_INIT_TCACHE (); - if (tc_idx < mp_.tcache_max - && tc_idx < TCACHE_IDX /* to appease gcc */ + if (tc_idx < mp_.tcache_bins + && tc_idx < TCACHE_MAX_BINS /* to appease gcc */ && tcache && tcache->entries[tc_idx] != NULL) { @@ -3542,19 +3545,22 @@ _int_malloc (mstate av, size_t bytes) can try it without checking, which saves some time on this fast path. */ +#define REMOVE_FB(fb, victim, pp) \ + do \ + { \ + victim = pp; \ + if (victim == NULL) \ + break; \ + } \ + while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \ + != victim); \ + if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ())) { idx = fastbin_index (nb); mfastbinptr *fb = &fastbin (av, idx); mchunkptr pp = *fb; - do - { - victim = pp; - if (victim == NULL) - break; - } - while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) - != victim); + REMOVE_FB (fb, victim, pp); if (victim != 0) { if (__builtin_expect (fastbin_index (chunksize (victim)) != idx, 0)) @@ -3569,27 +3575,18 @@ _int_malloc (mstate av, size_t bytes) /* While we're here, if we see other chunks of the same size, stash them in the tcache. */ size_t tc_idx = csize2tidx (nb); - if (tcache && tc_idx < mp_.tcache_max) + if (tcache && tc_idx < mp_.tcache_bins) { mchunkptr tc_victim; - int found = 0; /* While bin not empty and tcache not full, copy chunks over. */ while (tcache->counts[tc_idx] < mp_.tcache_count && (pp = *fb) != NULL) { - do - { - tc_victim = pp; - if (tc_victim == NULL) - break; - } - while ((pp = catomic_compare_and_exchange_val_acq (fb, tc_victim->fd, tc_victim)) - != tc_victim); + REMOVE_FB (fb, tc_victim, pp); if (tc_victim != 0) { tcache_put (tc_victim, tc_idx); - ++found; } } } @@ -3636,10 +3633,9 @@ _int_malloc (mstate av, size_t bytes) /* While we're here, if we see other chunks of the same size, stash them in the tcache. */ size_t tc_idx = csize2tidx (nb); - if (tcache && tc_idx < mp_.tcache_max) + if (tcache && tc_idx < mp_.tcache_bins) { mchunkptr tc_victim; - int found = 0; /* While bin not empty and tcache not full, copy chunks over. */ while (tcache->counts[tc_idx] < mp_.tcache_count @@ -3655,7 +3651,6 @@ _int_malloc (mstate av, size_t bytes) bck->fd = bin; tcache_put (tc_victim, tc_idx); - ++found; } } } @@ -3701,7 +3696,7 @@ _int_malloc (mstate av, size_t bytes) #if USE_TCACHE INTERNAL_SIZE_T tcache_nb = 0; size_t tc_idx = csize2tidx (nb); - if (tcache && tc_idx < mp_.tcache_max) + if (tcache && tc_idx < mp_.tcache_bins) tcache_nb = nb; int return_cached = 0; @@ -4169,7 +4164,7 @@ _int_free (mstate av, mchunkptr p, int have_lock) size_t tc_idx = csize2tidx (size); if (tcache - && tc_idx < mp_.tcache_max + && tc_idx < mp_.tcache_bins && tcache->counts[tc_idx] < mp_.tcache_count) { tcache_put (p, tc_idx); @@ -5144,11 +5139,11 @@ static inline int __always_inline do_set_tcache_max (size_t value) { - LIBC_PROBE (memory_mallopt_tcache_max_bytes, 2, value, mp_.tcache_max_bytes); if (value >= 0 && value <= MAX_TCACHE_SIZE) { + LIBC_PROBE (memory_tunable_tcache_max_bytes, 2, value, mp_.tcache_max_bytes); mp_.tcache_max_bytes = value; - mp_.tcache_max = usize2tidx (value) + 1; + mp_.tcache_bins = csize2tidx (request2size(value)) + 1; } return 1; } @@ -5157,7 +5152,7 @@ static inline int __always_inline do_set_tcache_count (size_t value) { - LIBC_PROBE (memory_mallopt_tcache_count, 2, value, mp_.tcache_count); + LIBC_PROBE (memory_tunable_tcache_count, 2, value, mp_.tcache_count); mp_.tcache_count = value; return 1; } @@ -5166,7 +5161,7 @@ static inline int __always_inline do_set_tcache_unsorted_limit (size_t value) { - LIBC_PROBE (memory_mallopt_tcache_unsorted_limit, 2, value, mp_.tcache_unsorted_limit); + LIBC_PROBE (memory_tunable_tcache_unsorted_limit, 2, value, mp_.tcache_unsorted_limit); mp_.tcache_unsorted_limit = value; return 1; } @@ -5231,20 +5226,6 @@ __libc_mallopt (int param_number, int value) if (value > 0) do_set_arena_test (value); break; -#if USE_TCACHE - case M_TCACHE_COUNT: - if (value >= 0) - do_set_tcache_count (value); - break; - case M_TCACHE_MAX: - if (value >= 0) - do_set_tcache_max (value); - break; - case M_TCACHE_UNSORTED_LIMIT: - if (value >= 0) - do_set_tcache_unsorted_limit (value); - break; -#endif } __libc_lock_unlock (av->mutex); return res; diff --git a/manual/probes.texi b/manual/probes.texi index eb91c62..96acaed 100644 --- a/manual/probes.texi +++ b/manual/probes.texi @@ -231,6 +231,25 @@ dynamic brk/mmap thresholds. Argument @var{$arg1} and @var{$arg2} are the adjusted mmap and trim thresholds, respectively. @end deftp +@deftp Probe memory_tunable_tcache_max_bytes (int @var{$arg1}, int @var{$arg2}) +This probe is triggered when the @code{glibc.malloc.tcache_max} +tunable is set. Argument @var{$arg1} is the requested value, and +@var{$arg2} is the previous value of this tunable. +@end deftp + +@deftp Probe memory_tunable_tcache_count (int @var{$arg1}, int @var{$arg2}) +This probe is triggered when the @code{glibc.malloc.tcache_count} +tunable is set. Argument @var{$arg1} is the requested value, and +@var{$arg2} is the previous value of this tunable. +@end deftp + +@deftp Probe memory_tunable_tcache_unsorted_limit (int @var{$arg1}, int @var{$arg2}) +This probe is triggered when the +@code{glibc.malloc.tcache_unsorted_limit} tunable is set. Argument +@var{$arg1} is the requested value, and @var{$arg2} is the previous +value of this tunable. +@end deftp + @node Mathematical Function Probes @section Mathematical Function Probes diff --git a/manual/tunables.texi b/manual/tunables.texi index ac8c38f..b651a1d 100644 --- a/manual/tunables.texi +++ b/manual/tunables.texi @@ -190,3 +190,37 @@ number of arenas is determined by the number of CPU cores online. For 32-bit systems the limit is twice the number of cores online and on 64-bit systems, it is 8 times the number of cores online. @end deftp + +@deftp Tunable glibc.malloc.tcache_max +The maximum size of a request (in bytes) which may be met via the +per-thread cache. The default (and maximum) value is 1032 bytes on +64-bit systems and 516 bytes on 32-bit systems. +@end deftp + +@deftp Tunable glibc.malloc.tcache_count +The maximum number of chunks of each size to cache. The default is 7. +There is no upper limit, other than available system memory. Note +that chunks are rounded up to malloc's guaranteed alignment - this +count is per rounded size, not per user-provided size. + +The approximate maximum overhead of the per-thread cache (for each +thread, of course) is thus @code{glibc.malloc.tcache_max} (in bins, +max 64 bins) times @code{glibc.malloc.tcache_count} times the size for +each bin. With defaults, this is about 236 KB on 64-bit systems and +118 KB on 32-bit systems. +@end deftp + +@deftp Tunable glibc.malloc.tcache_unsorted_limit +When the user requests memory and the request cannot be met via the +per-thread cache, the arenas are used to meet the request. At this +time, additional chunks will be moved from existing arena lists to +pre-fill the corresponding cache. While copies from the fastbins, +smallbins, and regular bins are bounded and predictable due to the bin +sizes, copies from the unsorted bin are not bounded, and incur +additional time penalties as they need to be sorted as they're +scanned. To make scanning the unsorted list more predictable and +bounded, the user may set this tunable to limit the number of blocks +that are scanned from the unsorted list while searching for chunks to +pre-fill the per-thread cache with. The default, or when set to zero, +is no limit. +@end deftp |