From 63310467a3d1ed6a0460ec1f4268126cd1ceec2e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 20 Jan 2011 11:12:26 -0600 Subject: mm: Remove support for kmem_cache_name() The last user was ext4 and Eric Sandeen removed the call in a recent patch. See the following URL for the discussion: http://marc.info/?l=linux-ext4&m=129546975702198&w=2 Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index e15aa7f193c9..d2f343a54bad 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2399,12 +2399,6 @@ unsigned int kmem_cache_size(struct kmem_cache *s) } EXPORT_SYMBOL(kmem_cache_size); -const char *kmem_cache_name(struct kmem_cache *s) -{ - return s->name; -} -EXPORT_SYMBOL(kmem_cache_name); - static void list_slab_objects(struct kmem_cache *s, struct page *page, const char *text) { -- cgit v1.2.3 From b3d41885d9cd0d9db31c8f49e362bae02c96fa3f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Feb 2011 18:35:22 +0100 Subject: slub: fix kmemcheck calls to match ksize() hints Recent use of ksize() in network stack (commit ca44ac38 : net: don't reallocate skb->head unless the current one hasn't the needed extra size or is shared) triggers kmemcheck warnings, because ksize() can return more space than kmemcheck is aware of. Pekka Enberg noticed SLAB+kmemcheck is doing the right thing, while SLUB +kmemcheck doesnt. Bugzilla reference #27212 Reported-by: Christian Casteyde Suggested-by: Pekka Enberg Signed-off-by: Eric Dumazet Acked-by: David S. Miller Acked-by: David Rientjes Acked-by: Christoph Lameter CC: Changli Gao CC: Andrew Morton Signed-off-by: Pekka Enberg --- mm/slub.c | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index d2f343a54bad..217b5b5338a2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -797,10 +797,34 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) return should_failslab(s->objsize, flags, s->flags); } +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->objsize; + +#endif + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +} + static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) { flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, s->objsize); + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); } @@ -2690,7 +2714,6 @@ EXPORT_SYMBOL(__kmalloc_node); size_t ksize(const void *object) { struct page *page; - struct kmem_cache *s; if (unlikely(object == ZERO_SIZE_PTR)) return 0; @@ -2701,28 +2724,8 @@ size_t ksize(const void *object) WARN_ON(!PageCompound(page)); return PAGE_SIZE << compound_order(page); } - s = page->slab; -#ifdef CONFIG_SLUB_DEBUG - /* - * Debugging requires use of the padding between object - * and whatever may come after it. - */ - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->objsize; - -#endif - /* - * If we have the need to store the freelist pointer - * back there or track user information then we can - * only use the space before that information. - */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation - */ - return s->size; + return slab_ksize(page->slab); } EXPORT_SYMBOL(ksize); -- cgit v1.2.3 From d71f606f687ef9d0cdddfd3619ca7cb9a0b3fb63 Mon Sep 17 00:00:00 2001 From: Mariusz Kozlowski Date: Sat, 26 Feb 2011 20:10:26 +0100 Subject: slub: fix ksize() build error mm/slub.c: In function 'ksize': mm/slub.c:2728: error: implicit declaration of function 'slab_ksize' slab_ksize() needs to go out of CONFIG_SLUB_DEBUG section. Acked-by: Randy Dunlap Acked-by: David Rientjes Signed-off-by: Mariusz Kozlowski Signed-off-by: Pekka Enberg --- mm/slub.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index 217b5b5338a2..ea6f0390996f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -281,6 +281,30 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->objsize; + +#endif + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +} + static inline struct kmem_cache_order_objects oo_make(int order, unsigned long size) { @@ -797,30 +821,6 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) return should_failslab(s->objsize, flags, s->flags); } -static inline size_t slab_ksize(const struct kmem_cache *s) -{ -#ifdef CONFIG_SLUB_DEBUG - /* - * Debugging requires use of the padding between object - * and whatever may come after it. - */ - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->objsize; - -#endif - /* - * If we have the need to store the freelist pointer - * back there or track user information then we can - * only use the space before that information. - */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation - */ - return s->size; -} - static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) { flags &= gfp_allowed_mask; -- cgit v1.2.3 From d3f661d69a486db0e0e6343b452f45d91b4b3656 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 25 Feb 2011 11:38:52 -0600 Subject: slub: Get rid of slab_free_hook_irq() The following patch will make the fastpaths lockless and will no longer require interrupts to be disabled. Calling the free hook with irq disabled will no longer be possible. Move the slab_free_hook_irq() logic into slab_free_hook. Only disable interrupts if the features are selected that require callbacks with interrupts off and reenable after calls have been made. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index e15aa7f193c9..bae7a5c636f4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -807,14 +807,24 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); -} -static inline void slab_free_hook_irq(struct kmem_cache *s, void *object) -{ - kmemcheck_slab_free(s, object, s->objsize); - debug_check_no_locks_freed(object, s->objsize); - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(object, s->objsize); + /* + * Trouble is that we may no longer disable interupts in the fast path + * So in order to make the debug calls that expect irqs to be + * disabled we need to disable interrupts temporarily. + */ +#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) + { + unsigned long flags; + + local_irq_save(flags); + kmemcheck_slab_free(s, x, s->objsize); + debug_check_no_locks_freed(x, s->objsize); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->objsize); + local_irq_restore(flags); + } +#endif } /* @@ -1101,9 +1111,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, static inline void slab_free_hook(struct kmem_cache *s, void *x) {} -static inline void slab_free_hook_irq(struct kmem_cache *s, - void *object) {} - #endif /* CONFIG_SLUB_DEBUG */ /* @@ -1909,8 +1916,6 @@ static __always_inline void slab_free(struct kmem_cache *s, local_irq_save(flags); c = __this_cpu_ptr(s->cpu_slab); - slab_free_hook_irq(s, x); - if (likely(page == c->page && c->node != NUMA_NO_NODE)) { set_freepointer(s, object, c->freelist); c->freelist = object; -- cgit v1.2.3 From 8a5ec0ba42c4919e2d8f4c3138cc8b987fdb0b79 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 25 Feb 2011 11:38:54 -0600 Subject: Lockless (and preemptless) fastpaths for slub Use the this_cpu_cmpxchg_double functionality to implement a lockless allocation algorithm on arches that support fast this_cpu_ops. Each of the per cpu pointers is paired with a transaction id that ensures that updates of the per cpu information can only occur in sequence on a certain cpu. A transaction id is a "long" integer that is comprised of an event number and the cpu number. The event number is incremented for every change to the per cpu state. This means that the cmpxchg instruction can verify for an update that nothing interfered and that we are updating the percpu structure for the processor where we picked up the information and that we are also currently on that processor when we update the information. This results in a significant decrease of the overhead in the fastpaths. It also makes it easy to adopt the fast path for realtime kernels since this is lockless and does not require the use of the current per cpu area over the critical section. It is only important that the per cpu area is current at the beginning of the critical section and at the end. So there is no need even to disable preemption. Test results show that the fastpath cycle count is reduced by up to ~ 40% (alloc/free test goes from ~140 cycles down to ~80). The slowpath for kfree adds a few cycles. Sadly this does nothing for the slowpath which is where the main issues with performance in slub are but the best case performance rises significantly. (For that see the more complex slub patches that require cmpxchg_double) Kmalloc: alloc/free test Before: 10000 times kmalloc(8)/kfree -> 134 cycles 10000 times kmalloc(16)/kfree -> 152 cycles 10000 times kmalloc(32)/kfree -> 144 cycles 10000 times kmalloc(64)/kfree -> 142 cycles 10000 times kmalloc(128)/kfree -> 142 cycles 10000 times kmalloc(256)/kfree -> 132 cycles 10000 times kmalloc(512)/kfree -> 132 cycles 10000 times kmalloc(1024)/kfree -> 135 cycles 10000 times kmalloc(2048)/kfree -> 135 cycles 10000 times kmalloc(4096)/kfree -> 135 cycles 10000 times kmalloc(8192)/kfree -> 144 cycles 10000 times kmalloc(16384)/kfree -> 754 cycles After: 10000 times kmalloc(8)/kfree -> 78 cycles 10000 times kmalloc(16)/kfree -> 78 cycles 10000 times kmalloc(32)/kfree -> 82 cycles 10000 times kmalloc(64)/kfree -> 88 cycles 10000 times kmalloc(128)/kfree -> 79 cycles 10000 times kmalloc(256)/kfree -> 79 cycles 10000 times kmalloc(512)/kfree -> 85 cycles 10000 times kmalloc(1024)/kfree -> 82 cycles 10000 times kmalloc(2048)/kfree -> 82 cycles 10000 times kmalloc(4096)/kfree -> 85 cycles 10000 times kmalloc(8192)/kfree -> 82 cycles 10000 times kmalloc(16384)/kfree -> 706 cycles Kmalloc: Repeatedly allocate then free test Before: 10000 times kmalloc(8) -> 211 cycles kfree -> 113 cycles 10000 times kmalloc(16) -> 174 cycles kfree -> 115 cycles 10000 times kmalloc(32) -> 235 cycles kfree -> 129 cycles 10000 times kmalloc(64) -> 222 cycles kfree -> 120 cycles 10000 times kmalloc(128) -> 343 cycles kfree -> 139 cycles 10000 times kmalloc(256) -> 827 cycles kfree -> 147 cycles 10000 times kmalloc(512) -> 1048 cycles kfree -> 272 cycles 10000 times kmalloc(1024) -> 2043 cycles kfree -> 528 cycles 10000 times kmalloc(2048) -> 4002 cycles kfree -> 571 cycles 10000 times kmalloc(4096) -> 7740 cycles kfree -> 628 cycles 10000 times kmalloc(8192) -> 8062 cycles kfree -> 850 cycles 10000 times kmalloc(16384) -> 8895 cycles kfree -> 1249 cycles After: 10000 times kmalloc(8) -> 190 cycles kfree -> 129 cycles 10000 times kmalloc(16) -> 76 cycles kfree -> 123 cycles 10000 times kmalloc(32) -> 126 cycles kfree -> 124 cycles 10000 times kmalloc(64) -> 181 cycles kfree -> 128 cycles 10000 times kmalloc(128) -> 310 cycles kfree -> 140 cycles 10000 times kmalloc(256) -> 809 cycles kfree -> 165 cycles 10000 times kmalloc(512) -> 1005 cycles kfree -> 269 cycles 10000 times kmalloc(1024) -> 1999 cycles kfree -> 527 cycles 10000 times kmalloc(2048) -> 3967 cycles kfree -> 570 cycles 10000 times kmalloc(4096) -> 7658 cycles kfree -> 637 cycles 10000 times kmalloc(8192) -> 8111 cycles kfree -> 859 cycles 10000 times kmalloc(16384) -> 8791 cycles kfree -> 1173 cycles Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 5 +- mm/slub.c | 205 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 207 insertions(+), 3 deletions(-) (limited to 'mm/slub.c') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 875df55ab36d..009b0020079d 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -35,7 +35,10 @@ enum stat_item { NR_SLUB_STAT_ITEMS }; struct kmem_cache_cpu { - void **freelist; /* Pointer to first free per cpu object */ + void **freelist; /* Pointer to next available object */ +#ifdef CONFIG_CMPXCHG_LOCAL + unsigned long tid; /* Globally unique transaction id */ +#endif struct page *page; /* The slab from which we are allocating */ int node; /* The node of the page (or -1 for debug) */ #ifdef CONFIG_SLUB_STATS diff --git a/mm/slub.c b/mm/slub.c index bae7a5c636f4..65030c7fd7e2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1494,6 +1494,77 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) } } +#ifdef CONFIG_CMPXCHG_LOCAL +#ifdef CONFIG_PREEMPT +/* + * Calculate the next globally unique transaction for disambiguiation + * during cmpxchg. The transactions start with the cpu number and are then + * incremented by CONFIG_NR_CPUS. + */ +#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) +#else +/* + * No preemption supported therefore also no need to check for + * different cpus. + */ +#define TID_STEP 1 +#endif + +static inline unsigned long next_tid(unsigned long tid) +{ + return tid + TID_STEP; +} + +static inline unsigned int tid_to_cpu(unsigned long tid) +{ + return tid % TID_STEP; +} + +static inline unsigned long tid_to_event(unsigned long tid) +{ + return tid / TID_STEP; +} + +static inline unsigned int init_tid(int cpu) +{ + return cpu; +} + +static inline void note_cmpxchg_failure(const char *n, + const struct kmem_cache *s, unsigned long tid) +{ +#ifdef SLUB_DEBUG_CMPXCHG + unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); + + printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); + +#ifdef CONFIG_PREEMPT + if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) + printk("due to cpu change %d -> %d\n", + tid_to_cpu(tid), tid_to_cpu(actual_tid)); + else +#endif + if (tid_to_event(tid) != tid_to_event(actual_tid)) + printk("due to cpu running other code. Event %ld->%ld\n", + tid_to_event(tid), tid_to_event(actual_tid)); + else + printk("for unknown reason: actual=%lx was=%lx target=%lx\n", + actual_tid, tid, next_tid(tid)); +#endif +} + +#endif + +void init_kmem_cache_cpus(struct kmem_cache *s) +{ +#if defined(CONFIG_CMPXCHG_LOCAL) && defined(CONFIG_PREEMPT) + int cpu; + + for_each_possible_cpu(cpu) + per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); +#endif + +} /* * Remove the cpu slab */ @@ -1525,6 +1596,9 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) page->inuse--; } c->page = NULL; +#ifdef CONFIG_CMPXCHG_LOCAL + c->tid = next_tid(c->tid); +#endif unfreeze_slab(s, page, tail); } @@ -1659,6 +1733,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, { void **object; struct page *new; +#ifdef CONFIG_CMPXCHG_LOCAL + unsigned long flags; + + local_irq_save(flags); +#ifdef CONFIG_PREEMPT + /* + * We may have been preempted and rescheduled on a different + * cpu before disabling interrupts. Need to reload cpu area + * pointer. + */ + c = this_cpu_ptr(s->cpu_slab); +#endif +#endif /* We handle __GFP_ZERO in the caller */ gfpflags &= ~__GFP_ZERO; @@ -1685,6 +1772,10 @@ load_freelist: c->node = page_to_nid(c->page); unlock_out: slab_unlock(c->page); +#ifdef CONFIG_CMPXCHG_LOCAL + c->tid = next_tid(c->tid); + local_irq_restore(flags); +#endif stat(s, ALLOC_SLOWPATH); return object; @@ -1746,23 +1837,76 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, { void **object; struct kmem_cache_cpu *c; +#ifdef CONFIG_CMPXCHG_LOCAL + unsigned long tid; +#else unsigned long flags; +#endif if (slab_pre_alloc_hook(s, gfpflags)) return NULL; +#ifndef CONFIG_CMPXCHG_LOCAL local_irq_save(flags); +#else +redo: +#endif + + /* + * Must read kmem_cache cpu data via this cpu ptr. Preemption is + * enabled. We may switch back and forth between cpus while + * reading from one cpu area. That does not matter as long + * as we end up on the original cpu again when doing the cmpxchg. + */ c = __this_cpu_ptr(s->cpu_slab); + +#ifdef CONFIG_CMPXCHG_LOCAL + /* + * The transaction ids are globally unique per cpu and per operation on + * a per cpu queue. Thus they can be guarantee that the cmpxchg_double + * occurs on the right processor and that there was no operation on the + * linked list in between. + */ + tid = c->tid; + barrier(); +#endif + object = c->freelist; if (unlikely(!object || !node_match(c, node))) object = __slab_alloc(s, gfpflags, node, addr, c); else { +#ifdef CONFIG_CMPXCHG_LOCAL + /* + * The cmpxchg will only match if there was no additonal + * operation and if we are on the right processor. + * + * The cmpxchg does the following atomically (without lock semantics!) + * 1. Relocate first pointer to the current per cpu area. + * 2. Verify that tid and freelist have not been changed + * 3. If they were not changed replace tid and freelist + * + * Since this is without lock semantics the protection is only against + * code executing on this cpu *not* from access by other cpus. + */ + if (unlikely(!this_cpu_cmpxchg_double( + s->cpu_slab->freelist, s->cpu_slab->tid, + object, tid, + get_freepointer(s, object), next_tid(tid)))) { + + note_cmpxchg_failure("slab_alloc", s, tid); + goto redo; + } +#else c->freelist = get_freepointer(s, object); +#endif stat(s, ALLOC_FASTPATH); } + +#ifndef CONFIG_CMPXCHG_LOCAL local_irq_restore(flags); +#endif if (unlikely(gfpflags & __GFP_ZERO) && object) memset(object, 0, s->objsize); @@ -1840,9 +1984,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page, { void *prior; void **object = (void *)x; +#ifdef CONFIG_CMPXCHG_LOCAL + unsigned long flags; - stat(s, FREE_SLOWPATH); + local_irq_save(flags); +#endif slab_lock(page); + stat(s, FREE_SLOWPATH); if (kmem_cache_debug(s)) goto debug; @@ -1872,6 +2020,9 @@ checks_ok: out_unlock: slab_unlock(page); +#ifdef CONFIG_CMPXCHG_LOCAL + local_irq_restore(flags); +#endif return; slab_empty: @@ -1883,6 +2034,9 @@ slab_empty: stat(s, FREE_REMOVE_PARTIAL); } slab_unlock(page); +#ifdef CONFIG_CMPXCHG_LOCAL + local_irq_restore(flags); +#endif stat(s, FREE_SLAB); discard_slab(s, page); return; @@ -1909,21 +2063,54 @@ static __always_inline void slab_free(struct kmem_cache *s, { void **object = (void *)x; struct kmem_cache_cpu *c; +#ifdef CONFIG_CMPXCHG_LOCAL + unsigned long tid; +#else unsigned long flags; +#endif slab_free_hook(s, x); +#ifndef CONFIG_CMPXCHG_LOCAL local_irq_save(flags); +#endif + +redo: + /* + * Determine the currently cpus per cpu slab. + * The cpu may change afterward. However that does not matter since + * data is retrieved via this pointer. If we are on the same cpu + * during the cmpxchg then the free will succedd. + */ c = __this_cpu_ptr(s->cpu_slab); +#ifdef CONFIG_CMPXCHG_LOCAL + tid = c->tid; + barrier(); +#endif + if (likely(page == c->page && c->node != NUMA_NO_NODE)) { set_freepointer(s, object, c->freelist); + +#ifdef CONFIG_CMPXCHG_LOCAL + if (unlikely(!this_cpu_cmpxchg_double( + s->cpu_slab->freelist, s->cpu_slab->tid, + c->freelist, tid, + object, next_tid(tid)))) { + + note_cmpxchg_failure("slab_free", s, tid); + goto redo; + } +#else c->freelist = object; +#endif stat(s, FREE_FASTPATH); } else __slab_free(s, page, x, addr); +#ifndef CONFIG_CMPXCHG_LOCAL local_irq_restore(flags); +#endif } void kmem_cache_free(struct kmem_cache *s, void *x) @@ -2115,9 +2302,23 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); +#ifdef CONFIG_CMPXCHG_LOCAL + /* + * Must align to double word boundary for the double cmpxchg instructions + * to work. + */ + s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *)); +#else + /* Regular alignment is sufficient */ s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); +#endif + + if (!s->cpu_slab) + return 0; + + init_kmem_cache_cpus(s); - return s->cpu_slab != NULL; + return 1; } static struct kmem_cache *kmem_cache_node; -- cgit v1.2.3 From ab9a0f196f2f4f080df54402493ea3dc31b5243e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 10 Mar 2011 15:21:48 +0800 Subject: slub: automatically reserve bytes at the end of slab There is no "struct" for slub's slab, it shares with struct page. But struct page is very small, it is insufficient when we need to add some metadata for slab. So we add a field "reserved" to struct kmem_cache, when a slab is allocated, kmem_cache->reserved bytes are automatically reserved at the end of the slab for slab's metadata. Changed from v1: Export the reserved field via sysfs Acked-by: Christoph Lameter Signed-off-by: Lai Jiangshan Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 1 + mm/slub.c | 47 ++++++++++++++++++++++++++++++----------------- 2 files changed, 31 insertions(+), 17 deletions(-) (limited to 'mm/slub.c') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 8b6e8ae5d5ca..ae0093cc5189 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -83,6 +83,7 @@ struct kmem_cache { void (*ctor)(void *); int inuse; /* Offset to metadata */ int align; /* Alignment */ + int reserved; /* Reserved bytes at the end of slabs */ unsigned long min_partial; const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ diff --git a/mm/slub.c b/mm/slub.c index e15aa7f193c9..d3d17677bab5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -281,11 +281,16 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } +static inline int order_objects(int order, unsigned long size, int reserved) +{ + return ((PAGE_SIZE << order) - reserved) / size; +} + static inline struct kmem_cache_order_objects oo_make(int order, - unsigned long size) + unsigned long size, int reserved) { struct kmem_cache_order_objects x = { - (order << OO_SHIFT) + (PAGE_SIZE << order) / size + (order << OO_SHIFT) + order_objects(order, size, reserved) }; return x; @@ -617,7 +622,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - length = (PAGE_SIZE << compound_order(page)); + length = (PAGE_SIZE << compound_order(page)) - s->reserved; end = start + length; remainder = length % s->size; if (!remainder) @@ -698,7 +703,7 @@ static int check_slab(struct kmem_cache *s, struct page *page) return 0; } - maxobj = (PAGE_SIZE << compound_order(page)) / s->size; + maxobj = order_objects(compound_order(page), s->size, s->reserved); if (page->objects > maxobj) { slab_err(s, page, "objects %u > max %u", s->name, page->objects, maxobj); @@ -748,7 +753,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } - max_objects = (PAGE_SIZE << compound_order(page)) / s->size; + max_objects = order_objects(compound_order(page), s->size, s->reserved); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -1988,13 +1993,13 @@ static int slub_nomerge; * the smallest order which will fit the object. */ static inline int slab_order(int size, int min_objects, - int max_order, int fract_leftover) + int max_order, int fract_leftover, int reserved) { int order; int rem; int min_order = slub_min_order; - if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE) + if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; for (order = max(min_order, @@ -2003,10 +2008,10 @@ static inline int slab_order(int size, int min_objects, unsigned long slab_size = PAGE_SIZE << order; - if (slab_size < min_objects * size) + if (slab_size < min_objects * size + reserved) continue; - rem = slab_size % size; + rem = (slab_size - reserved) % size; if (rem <= slab_size / fract_leftover) break; @@ -2016,7 +2021,7 @@ static inline int slab_order(int size, int min_objects, return order; } -static inline int calculate_order(int size) +static inline int calculate_order(int size, int reserved) { int order; int min_objects; @@ -2034,14 +2039,14 @@ static inline int calculate_order(int size) min_objects = slub_min_objects; if (!min_objects) min_objects = 4 * (fls(nr_cpu_ids) + 1); - max_objects = (PAGE_SIZE << slub_max_order)/size; + max_objects = order_objects(slub_max_order, size, reserved); min_objects = min(min_objects, max_objects); while (min_objects > 1) { fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, - slub_max_order, fraction); + slub_max_order, fraction, reserved); if (order <= slub_max_order) return order; fraction /= 2; @@ -2053,14 +2058,14 @@ static inline int calculate_order(int size) * We were unable to place multiple objects in a slab. Now * lets see if we can place a single object there. */ - order = slab_order(size, 1, slub_max_order, 1); + order = slab_order(size, 1, slub_max_order, 1, reserved); if (order <= slub_max_order) return order; /* * Doh this slab cannot be placed using slub_max_order. */ - order = slab_order(size, 1, MAX_ORDER, 1); + order = slab_order(size, 1, MAX_ORDER, 1, reserved); if (order < MAX_ORDER) return order; return -ENOSYS; @@ -2311,7 +2316,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) if (forced_order >= 0) order = forced_order; else - order = calculate_order(size); + order = calculate_order(size, s->reserved); if (order < 0) return 0; @@ -2329,8 +2334,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) /* * Determine the number of objects per slab */ - s->oo = oo_make(order, size); - s->min = oo_make(get_order(size), size); + s->oo = oo_make(order, size, s->reserved); + s->min = oo_make(get_order(size), size, s->reserved); if (oo_objects(s->oo) > oo_objects(s->max)) s->max = s->oo; @@ -2349,6 +2354,7 @@ static int kmem_cache_open(struct kmem_cache *s, s->objsize = size; s->align = align; s->flags = kmem_cache_flags(size, flags, name, ctor); + s->reserved = 0; if (!calculate_sizes(s, -1)) goto error; @@ -4017,6 +4023,12 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(destroy_by_rcu); +static ssize_t reserved_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->reserved); +} +SLAB_ATTR_RO(reserved); + #ifdef CONFIG_SLUB_DEBUG static ssize_t slabs_show(struct kmem_cache *s, char *buf) { @@ -4303,6 +4315,7 @@ static struct attribute *slab_attrs[] = { &reclaim_account_attr.attr, &destroy_by_rcu_attr.attr, &shrink_attr.attr, + &reserved_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, &slabs_attr.attr, -- cgit v1.2.3 From da9a638c6f8fc0633fa94a334f1c053f5e307177 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 10 Mar 2011 15:22:00 +0800 Subject: slub,rcu: don't assume the size of struct rcu_head The size of struct rcu_head may be changed. When it becomes larger, it will pollute the page array. We reserve some some bytes for struct rcu_head when a slab is allocated in this situation. Changed from V1: use VM_BUG_ON instead BUG_ON Acked-by: Christoph Lameter Signed-off-by: Lai Jiangshan Signed-off-by: Pekka Enberg --- mm/slub.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index d3d17677bab5..ebba3eb19369 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1254,21 +1254,38 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __free_pages(page, order); } +#define need_reserve_slab_rcu \ + (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) + static void rcu_free_slab(struct rcu_head *h) { struct page *page; - page = container_of((struct list_head *)h, struct page, lru); + if (need_reserve_slab_rcu) + page = virt_to_head_page(h); + else + page = container_of((struct list_head *)h, struct page, lru); + __free_slab(page->slab, page); } static void free_slab(struct kmem_cache *s, struct page *page) { if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { - /* - * RCU free overloads the RCU head over the LRU - */ - struct rcu_head *head = (void *)&page->lru; + struct rcu_head *head; + + if (need_reserve_slab_rcu) { + int order = compound_order(page); + int offset = (PAGE_SIZE << order) - s->reserved; + + VM_BUG_ON(s->reserved != sizeof(*head)); + head = page_address(page) + offset; + } else { + /* + * RCU free overloads the RCU head over the LRU + */ + head = (void *)&page->lru; + } call_rcu(head, rcu_free_slab); } else @@ -2356,6 +2373,9 @@ static int kmem_cache_open(struct kmem_cache *s, s->flags = kmem_cache_flags(size, flags, name, ctor); s->reserved = 0; + if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) + s->reserved = sizeof(struct rcu_head); + if (!calculate_sizes(s, -1)) goto error; if (disable_higher_order_debug) { -- cgit v1.2.3 From a24c5a0ea902bcda348f086bd909cc2d6e305bf8 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 15 Mar 2011 12:45:21 -0500 Subject: slub: Dont define useless label in the !CONFIG_CMPXCHG_LOCAL case The redo label needs #ifdeffery. Fixes the following problem introduced by commit 8a5ec0ba42c4 ("Lockless (and preemptless) fastpaths for slub"): mm/slub.c: In function 'slab_free': mm/slub.c:2124: warning: label 'redo' defined but not used Reported-by: Stephen Rothwell Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index 65030c7fd7e2..f32aee37840b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2073,9 +2073,11 @@ static __always_inline void slab_free(struct kmem_cache *s, #ifndef CONFIG_CMPXCHG_LOCAL local_irq_save(flags); -#endif +#else redo: +#endif + /* * Determine the currently cpus per cpu slab. * The cpu may change afterward. However that does not matter since -- cgit v1.2.3 From 2fd66c517d5e98de2528d86e0e62f5069ff99f59 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 22 Mar 2011 13:32:53 -0500 Subject: slub: Add missing irq restore for the OOM path OOM path is missing the irq restore in the CONFIG_CMPXCHG_LOCAL case. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index 7e4f835e32ab..e126cfbd3df2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1857,6 +1857,9 @@ new_slab: } if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) slab_out_of_memory(s, gfpflags, node); +#ifdef CONFIG_CMPXCHG_LOCAL + local_irq_restore(flags); +#endif return NULL; debug: if (!alloc_debug_processing(s, c->page, object, addr)) -- cgit v1.2.3 From 4fdccdfbb4652a7bbac8adbce7449eb093775118 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 22 Mar 2011 13:35:00 -0500 Subject: slub: Add statistics for this_cmpxchg_double failures Add some statistics for debugging. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 1 + mm/slub.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/slub.c') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 90fbb6d87e11..45ca123e8002 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -32,6 +32,7 @@ enum stat_item { DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ ORDER_FALLBACK, /* Number of times fallback was necessary */ + CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */ NR_SLUB_STAT_ITEMS }; struct kmem_cache_cpu { diff --git a/mm/slub.c b/mm/slub.c index e126cfbd3df2..93de30db95f5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -217,7 +217,7 @@ static inline void sysfs_slab_remove(struct kmem_cache *s) #endif -static inline void stat(struct kmem_cache *s, enum stat_item si) +static inline void stat(const struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS __this_cpu_inc(s->cpu_slab->stat[si]); @@ -1597,6 +1597,7 @@ static inline void note_cmpxchg_failure(const char *n, printk("for unknown reason: actual=%lx was=%lx target=%lx\n", actual_tid, tid, next_tid(tid)); #endif + stat(s, CMPXCHG_DOUBLE_CPU_FAIL); } #endif -- cgit v1.2.3 From f9b615de4663c4b852e07257e9f967df6a0161c0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 24 Mar 2011 21:26:46 +0200 Subject: slub: Fix debugobjects with lockless fastpath On Thu, 24 Mar 2011, Ingo Molnar wrote: > RIP: 0010:[] [] get_next_timer_interrupt+0x119/0x260 That's a typical timer crash, but you were unable to debug it with debugobjects because commit d3f661d6 broke those. Cc: Christoph Lameter Tested-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Pekka Enberg --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index 93de30db95f5..a6a783594ad4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -849,11 +849,11 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) local_irq_save(flags); kmemcheck_slab_free(s, x, s->objsize); debug_check_no_locks_freed(x, s->objsize); - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(x, s->objsize); local_irq_restore(flags); } #endif + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->objsize); } /* -- cgit v1.2.3 From b8c4c96ed4cdecf5ae51fc6f4c006658e873047f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 24 Mar 2011 14:51:38 -0500 Subject: SLUB: Write to per cpu data when allocating it It turns out that the cmpxchg16b emulation has to access vmalloced percpu memory with interrupts disabled. If the memory has never been touched before then the fault necessary to establish the mapping will not to occur and the kernel will fail on boot. Fix that by reusing the CONFIG_PREEMPT code that writes the cpu number into a field on every cpu. Writing to the per cpu area before causes the mapping to be established before we get to a cmpxchg16b emulation. Tested-by: Ingo Molnar Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index a6a783594ad4..f881874843a5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1604,7 +1604,7 @@ static inline void note_cmpxchg_failure(const char *n, void init_kmem_cache_cpus(struct kmem_cache *s) { -#if defined(CONFIG_CMPXCHG_LOCAL) && defined(CONFIG_PREEMPT) +#ifdef CONFIG_CMPXCHG_LOCAL int cpu; for_each_possible_cpu(cpu) -- cgit v1.2.3