From 63310467a3d1ed6a0460ec1f4268126cd1ceec2e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 20 Jan 2011 11:12:26 -0600
Subject: mm: Remove support for kmem_cache_name()

The last user was ext4 and Eric Sandeen removed the call in a recent patch. See
the following URL for the discussion:

http://marc.info/?l=linux-ext4&m=129546975702198&w=2

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'mm/slub.c')

diff --git a/mm/slub.c b/mm/slub.c
index e15aa7f193c9..d2f343a54bad 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2399,12 +2399,6 @@ unsigned int kmem_cache_size(struct kmem_cache *s)
 }
 EXPORT_SYMBOL(kmem_cache_size);
 
-const char *kmem_cache_name(struct kmem_cache *s)
-{
-	return s->name;
-}
-EXPORT_SYMBOL(kmem_cache_name);
-
 static void list_slab_objects(struct kmem_cache *s, struct page *page,
 							const char *text)
 {
-- 
cgit v1.2.3


From b3d41885d9cd0d9db31c8f49e362bae02c96fa3f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 14 Feb 2011 18:35:22 +0100
Subject: slub: fix kmemcheck calls to match ksize() hints

Recent use of ksize() in network stack (commit ca44ac38 : net: don't
reallocate skb->head unless the current one hasn't the needed extra size
or is shared) triggers kmemcheck warnings, because ksize() can return
more space than kmemcheck is aware of.

Pekka Enberg noticed SLAB+kmemcheck is doing the right thing, while SLUB
+kmemcheck doesnt.

Bugzilla reference #27212

Reported-by: Christian Casteyde <casteyde.christian@free.fr>
Suggested-by: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Christoph Lameter <cl@linux.com>
CC: Changli Gao <xiaosuo@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 49 ++++++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 23 deletions(-)

(limited to 'mm/slub.c')

diff --git a/mm/slub.c b/mm/slub.c
index d2f343a54bad..217b5b5338a2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -797,10 +797,34 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
 	return should_failslab(s->objsize, flags, s->flags);
 }
 
+static inline size_t slab_ksize(const struct kmem_cache *s)
+{
+#ifdef CONFIG_SLUB_DEBUG
+	/*
+	 * Debugging requires use of the padding between object
+	 * and whatever may come after it.
+	 */
+	if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
+		return s->objsize;
+
+#endif
+	/*
+	 * If we have the need to store the freelist pointer
+	 * back there or track user information then we can
+	 * only use the space before that information.
+	 */
+	if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
+		return s->inuse;
+	/*
+	 * Else we can use all the padding etc for the allocation
+	 */
+	return s->size;
+}
+
 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
 {
 	flags &= gfp_allowed_mask;
-	kmemcheck_slab_alloc(s, flags, object, s->objsize);
+	kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
 	kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
 }
 
@@ -2690,7 +2714,6 @@ EXPORT_SYMBOL(__kmalloc_node);
 size_t ksize(const void *object)
 {
 	struct page *page;
-	struct kmem_cache *s;
 
 	if (unlikely(object == ZERO_SIZE_PTR))
 		return 0;
@@ -2701,28 +2724,8 @@ size_t ksize(const void *object)
 		WARN_ON(!PageCompound(page));
 		return PAGE_SIZE << compound_order(page);
 	}
-	s = page->slab;
 
-#ifdef CONFIG_SLUB_DEBUG
-	/*
-	 * Debugging requires use of the padding between object
-	 * and whatever may come after it.
-	 */
-	if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
-		return s->objsize;
-
-#endif
-	/*
-	 * If we have the need to store the freelist pointer
-	 * back there or track user information then we can
-	 * only use the space before that information.
-	 */
-	if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
-		return s->inuse;
-	/*
-	 * Else we can use all the padding etc for the allocation
-	 */
-	return s->size;
+	return slab_ksize(page->slab);
 }
 EXPORT_SYMBOL(ksize);
 
-- 
cgit v1.2.3


From d71f606f687ef9d0cdddfd3619ca7cb9a0b3fb63 Mon Sep 17 00:00:00 2001
From: Mariusz Kozlowski <mk@lab.zgora.pl>
Date: Sat, 26 Feb 2011 20:10:26 +0100
Subject: slub: fix ksize() build error

mm/slub.c: In function 'ksize':
mm/slub.c:2728: error: implicit declaration of function 'slab_ksize'

slab_ksize() needs to go out of CONFIG_SLUB_DEBUG section.

Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Mariusz Kozlowski <mk@lab.zgora.pl>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

(limited to 'mm/slub.c')

diff --git a/mm/slub.c b/mm/slub.c
index 217b5b5338a2..ea6f0390996f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -281,6 +281,30 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
 	return (p - addr) / s->size;
 }
 
+static inline size_t slab_ksize(const struct kmem_cache *s)
+{
+#ifdef CONFIG_SLUB_DEBUG
+	/*
+	 * Debugging requires use of the padding between object
+	 * and whatever may come after it.
+	 */
+	if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
+		return s->objsize;
+
+#endif
+	/*
+	 * If we have the need to store the freelist pointer
+	 * back there or track user information then we can
+	 * only use the space before that information.
+	 */
+	if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
+		return s->inuse;
+	/*
+	 * Else we can use all the padding etc for the allocation
+	 */
+	return s->size;
+}
+
 static inline struct kmem_cache_order_objects oo_make(int order,
 						unsigned long size)
 {
@@ -797,30 +821,6 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
 	return should_failslab(s->objsize, flags, s->flags);
 }
 
-static inline size_t slab_ksize(const struct kmem_cache *s)
-{
-#ifdef CONFIG_SLUB_DEBUG
-	/*
-	 * Debugging requires use of the padding between object
-	 * and whatever may come after it.
-	 */
-	if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
-		return s->objsize;
-
-#endif
-	/*
-	 * If we have the need to store the freelist pointer
-	 * back there or track user information then we can
-	 * only use the space before that information.
-	 */
-	if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
-		return s->inuse;
-	/*
-	 * Else we can use all the padding etc for the allocation
-	 */
-	return s->size;
-}
-
 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
 {
 	flags &= gfp_allowed_mask;
-- 
cgit v1.2.3


From d3f661d69a486db0e0e6343b452f45d91b4b3656 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Fri, 25 Feb 2011 11:38:52 -0600
Subject: slub: Get rid of slab_free_hook_irq()

The following patch will make the fastpaths lockless and will no longer
require interrupts to be disabled. Calling the free hook with irq disabled
will no longer be possible.

Move the slab_free_hook_irq() logic into slab_free_hook. Only disable
interrupts if the features are selected that require callbacks with
interrupts off and reenable after calls have been made.

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'mm/slub.c')

diff --git a/mm/slub.c b/mm/slub.c
index e15aa7f193c9..bae7a5c636f4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -807,14 +807,24 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void
 static inline void slab_free_hook(struct kmem_cache *s, void *x)
 {
 	kmemleak_free_recursive(x, s->flags);
-}
 
-static inline void slab_free_hook_irq(struct kmem_cache *s, void *object)
-{
-	kmemcheck_slab_free(s, object, s->objsize);
-	debug_check_no_locks_freed(object, s->objsize);
-	if (!(s->flags & SLAB_DEBUG_OBJECTS))
-		debug_check_no_obj_freed(object, s->objsize);
+	/*
+	 * Trouble is that we may no longer disable interupts in the fast path
+	 * So in order to make the debug calls that expect irqs to be
+	 * disabled we need to disable interrupts temporarily.
+	 */
+#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
+	{
+		unsigned long flags;
+
+		local_irq_save(flags);
+		kmemcheck_slab_free(s, x, s->objsize);
+		debug_check_no_locks_freed(x, s->objsize);
+		if (!(s->flags & SLAB_DEBUG_OBJECTS))
+			debug_check_no_obj_freed(x, s->objsize);
+		local_irq_restore(flags);
+	}
+#endif
 }
 
 /*
@@ -1101,9 +1111,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
 
 static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
 
-static inline void slab_free_hook_irq(struct kmem_cache *s,
-		void *object) {}
-
 #endif /* CONFIG_SLUB_DEBUG */
 
 /*
@@ -1909,8 +1916,6 @@ static __always_inline void slab_free(struct kmem_cache *s,
 	local_irq_save(flags);
 	c = __this_cpu_ptr(s->cpu_slab);
 
-	slab_free_hook_irq(s, x);
-
 	if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
 		set_freepointer(s, object, c->freelist);
 		c->freelist = object;
-- 
cgit v1.2.3


From 8a5ec0ba42c4919e2d8f4c3138cc8b987fdb0b79 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Fri, 25 Feb 2011 11:38:54 -0600
Subject: Lockless (and preemptless) fastpaths for slub

Use the this_cpu_cmpxchg_double functionality to implement a lockless
allocation algorithm on arches that support fast this_cpu_ops.

Each of the per cpu pointers is paired with a transaction id that ensures
that updates of the per cpu information can only occur in sequence on
a certain cpu.

A transaction id is a "long" integer that is comprised of an event number
and the cpu number. The event number is incremented for every change to the
per cpu state. This means that the cmpxchg instruction can verify for an
update that nothing interfered and that we are updating the percpu structure
for the processor where we picked up the information and that we are also
currently on that processor when we update the information.

This results in a significant decrease of the overhead in the fastpaths. It
also makes it easy to adopt the fast path for realtime kernels since this
is lockless and does not require the use of the current per cpu area
over the critical section. It is only important that the per cpu area is
current at the beginning of the critical section and at the end.

So there is no need even to disable preemption.

Test results show that the fastpath cycle count is reduced by up to ~ 40%
(alloc/free test goes from ~140 cycles down to ~80). The slowpath for kfree
adds a few cycles.

Sadly this does nothing for the slowpath which is where the main issues with
performance in slub are but the best case performance rises significantly.
(For that see the more complex slub patches that require cmpxchg_double)

Kmalloc: alloc/free test

Before:

10000 times kmalloc(8)/kfree -> 134 cycles
10000 times kmalloc(16)/kfree -> 152 cycles
10000 times kmalloc(32)/kfree -> 144 cycles
10000 times kmalloc(64)/kfree -> 142 cycles
10000 times kmalloc(128)/kfree -> 142 cycles
10000 times kmalloc(256)/kfree -> 132 cycles
10000 times kmalloc(512)/kfree -> 132 cycles
10000 times kmalloc(1024)/kfree -> 135 cycles
10000 times kmalloc(2048)/kfree -> 135 cycles
10000 times kmalloc(4096)/kfree -> 135 cycles
10000 times kmalloc(8192)/kfree -> 144 cycles
10000 times kmalloc(16384)/kfree -> 754 cycles

After:

10000 times kmalloc(8)/kfree -> 78 cycles
10000 times kmalloc(16)/kfree -> 78 cycles
10000 times kmalloc(32)/kfree -> 82 cycles
10000 times kmalloc(64)/kfree -> 88 cycles
10000 times kmalloc(128)/kfree -> 79 cycles
10000 times kmalloc(256)/kfree -> 79 cycles
10000 times kmalloc(512)/kfree -> 85 cycles
10000 times kmalloc(1024)/kfree -> 82 cycles
10000 times kmalloc(2048)/kfree -> 82 cycles
10000 times kmalloc(4096)/kfree -> 85 cycles
10000 times kmalloc(8192)/kfree -> 82 cycles
10000 times kmalloc(16384)/kfree -> 706 cycles

Kmalloc: Repeatedly allocate then free test

Before:

10000 times kmalloc(8) -> 211 cycles kfree -> 113 cycles
10000 times kmalloc(16) -> 174 cycles kfree -> 115 cycles
10000 times kmalloc(32) -> 235 cycles kfree -> 129 cycles
10000 times kmalloc(64) -> 222 cycles kfree -> 120 cycles
10000 times kmalloc(128) -> 343 cycles kfree -> 139 cycles
10000 times kmalloc(256) -> 827 cycles kfree -> 147 cycles
10000 times kmalloc(512) -> 1048 cycles kfree -> 272 cycles
10000 times kmalloc(1024) -> 2043 cycles kfree -> 528 cycles
10000 times kmalloc(2048) -> 4002 cycles kfree -> 571 cycles
10000 times kmalloc(4096) -> 7740 cycles kfree -> 628 cycles
10000 times kmalloc(8192) -> 8062 cycles kfree -> 850 cycles
10000 times kmalloc(16384) -> 8895 cycles kfree -> 1249 cycles

After:

10000 times kmalloc(8) -> 190 cycles kfree -> 129 cycles
10000 times kmalloc(16) -> 76 cycles kfree -> 123 cycles
10000 times kmalloc(32) -> 126 cycles kfree -> 124 cycles
10000 times kmalloc(64) -> 181 cycles kfree -> 128 cycles
10000 times kmalloc(128) -> 310 cycles kfree -> 140 cycles
10000 times kmalloc(256) -> 809 cycles kfree -> 165 cycles
10000 times kmalloc(512) -> 1005 cycles kfree -> 269 cycles
10000 times kmalloc(1024) -> 1999 cycles kfree -> 527 cycles
10000 times kmalloc(2048) -> 3967 cycles kfree -> 570 cycles
10000 times kmalloc(4096) -> 7658 cycles kfree -> 637 cycles
10000 times kmalloc(8192) -> 8111 cycles kfree -> 859 cycles
10000 times kmalloc(16384) -> 8791 cycles kfree -> 1173 cycles

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slub_def.h |   5 +-
 mm/slub.c                | 205 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 207 insertions(+), 3 deletions(-)

(limited to 'mm/slub.c')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 875df55ab36d..009b0020079d 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -35,7 +35,10 @@ enum stat_item {
 	NR_SLUB_STAT_ITEMS };
 
 struct kmem_cache_cpu {
-	void **freelist;	/* Pointer to first free per cpu object */
+	void **freelist;	/* Pointer to next available object */
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long tid;	/* Globally unique transaction id */
+#endif
 	struct page *page;	/* The slab from which we are allocating */
 	int node;		/* The node of the page (or -1 for debug) */
 #ifdef CONFIG_SLUB_STATS
diff --git a/mm/slub.c b/mm/slub.c
index bae7a5c636f4..65030c7fd7e2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1494,6 +1494,77 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
 	}
 }
 
+#ifdef CONFIG_CMPXCHG_LOCAL
+#ifdef CONFIG_PREEMPT
+/*
+ * Calculate the next globally unique transaction for disambiguiation
+ * during cmpxchg. The transactions start with the cpu number and are then
+ * incremented by CONFIG_NR_CPUS.
+ */
+#define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
+#else
+/*
+ * No preemption supported therefore also no need to check for
+ * different cpus.
+ */
+#define TID_STEP 1
+#endif
+
+static inline unsigned long next_tid(unsigned long tid)
+{
+	return tid + TID_STEP;
+}
+
+static inline unsigned int tid_to_cpu(unsigned long tid)
+{
+	return tid % TID_STEP;
+}
+
+static inline unsigned long tid_to_event(unsigned long tid)
+{
+	return tid / TID_STEP;
+}
+
+static inline unsigned int init_tid(int cpu)
+{
+	return cpu;
+}
+
+static inline void note_cmpxchg_failure(const char *n,
+		const struct kmem_cache *s, unsigned long tid)
+{
+#ifdef SLUB_DEBUG_CMPXCHG
+	unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
+
+	printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
+
+#ifdef CONFIG_PREEMPT
+	if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
+		printk("due to cpu change %d -> %d\n",
+			tid_to_cpu(tid), tid_to_cpu(actual_tid));
+	else
+#endif
+	if (tid_to_event(tid) != tid_to_event(actual_tid))
+		printk("due to cpu running other code. Event %ld->%ld\n",
+			tid_to_event(tid), tid_to_event(actual_tid));
+	else
+		printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
+			actual_tid, tid, next_tid(tid));
+#endif
+}
+
+#endif
+
+void init_kmem_cache_cpus(struct kmem_cache *s)
+{
+#if defined(CONFIG_CMPXCHG_LOCAL) && defined(CONFIG_PREEMPT)
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
+#endif
+
+}
 /*
  * Remove the cpu slab
  */
@@ -1525,6 +1596,9 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 		page->inuse--;
 	}
 	c->page = NULL;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	c->tid = next_tid(c->tid);
+#endif
 	unfreeze_slab(s, page, tail);
 }
 
@@ -1659,6 +1733,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 {
 	void **object;
 	struct page *new;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long flags;
+
+	local_irq_save(flags);
+#ifdef CONFIG_PREEMPT
+	/*
+	 * We may have been preempted and rescheduled on a different
+	 * cpu before disabling interrupts. Need to reload cpu area
+	 * pointer.
+	 */
+	c = this_cpu_ptr(s->cpu_slab);
+#endif
+#endif
 
 	/* We handle __GFP_ZERO in the caller */
 	gfpflags &= ~__GFP_ZERO;
@@ -1685,6 +1772,10 @@ load_freelist:
 	c->node = page_to_nid(c->page);
 unlock_out:
 	slab_unlock(c->page);
+#ifdef CONFIG_CMPXCHG_LOCAL
+	c->tid = next_tid(c->tid);
+	local_irq_restore(flags);
+#endif
 	stat(s, ALLOC_SLOWPATH);
 	return object;
 
@@ -1746,23 +1837,76 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 {
 	void **object;
 	struct kmem_cache_cpu *c;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long tid;
+#else
 	unsigned long flags;
+#endif
 
 	if (slab_pre_alloc_hook(s, gfpflags))
 		return NULL;
 
+#ifndef CONFIG_CMPXCHG_LOCAL
 	local_irq_save(flags);
+#else
+redo:
+#endif
+
+	/*
+	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
+	 * enabled. We may switch back and forth between cpus while
+	 * reading from one cpu area. That does not matter as long
+	 * as we end up on the original cpu again when doing the cmpxchg.
+	 */
 	c = __this_cpu_ptr(s->cpu_slab);
+
+#ifdef CONFIG_CMPXCHG_LOCAL
+	/*
+	 * The transaction ids are globally unique per cpu and per operation on
+	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
+	 * occurs on the right processor and that there was no operation on the
+	 * linked list in between.
+	 */
+	tid = c->tid;
+	barrier();
+#endif
+
 	object = c->freelist;
 	if (unlikely(!object || !node_match(c, node)))
 
 		object = __slab_alloc(s, gfpflags, node, addr, c);
 
 	else {
+#ifdef CONFIG_CMPXCHG_LOCAL
+		/*
+		 * The cmpxchg will only match if there was no additonal
+		 * operation and if we are on the right processor.
+		 *
+		 * The cmpxchg does the following atomically (without lock semantics!)
+		 * 1. Relocate first pointer to the current per cpu area.
+		 * 2. Verify that tid and freelist have not been changed
+		 * 3. If they were not changed replace tid and freelist
+		 *
+		 * Since this is without lock semantics the protection is only against
+		 * code executing on this cpu *not* from access by other cpus.
+		 */
+		if (unlikely(!this_cpu_cmpxchg_double(
+				s->cpu_slab->freelist, s->cpu_slab->tid,
+				object, tid,
+				get_freepointer(s, object), next_tid(tid)))) {
+
+			note_cmpxchg_failure("slab_alloc", s, tid);
+			goto redo;
+		}
+#else
 		c->freelist = get_freepointer(s, object);
+#endif
 		stat(s, ALLOC_FASTPATH);
 	}
+
+#ifndef CONFIG_CMPXCHG_LOCAL
 	local_irq_restore(flags);
+#endif
 
 	if (unlikely(gfpflags & __GFP_ZERO) && object)
 		memset(object, 0, s->objsize);
@@ -1840,9 +1984,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 {
 	void *prior;
 	void **object = (void *)x;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long flags;
 
-	stat(s, FREE_SLOWPATH);
+	local_irq_save(flags);
+#endif
 	slab_lock(page);
+	stat(s, FREE_SLOWPATH);
 
 	if (kmem_cache_debug(s))
 		goto debug;
@@ -1872,6 +2020,9 @@ checks_ok:
 
 out_unlock:
 	slab_unlock(page);
+#ifdef CONFIG_CMPXCHG_LOCAL
+	local_irq_restore(flags);
+#endif
 	return;
 
 slab_empty:
@@ -1883,6 +2034,9 @@ slab_empty:
 		stat(s, FREE_REMOVE_PARTIAL);
 	}
 	slab_unlock(page);
+#ifdef CONFIG_CMPXCHG_LOCAL
+	local_irq_restore(flags);
+#endif
 	stat(s, FREE_SLAB);
 	discard_slab(s, page);
 	return;
@@ -1909,21 +2063,54 @@ static __always_inline void slab_free(struct kmem_cache *s,
 {
 	void **object = (void *)x;
 	struct kmem_cache_cpu *c;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long tid;
+#else
 	unsigned long flags;
+#endif
 
 	slab_free_hook(s, x);
 
+#ifndef CONFIG_CMPXCHG_LOCAL
 	local_irq_save(flags);
+#endif
+
+redo:
+	/*
+	 * Determine the currently cpus per cpu slab.
+	 * The cpu may change afterward. However that does not matter since
+	 * data is retrieved via this pointer. If we are on the same cpu
+	 * during the cmpxchg then the free will succedd.
+	 */
 	c = __this_cpu_ptr(s->cpu_slab);
 
+#ifdef CONFIG_CMPXCHG_LOCAL
+	tid = c->tid;
+	barrier();
+#endif
+
 	if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
 		set_freepointer(s, object, c->freelist);
+
+#ifdef CONFIG_CMPXCHG_LOCAL
+		if (unlikely(!this_cpu_cmpxchg_double(
+				s->cpu_slab->freelist, s->cpu_slab->tid,
+				c->freelist, tid,
+				object, next_tid(tid)))) {
+
+			note_cmpxchg_failure("slab_free", s, tid);
+			goto redo;
+		}
+#else
 		c->freelist = object;
+#endif
 		stat(s, FREE_FASTPATH);
 	} else
 		__slab_free(s, page, x, addr);
 
+#ifndef CONFIG_CMPXCHG_LOCAL
 	local_irq_restore(flags);
+#endif
 }
 
 void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -2115,9 +2302,23 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
 	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
 			SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
 
+#ifdef CONFIG_CMPXCHG_LOCAL
+	/*
+	 * Must align to double word boundary for the double cmpxchg instructions
+	 * to work.
+	 */
+	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *));
+#else
+	/* Regular alignment is sufficient */
 	s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
+#endif
+
+	if (!s->cpu_slab)
+		return 0;
+
+	init_kmem_cache_cpus(s);
 
-	return s->cpu_slab != NULL;
+	return 1;
 }
 
 static struct kmem_cache *kmem_cache_node;
-- 
cgit v1.2.3


From ab9a0f196f2f4f080df54402493ea3dc31b5243e Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 10 Mar 2011 15:21:48 +0800
Subject: slub: automatically reserve bytes at the end of slab

There is no "struct" for slub's slab, it shares with struct page.
But struct page is very small, it is insufficient when we need
to add some metadata for slab.

So we add a field "reserved" to struct kmem_cache, when a slab
is allocated, kmem_cache->reserved bytes are automatically reserved
at the end of the slab for slab's metadata.

Changed from v1:
	Export the reserved field via sysfs

Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slub_def.h |  1 +
 mm/slub.c                | 47 ++++++++++++++++++++++++++++++-----------------
 2 files changed, 31 insertions(+), 17 deletions(-)

(limited to 'mm/slub.c')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 8b6e8ae5d5ca..ae0093cc5189 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -83,6 +83,7 @@ struct kmem_cache {
 	void (*ctor)(void *);
 	int inuse;		/* Offset to metadata */
 	int align;		/* Alignment */
+	int reserved;		/* Reserved bytes at the end of slabs */
 	unsigned long min_partial;
 	const char *name;	/* Name (only for display!) */
 	struct list_head list;	/* List of slab caches */
diff --git a/mm/slub.c b/mm/slub.c
index e15aa7f193c9..d3d17677bab5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -281,11 +281,16 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
 	return (p - addr) / s->size;
 }
 
+static inline int order_objects(int order, unsigned long size, int reserved)
+{
+	return ((PAGE_SIZE << order) - reserved) / size;
+}
+
 static inline struct kmem_cache_order_objects oo_make(int order,
-						unsigned long size)
+		unsigned long size, int reserved)
 {
 	struct kmem_cache_order_objects x = {
-		(order << OO_SHIFT) + (PAGE_SIZE << order) / size
+		(order << OO_SHIFT) + order_objects(order, size, reserved)
 	};
 
 	return x;
@@ -617,7 +622,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
 		return 1;
 
 	start = page_address(page);
-	length = (PAGE_SIZE << compound_order(page));
+	length = (PAGE_SIZE << compound_order(page)) - s->reserved;
 	end = start + length;
 	remainder = length % s->size;
 	if (!remainder)
@@ -698,7 +703,7 @@ static int check_slab(struct kmem_cache *s, struct page *page)
 		return 0;
 	}
 
-	maxobj = (PAGE_SIZE << compound_order(page)) / s->size;
+	maxobj = order_objects(compound_order(page), s->size, s->reserved);
 	if (page->objects > maxobj) {
 		slab_err(s, page, "objects %u > max %u",
 			s->name, page->objects, maxobj);
@@ -748,7 +753,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 		nr++;
 	}
 
-	max_objects = (PAGE_SIZE << compound_order(page)) / s->size;
+	max_objects = order_objects(compound_order(page), s->size, s->reserved);
 	if (max_objects > MAX_OBJS_PER_PAGE)
 		max_objects = MAX_OBJS_PER_PAGE;
 
@@ -1988,13 +1993,13 @@ static int slub_nomerge;
  * the smallest order which will fit the object.
  */
 static inline int slab_order(int size, int min_objects,
-				int max_order, int fract_leftover)
+				int max_order, int fract_leftover, int reserved)
 {
 	int order;
 	int rem;
 	int min_order = slub_min_order;
 
-	if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE)
+	if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
 		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
 
 	for (order = max(min_order,
@@ -2003,10 +2008,10 @@ static inline int slab_order(int size, int min_objects,
 
 		unsigned long slab_size = PAGE_SIZE << order;
 
-		if (slab_size < min_objects * size)
+		if (slab_size < min_objects * size + reserved)
 			continue;
 
-		rem = slab_size % size;
+		rem = (slab_size - reserved) % size;
 
 		if (rem <= slab_size / fract_leftover)
 			break;
@@ -2016,7 +2021,7 @@ static inline int slab_order(int size, int min_objects,
 	return order;
 }
 
-static inline int calculate_order(int size)
+static inline int calculate_order(int size, int reserved)
 {
 	int order;
 	int min_objects;
@@ -2034,14 +2039,14 @@ static inline int calculate_order(int size)
 	min_objects = slub_min_objects;
 	if (!min_objects)
 		min_objects = 4 * (fls(nr_cpu_ids) + 1);
-	max_objects = (PAGE_SIZE << slub_max_order)/size;
+	max_objects = order_objects(slub_max_order, size, reserved);
 	min_objects = min(min_objects, max_objects);
 
 	while (min_objects > 1) {
 		fraction = 16;
 		while (fraction >= 4) {
 			order = slab_order(size, min_objects,
-						slub_max_order, fraction);
+					slub_max_order, fraction, reserved);
 			if (order <= slub_max_order)
 				return order;
 			fraction /= 2;
@@ -2053,14 +2058,14 @@ static inline int calculate_order(int size)
 	 * We were unable to place multiple objects in a slab. Now
 	 * lets see if we can place a single object there.
 	 */
-	order = slab_order(size, 1, slub_max_order, 1);
+	order = slab_order(size, 1, slub_max_order, 1, reserved);
 	if (order <= slub_max_order)
 		return order;
 
 	/*
 	 * Doh this slab cannot be placed using slub_max_order.
 	 */
-	order = slab_order(size, 1, MAX_ORDER, 1);
+	order = slab_order(size, 1, MAX_ORDER, 1, reserved);
 	if (order < MAX_ORDER)
 		return order;
 	return -ENOSYS;
@@ -2311,7 +2316,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 	if (forced_order >= 0)
 		order = forced_order;
 	else
-		order = calculate_order(size);
+		order = calculate_order(size, s->reserved);
 
 	if (order < 0)
 		return 0;
@@ -2329,8 +2334,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 	/*
 	 * Determine the number of objects per slab
 	 */
-	s->oo = oo_make(order, size);
-	s->min = oo_make(get_order(size), size);
+	s->oo = oo_make(order, size, s->reserved);
+	s->min = oo_make(get_order(size), size, s->reserved);
 	if (oo_objects(s->oo) > oo_objects(s->max))
 		s->max = s->oo;
 
@@ -2349,6 +2354,7 @@ static int kmem_cache_open(struct kmem_cache *s,
 	s->objsize = size;
 	s->align = align;
 	s->flags = kmem_cache_flags(size, flags, name, ctor);
+	s->reserved = 0;
 
 	if (!calculate_sizes(s, -1))
 		goto error;
@@ -4017,6 +4023,12 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
 }
 SLAB_ATTR_RO(destroy_by_rcu);
 
+static ssize_t reserved_show(struct kmem_cache *s, char *buf)
+{
+	return sprintf(buf, "%d\n", s->reserved);
+}
+SLAB_ATTR_RO(reserved);
+
 #ifdef CONFIG_SLUB_DEBUG
 static ssize_t slabs_show(struct kmem_cache *s, char *buf)
 {
@@ -4303,6 +4315,7 @@ static struct attribute *slab_attrs[] = {
 	&reclaim_account_attr.attr,
 	&destroy_by_rcu_attr.attr,
 	&shrink_attr.attr,
+	&reserved_attr.attr,
 #ifdef CONFIG_SLUB_DEBUG
 	&total_objects_attr.attr,
 	&slabs_attr.attr,
-- 
cgit v1.2.3


From da9a638c6f8fc0633fa94a334f1c053f5e307177 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 10 Mar 2011 15:22:00 +0800
Subject: slub,rcu: don't assume the size of struct rcu_head

The size of struct rcu_head may be changed. When it becomes larger,
it will pollute the page array.

We reserve some some bytes for struct rcu_head when a slab
is allocated in this situation.

Changed from V1:
	use VM_BUG_ON instead BUG_ON

Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

(limited to 'mm/slub.c')

diff --git a/mm/slub.c b/mm/slub.c
index d3d17677bab5..ebba3eb19369 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1254,21 +1254,38 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 	__free_pages(page, order);
 }
 
+#define need_reserve_slab_rcu						\
+	(sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
+
 static void rcu_free_slab(struct rcu_head *h)
 {
 	struct page *page;
 
-	page = container_of((struct list_head *)h, struct page, lru);
+	if (need_reserve_slab_rcu)
+		page = virt_to_head_page(h);
+	else
+		page = container_of((struct list_head *)h, struct page, lru);
+
 	__free_slab(page->slab, page);
 }
 
 static void free_slab(struct kmem_cache *s, struct page *page)
 {
 	if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
-		/*
-		 * RCU free overloads the RCU head over the LRU
-		 */
-		struct rcu_head *head = (void *)&page->lru;
+		struct rcu_head *head;
+
+		if (need_reserve_slab_rcu) {
+			int order = compound_order(page);
+			int offset = (PAGE_SIZE << order) - s->reserved;
+
+			VM_BUG_ON(s->reserved != sizeof(*head));
+			head = page_address(page) + offset;
+		} else {
+			/*
+			 * RCU free overloads the RCU head over the LRU
+			 */
+			head = (void *)&page->lru;
+		}
 
 		call_rcu(head, rcu_free_slab);
 	} else
@@ -2356,6 +2373,9 @@ static int kmem_cache_open(struct kmem_cache *s,
 	s->flags = kmem_cache_flags(size, flags, name, ctor);
 	s->reserved = 0;
 
+	if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
+		s->reserved = sizeof(struct rcu_head);
+
 	if (!calculate_sizes(s, -1))
 		goto error;
 	if (disable_higher_order_debug) {
-- 
cgit v1.2.3


From a24c5a0ea902bcda348f086bd909cc2d6e305bf8 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 15 Mar 2011 12:45:21 -0500
Subject: slub: Dont define useless label in the !CONFIG_CMPXCHG_LOCAL case

The redo label needs #ifdeffery. Fixes the following problem introduced by
commit 8a5ec0ba42c4 ("Lockless (and preemptless) fastpaths for slub"):

  mm/slub.c: In function 'slab_free':
  mm/slub.c:2124: warning: label 'redo' defined but not used

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm/slub.c')

diff --git a/mm/slub.c b/mm/slub.c
index 65030c7fd7e2..f32aee37840b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2073,9 +2073,11 @@ static __always_inline void slab_free(struct kmem_cache *s,
 
 #ifndef CONFIG_CMPXCHG_LOCAL
 	local_irq_save(flags);
-#endif
 
+#else
 redo:
+#endif
+
 	/*
 	 * Determine the currently cpus per cpu slab.
 	 * The cpu may change afterward. However that does not matter since
-- 
cgit v1.2.3


From 2fd66c517d5e98de2528d86e0e62f5069ff99f59 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 22 Mar 2011 13:32:53 -0500
Subject: slub: Add missing irq restore for the OOM path

OOM path is missing the irq restore in the CONFIG_CMPXCHG_LOCAL case.

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm/slub.c')

diff --git a/mm/slub.c b/mm/slub.c
index 7e4f835e32ab..e126cfbd3df2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1857,6 +1857,9 @@ new_slab:
 	}
 	if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
 		slab_out_of_memory(s, gfpflags, node);
+#ifdef CONFIG_CMPXCHG_LOCAL
+	local_irq_restore(flags);
+#endif
 	return NULL;
 debug:
 	if (!alloc_debug_processing(s, c->page, object, addr))
-- 
cgit v1.2.3


From 4fdccdfbb4652a7bbac8adbce7449eb093775118 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 22 Mar 2011 13:35:00 -0500
Subject: slub: Add statistics for this_cmpxchg_double failures

Add some statistics for debugging.

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slub_def.h | 1 +
 mm/slub.c                | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm/slub.c')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 90fbb6d87e11..45ca123e8002 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -32,6 +32,7 @@ enum stat_item {
 	DEACTIVATE_TO_TAIL,	/* Cpu slab was moved to the tail of partials */
 	DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */
 	ORDER_FALLBACK,		/* Number of times fallback was necessary */
+	CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */
 	NR_SLUB_STAT_ITEMS };
 
 struct kmem_cache_cpu {
diff --git a/mm/slub.c b/mm/slub.c
index e126cfbd3df2..93de30db95f5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -217,7 +217,7 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
 
 #endif
 
-static inline void stat(struct kmem_cache *s, enum stat_item si)
+static inline void stat(const struct kmem_cache *s, enum stat_item si)
 {
 #ifdef CONFIG_SLUB_STATS
 	__this_cpu_inc(s->cpu_slab->stat[si]);
@@ -1597,6 +1597,7 @@ static inline void note_cmpxchg_failure(const char *n,
 		printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
 			actual_tid, tid, next_tid(tid));
 #endif
+	stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
 }
 
 #endif
-- 
cgit v1.2.3


From f9b615de4663c4b852e07257e9f967df6a0161c0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 24 Mar 2011 21:26:46 +0200
Subject: slub: Fix debugobjects with lockless fastpath

On Thu, 24 Mar 2011, Ingo Molnar wrote:
> RIP: 0010:[<ffffffff810570a9>]  [<ffffffff810570a9>] get_next_timer_interrupt+0x119/0x260

That's a typical timer crash, but you were unable to debug it with
debugobjects because commit d3f661d6 broke those.

Cc: Christoph Lameter <cl@linux.com>
Tested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm/slub.c')

diff --git a/mm/slub.c b/mm/slub.c
index 93de30db95f5..a6a783594ad4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -849,11 +849,11 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
 		local_irq_save(flags);
 		kmemcheck_slab_free(s, x, s->objsize);
 		debug_check_no_locks_freed(x, s->objsize);
-		if (!(s->flags & SLAB_DEBUG_OBJECTS))
-			debug_check_no_obj_freed(x, s->objsize);
 		local_irq_restore(flags);
 	}
 #endif
+	if (!(s->flags & SLAB_DEBUG_OBJECTS))
+		debug_check_no_obj_freed(x, s->objsize);
 }
 
 /*
-- 
cgit v1.2.3


From b8c4c96ed4cdecf5ae51fc6f4c006658e873047f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 24 Mar 2011 14:51:38 -0500
Subject: SLUB: Write to per cpu data when allocating it

It turns out that the cmpxchg16b emulation has to access vmalloced
percpu memory with interrupts disabled. If the memory has never
been touched before then the fault necessary to establish the
mapping will not to occur and the kernel will fail on boot.

Fix that by reusing the CONFIG_PREEMPT code that writes the
cpu number into a field on every cpu. Writing to the per cpu
area before causes the mapping to be established before we get
to a cmpxchg16b emulation.

Tested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/slub.c')

diff --git a/mm/slub.c b/mm/slub.c
index a6a783594ad4..f881874843a5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1604,7 +1604,7 @@ static inline void note_cmpxchg_failure(const char *n,
 
 void init_kmem_cache_cpus(struct kmem_cache *s)
 {
-#if defined(CONFIG_CMPXCHG_LOCAL) && defined(CONFIG_PREEMPT)
+#ifdef CONFIG_CMPXCHG_LOCAL
 	int cpu;
 
 	for_each_possible_cpu(cpu)
-- 
cgit v1.2.3