slqb: dynamic array allocations

Implement dynamic allocation for SLQB per-cpu and per-node arrays. This should hopefully have minimal runtime performance impact, because although there is an extra level of indirection to do allocations, the pointer should be in the cache hot area of the struct kmem_cache. It's not quite possible to use dynamic percpu allocator for this: firstly, that subsystem uses the slab allocator. Secondly, it doesn't have good support for per-node data. If those problems were improved, we could use it. For now, just implement a very very simple allocator until the kmalloc caches are up. On x86-64 with a NUMA MAXCPUS config, sizes look like this: text data bss dec hex filename 29960 259565 100 289625 46b59 mm/slab.o 34130 497130 696 531956 81df4 mm/slub.o 24575 1634267 111136 1769978 1b01fa mm/slqb.o 24845 13959 712 39516 9a5c mm/slqb.o + this patch SLQB is now 2 orders of magnitude smaller than it was, and an order of magnitude smaller than SLAB or SLUB (in total size -- text size has always been smaller). So it should now be very suitable for distro-type configs in this respect. As a side-effect the UP version of cpu_slab (which is embedded directly in the kmem_cache struct) moves up to the hot cachelines, so it need no longer be cacheline aligned on UP. The overall result should be a reduction in cacheline footprint on UP kernels. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
author: Nick Piggin <npiggin@suse.de> 2009-02-03 15:07:12 +0100
committer: Pekka Enberg <penberg@cs.helsinki.fi> 2009-04-16 11:14:28 +0300
commit: 13079b7881aa3b027e4babdff901765e88d7e216 (patch)
tree: bca77ccf8581bd217f85d7af6a2d248a083dbd9d /mm
parent: ed65612b974d4b3609192eadd8b3d632fe4960e7 (diff)
1 files changed, 88 insertions, 29 deletions
diff --git a/mm/slqb.c b/mm/slqb.c
index 7ebbf01e3593..4352dad169dc 100644
--- a/mm/slqb.c
+++ b/mm/slqb.c
@@ -56,7 +56,6 @@ static inline void struct_slqb_page_wrong_size(void)
 
 #define PG_SLQB_BIT (1 << PG_slab)
 
-static int kmem_size __read_mostly;
 #ifdef CONFIG_NUMA
 static inline int slab_numa(struct kmem_cache *s)
 {
@@ -1329,7 +1328,7 @@ static noinline void *__slab_alloc_page(struct kmem_cache *s,
 #ifdef CONFIG_NUMA
 		struct kmem_cache_node *n;
 
-		n = s->node[slqb_page_to_nid(page)];
+		n = s->node_slab[slqb_page_to_nid(page)];
 		l = &n->list;
 		page->list = l;
 
@@ -1373,7 +1372,7 @@ static void *__remote_slab_alloc_node(struct kmem_cache *s,
 	struct kmem_cache_list *l;
 	void *object;
 
-	n = s->node[node];
+	n = s->node_slab[node];
 	if (unlikely(!n)) /* node has no memory */
 		return NULL;
 	l = &n->list;
@@ -1818,7 +1817,7 @@ static void init_kmem_cache_node(struct kmem_cache *s,
 }
 #endif
 
-/* Initial slabs. XXX: allocate dynamically (with bootmem maybe) */
+/* Initial slabs. */
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_cache_cpus);
 #endif
@@ -1912,10 +1911,10 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
 	for_each_node_state(node, N_NORMAL_MEMORY) {
 		struct kmem_cache_node *n;
 
-		n = s->node[node];
+		n = s->node_slab[node];
 		if (n) {
 			kmem_cache_free(&kmem_node_cache, n);
-			s->node[node] = NULL;
+			s->node_slab[node] = NULL;
 		}
 	}
 }
@@ -1933,7 +1932,7 @@ static int alloc_kmem_cache_nodes(struct kmem_cache *s)
 			return 0;
 		}
 		init_kmem_cache_node(s, n);
-		s->node[node] = n;
+		s->node_slab[node] = n;
 	}
 	return 1;
 }
@@ -2069,13 +2068,56 @@ static int calculate_sizes(struct kmem_cache *s)
 
 }
 
+#ifdef CONFIG_SMP
+/*
+ * Per-cpu allocator can't be used because it always uses slab allocator,
+ * and it can't do per-node allocations.
+ */
+static void *kmem_cache_dyn_array_alloc(int ids)
+{
+	size_t size = sizeof(void *) * ids;
+
+	if (unlikely(!slab_is_available())) {
+		static void *nextmem;
+		void *ret;
+
+		/*
+		 * Special case for setting up initial caches. These will
+		 * never get freed by definition so we can do it rather
+		 * simply.
+		 */
+		if (!nextmem) {
+			nextmem = alloc_pages_exact(size, GFP_KERNEL);
+			if (!nextmem)
+				return NULL;
+		}
+		ret = nextmem;
+		nextmem = (void *)((unsigned long)ret + size);
+		if ((unsigned long)ret >> PAGE_SHIFT !=
+				(unsigned long)nextmem >> PAGE_SHIFT)
+			nextmem = NULL;
+		memset(ret, 0, size);
+		return ret;
+	} else {
+		return kzalloc(size, GFP_KERNEL);
+	}
+}
+
+static void kmem_cache_dyn_array_free(void *array)
+{
+	if (unlikely(!slab_is_available()))
+		return; /* error case without crashing here (will panic soon) */
+	kfree(array);
+}
+#endif
+
 static int kmem_cache_open(struct kmem_cache *s,
 			const char *name, size_t size, size_t align,
 			unsigned long flags, void (*ctor)(void *), int alloc)
 {
 	unsigned int left_over;
 
-	memset(s, 0, kmem_size);
+	memset(s, 0, sizeof(struct kmem_cache));
 	s->name = name;
 	s->ctor = ctor;
 	s->objsize = size;
@@ -2094,10 +2136,26 @@ static int kmem_cache_open(struct kmem_cache *s,
 		s->colour_range = 0;
 	}
 
+	/*
+	 * Protect all alloc_kmem_cache_cpus/nodes allocations with slqb_lock
+	 * to lock out hotplug, just in case (probably not strictly needed
+	 * here).
+	 */
 	down_write(&slqb_lock);
+#ifdef CONFIG_SMP
+	s->cpu_slab = kmem_cache_dyn_array_alloc(nr_cpu_ids);
+	if (!s->cpu_slab)
+		goto error_lock;
+# ifdef CONFIG_NUMA
+	s->node_slab = kmem_cache_dyn_array_alloc(nr_node_ids);
+	if (!s->node_slab)
+		goto error_cpu_array;
+# endif
+#endif
+
 	if (likely(alloc)) {
 		if (!alloc_kmem_cache_nodes(s))
-			goto error_lock;
+			goto error_node_array;
 
 		if (!alloc_kmem_cache_cpus(s))
 			goto error_nodes;
@@ -2111,6 +2169,14 @@ static int kmem_cache_open(struct kmem_cache *s,
 
 error_nodes:
 	free_kmem_cache_nodes(s);
+error_node_array:
+#ifdef CONFIG_NUMA
+	kmem_cache_dyn_array_free(s->node_slab);
+#endif
+error_cpu_array:
+#ifdef CONFIG_SMP
+	kmem_cache_dyn_array_free(s->cpu_slab);
+#endif
 error_lock:
 	up_write(&slqb_lock);
 error:
@@ -2152,7 +2218,7 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *ptr)
 	page = virt_to_head_slqb_page(ptr);
 	if (unlikely(!(page->flags & PG_SLQB_BIT)))
 		goto out;
-	if (unlikely(page->list->cache != s))
+	if (unlikely(page->list->cache != s)) /* XXX: ouch, racy */
 		goto out;
 	return 1;
 out:
@@ -2220,7 +2286,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
 		struct kmem_cache_node *n;
 		struct kmem_cache_list *l;
 
-		n = s->node[node];
+		n = s->node_slab[node];
 		if (!n)
 			continue;
 		l = &n->list;
@@ -2449,7 +2515,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
 		struct kmem_cache_node *n;
 		struct kmem_cache_list *l;
 
-		n = s->node[node];
+		n = s->node_slab[node];
 		if (!n)
 			continue;
 		l = &n->list;
@@ -2502,7 +2568,7 @@ static void kmem_cache_reap(void)
 			struct kmem_cache_node *n;
 			struct kmem_cache_list *l;
 
-			n = s->node[node];
+			n = s->node_slab[node];
 			if (!n)
 				continue;
 			l = &n->list;
@@ -2529,7 +2595,7 @@ static void cache_trim_worker(struct work_struct *w)
 	list_for_each_entry(s, &slab_caches, list) {
 #ifdef CONFIG_NUMA
 		int node = numa_node_id();
-		struct kmem_cache_node *n = s->node[node];
+		struct kmem_cache_node *n = s->node_slab[node];
 
 		if (n) {
 			struct kmem_cache_list *l = &n->list;
@@ -2618,7 +2684,7 @@ static int slab_mem_going_online_callback(void *arg)
 		 *      since memory is not yet available from the node that
 		 *      is brought up.
 		 */
-		if (s->node[nid]) /* could be lefover from last online */
+		if (s->node_slab[nid]) /* could be lefover from last online */
 			continue;
 		n = kmem_cache_alloc(&kmem_node_cache, GFP_KERNEL);
 		if (!n) {
@@ -2626,7 +2692,7 @@ static int slab_mem_going_online_callback(void *arg)
 			goto out;
 		}
 		init_kmem_cache_node(s, n);
-		s->node[nid] = n;
+		s->node_slab[nid] = n;
 	}
 out:
 	up_write(&slqb_lock);
@@ -2673,15 +2739,8 @@ void __init kmem_cache_init(void)
 	 * All the ifdefs are rather ugly here, but it's just the setup code,
 	 * so it doesn't have to be too readable :)
 	 */
-#ifdef CONFIG_SMP
-	kmem_size = offsetof(struct kmem_cache, cpu_slab) +
-				nr_cpu_ids * sizeof(struct kmem_cache_cpu *);
-#else
-	kmem_size = sizeof(struct kmem_cache);
-#endif
-
 	kmem_cache_open(&kmem_cache_cache, "kmem_cache",
-			kmem_size, 0, flags, NULL, 0);
+			sizeof(struct kmem_cache), 0, flags, NULL, 0);
 #ifdef CONFIG_SMP
 	kmem_cache_open(&kmem_cpu_cache, "kmem_cache_cpu",
 			sizeof(struct kmem_cache_cpu), 0, flags, NULL, 0);
@@ -2719,15 +2778,15 @@ void __init kmem_cache_init(void)
 
 		n = &per_cpu(kmem_cache_nodes, i);
 		init_kmem_cache_node(&kmem_cache_cache, n);
-		kmem_cache_cache.node[i] = n;
+		kmem_cache_cache.node_slab[i] = n;
 
 		n = &per_cpu(kmem_cpu_nodes, i);
 		init_kmem_cache_node(&kmem_cpu_cache, n);
-		kmem_cpu_cache.node[i] = n;
+		kmem_cpu_cache.node_slab[i] = n;
 
 		n = &per_cpu(kmem_node_nodes, i);
 		init_kmem_cache_node(&kmem_node_cache, n);
-		kmem_node_cache.node[i] = n;
+		kmem_node_cache.node_slab[i] = n;
 	}
 #endif
 
@@ -2793,7 +2852,7 @@ void __init kmem_cache_init(void)
 #endif
 	/*
 	 * smp_init() has not yet been called, so no worries about memory
-	 * ordering here (eg. slab_is_available vs numa_platform)
+	 * ordering with __slab_is_available.
 	 */
 	__slab_is_available = 1;
 }
@@ -3036,7 +3095,7 @@ static void gather_stats(struct kmem_cache *s, struct stats_gather *stats)
 
 #ifdef CONFIG_NUMA
 	for_each_online_node(node) {
-		struct kmem_cache_node *n = s->node[node];
+		struct kmem_cache_node *n = s->node_slab[node];
 		struct kmem_cache_list *l = &n->list;
 		struct slqb_page *page;
 		unsigned long flags;
author	Nick Piggin <npiggin@suse.de>	2009-02-03 15:07:12 +0100
committer	Pekka Enberg <penberg@cs.helsinki.fi>	2009-04-16 11:14:28 +0300
commit	13079b7881aa3b027e4babdff901765e88d7e216 (patch)
tree	bca77ccf8581bd217f85d7af6a2d248a083dbd9d /mm
parent	ed65612b974d4b3609192eadd8b3d632fe4960e7 (diff)