summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2008-12-04 16:33:25 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2008-12-04 16:33:25 +1100
commit66c3b54aaf127ee1a99c7a8e3a907f4e85715a9b (patch)
tree619a4d458c85af177f11cdf557256717d1f35730 /mm
parent3b22d14aeb9e110f956637a7655d2cbfe7d5da79 (diff)
parent00f39605364d5d04d669c7c280d8645efee3aa51 (diff)
Merge commit 'slab/for-next'
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile1
-rw-r--r--mm/kmemtrace.c333
-rw-r--r--mm/slab.c85
-rw-r--r--mm/slob.c37
-rw-r--r--mm/slub.c543
-rw-r--r--mm/vmscan.c64
-rw-r--r--mm/vmstat.c4
7 files changed, 937 insertions, 130 deletions
diff --git a/mm/Makefile b/mm/Makefile
index c06b45a1ff5f..3782eb66d4b3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -34,3 +34,4 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c
new file mode 100644
index 000000000000..2a70a805027c
--- /dev/null
+++ b/mm/kmemtrace.c
@@ -0,0 +1,333 @@
+/*
+ * Copyright (C) 2008 Pekka Enberg, Eduard - Gabriel Munteanu
+ *
+ * This file is released under GPL version 2.
+ */
+
+#include <linux/string.h>
+#include <linux/debugfs.h>
+#include <linux/relay.h>
+#include <linux/module.h>
+#include <linux/marker.h>
+#include <linux/gfp.h>
+#include <linux/kmemtrace.h>
+
+#define KMEMTRACE_SUBBUF_SIZE 524288
+#define KMEMTRACE_DEF_N_SUBBUFS 20
+
+static struct rchan *kmemtrace_chan;
+static u32 kmemtrace_buf_overruns;
+
+static unsigned int kmemtrace_n_subbufs;
+
+/* disabled by default */
+static unsigned int kmemtrace_enabled;
+
+/*
+ * The sequence number is used for reordering kmemtrace packets
+ * in userspace, since they are logged as per-CPU data.
+ *
+ * atomic_t should always be a 32-bit signed integer. Wraparound is not
+ * likely to occur, but userspace can deal with it by expecting a certain
+ * sequence number in the next packet that will be read.
+ */
+static atomic_t kmemtrace_seq_num;
+
+#define KMEMTRACE_ABI_VERSION 1
+
+static u32 kmemtrace_abi_version __read_mostly = KMEMTRACE_ABI_VERSION;
+
+enum kmemtrace_event_id {
+ KMEMTRACE_EVENT_ALLOC = 0,
+ KMEMTRACE_EVENT_FREE,
+};
+
+struct kmemtrace_event {
+ u8 event_id;
+ u8 type_id;
+ u16 event_size;
+ s32 seq_num;
+ u64 call_site;
+ u64 ptr;
+} __attribute__ ((__packed__));
+
+struct kmemtrace_stats_alloc {
+ u64 bytes_req;
+ u64 bytes_alloc;
+ u32 gfp_flags;
+ s32 numa_node;
+} __attribute__ ((__packed__));
+
+static void kmemtrace_probe_alloc(void *probe_data, void *call_data,
+ const char *format, va_list *args)
+{
+ unsigned long flags;
+ struct kmemtrace_event *ev;
+ struct kmemtrace_stats_alloc *stats;
+ void *buf;
+
+ local_irq_save(flags);
+
+ buf = relay_reserve(kmemtrace_chan,
+ sizeof(struct kmemtrace_event) +
+ sizeof(struct kmemtrace_stats_alloc));
+ if (!buf)
+ goto failed;
+
+ /*
+ * Don't convert this to use structure initializers,
+ * C99 does not guarantee the rvalues evaluation order.
+ */
+
+ ev = buf;
+ ev->event_id = KMEMTRACE_EVENT_ALLOC;
+ ev->type_id = va_arg(*args, int);
+ ev->event_size = sizeof(struct kmemtrace_event) +
+ sizeof(struct kmemtrace_stats_alloc);
+ ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num);
+ ev->call_site = va_arg(*args, unsigned long);
+ ev->ptr = va_arg(*args, unsigned long);
+
+ stats = buf + sizeof(struct kmemtrace_event);
+ stats->bytes_req = va_arg(*args, unsigned long);
+ stats->bytes_alloc = va_arg(*args, unsigned long);
+ stats->gfp_flags = va_arg(*args, unsigned long);
+ stats->numa_node = va_arg(*args, int);
+
+failed:
+ local_irq_restore(flags);
+}
+
+static void kmemtrace_probe_free(void *probe_data, void *call_data,
+ const char *format, va_list *args)
+{
+ unsigned long flags;
+ struct kmemtrace_event *ev;
+
+ local_irq_save(flags);
+
+ ev = relay_reserve(kmemtrace_chan, sizeof(struct kmemtrace_event));
+ if (!ev)
+ goto failed;
+
+ /*
+ * Don't convert this to use structure initializers,
+ * C99 does not guarantee the rvalues evaluation order.
+ */
+ ev->event_id = KMEMTRACE_EVENT_FREE;
+ ev->type_id = va_arg(*args, int);
+ ev->event_size = sizeof(struct kmemtrace_event);
+ ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num);
+ ev->call_site = va_arg(*args, unsigned long);
+ ev->ptr = va_arg(*args, unsigned long);
+
+failed:
+ local_irq_restore(flags);
+}
+
+static struct dentry *
+kmemtrace_create_buf_file(const char *filename, struct dentry *parent,
+ int mode, struct rchan_buf *buf, int *is_global)
+{
+ return debugfs_create_file(filename, mode, parent, buf,
+ &relay_file_operations);
+}
+
+static int kmemtrace_remove_buf_file(struct dentry *dentry)
+{
+ debugfs_remove(dentry);
+
+ return 0;
+}
+
+static int kmemtrace_subbuf_start(struct rchan_buf *buf,
+ void *subbuf,
+ void *prev_subbuf,
+ size_t prev_padding)
+{
+ if (relay_buf_full(buf)) {
+ /*
+ * We know it's not SMP-safe, but neither
+ * debugfs_create_u32() is.
+ */
+ kmemtrace_buf_overruns++;
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct rchan_callbacks relay_callbacks = {
+ .create_buf_file = kmemtrace_create_buf_file,
+ .remove_buf_file = kmemtrace_remove_buf_file,
+ .subbuf_start = kmemtrace_subbuf_start,
+};
+
+static struct dentry *kmemtrace_dir;
+static struct dentry *kmemtrace_overruns_dentry;
+static struct dentry *kmemtrace_abi_version_dentry;
+
+static struct dentry *kmemtrace_enabled_dentry;
+
+static int kmemtrace_start_probes(void)
+{
+ int err;
+
+ err = marker_probe_register("kmemtrace_alloc", "type_id %d "
+ "call_site %lu ptr %lu "
+ "bytes_req %lu bytes_alloc %lu "
+ "gfp_flags %lu node %d",
+ kmemtrace_probe_alloc, NULL);
+ if (err)
+ return err;
+ err = marker_probe_register("kmemtrace_free", "type_id %d "
+ "call_site %lu ptr %lu",
+ kmemtrace_probe_free, NULL);
+
+ return err;
+}
+
+static void kmemtrace_stop_probes(void)
+{
+ marker_probe_unregister("kmemtrace_alloc",
+ kmemtrace_probe_alloc, NULL);
+ marker_probe_unregister("kmemtrace_free",
+ kmemtrace_probe_free, NULL);
+}
+
+static int kmemtrace_enabled_get(void *data, u64 *val)
+{
+ *val = *((int *) data);
+
+ return 0;
+}
+
+static int kmemtrace_enabled_set(void *data, u64 val)
+{
+ u64 old_val = kmemtrace_enabled;
+
+ *((int *) data) = !!val;
+
+ if (old_val == val)
+ return 0;
+ if (val)
+ kmemtrace_start_probes();
+ else
+ kmemtrace_stop_probes();
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(kmemtrace_enabled_fops,
+ kmemtrace_enabled_get,
+ kmemtrace_enabled_set, "%llu\n");
+
+static void kmemtrace_cleanup(void)
+{
+ if (kmemtrace_enabled_dentry)
+ debugfs_remove(kmemtrace_enabled_dentry);
+
+ kmemtrace_stop_probes();
+
+ if (kmemtrace_abi_version_dentry)
+ debugfs_remove(kmemtrace_abi_version_dentry);
+ if (kmemtrace_overruns_dentry)
+ debugfs_remove(kmemtrace_overruns_dentry);
+
+ relay_close(kmemtrace_chan);
+ kmemtrace_chan = NULL;
+
+ if (kmemtrace_dir)
+ debugfs_remove(kmemtrace_dir);
+}
+
+static int __init kmemtrace_setup_late(void)
+{
+ if (!kmemtrace_chan)
+ goto failed;
+
+ kmemtrace_dir = debugfs_create_dir("kmemtrace", NULL);
+ if (!kmemtrace_dir)
+ goto cleanup;
+
+ kmemtrace_abi_version_dentry =
+ debugfs_create_u32("abi_version", S_IRUSR,
+ kmemtrace_dir, &kmemtrace_abi_version);
+ kmemtrace_overruns_dentry =
+ debugfs_create_u32("total_overruns", S_IRUSR,
+ kmemtrace_dir, &kmemtrace_buf_overruns);
+ if (!kmemtrace_overruns_dentry || !kmemtrace_abi_version_dentry)
+ goto cleanup;
+
+ kmemtrace_enabled_dentry =
+ debugfs_create_file("enabled", S_IRUSR | S_IWUSR,
+ kmemtrace_dir, &kmemtrace_enabled,
+ &kmemtrace_enabled_fops);
+ if (!kmemtrace_enabled_dentry)
+ goto cleanup;
+
+ if (relay_late_setup_files(kmemtrace_chan, "cpu", kmemtrace_dir))
+ goto cleanup;
+
+ printk(KERN_INFO "kmemtrace: fully up.\n");
+
+ return 0;
+
+cleanup:
+ kmemtrace_cleanup();
+failed:
+ return 1;
+}
+late_initcall(kmemtrace_setup_late);
+
+static int __init kmemtrace_set_boot_enabled(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "yes"))
+ kmemtrace_enabled = 1;
+ else if (!strcmp(str, "no"))
+ kmemtrace_enabled = 0;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kmemtrace.enable", kmemtrace_set_boot_enabled);
+
+static int __init kmemtrace_set_subbufs(char *str)
+{
+ get_option(&str, &kmemtrace_n_subbufs);
+ return 0;
+}
+early_param("kmemtrace.subbufs", kmemtrace_set_subbufs);
+
+void kmemtrace_init(void)
+{
+ if (!kmemtrace_n_subbufs)
+ kmemtrace_n_subbufs = KMEMTRACE_DEF_N_SUBBUFS;
+
+ kmemtrace_chan = relay_open(NULL, NULL, KMEMTRACE_SUBBUF_SIZE,
+ kmemtrace_n_subbufs, &relay_callbacks,
+ NULL);
+ if (!kmemtrace_chan) {
+ printk(KERN_ERR "kmemtrace: could not open relay channel.\n");
+ return;
+ }
+
+ if (!kmemtrace_enabled) {
+ printk(KERN_INFO "kmemtrace: disabled. Pass "
+ "kemtrace.enable=yes as kernel parameter for "
+ "boot-time tracing.\n");
+ return;
+ }
+ if (kmemtrace_start_probes()) {
+ printk(KERN_ERR "kmemtrace: could not register marker probes!\n");
+ kmemtrace_cleanup();
+ return;
+ }
+
+ printk(KERN_INFO "kmemtrace: enabled.\n");
+}
+
diff --git a/mm/slab.c b/mm/slab.c
index 09187517f9dc..389ea62109b7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -112,6 +112,7 @@
#include <linux/rtmutex.h>
#include <linux/reciprocal_div.h>
#include <linux/debugobjects.h>
+#include <linux/kmemtrace.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -568,6 +569,14 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
#endif
+#ifdef CONFIG_KMEMTRACE
+size_t slab_buffer_size(struct kmem_cache *cachep)
+{
+ return cachep->buffer_size;
+}
+EXPORT_SYMBOL(slab_buffer_size);
+#endif
+
/*
* Do not go above this order unless 0 objects fit into the slab.
*/
@@ -2123,6 +2132,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
*
* @name must be valid until the cache is destroyed. This implies that
* the module calling this has to destroy the cache before getting unloaded.
+ * Note that kmem_cache_name() is not guaranteed to return the same pointer,
+ * therefore applications must manage it themselves.
*
* The flags are
*
@@ -2609,7 +2620,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
slabp = kmem_cache_alloc_node(cachep->slabp_cache,
- local_flags & ~GFP_THISNODE, nodeid);
+ local_flags, nodeid);
if (!slabp)
return NULL;
} else {
@@ -2997,7 +3008,7 @@ retry:
* there must be at least one object available for
* allocation.
*/
- BUG_ON(slabp->inuse < 0 || slabp->inuse >= cachep->num);
+ BUG_ON(slabp->inuse >= cachep->num);
while (slabp->inuse < cachep->num && batchcount--) {
STATS_INC_ALLOCED(cachep);
@@ -3613,10 +3624,23 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
*/
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
- return __cache_alloc(cachep, flags, __builtin_return_address(0));
+ void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
+
+ kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
+ obj_size(cachep), cachep->buffer_size, flags);
+
+ return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc);
+#ifdef CONFIG_KMEMTRACE
+void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
+{
+ return __cache_alloc(cachep, flags, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(kmem_cache_alloc_notrace);
+#endif
+
/**
* kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
* @cachep: the cache we're checking against
@@ -3661,23 +3685,47 @@ out:
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
- return __cache_alloc_node(cachep, flags, nodeid,
- __builtin_return_address(0));
+ void *ret = __cache_alloc_node(cachep, flags, nodeid,
+ __builtin_return_address(0));
+
+ kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
+ obj_size(cachep), cachep->buffer_size,
+ flags, nodeid);
+
+ return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
+#ifdef CONFIG_KMEMTRACE
+void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
+ gfp_t flags,
+ int nodeid)
+{
+ return __cache_alloc_node(cachep, flags, nodeid,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+#endif
+
static __always_inline void *
__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
{
struct kmem_cache *cachep;
+ void *ret;
cachep = kmem_find_general_cachep(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
- return kmem_cache_alloc_node(cachep, flags, node);
+ ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
+
+ kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
+ (unsigned long) caller, ret,
+ size, cachep->buffer_size, flags, node);
+
+ return ret;
}
-#ifdef CONFIG_DEBUG_SLAB
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
return __do_kmalloc_node(size, flags, node,
@@ -3686,9 +3734,9 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
EXPORT_SYMBOL(__kmalloc_node);
void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
- int node, void *caller)
+ int node, unsigned long caller)
{
- return __do_kmalloc_node(size, flags, node, caller);
+ return __do_kmalloc_node(size, flags, node, (void *)caller);
}
EXPORT_SYMBOL(__kmalloc_node_track_caller);
#else
@@ -3710,6 +3758,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
void *caller)
{
struct kmem_cache *cachep;
+ void *ret;
/* If you want to save a few bytes .text space: replace
* __ with kmem_.
@@ -3719,20 +3768,26 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
cachep = __find_general_cachep(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
- return __cache_alloc(cachep, flags, caller);
+ ret = __cache_alloc(cachep, flags, caller);
+
+ kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC,
+ (unsigned long) caller, ret,
+ size, cachep->buffer_size, flags);
+
+ return ret;
}
-#ifdef CONFIG_DEBUG_SLAB
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
void *__kmalloc(size_t size, gfp_t flags)
{
return __do_kmalloc(size, flags, __builtin_return_address(0));
}
EXPORT_SYMBOL(__kmalloc);
-void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
+void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
{
- return __do_kmalloc(size, flags, caller);
+ return __do_kmalloc(size, flags, (void *)caller);
}
EXPORT_SYMBOL(__kmalloc_track_caller);
@@ -3762,6 +3817,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
debug_check_no_obj_freed(objp, obj_size(cachep));
__cache_free(cachep, objp);
local_irq_restore(flags);
+
+ kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, objp);
}
EXPORT_SYMBOL(kmem_cache_free);
@@ -3788,6 +3845,8 @@ void kfree(const void *objp)
debug_check_no_obj_freed(objp, obj_size(c));
__cache_free(c, (void *)objp);
local_irq_restore(flags);
+
+ kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, objp);
}
EXPORT_SYMBOL(kfree);
diff --git a/mm/slob.c b/mm/slob.c
index cb675d126791..55de44ae5d30 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -65,6 +65,7 @@
#include <linux/module.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
+#include <linux/kmemtrace.h>
#include <asm/atomic.h>
/*
@@ -463,27 +464,38 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
{
unsigned int *m;
int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ void *ret;
if (size < PAGE_SIZE - align) {
if (!size)
return ZERO_SIZE_PTR;
m = slob_alloc(size + align, gfp, align, node);
+
if (!m)
return NULL;
*m = size;
- return (void *)m + align;
+ ret = (void *)m + align;
+
+ kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
+ _RET_IP_, ret,
+ size, size + align, gfp, node);
} else {
- void *ret;
+ unsigned int order = get_order(size);
- ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node);
+ ret = slob_new_page(gfp | __GFP_COMP, order, node);
if (ret) {
struct page *page;
page = virt_to_page(ret);
page->private = size;
}
- return ret;
+
+ kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
+ _RET_IP_, ret,
+ size, PAGE_SIZE << order, gfp, node);
}
+
+ return ret;
}
EXPORT_SYMBOL(__kmalloc_node);
@@ -501,6 +513,8 @@ void kfree(const void *block)
slob_free(m, *m + align);
} else
put_page(&sp->page);
+
+ kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, block);
}
EXPORT_SYMBOL(kfree);
@@ -569,10 +583,19 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
{
void *b;
- if (c->size < PAGE_SIZE)
+ if (c->size < PAGE_SIZE) {
b = slob_alloc(c->size, flags, c->align, node);
- else
+ kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE,
+ _RET_IP_, b, c->size,
+ SLOB_UNITS(c->size) * SLOB_UNIT,
+ flags, node);
+ } else {
b = slob_new_page(flags, get_order(c->size), node);
+ kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE,
+ _RET_IP_, b, c->size,
+ PAGE_SIZE << get_order(c->size),
+ flags, node);
+ }
if (c->ctor)
c->ctor(b);
@@ -608,6 +631,8 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
} else {
__kmem_cache_free(b, c->size);
}
+
+ kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, b);
}
EXPORT_SYMBOL(kmem_cache_free);
diff --git a/mm/slub.c b/mm/slub.c
index 72c5fd069f65..d00628edec0e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -24,6 +24,7 @@
#include <linux/kallsyms.h>
#include <linux/memory.h>
#include <linux/math64.h>
+#include <linux/kmemtrace.h>
/*
* Lock order:
@@ -128,10 +129,10 @@
/*
* Maximum number of desirable partial slabs.
- * The existence of more partial slabs makes kmem_cache_shrink
- * sort the partial list by the number of objects in the.
+ * More slabs cause kmem_cache_shrink to sort the slabs by objects
+ * and triggers slab defragmentation.
*/
-#define MAX_PARTIAL 10
+#define MAX_PARTIAL 20
#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
SLAB_POISON | SLAB_STORE_USER)
@@ -153,6 +154,10 @@
#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
#endif
+#define OO_SHIFT 16
+#define OO_MASK ((1 << OO_SHIFT) - 1)
+#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */
+
/* Internal SLUB flags */
#define __OBJECT_POISON 0x80000000 /* Poison object */
#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */
@@ -174,11 +179,14 @@ static enum {
static DECLARE_RWSEM(slub_lock);
static LIST_HEAD(slab_caches);
+/* Maximum objects in defragmentable slabs */
+static unsigned int max_defrag_slab_objects;
+
/*
* Tracking user of a slab.
*/
struct track {
- void *addr; /* Called from address */
+ unsigned long addr; /* Called from address */
int cpu; /* Was running on cpu */
int pid; /* Pid context */
unsigned long when; /* When did the operation occur */
@@ -290,7 +298,7 @@ static inline struct kmem_cache_order_objects oo_make(int order,
unsigned long size)
{
struct kmem_cache_order_objects x = {
- (order << 16) + (PAGE_SIZE << order) / size
+ (order << OO_SHIFT) + (PAGE_SIZE << order) / size
};
return x;
@@ -298,12 +306,12 @@ static inline struct kmem_cache_order_objects oo_make(int order,
static inline int oo_order(struct kmem_cache_order_objects x)
{
- return x.x >> 16;
+ return x.x >> OO_SHIFT;
}
static inline int oo_objects(struct kmem_cache_order_objects x)
{
- return x.x & ((1 << 16) - 1);
+ return x.x & OO_MASK;
}
#ifdef CONFIG_SLUB_DEBUG
@@ -367,7 +375,7 @@ static struct track *get_track(struct kmem_cache *s, void *object,
}
static void set_track(struct kmem_cache *s, void *object,
- enum track_item alloc, void *addr)
+ enum track_item alloc, unsigned long addr)
{
struct track *p;
@@ -391,8 +399,8 @@ static void init_tracking(struct kmem_cache *s, void *object)
if (!(s->flags & SLAB_STORE_USER))
return;
- set_track(s, object, TRACK_FREE, NULL);
- set_track(s, object, TRACK_ALLOC, NULL);
+ set_track(s, object, TRACK_FREE, 0UL);
+ set_track(s, object, TRACK_ALLOC, 0UL);
}
static void print_track(const char *s, struct track *t)
@@ -401,7 +409,7 @@ static void print_track(const char *s, struct track *t)
return;
printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
- s, t->addr, jiffies - t->when, t->cpu, t->pid);
+ s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
}
static void print_tracking(struct kmem_cache *s, void *object)
@@ -764,8 +772,8 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
}
max_objects = (PAGE_SIZE << compound_order(page)) / s->size;
- if (max_objects > 65535)
- max_objects = 65535;
+ if (max_objects > MAX_OBJS_PER_PAGE)
+ max_objects = MAX_OBJS_PER_PAGE;
if (page->objects != max_objects) {
slab_err(s, page, "Wrong number of objects. Found %d but "
@@ -866,7 +874,7 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
}
static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
- void *object, void *addr)
+ void *object, unsigned long addr)
{
if (!check_slab(s, page))
goto bad;
@@ -906,7 +914,7 @@ bad:
}
static int free_debug_processing(struct kmem_cache *s, struct page *page,
- void *object, void *addr)
+ void *object, unsigned long addr)
{
if (!check_slab(s, page))
goto fail;
@@ -1029,10 +1037,10 @@ static inline void setup_object_debug(struct kmem_cache *s,
struct page *page, void *object) {}
static inline int alloc_debug_processing(struct kmem_cache *s,
- struct page *page, void *object, void *addr) { return 0; }
+ struct page *page, void *object, unsigned long addr) { return 0; }
static inline int free_debug_processing(struct kmem_cache *s,
- struct page *page, void *object, void *addr) { return 0; }
+ struct page *page, void *object, unsigned long addr) { return 0; }
static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
{ return 1; }
@@ -1128,6 +1136,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
SLAB_STORE_USER | SLAB_TRACE))
__SetPageSlubDebug(page);
+ if (s->kick)
+ __SetPageSlubKickable(page);
+
start = page_address(page);
if (unlikely(s->flags & SLAB_POISON))
@@ -1168,6 +1179,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-pages);
+ __ClearPageSlubKickable(page);
__ClearPageSlab(page);
reset_page_mapcount(page);
__free_pages(page, order);
@@ -1378,6 +1390,8 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
if (SLABDEBUG && PageSlubDebug(page) &&
(s->flags & SLAB_STORE_USER))
add_full(n, page);
+ if (s->kick)
+ __SetPageSlubKickable(page);
}
slab_unlock(page);
} else {
@@ -1499,8 +1513,8 @@ static inline int node_match(struct kmem_cache_cpu *c, int node)
* we need to allocate a new slab. This is the slowest path since it involves
* a call to the page allocator and the setup of a new slab.
*/
-static void *__slab_alloc(struct kmem_cache *s,
- gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
+static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+ unsigned long addr, struct kmem_cache_cpu *c)
{
void **object;
struct page *new;
@@ -1584,13 +1598,14 @@ debug:
* Otherwise we can simply pick the next object from the lockless free list.
*/
static __always_inline void *slab_alloc(struct kmem_cache *s,
- gfp_t gfpflags, int node, void *addr)
+ gfp_t gfpflags, int node, unsigned long addr)
{
void **object;
struct kmem_cache_cpu *c;
unsigned long flags;
unsigned int objsize;
+ might_sleep_if(gfpflags & __GFP_WAIT);
local_irq_save(flags);
c = get_cpu_slab(s, smp_processor_id());
objsize = c->objsize;
@@ -1613,18 +1628,46 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
- return slab_alloc(s, gfpflags, -1, __builtin_return_address(0));
+ void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_);
+
+ kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
+ s->objsize, s->size, gfpflags);
+
+ return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc);
+#ifdef CONFIG_KMEMTRACE
+void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
+{
+ return slab_alloc(s, gfpflags, -1, _RET_IP_);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_notrace);
+#endif
+
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
{
- return slab_alloc(s, gfpflags, node, __builtin_return_address(0));
+ void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+
+ kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
+ s->objsize, s->size, gfpflags, node);
+
+ return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
#endif
+#ifdef CONFIG_KMEMTRACE
+void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
+ gfp_t gfpflags,
+ int node)
+{
+ return slab_alloc(s, gfpflags, node, _RET_IP_);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+#endif
+
/*
* Slow patch handling. This may still be called frequently since objects
* have a longer lifetime than the cpu slabs in most processing loads.
@@ -1634,7 +1677,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
* handling required then we can return immediately.
*/
static void __slab_free(struct kmem_cache *s, struct page *page,
- void *x, void *addr, unsigned int offset)
+ void *x, unsigned long addr, unsigned int offset)
{
void *prior;
void **object = (void *)x;
@@ -1704,7 +1747,7 @@ debug:
* with all sorts of special processing.
*/
static __always_inline void slab_free(struct kmem_cache *s,
- struct page *page, void *x, void *addr)
+ struct page *page, void *x, unsigned long addr)
{
void **object = (void *)x;
struct kmem_cache_cpu *c;
@@ -1731,11 +1774,13 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
page = virt_to_head_page(x);
- slab_free(s, page, x, __builtin_return_address(0));
+ slab_free(s, page, x, _RET_IP_);
+
+ kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, x);
}
EXPORT_SYMBOL(kmem_cache_free);
-/* Figure out on which slab object the object resides */
+/* Figure out on which slab page the object resides */
static struct page *get_object_page(const void *x)
{
struct page *page = virt_to_head_page(x);
@@ -1807,8 +1852,8 @@ static inline int slab_order(int size, int min_objects,
int rem;
int min_order = slub_min_order;
- if ((PAGE_SIZE << min_order) / size > 65535)
- return get_order(size * 65535) - 1;
+ if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE)
+ return get_order(size * MAX_OBJS_PER_PAGE) - 1;
for (order = max(min_order,
fls(min_objects * size - 1) - PAGE_SHIFT);
@@ -2073,8 +2118,7 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
* when allocating for the kmalloc_node_cache. This is used for bootstrapping
* memory on a fresh node that has no slab structures yet.
*/
-static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
- int node)
+static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
{
struct page *page;
struct kmem_cache_node *n;
@@ -2112,7 +2156,6 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
local_irq_save(flags);
add_partial(n, page, 0);
local_irq_restore(flags);
- return n;
}
static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -2144,8 +2187,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
n = &s->local_node;
else {
if (slab_state == DOWN) {
- n = early_kmem_cache_node_alloc(gfpflags,
- node);
+ early_kmem_cache_node_alloc(gfpflags, node);
continue;
}
n = kmem_cache_alloc_node(kmalloc_caches,
@@ -2313,6 +2355,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
goto error;
s->refcount = 1;
+ s->defrag_ratio = 30;
#ifdef CONFIG_NUMA
s->remote_node_defrag_ratio = 1000;
#endif
@@ -2519,7 +2562,7 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
flags, NULL))
goto panic;
- list_add(&s->list, &slab_caches);
+ list_add_tail(&s->list, &slab_caches);
up_write(&slub_lock);
if (sysfs_slab_add(s))
goto panic;
@@ -2582,7 +2625,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
goto unlock_out;
}
- list_add(&s->list, &slab_caches);
+ list_add_tail(&s->list, &slab_caches);
kmalloc_caches_dma[index] = s;
schedule_work(&sysfs_add_work);
@@ -2650,6 +2693,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
void *__kmalloc(size_t size, gfp_t flags)
{
struct kmem_cache *s;
+ void *ret;
if (unlikely(size > PAGE_SIZE))
return kmalloc_large(size, flags);
@@ -2659,7 +2703,12 @@ void *__kmalloc(size_t size, gfp_t flags)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- return slab_alloc(s, flags, -1, __builtin_return_address(0));
+ ret = slab_alloc(s, flags, -1, _RET_IP_);
+
+ kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret,
+ size, s->size, flags);
+
+ return ret;
}
EXPORT_SYMBOL(__kmalloc);
@@ -2678,16 +2727,30 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
struct kmem_cache *s;
+ void *ret;
- if (unlikely(size > PAGE_SIZE))
- return kmalloc_large_node(size, flags, node);
+ if (unlikely(size > PAGE_SIZE)) {
+ ret = kmalloc_large_node(size, flags, node);
+
+ kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
+ _RET_IP_, ret,
+ size, PAGE_SIZE << get_order(size),
+ flags, node);
+
+ return ret;
+ }
s = get_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- return slab_alloc(s, flags, node, __builtin_return_address(0));
+ ret = slab_alloc(s, flags, node, _RET_IP_);
+
+ kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret,
+ size, s->size, flags, node);
+
+ return ret;
}
EXPORT_SYMBOL(__kmalloc_node);
#endif
@@ -2744,81 +2807,291 @@ void kfree(const void *x)
put_page(page);
return;
}
- slab_free(page->slab, page, object, __builtin_return_address(0));
+ slab_free(page->slab, page, object, _RET_IP_);
+
+ kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, x);
}
EXPORT_SYMBOL(kfree);
/*
- * kmem_cache_shrink removes empty slabs from the partial lists and sorts
- * the remaining slabs by the number of items in use. The slabs with the
- * most items in use come first. New allocations will then fill those up
- * and thus they can be removed from the partial lists.
+ * Allocate a slab scratch space that is sufficient to keep at least
+ * max_defrag_slab_objects pointers to individual objects and also a bitmap
+ * for max_defrag_slab_objects.
+ */
+static inline void *alloc_scratch(void)
+{
+ return kmalloc(max_defrag_slab_objects * sizeof(void *) +
+ BITS_TO_LONGS(max_defrag_slab_objects) * sizeof(unsigned long),
+ GFP_KERNEL);
+}
+
+void kmem_cache_setup_defrag(struct kmem_cache *s,
+ kmem_defrag_get_func get, kmem_defrag_kick_func kick)
+{
+ int max_objects = oo_objects(s->max);
+
+ /*
+ * Defragmentable slabs must have a ctor otherwise objects may be
+ * in an undetermined state after they are allocated.
+ */
+ BUG_ON(!s->ctor);
+ s->get = get;
+ s->kick = kick;
+ down_write(&slub_lock);
+ list_move(&s->list, &slab_caches);
+ if (max_objects > max_defrag_slab_objects)
+ max_defrag_slab_objects = max_objects;
+ up_write(&slub_lock);
+}
+EXPORT_SYMBOL(kmem_cache_setup_defrag);
+
+/*
+ * Vacate all objects in the given slab.
*
- * The slabs with the least items are placed last. This results in them
- * being allocated from last increasing the chance that the last objects
- * are freed in them.
+ * The scratch aread passed to list function is sufficient to hold
+ * struct listhead times objects per slab. We use it to hold void ** times
+ * objects per slab plus a bitmap for each object.
*/
-int kmem_cache_shrink(struct kmem_cache *s)
+static int kmem_cache_vacate(struct page *page, void *scratch)
{
- int node;
- int i;
- struct kmem_cache_node *n;
- struct page *page;
- struct page *t;
- int objects = oo_objects(s->max);
- struct list_head *slabs_by_inuse =
- kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
+ void **vector = scratch;
+ void *p;
+ void *addr = page_address(page);
+ struct kmem_cache *s;
+ unsigned long *map;
+ int leftover;
+ int count;
+ void *private;
unsigned long flags;
+ unsigned long objects;
+ struct kmem_cache_cpu *c;
- if (!slabs_by_inuse)
- return -ENOMEM;
+ local_irq_save(flags);
+ slab_lock(page);
- flush_all(s);
- for_each_node_state(node, N_NORMAL_MEMORY) {
- n = get_node(s, node);
+ BUG_ON(!PageSlab(page)); /* Must be s slab page */
+ BUG_ON(!PageSlubFrozen(page)); /* Slab must have been frozen earlier */
- if (!n->nr_partial)
- continue;
+ s = page->slab;
+ objects = page->objects;
+ map = scratch + objects * sizeof(void **);
+ if (!page->inuse || !s->kick || !PageSlubKickable(page))
+ goto out;
- for (i = 0; i < objects; i++)
- INIT_LIST_HEAD(slabs_by_inuse + i);
+ /* Determine used objects */
+ bitmap_fill(map, objects);
+ for_each_free_object(p, s, page->freelist)
+ __clear_bit(slab_index(p, s, addr), map);
- spin_lock_irqsave(&n->list_lock, flags);
+ /* Build vector of pointers to objects */
+ count = 0;
+ memset(vector, 0, objects * sizeof(void **));
+ for_each_object(p, s, addr, objects)
+ if (test_bit(slab_index(p, s, addr), map))
+ vector[count++] = p;
- /*
- * Build lists indexed by the items in use in each slab.
- *
- * Note that concurrent frees may occur while we hold the
- * list_lock. page->inuse here is the upper limit.
- */
- list_for_each_entry_safe(page, t, &n->partial, lru) {
- if (!page->inuse && slab_trylock(page)) {
+ private = s->get(s, count, vector);
+
+ /*
+ * Got references. Now we can drop the slab lock. The slab
+ * is frozen so it cannot vanish from under us nor will
+ * allocations be performed on the slab. However, unlocking the
+ * slab will allow concurrent slab_frees to proceed.
+ */
+ slab_unlock(page);
+ local_irq_restore(flags);
+
+ /*
+ * Perform the KICK callbacks to remove the objects.
+ */
+ s->kick(s, count, vector, private);
+
+ local_irq_save(flags);
+ slab_lock(page);
+out:
+ /*
+ * Check the result and unfreeze the slab
+ */
+ leftover = page->inuse;
+ c = get_cpu_slab(s, smp_processor_id());
+ if (leftover) {
+ /* Unsuccessful reclaim. Avoid future reclaim attempts. */
+ stat(c, SHRINK_OBJECT_RECLAIM_FAILED);
+ __ClearPageSlubKickable(page);
+ } else
+ stat(c, SHRINK_SLAB_RECLAIMED);
+ unfreeze_slab(s, page, leftover > 0);
+ local_irq_restore(flags);
+ return leftover;
+}
+
+/*
+ * Remove objects from a list of slab pages that have been gathered.
+ * Must be called with slabs that have been isolated before.
+ *
+ * kmem_cache_reclaim() is never called from an atomic context. It
+ * allocates memory for temporary storage. We are holding the
+ * slub_lock semaphore which prevents another call into
+ * the defrag logic.
+ */
+int kmem_cache_reclaim(struct list_head *zaplist)
+{
+ int freed = 0;
+ void **scratch;
+ struct page *page;
+ struct page *page2;
+
+ if (list_empty(zaplist))
+ return 0;
+
+ scratch = alloc_scratch();
+ if (!scratch)
+ return 0;
+
+ list_for_each_entry_safe(page, page2, zaplist, lru) {
+ list_del(&page->lru);
+ if (kmem_cache_vacate(page, scratch) == 0)
+ freed++;
+ }
+ kfree(scratch);
+ return freed;
+}
+
+/*
+ * Shrink the slab cache on a particular node of the cache
+ * by releasing slabs with zero objects and trying to reclaim
+ * slabs with less than the configured percentage of objects allocated.
+ */
+static unsigned long __kmem_cache_shrink(struct kmem_cache *s, int node,
+ unsigned long limit)
+{
+ unsigned long flags;
+ struct page *page, *page2;
+ LIST_HEAD(zaplist);
+ int freed = 0;
+ struct kmem_cache_node *n = get_node(s, node);
+ struct kmem_cache_cpu *c;
+
+ if (n->nr_partial <= limit)
+ return 0;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ c = get_cpu_slab(s, smp_processor_id());
+ stat(c, SHRINK_CALLS);
+ list_for_each_entry_safe(page, page2, &n->partial, lru) {
+ if (!slab_trylock(page))
+ /* Busy slab. Get out of the way */
+ continue;
+
+ if (page->inuse) {
+ if (!PageSlubKickable(page) || page->inuse * 100 >=
+ s->defrag_ratio * page->objects) {
+ slab_unlock(page);
/*
- * Must hold slab lock here because slab_free
- * may have freed the last object and be
- * waiting to release the slab.
+ * Slab contains enough objects
+ * or we alrady tried reclaim before and
+ * it failed. Skip this one.
*/
- list_del(&page->lru);
+ continue;
+ }
+
+ list_move(&page->lru, &zaplist);
+ if (s->kick) {
+ stat(c, SHRINK_ATTEMPT_DEFRAG);
n->nr_partial--;
- slab_unlock(page);
- discard_slab(s, page);
- } else {
- list_move(&page->lru,
- slabs_by_inuse + page->inuse);
+ __SetPageSlubFrozen(page);
}
+ slab_unlock(page);
+ } else {
+ /* Empty slab page */
+ stat(c, SHRINK_EMPTY_SLAB);
+ list_del(&page->lru);
+ n->nr_partial--;
+ slab_unlock(page);
+ discard_slab(s, page);
+ freed++;
}
+ }
+ if (!s->kick)
/*
- * Rebuild the partial list with the slabs filled up most
- * first and the least used slabs at the end.
+ * No defrag methods. By simply putting the zaplist at the
+ * end of the partial list we can let them simmer longer
+ * and thus increase the chance of all objects being
+ * reclaimed.
+ *
+ * We have effectively sorted the partial list and put
+ * the slabs with more objects first. As soon as they
+ * are allocated they are going to be removed from the
+ * partial list.
*/
- for (i = objects - 1; i >= 0; i--)
- list_splice(slabs_by_inuse + i, n->partial.prev);
+ list_splice(&zaplist, n->partial.prev);
- spin_unlock_irqrestore(&n->list_lock, flags);
+
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ if (s->kick)
+ freed += kmem_cache_reclaim(&zaplist);
+
+ return freed;
+}
+
+/*
+ * Defrag slabs conditional on the amount of fragmentation in a page.
+ */
+int kmem_cache_defrag(int node)
+{
+ struct kmem_cache *s;
+ unsigned long slabs = 0;
+
+ /*
+ * kmem_cache_defrag may be called from the reclaim path which may be
+ * called for any page allocator alloc. So there is the danger that we
+ * get called in a situation where slub already acquired the slub_lock
+ * for other purposes.
+ */
+ if (!down_read_trylock(&slub_lock))
+ return 0;
+
+ list_for_each_entry(s, &slab_caches, list) {
+ unsigned long reclaimed = 0;
+
+ /*
+ * Defragmentable caches come first. If the slab cache is not
+ * defragmentable then we can stop traversing the list.
+ */
+ if (!s->kick)
+ break;
+
+ if (node == -1) {
+ int nid;
+
+ for_each_node_state(nid, N_NORMAL_MEMORY)
+ reclaimed += __kmem_cache_shrink(s, nid,
+ MAX_PARTIAL);
+ } else
+ reclaimed = __kmem_cache_shrink(s, node, MAX_PARTIAL);
+
+ slabs += reclaimed;
}
+ up_read(&slub_lock);
+ return slabs;
+}
+EXPORT_SYMBOL(kmem_cache_defrag);
+
+/*
+ * kmem_cache_shrink removes empty slabs from the partial lists.
+ * If the slab cache supports defragmentation then objects are
+ * reclaimed.
+ */
+int kmem_cache_shrink(struct kmem_cache *s)
+{
+ int node;
+
+ flush_all(s);
+ for_each_node_state(node, N_NORMAL_MEMORY)
+ __kmem_cache_shrink(s, node, 0);
- kfree(slabs_by_inuse);
return 0;
}
EXPORT_SYMBOL(kmem_cache_shrink);
@@ -3043,7 +3316,7 @@ static int slab_unmergeable(struct kmem_cache *s)
if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
return 1;
- if (s->ctor)
+ if (s->ctor || s->kick || s->get)
return 1;
/*
@@ -3132,7 +3405,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
if (s) {
if (kmem_cache_open(s, GFP_KERNEL, name,
size, align, flags, ctor)) {
- list_add(&s->list, &slab_caches);
+ list_add_tail(&s->list, &slab_caches);
up_write(&slub_lock);
if (sysfs_slab_add(s))
goto err;
@@ -3202,9 +3475,10 @@ static struct notifier_block __cpuinitdata slab_notifier = {
#endif
-void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
+void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
{
struct kmem_cache *s;
+ void *ret;
if (unlikely(size > PAGE_SIZE))
return kmalloc_large(size, gfpflags);
@@ -3214,13 +3488,20 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- return slab_alloc(s, gfpflags, -1, caller);
+ ret = slab_alloc(s, gfpflags, -1, caller);
+
+ /* Honor the call site pointer we recieved. */
+ kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, caller, ret, size,
+ s->size, gfpflags);
+
+ return ret;
}
void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
- int node, void *caller)
+ int node, unsigned long caller)
{
struct kmem_cache *s;
+ void *ret;
if (unlikely(size > PAGE_SIZE))
return kmalloc_large_node(size, gfpflags, node);
@@ -3230,7 +3511,13 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- return slab_alloc(s, gfpflags, node, caller);
+ ret = slab_alloc(s, gfpflags, node, caller);
+
+ /* Honor the call site pointer we recieved. */
+ kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, caller, ret,
+ size, s->size, gfpflags, node);
+
+ return ret;
}
#ifdef CONFIG_SLUB_DEBUG
@@ -3429,7 +3716,7 @@ static void resiliency_test(void) {};
struct location {
unsigned long count;
- void *addr;
+ unsigned long addr;
long long sum_time;
long min_time;
long max_time;
@@ -3477,7 +3764,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
{
long start, end, pos;
struct location *l;
- void *caddr;
+ unsigned long caddr;
unsigned long age = jiffies - track->when;
start = -1;
@@ -3817,16 +4104,32 @@ static ssize_t order_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR(order);
-static ssize_t ctor_show(struct kmem_cache *s, char *buf)
+static ssize_t ops_show(struct kmem_cache *s, char *buf)
{
+ int x = 0;
+
if (s->ctor) {
- int n = sprint_symbol(buf, (unsigned long)s->ctor);
+ x += sprintf(buf + x, "ctor : ");
+ x += sprint_symbol(buf + x, (unsigned long)s->ctor);
+ x += sprintf(buf + x, "\n");
+ }
- return n + sprintf(buf + n, "\n");
+ if (s->get) {
+ x += sprintf(buf + x, "get : ");
+ x += sprint_symbol(buf + x,
+ (unsigned long)s->get);
+ x += sprintf(buf + x, "\n");
}
- return 0;
+
+ if (s->kick) {
+ x += sprintf(buf + x, "kick : ");
+ x += sprint_symbol(buf + x,
+ (unsigned long)s->kick);
+ x += sprintf(buf + x, "\n");
+ }
+ return x;
}
-SLAB_ATTR_RO(ctor);
+SLAB_ATTR_RO(ops);
static ssize_t aliases_show(struct kmem_cache *s, char *buf)
{
@@ -4046,6 +4349,27 @@ static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(free_calls);
+static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->defrag_ratio);
+}
+
+static ssize_t defrag_ratio_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ unsigned long ratio;
+ int err;
+
+ err = strict_strtoul(buf, 10, &ratio);
+ if (err)
+ return err;
+
+ if (ratio < 100)
+ s->defrag_ratio = ratio;
+ return length;
+}
+SLAB_ATTR(defrag_ratio);
+
#ifdef CONFIG_NUMA
static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
{
@@ -4125,6 +4449,12 @@ STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
STAT_ATTR(ORDER_FALLBACK, order_fallback);
+STAT_ATTR(SHRINK_CALLS, shrink_calls);
+STAT_ATTR(SHRINK_ATTEMPT_DEFRAG, shrink_attempt_defrag);
+STAT_ATTR(SHRINK_EMPTY_SLAB, shrink_empty_slab);
+STAT_ATTR(SHRINK_SLAB_SKIPPED, shrink_slab_skipped);
+STAT_ATTR(SHRINK_SLAB_RECLAIMED, shrink_slab_reclaimed);
+STAT_ATTR(SHRINK_OBJECT_RECLAIM_FAILED, shrink_object_reclaim_failed);
#endif
static struct attribute *slab_attrs[] = {
@@ -4138,7 +4468,7 @@ static struct attribute *slab_attrs[] = {
&slabs_attr.attr,
&partial_attr.attr,
&cpu_slabs_attr.attr,
- &ctor_attr.attr,
+ &ops_attr.attr,
&aliases_attr.attr,
&align_attr.attr,
&sanity_checks_attr.attr,
@@ -4153,6 +4483,7 @@ static struct attribute *slab_attrs[] = {
&shrink_attr.attr,
&alloc_calls_attr.attr,
&free_calls_attr.attr,
+ &defrag_ratio_attr.attr,
#ifdef CONFIG_ZONE_DMA
&cache_dma_attr.attr,
#endif
@@ -4178,6 +4509,12 @@ static struct attribute *slab_attrs[] = {
&deactivate_to_tail_attr.attr,
&deactivate_remote_frees_attr.attr,
&order_fallback_attr.attr,
+ &shrink_calls_attr.attr,
+ &shrink_attempt_defrag_attr.attr,
+ &shrink_empty_slab_attr.attr,
+ &shrink_slab_skipped_attr.attr,
+ &shrink_slab_reclaimed_attr.attr,
+ &shrink_object_reclaim_failed_attr.attr,
#endif
NULL
};
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 62e7f62fb559..c45074e6cc5d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -151,6 +151,14 @@ void unregister_shrinker(struct shrinker *shrinker)
EXPORT_SYMBOL(unregister_shrinker);
#define SHRINK_BATCH 128
+
+/*
+ * Trigger a call into slab defrag if the sum of the returns from
+ * shrinkers cross this value.
+ */
+int slab_defrag_limit = 1000;
+int slab_defrag_counter;
+
/*
* Call the shrink functions to age shrinkable caches
*
@@ -168,10 +176,18 @@ EXPORT_SYMBOL(unregister_shrinker);
* are eligible for the caller's allocation attempt. It is used for balancing
* slab reclaim versus page reclaim.
*
+ * zone is the zone for which we are shrinking the slabs. If the intent
+ * is to do a global shrink then zone may be NULL. Specification of a
+ * zone is currently only used to limit slab defragmentation to a NUMA node.
+ * The performace of shrink_slab would be better (in particular under NUMA)
+ * if it could be targeted as a whole to the zone that is under memory
+ * pressure but the VFS infrastructure does not allow that at the present
+ * time.
+ *
* Returns the number of slab objects which we shrunk.
*/
unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
- unsigned long lru_pages)
+ unsigned long lru_pages, struct zone *zone)
{
struct shrinker *shrinker;
unsigned long ret = 0;
@@ -228,6 +244,39 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
shrinker->nr += total_scan;
}
up_read(&shrinker_rwsem);
+
+
+ /* Avoid dirtying cachelines */
+ if (!ret)
+ return 0;
+
+ /*
+ * "ret" doesnt really contain the freed object count. The shrinkers
+ * fake it. Gotta go with what we are getting though.
+ *
+ * Handling of the defrag_counter is also racy. If we get the
+ * wrong counts then we may unnecessarily do a defrag pass or defer
+ * one. "ret" is already faked. So this is just increasing
+ * the already existing fuzziness to get some notion as to when
+ * to initiate slab defrag which will hopefully be okay.
+ */
+ if (zone) {
+ /* balance_pgdat running on a zone so we only scan one node */
+ zone->slab_defrag_counter += ret;
+ if (zone->slab_defrag_counter > slab_defrag_limit &&
+ (gfp_mask & __GFP_FS)) {
+ zone->slab_defrag_counter = 0;
+ kmem_cache_defrag(zone_to_nid(zone));
+ }
+ } else {
+ /* Direct (and thus global) reclaim. Scan all nodes */
+ slab_defrag_counter += ret;
+ if (slab_defrag_counter > slab_defrag_limit &&
+ (gfp_mask & __GFP_FS)) {
+ slab_defrag_counter = 0;
+ kmem_cache_defrag(-1);
+ }
+ }
return ret;
}
@@ -1586,7 +1635,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
* over limit cgroups
*/
if (scan_global_lru(sc)) {
- shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
+ shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages, NULL);
if (reclaim_state) {
nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
@@ -1820,7 +1869,7 @@ loop_again:
nr_reclaimed += shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
- lru_pages);
+ lru_pages, zone);
nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
if (zone_is_all_unreclaimable(zone))
@@ -2060,7 +2109,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
/* If slab caches are huge, it's better to hit them first */
while (nr_slab >= lru_pages) {
reclaim_state.reclaimed_slab = 0;
- shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+ shrink_slab(nr_pages, sc.gfp_mask, lru_pages, NULL);
if (!reclaim_state.reclaimed_slab)
break;
@@ -2098,7 +2147,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
reclaim_state.reclaimed_slab = 0;
shrink_slab(sc.nr_scanned, sc.gfp_mask,
- global_lru_pages());
+ global_lru_pages(), NULL);
ret += reclaim_state.reclaimed_slab;
if (ret >= nr_pages)
goto out;
@@ -2115,7 +2164,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
if (!ret) {
do {
reclaim_state.reclaimed_slab = 0;
- shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
+ shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages(), NULL);
ret += reclaim_state.reclaimed_slab;
} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
}
@@ -2277,7 +2326,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* Note that shrink_slab will free memory on all zones and may
* take a long time.
*/
- while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+ while (shrink_slab(sc.nr_scanned, gfp_mask, order,
+ zone) &&
zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
slab_reclaimable - nr_pages)
;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c3ccfda23adc..c4b7280e6e70 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -774,11 +774,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
#endif
}
seq_printf(m,
+ "\n slab_defrag_count: %lu"
"\n all_unreclaimable: %u"
"\n prev_priority: %i"
"\n start_pfn: %lu"
"\n inactive_ratio: %u",
- zone_is_all_unreclaimable(zone),
+ zone->slab_defrag_counter,
+ zone_is_all_unreclaimable(zone),
zone->prev_priority,
zone->zone_start_pfn,
zone->inactive_ratio);