diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2008-12-04 16:33:25 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2008-12-04 16:33:25 +1100 |
commit | 66c3b54aaf127ee1a99c7a8e3a907f4e85715a9b (patch) | |
tree | 619a4d458c85af177f11cdf557256717d1f35730 /mm | |
parent | 3b22d14aeb9e110f956637a7655d2cbfe7d5da79 (diff) | |
parent | 00f39605364d5d04d669c7c280d8645efee3aa51 (diff) |
Merge commit 'slab/for-next'
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/kmemtrace.c | 333 | ||||
-rw-r--r-- | mm/slab.c | 85 | ||||
-rw-r--r-- | mm/slob.c | 37 | ||||
-rw-r--r-- | mm/slub.c | 543 | ||||
-rw-r--r-- | mm/vmscan.c | 64 | ||||
-rw-r--r-- | mm/vmstat.c | 4 |
7 files changed, 937 insertions, 130 deletions
diff --git a/mm/Makefile b/mm/Makefile index c06b45a1ff5f..3782eb66d4b3 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -34,3 +34,4 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_KMEMTRACE) += kmemtrace.o diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c new file mode 100644 index 000000000000..2a70a805027c --- /dev/null +++ b/mm/kmemtrace.c @@ -0,0 +1,333 @@ +/* + * Copyright (C) 2008 Pekka Enberg, Eduard - Gabriel Munteanu + * + * This file is released under GPL version 2. + */ + +#include <linux/string.h> +#include <linux/debugfs.h> +#include <linux/relay.h> +#include <linux/module.h> +#include <linux/marker.h> +#include <linux/gfp.h> +#include <linux/kmemtrace.h> + +#define KMEMTRACE_SUBBUF_SIZE 524288 +#define KMEMTRACE_DEF_N_SUBBUFS 20 + +static struct rchan *kmemtrace_chan; +static u32 kmemtrace_buf_overruns; + +static unsigned int kmemtrace_n_subbufs; + +/* disabled by default */ +static unsigned int kmemtrace_enabled; + +/* + * The sequence number is used for reordering kmemtrace packets + * in userspace, since they are logged as per-CPU data. + * + * atomic_t should always be a 32-bit signed integer. Wraparound is not + * likely to occur, but userspace can deal with it by expecting a certain + * sequence number in the next packet that will be read. + */ +static atomic_t kmemtrace_seq_num; + +#define KMEMTRACE_ABI_VERSION 1 + +static u32 kmemtrace_abi_version __read_mostly = KMEMTRACE_ABI_VERSION; + +enum kmemtrace_event_id { + KMEMTRACE_EVENT_ALLOC = 0, + KMEMTRACE_EVENT_FREE, +}; + +struct kmemtrace_event { + u8 event_id; + u8 type_id; + u16 event_size; + s32 seq_num; + u64 call_site; + u64 ptr; +} __attribute__ ((__packed__)); + +struct kmemtrace_stats_alloc { + u64 bytes_req; + u64 bytes_alloc; + u32 gfp_flags; + s32 numa_node; +} __attribute__ ((__packed__)); + +static void kmemtrace_probe_alloc(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + unsigned long flags; + struct kmemtrace_event *ev; + struct kmemtrace_stats_alloc *stats; + void *buf; + + local_irq_save(flags); + + buf = relay_reserve(kmemtrace_chan, + sizeof(struct kmemtrace_event) + + sizeof(struct kmemtrace_stats_alloc)); + if (!buf) + goto failed; + + /* + * Don't convert this to use structure initializers, + * C99 does not guarantee the rvalues evaluation order. + */ + + ev = buf; + ev->event_id = KMEMTRACE_EVENT_ALLOC; + ev->type_id = va_arg(*args, int); + ev->event_size = sizeof(struct kmemtrace_event) + + sizeof(struct kmemtrace_stats_alloc); + ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num); + ev->call_site = va_arg(*args, unsigned long); + ev->ptr = va_arg(*args, unsigned long); + + stats = buf + sizeof(struct kmemtrace_event); + stats->bytes_req = va_arg(*args, unsigned long); + stats->bytes_alloc = va_arg(*args, unsigned long); + stats->gfp_flags = va_arg(*args, unsigned long); + stats->numa_node = va_arg(*args, int); + +failed: + local_irq_restore(flags); +} + +static void kmemtrace_probe_free(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + unsigned long flags; + struct kmemtrace_event *ev; + + local_irq_save(flags); + + ev = relay_reserve(kmemtrace_chan, sizeof(struct kmemtrace_event)); + if (!ev) + goto failed; + + /* + * Don't convert this to use structure initializers, + * C99 does not guarantee the rvalues evaluation order. + */ + ev->event_id = KMEMTRACE_EVENT_FREE; + ev->type_id = va_arg(*args, int); + ev->event_size = sizeof(struct kmemtrace_event); + ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num); + ev->call_site = va_arg(*args, unsigned long); + ev->ptr = va_arg(*args, unsigned long); + +failed: + local_irq_restore(flags); +} + +static struct dentry * +kmemtrace_create_buf_file(const char *filename, struct dentry *parent, + int mode, struct rchan_buf *buf, int *is_global) +{ + return debugfs_create_file(filename, mode, parent, buf, + &relay_file_operations); +} + +static int kmemtrace_remove_buf_file(struct dentry *dentry) +{ + debugfs_remove(dentry); + + return 0; +} + +static int kmemtrace_subbuf_start(struct rchan_buf *buf, + void *subbuf, + void *prev_subbuf, + size_t prev_padding) +{ + if (relay_buf_full(buf)) { + /* + * We know it's not SMP-safe, but neither + * debugfs_create_u32() is. + */ + kmemtrace_buf_overruns++; + return 0; + } + + return 1; +} + +static struct rchan_callbacks relay_callbacks = { + .create_buf_file = kmemtrace_create_buf_file, + .remove_buf_file = kmemtrace_remove_buf_file, + .subbuf_start = kmemtrace_subbuf_start, +}; + +static struct dentry *kmemtrace_dir; +static struct dentry *kmemtrace_overruns_dentry; +static struct dentry *kmemtrace_abi_version_dentry; + +static struct dentry *kmemtrace_enabled_dentry; + +static int kmemtrace_start_probes(void) +{ + int err; + + err = marker_probe_register("kmemtrace_alloc", "type_id %d " + "call_site %lu ptr %lu " + "bytes_req %lu bytes_alloc %lu " + "gfp_flags %lu node %d", + kmemtrace_probe_alloc, NULL); + if (err) + return err; + err = marker_probe_register("kmemtrace_free", "type_id %d " + "call_site %lu ptr %lu", + kmemtrace_probe_free, NULL); + + return err; +} + +static void kmemtrace_stop_probes(void) +{ + marker_probe_unregister("kmemtrace_alloc", + kmemtrace_probe_alloc, NULL); + marker_probe_unregister("kmemtrace_free", + kmemtrace_probe_free, NULL); +} + +static int kmemtrace_enabled_get(void *data, u64 *val) +{ + *val = *((int *) data); + + return 0; +} + +static int kmemtrace_enabled_set(void *data, u64 val) +{ + u64 old_val = kmemtrace_enabled; + + *((int *) data) = !!val; + + if (old_val == val) + return 0; + if (val) + kmemtrace_start_probes(); + else + kmemtrace_stop_probes(); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(kmemtrace_enabled_fops, + kmemtrace_enabled_get, + kmemtrace_enabled_set, "%llu\n"); + +static void kmemtrace_cleanup(void) +{ + if (kmemtrace_enabled_dentry) + debugfs_remove(kmemtrace_enabled_dentry); + + kmemtrace_stop_probes(); + + if (kmemtrace_abi_version_dentry) + debugfs_remove(kmemtrace_abi_version_dentry); + if (kmemtrace_overruns_dentry) + debugfs_remove(kmemtrace_overruns_dentry); + + relay_close(kmemtrace_chan); + kmemtrace_chan = NULL; + + if (kmemtrace_dir) + debugfs_remove(kmemtrace_dir); +} + +static int __init kmemtrace_setup_late(void) +{ + if (!kmemtrace_chan) + goto failed; + + kmemtrace_dir = debugfs_create_dir("kmemtrace", NULL); + if (!kmemtrace_dir) + goto cleanup; + + kmemtrace_abi_version_dentry = + debugfs_create_u32("abi_version", S_IRUSR, + kmemtrace_dir, &kmemtrace_abi_version); + kmemtrace_overruns_dentry = + debugfs_create_u32("total_overruns", S_IRUSR, + kmemtrace_dir, &kmemtrace_buf_overruns); + if (!kmemtrace_overruns_dentry || !kmemtrace_abi_version_dentry) + goto cleanup; + + kmemtrace_enabled_dentry = + debugfs_create_file("enabled", S_IRUSR | S_IWUSR, + kmemtrace_dir, &kmemtrace_enabled, + &kmemtrace_enabled_fops); + if (!kmemtrace_enabled_dentry) + goto cleanup; + + if (relay_late_setup_files(kmemtrace_chan, "cpu", kmemtrace_dir)) + goto cleanup; + + printk(KERN_INFO "kmemtrace: fully up.\n"); + + return 0; + +cleanup: + kmemtrace_cleanup(); +failed: + return 1; +} +late_initcall(kmemtrace_setup_late); + +static int __init kmemtrace_set_boot_enabled(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "yes")) + kmemtrace_enabled = 1; + else if (!strcmp(str, "no")) + kmemtrace_enabled = 0; + else + return -EINVAL; + + return 0; +} +early_param("kmemtrace.enable", kmemtrace_set_boot_enabled); + +static int __init kmemtrace_set_subbufs(char *str) +{ + get_option(&str, &kmemtrace_n_subbufs); + return 0; +} +early_param("kmemtrace.subbufs", kmemtrace_set_subbufs); + +void kmemtrace_init(void) +{ + if (!kmemtrace_n_subbufs) + kmemtrace_n_subbufs = KMEMTRACE_DEF_N_SUBBUFS; + + kmemtrace_chan = relay_open(NULL, NULL, KMEMTRACE_SUBBUF_SIZE, + kmemtrace_n_subbufs, &relay_callbacks, + NULL); + if (!kmemtrace_chan) { + printk(KERN_ERR "kmemtrace: could not open relay channel.\n"); + return; + } + + if (!kmemtrace_enabled) { + printk(KERN_INFO "kmemtrace: disabled. Pass " + "kemtrace.enable=yes as kernel parameter for " + "boot-time tracing.\n"); + return; + } + if (kmemtrace_start_probes()) { + printk(KERN_ERR "kmemtrace: could not register marker probes!\n"); + kmemtrace_cleanup(); + return; + } + + printk(KERN_INFO "kmemtrace: enabled.\n"); +} + diff --git a/mm/slab.c b/mm/slab.c index 09187517f9dc..389ea62109b7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -112,6 +112,7 @@ #include <linux/rtmutex.h> #include <linux/reciprocal_div.h> #include <linux/debugobjects.h> +#include <linux/kmemtrace.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -568,6 +569,14 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif +#ifdef CONFIG_KMEMTRACE +size_t slab_buffer_size(struct kmem_cache *cachep) +{ + return cachep->buffer_size; +} +EXPORT_SYMBOL(slab_buffer_size); +#endif + /* * Do not go above this order unless 0 objects fit into the slab. */ @@ -2123,6 +2132,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) * * @name must be valid until the cache is destroyed. This implies that * the module calling this has to destroy the cache before getting unloaded. + * Note that kmem_cache_name() is not guaranteed to return the same pointer, + * therefore applications must manage it themselves. * * The flags are * @@ -2609,7 +2620,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ slabp = kmem_cache_alloc_node(cachep->slabp_cache, - local_flags & ~GFP_THISNODE, nodeid); + local_flags, nodeid); if (!slabp) return NULL; } else { @@ -2997,7 +3008,7 @@ retry: * there must be at least one object available for * allocation. */ - BUG_ON(slabp->inuse < 0 || slabp->inuse >= cachep->num); + BUG_ON(slabp->inuse >= cachep->num); while (slabp->inuse < cachep->num && batchcount--) { STATS_INC_ALLOCED(cachep); @@ -3613,10 +3624,23 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { - return __cache_alloc(cachep, flags, __builtin_return_address(0)); + void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret, + obj_size(cachep), cachep->buffer_size, flags); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc); +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) +{ + return __cache_alloc(cachep, flags, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_alloc_notrace); +#endif + /** * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. * @cachep: the cache we're checking against @@ -3661,23 +3685,47 @@ out: #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - return __cache_alloc_node(cachep, flags, nodeid, - __builtin_return_address(0)); + void *ret = __cache_alloc_node(cachep, flags, nodeid, + __builtin_return_address(0)); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret, + obj_size(cachep), cachep->buffer_size, + flags, nodeid); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, + gfp_t flags, + int nodeid) +{ + return __cache_alloc_node(cachep, flags, nodeid, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +#endif + static __always_inline void * __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) { struct kmem_cache *cachep; + void *ret; cachep = kmem_find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return kmem_cache_alloc_node(cachep, flags, node); + ret = kmem_cache_alloc_node_notrace(cachep, flags, node); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + (unsigned long) caller, ret, + size, cachep->buffer_size, flags, node); + + return ret; } -#ifdef CONFIG_DEBUG_SLAB +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) void *__kmalloc_node(size_t size, gfp_t flags, int node) { return __do_kmalloc_node(size, flags, node, @@ -3686,9 +3734,9 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) EXPORT_SYMBOL(__kmalloc_node); void *__kmalloc_node_track_caller(size_t size, gfp_t flags, - int node, void *caller) + int node, unsigned long caller) { - return __do_kmalloc_node(size, flags, node, caller); + return __do_kmalloc_node(size, flags, node, (void *)caller); } EXPORT_SYMBOL(__kmalloc_node_track_caller); #else @@ -3710,6 +3758,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, void *caller) { struct kmem_cache *cachep; + void *ret; /* If you want to save a few bytes .text space: replace * __ with kmem_. @@ -3719,20 +3768,26 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, cachep = __find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return __cache_alloc(cachep, flags, caller); + ret = __cache_alloc(cachep, flags, caller); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, + (unsigned long) caller, ret, + size, cachep->buffer_size, flags); + + return ret; } -#ifdef CONFIG_DEBUG_SLAB +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) void *__kmalloc(size_t size, gfp_t flags) { return __do_kmalloc(size, flags, __builtin_return_address(0)); } EXPORT_SYMBOL(__kmalloc); -void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) +void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) { - return __do_kmalloc(size, flags, caller); + return __do_kmalloc(size, flags, (void *)caller); } EXPORT_SYMBOL(__kmalloc_track_caller); @@ -3762,6 +3817,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) debug_check_no_obj_freed(objp, obj_size(cachep)); __cache_free(cachep, objp); local_irq_restore(flags); + + kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, objp); } EXPORT_SYMBOL(kmem_cache_free); @@ -3788,6 +3845,8 @@ void kfree(const void *objp) debug_check_no_obj_freed(objp, obj_size(c)); __cache_free(c, (void *)objp); local_irq_restore(flags); + + kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, objp); } EXPORT_SYMBOL(kfree); diff --git a/mm/slob.c b/mm/slob.c index cb675d126791..55de44ae5d30 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -65,6 +65,7 @@ #include <linux/module.h> #include <linux/rcupdate.h> #include <linux/list.h> +#include <linux/kmemtrace.h> #include <asm/atomic.h> /* @@ -463,27 +464,38 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) { unsigned int *m; int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + void *ret; if (size < PAGE_SIZE - align) { if (!size) return ZERO_SIZE_PTR; m = slob_alloc(size + align, gfp, align, node); + if (!m) return NULL; *m = size; - return (void *)m + align; + ret = (void *)m + align; + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + _RET_IP_, ret, + size, size + align, gfp, node); } else { - void *ret; + unsigned int order = get_order(size); - ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node); + ret = slob_new_page(gfp | __GFP_COMP, order, node); if (ret) { struct page *page; page = virt_to_page(ret); page->private = size; } - return ret; + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + _RET_IP_, ret, + size, PAGE_SIZE << order, gfp, node); } + + return ret; } EXPORT_SYMBOL(__kmalloc_node); @@ -501,6 +513,8 @@ void kfree(const void *block) slob_free(m, *m + align); } else put_page(&sp->page); + + kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, block); } EXPORT_SYMBOL(kfree); @@ -569,10 +583,19 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { void *b; - if (c->size < PAGE_SIZE) + if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node); - else + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, + _RET_IP_, b, c->size, + SLOB_UNITS(c->size) * SLOB_UNIT, + flags, node); + } else { b = slob_new_page(flags, get_order(c->size), node); + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, + _RET_IP_, b, c->size, + PAGE_SIZE << get_order(c->size), + flags, node); + } if (c->ctor) c->ctor(b); @@ -608,6 +631,8 @@ void kmem_cache_free(struct kmem_cache *c, void *b) } else { __kmem_cache_free(b, c->size); } + + kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, b); } EXPORT_SYMBOL(kmem_cache_free); diff --git a/mm/slub.c b/mm/slub.c index 72c5fd069f65..d00628edec0e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -24,6 +24,7 @@ #include <linux/kallsyms.h> #include <linux/memory.h> #include <linux/math64.h> +#include <linux/kmemtrace.h> /* * Lock order: @@ -128,10 +129,10 @@ /* * Maximum number of desirable partial slabs. - * The existence of more partial slabs makes kmem_cache_shrink - * sort the partial list by the number of objects in the. + * More slabs cause kmem_cache_shrink to sort the slabs by objects + * and triggers slab defragmentation. */ -#define MAX_PARTIAL 10 +#define MAX_PARTIAL 20 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_STORE_USER) @@ -153,6 +154,10 @@ #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif +#define OO_SHIFT 16 +#define OO_MASK ((1 << OO_SHIFT) - 1) +#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ + /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000 /* Poison object */ #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ @@ -174,11 +179,14 @@ static enum { static DECLARE_RWSEM(slub_lock); static LIST_HEAD(slab_caches); +/* Maximum objects in defragmentable slabs */ +static unsigned int max_defrag_slab_objects; + /* * Tracking user of a slab. */ struct track { - void *addr; /* Called from address */ + unsigned long addr; /* Called from address */ int cpu; /* Was running on cpu */ int pid; /* Pid context */ unsigned long when; /* When did the operation occur */ @@ -290,7 +298,7 @@ static inline struct kmem_cache_order_objects oo_make(int order, unsigned long size) { struct kmem_cache_order_objects x = { - (order << 16) + (PAGE_SIZE << order) / size + (order << OO_SHIFT) + (PAGE_SIZE << order) / size }; return x; @@ -298,12 +306,12 @@ static inline struct kmem_cache_order_objects oo_make(int order, static inline int oo_order(struct kmem_cache_order_objects x) { - return x.x >> 16; + return x.x >> OO_SHIFT; } static inline int oo_objects(struct kmem_cache_order_objects x) { - return x.x & ((1 << 16) - 1); + return x.x & OO_MASK; } #ifdef CONFIG_SLUB_DEBUG @@ -367,7 +375,7 @@ static struct track *get_track(struct kmem_cache *s, void *object, } static void set_track(struct kmem_cache *s, void *object, - enum track_item alloc, void *addr) + enum track_item alloc, unsigned long addr) { struct track *p; @@ -391,8 +399,8 @@ static void init_tracking(struct kmem_cache *s, void *object) if (!(s->flags & SLAB_STORE_USER)) return; - set_track(s, object, TRACK_FREE, NULL); - set_track(s, object, TRACK_ALLOC, NULL); + set_track(s, object, TRACK_FREE, 0UL); + set_track(s, object, TRACK_ALLOC, 0UL); } static void print_track(const char *s, struct track *t) @@ -401,7 +409,7 @@ static void print_track(const char *s, struct track *t) return; printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", - s, t->addr, jiffies - t->when, t->cpu, t->pid); + s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); } static void print_tracking(struct kmem_cache *s, void *object) @@ -764,8 +772,8 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) } max_objects = (PAGE_SIZE << compound_order(page)) / s->size; - if (max_objects > 65535) - max_objects = 65535; + if (max_objects > MAX_OBJS_PER_PAGE) + max_objects = MAX_OBJS_PER_PAGE; if (page->objects != max_objects) { slab_err(s, page, "Wrong number of objects. Found %d but " @@ -866,7 +874,7 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, } static int alloc_debug_processing(struct kmem_cache *s, struct page *page, - void *object, void *addr) + void *object, unsigned long addr) { if (!check_slab(s, page)) goto bad; @@ -906,7 +914,7 @@ bad: } static int free_debug_processing(struct kmem_cache *s, struct page *page, - void *object, void *addr) + void *object, unsigned long addr) { if (!check_slab(s, page)) goto fail; @@ -1029,10 +1037,10 @@ static inline void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) {} static inline int alloc_debug_processing(struct kmem_cache *s, - struct page *page, void *object, void *addr) { return 0; } + struct page *page, void *object, unsigned long addr) { return 0; } static inline int free_debug_processing(struct kmem_cache *s, - struct page *page, void *object, void *addr) { return 0; } + struct page *page, void *object, unsigned long addr) { return 0; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } @@ -1128,6 +1136,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) SLAB_STORE_USER | SLAB_TRACE)) __SetPageSlubDebug(page); + if (s->kick) + __SetPageSlubKickable(page); + start = page_address(page); if (unlikely(s->flags & SLAB_POISON)) @@ -1168,6 +1179,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -pages); + __ClearPageSlubKickable(page); __ClearPageSlab(page); reset_page_mapcount(page); __free_pages(page, order); @@ -1378,6 +1390,8 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) if (SLABDEBUG && PageSlubDebug(page) && (s->flags & SLAB_STORE_USER)) add_full(n, page); + if (s->kick) + __SetPageSlubKickable(page); } slab_unlock(page); } else { @@ -1499,8 +1513,8 @@ static inline int node_match(struct kmem_cache_cpu *c, int node) * we need to allocate a new slab. This is the slowest path since it involves * a call to the page allocator and the setup of a new slab. */ -static void *__slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) +static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, struct kmem_cache_cpu *c) { void **object; struct page *new; @@ -1584,13 +1598,14 @@ debug: * Otherwise we can simply pick the next object from the lockless free list. */ static __always_inline void *slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr) + gfp_t gfpflags, int node, unsigned long addr) { void **object; struct kmem_cache_cpu *c; unsigned long flags; unsigned int objsize; + might_sleep_if(gfpflags & __GFP_WAIT); local_irq_save(flags); c = get_cpu_slab(s, smp_processor_id()); objsize = c->objsize; @@ -1613,18 +1628,46 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); + void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret, + s->objsize, s->size, gfpflags); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc); +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) +{ + return slab_alloc(s, gfpflags, -1, _RET_IP_); +} +EXPORT_SYMBOL(kmem_cache_alloc_notrace); +#endif + #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); + void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret, + s->objsize, s->size, gfpflags, node); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); #endif +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, + gfp_t gfpflags, + int node) +{ + return slab_alloc(s, gfpflags, node, _RET_IP_); +} +EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +#endif + /* * Slow patch handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. @@ -1634,7 +1677,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, void *addr, unsigned int offset) + void *x, unsigned long addr, unsigned int offset) { void *prior; void **object = (void *)x; @@ -1704,7 +1747,7 @@ debug: * with all sorts of special processing. */ static __always_inline void slab_free(struct kmem_cache *s, - struct page *page, void *x, void *addr) + struct page *page, void *x, unsigned long addr) { void **object = (void *)x; struct kmem_cache_cpu *c; @@ -1731,11 +1774,13 @@ void kmem_cache_free(struct kmem_cache *s, void *x) page = virt_to_head_page(x); - slab_free(s, page, x, __builtin_return_address(0)); + slab_free(s, page, x, _RET_IP_); + + kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, x); } EXPORT_SYMBOL(kmem_cache_free); -/* Figure out on which slab object the object resides */ +/* Figure out on which slab page the object resides */ static struct page *get_object_page(const void *x) { struct page *page = virt_to_head_page(x); @@ -1807,8 +1852,8 @@ static inline int slab_order(int size, int min_objects, int rem; int min_order = slub_min_order; - if ((PAGE_SIZE << min_order) / size > 65535) - return get_order(size * 65535) - 1; + if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE) + return get_order(size * MAX_OBJS_PER_PAGE) - 1; for (order = max(min_order, fls(min_objects * size - 1) - PAGE_SHIFT); @@ -2073,8 +2118,7 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) * when allocating for the kmalloc_node_cache. This is used for bootstrapping * memory on a fresh node that has no slab structures yet. */ -static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, - int node) +static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node) { struct page *page; struct kmem_cache_node *n; @@ -2112,7 +2156,6 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, local_irq_save(flags); add_partial(n, page, 0); local_irq_restore(flags); - return n; } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -2144,8 +2187,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) n = &s->local_node; else { if (slab_state == DOWN) { - n = early_kmem_cache_node_alloc(gfpflags, - node); + early_kmem_cache_node_alloc(gfpflags, node); continue; } n = kmem_cache_alloc_node(kmalloc_caches, @@ -2313,6 +2355,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, goto error; s->refcount = 1; + s->defrag_ratio = 30; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif @@ -2519,7 +2562,7 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, flags, NULL)) goto panic; - list_add(&s->list, &slab_caches); + list_add_tail(&s->list, &slab_caches); up_write(&slub_lock); if (sysfs_slab_add(s)) goto panic; @@ -2582,7 +2625,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) goto unlock_out; } - list_add(&s->list, &slab_caches); + list_add_tail(&s->list, &slab_caches); kmalloc_caches_dma[index] = s; schedule_work(&sysfs_add_work); @@ -2650,6 +2693,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; + void *ret; if (unlikely(size > PAGE_SIZE)) return kmalloc_large(size, flags); @@ -2659,7 +2703,12 @@ void *__kmalloc(size_t size, gfp_t flags) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, -1, __builtin_return_address(0)); + ret = slab_alloc(s, flags, -1, _RET_IP_); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret, + size, s->size, flags); + + return ret; } EXPORT_SYMBOL(__kmalloc); @@ -2678,16 +2727,30 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; + void *ret; - if (unlikely(size > PAGE_SIZE)) - return kmalloc_large_node(size, flags, node); + if (unlikely(size > PAGE_SIZE)) { + ret = kmalloc_large_node(size, flags, node); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + _RET_IP_, ret, + size, PAGE_SIZE << get_order(size), + flags, node); + + return ret; + } s = get_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, node, __builtin_return_address(0)); + ret = slab_alloc(s, flags, node, _RET_IP_); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret, + size, s->size, flags, node); + + return ret; } EXPORT_SYMBOL(__kmalloc_node); #endif @@ -2744,81 +2807,291 @@ void kfree(const void *x) put_page(page); return; } - slab_free(page->slab, page, object, __builtin_return_address(0)); + slab_free(page->slab, page, object, _RET_IP_); + + kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, x); } EXPORT_SYMBOL(kfree); /* - * kmem_cache_shrink removes empty slabs from the partial lists and sorts - * the remaining slabs by the number of items in use. The slabs with the - * most items in use come first. New allocations will then fill those up - * and thus they can be removed from the partial lists. + * Allocate a slab scratch space that is sufficient to keep at least + * max_defrag_slab_objects pointers to individual objects and also a bitmap + * for max_defrag_slab_objects. + */ +static inline void *alloc_scratch(void) +{ + return kmalloc(max_defrag_slab_objects * sizeof(void *) + + BITS_TO_LONGS(max_defrag_slab_objects) * sizeof(unsigned long), + GFP_KERNEL); +} + +void kmem_cache_setup_defrag(struct kmem_cache *s, + kmem_defrag_get_func get, kmem_defrag_kick_func kick) +{ + int max_objects = oo_objects(s->max); + + /* + * Defragmentable slabs must have a ctor otherwise objects may be + * in an undetermined state after they are allocated. + */ + BUG_ON(!s->ctor); + s->get = get; + s->kick = kick; + down_write(&slub_lock); + list_move(&s->list, &slab_caches); + if (max_objects > max_defrag_slab_objects) + max_defrag_slab_objects = max_objects; + up_write(&slub_lock); +} +EXPORT_SYMBOL(kmem_cache_setup_defrag); + +/* + * Vacate all objects in the given slab. * - * The slabs with the least items are placed last. This results in them - * being allocated from last increasing the chance that the last objects - * are freed in them. + * The scratch aread passed to list function is sufficient to hold + * struct listhead times objects per slab. We use it to hold void ** times + * objects per slab plus a bitmap for each object. */ -int kmem_cache_shrink(struct kmem_cache *s) +static int kmem_cache_vacate(struct page *page, void *scratch) { - int node; - int i; - struct kmem_cache_node *n; - struct page *page; - struct page *t; - int objects = oo_objects(s->max); - struct list_head *slabs_by_inuse = - kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); + void **vector = scratch; + void *p; + void *addr = page_address(page); + struct kmem_cache *s; + unsigned long *map; + int leftover; + int count; + void *private; unsigned long flags; + unsigned long objects; + struct kmem_cache_cpu *c; - if (!slabs_by_inuse) - return -ENOMEM; + local_irq_save(flags); + slab_lock(page); - flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - n = get_node(s, node); + BUG_ON(!PageSlab(page)); /* Must be s slab page */ + BUG_ON(!PageSlubFrozen(page)); /* Slab must have been frozen earlier */ - if (!n->nr_partial) - continue; + s = page->slab; + objects = page->objects; + map = scratch + objects * sizeof(void **); + if (!page->inuse || !s->kick || !PageSlubKickable(page)) + goto out; - for (i = 0; i < objects; i++) - INIT_LIST_HEAD(slabs_by_inuse + i); + /* Determine used objects */ + bitmap_fill(map, objects); + for_each_free_object(p, s, page->freelist) + __clear_bit(slab_index(p, s, addr), map); - spin_lock_irqsave(&n->list_lock, flags); + /* Build vector of pointers to objects */ + count = 0; + memset(vector, 0, objects * sizeof(void **)); + for_each_object(p, s, addr, objects) + if (test_bit(slab_index(p, s, addr), map)) + vector[count++] = p; - /* - * Build lists indexed by the items in use in each slab. - * - * Note that concurrent frees may occur while we hold the - * list_lock. page->inuse here is the upper limit. - */ - list_for_each_entry_safe(page, t, &n->partial, lru) { - if (!page->inuse && slab_trylock(page)) { + private = s->get(s, count, vector); + + /* + * Got references. Now we can drop the slab lock. The slab + * is frozen so it cannot vanish from under us nor will + * allocations be performed on the slab. However, unlocking the + * slab will allow concurrent slab_frees to proceed. + */ + slab_unlock(page); + local_irq_restore(flags); + + /* + * Perform the KICK callbacks to remove the objects. + */ + s->kick(s, count, vector, private); + + local_irq_save(flags); + slab_lock(page); +out: + /* + * Check the result and unfreeze the slab + */ + leftover = page->inuse; + c = get_cpu_slab(s, smp_processor_id()); + if (leftover) { + /* Unsuccessful reclaim. Avoid future reclaim attempts. */ + stat(c, SHRINK_OBJECT_RECLAIM_FAILED); + __ClearPageSlubKickable(page); + } else + stat(c, SHRINK_SLAB_RECLAIMED); + unfreeze_slab(s, page, leftover > 0); + local_irq_restore(flags); + return leftover; +} + +/* + * Remove objects from a list of slab pages that have been gathered. + * Must be called with slabs that have been isolated before. + * + * kmem_cache_reclaim() is never called from an atomic context. It + * allocates memory for temporary storage. We are holding the + * slub_lock semaphore which prevents another call into + * the defrag logic. + */ +int kmem_cache_reclaim(struct list_head *zaplist) +{ + int freed = 0; + void **scratch; + struct page *page; + struct page *page2; + + if (list_empty(zaplist)) + return 0; + + scratch = alloc_scratch(); + if (!scratch) + return 0; + + list_for_each_entry_safe(page, page2, zaplist, lru) { + list_del(&page->lru); + if (kmem_cache_vacate(page, scratch) == 0) + freed++; + } + kfree(scratch); + return freed; +} + +/* + * Shrink the slab cache on a particular node of the cache + * by releasing slabs with zero objects and trying to reclaim + * slabs with less than the configured percentage of objects allocated. + */ +static unsigned long __kmem_cache_shrink(struct kmem_cache *s, int node, + unsigned long limit) +{ + unsigned long flags; + struct page *page, *page2; + LIST_HEAD(zaplist); + int freed = 0; + struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_cpu *c; + + if (n->nr_partial <= limit) + return 0; + + spin_lock_irqsave(&n->list_lock, flags); + c = get_cpu_slab(s, smp_processor_id()); + stat(c, SHRINK_CALLS); + list_for_each_entry_safe(page, page2, &n->partial, lru) { + if (!slab_trylock(page)) + /* Busy slab. Get out of the way */ + continue; + + if (page->inuse) { + if (!PageSlubKickable(page) || page->inuse * 100 >= + s->defrag_ratio * page->objects) { + slab_unlock(page); /* - * Must hold slab lock here because slab_free - * may have freed the last object and be - * waiting to release the slab. + * Slab contains enough objects + * or we alrady tried reclaim before and + * it failed. Skip this one. */ - list_del(&page->lru); + continue; + } + + list_move(&page->lru, &zaplist); + if (s->kick) { + stat(c, SHRINK_ATTEMPT_DEFRAG); n->nr_partial--; - slab_unlock(page); - discard_slab(s, page); - } else { - list_move(&page->lru, - slabs_by_inuse + page->inuse); + __SetPageSlubFrozen(page); } + slab_unlock(page); + } else { + /* Empty slab page */ + stat(c, SHRINK_EMPTY_SLAB); + list_del(&page->lru); + n->nr_partial--; + slab_unlock(page); + discard_slab(s, page); + freed++; } + } + if (!s->kick) /* - * Rebuild the partial list with the slabs filled up most - * first and the least used slabs at the end. + * No defrag methods. By simply putting the zaplist at the + * end of the partial list we can let them simmer longer + * and thus increase the chance of all objects being + * reclaimed. + * + * We have effectively sorted the partial list and put + * the slabs with more objects first. As soon as they + * are allocated they are going to be removed from the + * partial list. */ - for (i = objects - 1; i >= 0; i--) - list_splice(slabs_by_inuse + i, n->partial.prev); + list_splice(&zaplist, n->partial.prev); - spin_unlock_irqrestore(&n->list_lock, flags); + + spin_unlock_irqrestore(&n->list_lock, flags); + + if (s->kick) + freed += kmem_cache_reclaim(&zaplist); + + return freed; +} + +/* + * Defrag slabs conditional on the amount of fragmentation in a page. + */ +int kmem_cache_defrag(int node) +{ + struct kmem_cache *s; + unsigned long slabs = 0; + + /* + * kmem_cache_defrag may be called from the reclaim path which may be + * called for any page allocator alloc. So there is the danger that we + * get called in a situation where slub already acquired the slub_lock + * for other purposes. + */ + if (!down_read_trylock(&slub_lock)) + return 0; + + list_for_each_entry(s, &slab_caches, list) { + unsigned long reclaimed = 0; + + /* + * Defragmentable caches come first. If the slab cache is not + * defragmentable then we can stop traversing the list. + */ + if (!s->kick) + break; + + if (node == -1) { + int nid; + + for_each_node_state(nid, N_NORMAL_MEMORY) + reclaimed += __kmem_cache_shrink(s, nid, + MAX_PARTIAL); + } else + reclaimed = __kmem_cache_shrink(s, node, MAX_PARTIAL); + + slabs += reclaimed; } + up_read(&slub_lock); + return slabs; +} +EXPORT_SYMBOL(kmem_cache_defrag); + +/* + * kmem_cache_shrink removes empty slabs from the partial lists. + * If the slab cache supports defragmentation then objects are + * reclaimed. + */ +int kmem_cache_shrink(struct kmem_cache *s) +{ + int node; + + flush_all(s); + for_each_node_state(node, N_NORMAL_MEMORY) + __kmem_cache_shrink(s, node, 0); - kfree(slabs_by_inuse); return 0; } EXPORT_SYMBOL(kmem_cache_shrink); @@ -3043,7 +3316,7 @@ static int slab_unmergeable(struct kmem_cache *s) if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) return 1; - if (s->ctor) + if (s->ctor || s->kick || s->get) return 1; /* @@ -3132,7 +3405,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, if (s) { if (kmem_cache_open(s, GFP_KERNEL, name, size, align, flags, ctor)) { - list_add(&s->list, &slab_caches); + list_add_tail(&s->list, &slab_caches); up_write(&slub_lock); if (sysfs_slab_add(s)) goto err; @@ -3202,9 +3475,10 @@ static struct notifier_block __cpuinitdata slab_notifier = { #endif -void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) +void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) { struct kmem_cache *s; + void *ret; if (unlikely(size > PAGE_SIZE)) return kmalloc_large(size, gfpflags); @@ -3214,13 +3488,20 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, gfpflags, -1, caller); + ret = slab_alloc(s, gfpflags, -1, caller); + + /* Honor the call site pointer we recieved. */ + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, caller, ret, size, + s->size, gfpflags); + + return ret; } void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, - int node, void *caller) + int node, unsigned long caller) { struct kmem_cache *s; + void *ret; if (unlikely(size > PAGE_SIZE)) return kmalloc_large_node(size, gfpflags, node); @@ -3230,7 +3511,13 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, gfpflags, node, caller); + ret = slab_alloc(s, gfpflags, node, caller); + + /* Honor the call site pointer we recieved. */ + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, caller, ret, + size, s->size, gfpflags, node); + + return ret; } #ifdef CONFIG_SLUB_DEBUG @@ -3429,7 +3716,7 @@ static void resiliency_test(void) {}; struct location { unsigned long count; - void *addr; + unsigned long addr; long long sum_time; long min_time; long max_time; @@ -3477,7 +3764,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, { long start, end, pos; struct location *l; - void *caddr; + unsigned long caddr; unsigned long age = jiffies - track->when; start = -1; @@ -3817,16 +4104,32 @@ static ssize_t order_show(struct kmem_cache *s, char *buf) } SLAB_ATTR(order); -static ssize_t ctor_show(struct kmem_cache *s, char *buf) +static ssize_t ops_show(struct kmem_cache *s, char *buf) { + int x = 0; + if (s->ctor) { - int n = sprint_symbol(buf, (unsigned long)s->ctor); + x += sprintf(buf + x, "ctor : "); + x += sprint_symbol(buf + x, (unsigned long)s->ctor); + x += sprintf(buf + x, "\n"); + } - return n + sprintf(buf + n, "\n"); + if (s->get) { + x += sprintf(buf + x, "get : "); + x += sprint_symbol(buf + x, + (unsigned long)s->get); + x += sprintf(buf + x, "\n"); } - return 0; + + if (s->kick) { + x += sprintf(buf + x, "kick : "); + x += sprint_symbol(buf + x, + (unsigned long)s->kick); + x += sprintf(buf + x, "\n"); + } + return x; } -SLAB_ATTR_RO(ctor); +SLAB_ATTR_RO(ops); static ssize_t aliases_show(struct kmem_cache *s, char *buf) { @@ -4046,6 +4349,27 @@ static ssize_t free_calls_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(free_calls); +static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->defrag_ratio); +} + +static ssize_t defrag_ratio_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + unsigned long ratio; + int err; + + err = strict_strtoul(buf, 10, &ratio); + if (err) + return err; + + if (ratio < 100) + s->defrag_ratio = ratio; + return length; +} +SLAB_ATTR(defrag_ratio); + #ifdef CONFIG_NUMA static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) { @@ -4125,6 +4449,12 @@ STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); STAT_ATTR(ORDER_FALLBACK, order_fallback); +STAT_ATTR(SHRINK_CALLS, shrink_calls); +STAT_ATTR(SHRINK_ATTEMPT_DEFRAG, shrink_attempt_defrag); +STAT_ATTR(SHRINK_EMPTY_SLAB, shrink_empty_slab); +STAT_ATTR(SHRINK_SLAB_SKIPPED, shrink_slab_skipped); +STAT_ATTR(SHRINK_SLAB_RECLAIMED, shrink_slab_reclaimed); +STAT_ATTR(SHRINK_OBJECT_RECLAIM_FAILED, shrink_object_reclaim_failed); #endif static struct attribute *slab_attrs[] = { @@ -4138,7 +4468,7 @@ static struct attribute *slab_attrs[] = { &slabs_attr.attr, &partial_attr.attr, &cpu_slabs_attr.attr, - &ctor_attr.attr, + &ops_attr.attr, &aliases_attr.attr, &align_attr.attr, &sanity_checks_attr.attr, @@ -4153,6 +4483,7 @@ static struct attribute *slab_attrs[] = { &shrink_attr.attr, &alloc_calls_attr.attr, &free_calls_attr.attr, + &defrag_ratio_attr.attr, #ifdef CONFIG_ZONE_DMA &cache_dma_attr.attr, #endif @@ -4178,6 +4509,12 @@ static struct attribute *slab_attrs[] = { &deactivate_to_tail_attr.attr, &deactivate_remote_frees_attr.attr, &order_fallback_attr.attr, + &shrink_calls_attr.attr, + &shrink_attempt_defrag_attr.attr, + &shrink_empty_slab_attr.attr, + &shrink_slab_skipped_attr.attr, + &shrink_slab_reclaimed_attr.attr, + &shrink_object_reclaim_failed_attr.attr, #endif NULL }; diff --git a/mm/vmscan.c b/mm/vmscan.c index 62e7f62fb559..c45074e6cc5d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -151,6 +151,14 @@ void unregister_shrinker(struct shrinker *shrinker) EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 + +/* + * Trigger a call into slab defrag if the sum of the returns from + * shrinkers cross this value. + */ +int slab_defrag_limit = 1000; +int slab_defrag_counter; + /* * Call the shrink functions to age shrinkable caches * @@ -168,10 +176,18 @@ EXPORT_SYMBOL(unregister_shrinker); * are eligible for the caller's allocation attempt. It is used for balancing * slab reclaim versus page reclaim. * + * zone is the zone for which we are shrinking the slabs. If the intent + * is to do a global shrink then zone may be NULL. Specification of a + * zone is currently only used to limit slab defragmentation to a NUMA node. + * The performace of shrink_slab would be better (in particular under NUMA) + * if it could be targeted as a whole to the zone that is under memory + * pressure but the VFS infrastructure does not allow that at the present + * time. + * * Returns the number of slab objects which we shrunk. */ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, - unsigned long lru_pages) + unsigned long lru_pages, struct zone *zone) { struct shrinker *shrinker; unsigned long ret = 0; @@ -228,6 +244,39 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, shrinker->nr += total_scan; } up_read(&shrinker_rwsem); + + + /* Avoid dirtying cachelines */ + if (!ret) + return 0; + + /* + * "ret" doesnt really contain the freed object count. The shrinkers + * fake it. Gotta go with what we are getting though. + * + * Handling of the defrag_counter is also racy. If we get the + * wrong counts then we may unnecessarily do a defrag pass or defer + * one. "ret" is already faked. So this is just increasing + * the already existing fuzziness to get some notion as to when + * to initiate slab defrag which will hopefully be okay. + */ + if (zone) { + /* balance_pgdat running on a zone so we only scan one node */ + zone->slab_defrag_counter += ret; + if (zone->slab_defrag_counter > slab_defrag_limit && + (gfp_mask & __GFP_FS)) { + zone->slab_defrag_counter = 0; + kmem_cache_defrag(zone_to_nid(zone)); + } + } else { + /* Direct (and thus global) reclaim. Scan all nodes */ + slab_defrag_counter += ret; + if (slab_defrag_counter > slab_defrag_limit && + (gfp_mask & __GFP_FS)) { + slab_defrag_counter = 0; + kmem_cache_defrag(-1); + } + } return ret; } @@ -1586,7 +1635,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, * over limit cgroups */ if (scan_global_lru(sc)) { - shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); + shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages, NULL); if (reclaim_state) { nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; @@ -1820,7 +1869,7 @@ loop_again: nr_reclaimed += shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, - lru_pages); + lru_pages, zone); nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; if (zone_is_all_unreclaimable(zone)) @@ -2060,7 +2109,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, lru_pages); + shrink_slab(nr_pages, sc.gfp_mask, lru_pages, NULL); if (!reclaim_state.reclaimed_slab) break; @@ -2098,7 +2147,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) reclaim_state.reclaimed_slab = 0; shrink_slab(sc.nr_scanned, sc.gfp_mask, - global_lru_pages()); + global_lru_pages(), NULL); ret += reclaim_state.reclaimed_slab; if (ret >= nr_pages) goto out; @@ -2115,7 +2164,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) if (!ret) { do { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); + shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages(), NULL); ret += reclaim_state.reclaimed_slab; } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); } @@ -2277,7 +2326,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Note that shrink_slab will free memory on all zones and may * take a long time. */ - while (shrink_slab(sc.nr_scanned, gfp_mask, order) && + while (shrink_slab(sc.nr_scanned, gfp_mask, order, + zone) && zone_page_state(zone, NR_SLAB_RECLAIMABLE) > slab_reclaimable - nr_pages) ; diff --git a/mm/vmstat.c b/mm/vmstat.c index c3ccfda23adc..c4b7280e6e70 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -774,11 +774,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, #endif } seq_printf(m, + "\n slab_defrag_count: %lu" "\n all_unreclaimable: %u" "\n prev_priority: %i" "\n start_pfn: %lu" "\n inactive_ratio: %u", - zone_is_all_unreclaimable(zone), + zone->slab_defrag_counter, + zone_is_all_unreclaimable(zone), zone->prev_priority, zone->zone_start_pfn, zone->inactive_ratio); |