diff options
-rw-r--r-- | Documentation/sysctl/vm.txt | 12 | ||||
-rw-r--r-- | Documentation/vm/slabinfo.c | 95 | ||||
-rw-r--r-- | fs/buffer.c | 102 | ||||
-rw-r--r-- | fs/dcache.c | 148 | ||||
-rw-r--r-- | fs/drop_caches.c | 2 | ||||
-rw-r--r-- | fs/ext2/super.c | 9 | ||||
-rw-r--r-- | fs/ext3/super.c | 8 | ||||
-rw-r--r-- | fs/ext4/super.c | 8 | ||||
-rw-r--r-- | fs/inode.c | 123 | ||||
-rw-r--r-- | fs/proc/inode.c | 8 | ||||
-rw-r--r-- | fs/reiserfs/super.c | 8 | ||||
-rw-r--r-- | include/linux/fs.h | 6 | ||||
-rw-r--r-- | include/linux/mm.h | 3 | ||||
-rw-r--r-- | include/linux/mmzone.h | 1 | ||||
-rw-r--r-- | include/linux/page-flags.h | 2 | ||||
-rw-r--r-- | include/linux/slab.h | 53 | ||||
-rw-r--r-- | include/linux/slub_def.h | 16 | ||||
-rw-r--r-- | include/linux/swap.h | 3 | ||||
-rw-r--r-- | kernel/sysctl.c | 20 | ||||
-rw-r--r-- | mm/slub.c | 392 | ||||
-rw-r--r-- | mm/vmscan.c | 64 | ||||
-rw-r--r-- | mm/vmstat.c | 4 |
22 files changed, 972 insertions, 115 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index d79eeda7a699..5e7329a1abcc 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -38,6 +38,7 @@ Currently, these files are in /proc/sys/vm: - numa_zonelist_order - nr_hugepages - nr_overcommit_hugepages +- slab_defrag_limit ============================================================== @@ -347,3 +348,14 @@ Change the maximum size of the hugepage pool. The maximum is nr_hugepages + nr_overcommit_hugepages. See Documentation/vm/hugetlbpage.txt + +============================================================== + +slab_defrag_limit + +Determines the frequency of calls from reclaim into slab defragmentation. +Slab defrag reclaims objects from sparsely populates slab pages. +The default is 1000. Increase if slab defragmentation occurs +too frequently. Decrease if more slab defragmentation passes +are needed. The slabinfo tool can report on the frequency of the callbacks. + diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c index df3227605d59..75b56e8fc651 100644 --- a/Documentation/vm/slabinfo.c +++ b/Documentation/vm/slabinfo.c @@ -31,6 +31,8 @@ struct slabinfo { int hwcache_align, object_size, objs_per_slab; int sanity_checks, slab_size, store_user, trace; int order, poison, reclaim_account, red_zone; + int defrag, ctor; + int defrag_ratio, remote_node_defrag_ratio; unsigned long partial, objects, slabs, objects_partial, objects_total; unsigned long alloc_fastpath, alloc_slowpath; unsigned long free_fastpath, free_slowpath; @@ -39,6 +41,9 @@ struct slabinfo { unsigned long cpuslab_flush, deactivate_full, deactivate_empty; unsigned long deactivate_to_head, deactivate_to_tail; unsigned long deactivate_remote_frees, order_fallback; + unsigned long shrink_calls, shrink_attempt_defrag, shrink_empty_slab; + unsigned long shrink_slab_skipped, shrink_slab_reclaimed; + unsigned long shrink_object_reclaim_failed; int numa[MAX_NODES]; int numa_partial[MAX_NODES]; } slabinfo[MAX_SLABS]; @@ -64,6 +69,8 @@ int show_slab = 0; int skip_zero = 1; int show_numa = 0; int show_track = 0; +int show_defrag = 0; +int show_ctor = 0; int show_first_alias = 0; int validate = 0; int shrink = 0; @@ -75,6 +82,7 @@ int sort_active = 0; int set_debug = 0; int show_ops = 0; int show_activity = 0; +int show_defragcount = 0; /* Debug options */ int sanity = 0; @@ -100,20 +108,23 @@ void fatal(const char *x, ...) void usage(void) { printf("slabinfo 5/7/2007. (c) 2007 sgi.\n\n" - "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n" + "slabinfo [-aCdDefFhnpvtsz] [-d debugopts] [slab-regexp]\n" "-a|--aliases Show aliases\n" "-A|--activity Most active slabs first\n" "-d<options>|--debug=<options> Set/Clear Debug options\n" + "-C|--ctor Show slabs with ctors\n" "-D|--display-active Switch line format to activity\n" "-e|--empty Show empty slabs\n" "-f|--first-alias Show first alias\n" + "-F|--defrag Show defragmentable caches\n" + "-G|--display-defrag Display defrag counters\n" "-h|--help Show usage information\n" "-i|--inverted Inverted list\n" "-l|--slabs Show slabs\n" "-n|--numa Show NUMA information\n" - "-o|--ops Show kmem_cache_ops\n" + "-o|--ops Show kmem_cache_ops\n" "-s|--shrink Shrink slabs\n" - "-r|--report Detailed report on single slabs\n" + "-r|--report Detailed report on single slabs\n" "-S|--Size Sort by size\n" "-t|--tracking Show alloc/free information\n" "-T|--Totals Show summary information\n" @@ -294,9 +305,11 @@ void first_line(void) { if (show_activity) printf("Name Objects Alloc Free %%Fast Fallb O\n"); + else if (show_defragcount) + printf("Name Objects DefragRQ Slabs Success Empty Skipped Failed\n"); else printf("Name Objects Objsize Space " - "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n"); + "Slabs/Part/Cpu O/S O %%Ra %%Ef Flg\n"); } /* @@ -345,7 +358,7 @@ void slab_numa(struct slabinfo *s, int mode) return; if (!line) { - printf("\n%-21s:", mode ? "NUMA nodes" : "Slab"); + printf("\n%-21s: Rto ", mode ? "NUMA nodes" : "Slab"); for(node = 0; node <= highest_node; node++) printf(" %4d", node); printf("\n----------------------"); @@ -354,6 +367,7 @@ void slab_numa(struct slabinfo *s, int mode) printf("\n"); } printf("%-21s ", mode ? "All slabs" : s->name); + printf("%3d ", s->remote_node_defrag_ratio); for(node = 0; node <= highest_node; node++) { char b[20]; @@ -459,22 +473,28 @@ void slab_stats(struct slabinfo *s) printf("Total %8lu %8lu\n\n", total_alloc, total_free); - if (s->cpuslab_flush) - printf("Flushes %8lu\n", s->cpuslab_flush); - - if (s->alloc_refill) - printf("Refill %8lu\n", s->alloc_refill); + if (s->cpuslab_flush || s->alloc_refill) + printf("CPU Slab : Flushes=%lu Refills=%lu\n", + s->cpuslab_flush, s->alloc_refill); total = s->deactivate_full + s->deactivate_empty + s->deactivate_to_head + s->deactivate_to_tail; if (total) - printf("Deactivate Full=%lu(%lu%%) Empty=%lu(%lu%%) " + printf("Deactivate: Full=%lu(%lu%%) Empty=%lu(%lu%%) " "ToHead=%lu(%lu%%) ToTail=%lu(%lu%%)\n", s->deactivate_full, (s->deactivate_full * 100) / total, s->deactivate_empty, (s->deactivate_empty * 100) / total, s->deactivate_to_head, (s->deactivate_to_head * 100) / total, s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total); + + if (s->shrink_calls) + printf("Shrink : Calls=%lu Attempts=%lu Empty=%lu Successful=%lu\n", + s->shrink_calls, s->shrink_attempt_defrag, + s->shrink_empty_slab, s->shrink_slab_reclaimed); + if (s->shrink_slab_skipped || s->shrink_object_reclaim_failed) + printf("Defrag : Slabs skipped=%lu Object reclaim failed=%lu\n", + s->shrink_slab_skipped, s->shrink_object_reclaim_failed); } void report(struct slabinfo *s) @@ -492,6 +512,8 @@ void report(struct slabinfo *s) printf("** Slabs are destroyed via RCU\n"); if (s->reclaim_account) printf("** Reclaim accounting active\n"); + if (s->defrag) + printf("** Defragmentation at %d%%\n", s->defrag_ratio); printf("\nSizes (bytes) Slabs Debug Memory\n"); printf("------------------------------------------------------------------------\n"); @@ -539,6 +561,12 @@ void slabcache(struct slabinfo *s) if (show_empty && s->slabs) return; + if (show_defrag && !s->defrag) + return; + + if (show_ctor && !s->ctor) + return; + store_size(size_str, slab_size(s)); snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs, s->partial, s->cpu_slabs); @@ -550,6 +578,10 @@ void slabcache(struct slabinfo *s) *p++ = '*'; if (s->cache_dma) *p++ = 'd'; + if (s->defrag) + *p++ = 'F'; + if (s->ctor) + *p++ = 'C'; if (s->hwcache_align) *p++ = 'A'; if (s->poison) @@ -579,12 +611,18 @@ void slabcache(struct slabinfo *s) total_alloc ? (s->alloc_fastpath * 100 / total_alloc) : 0, total_free ? (s->free_fastpath * 100 / total_free) : 0, s->order_fallback, s->order); - } + } else + if (show_defragcount) + printf("%-21s %8ld %7lu %7lu %7lu %7lu %7lu %7lu\n", + s->name, s->objects, s->shrink_calls, s->shrink_attempt_defrag, + s->shrink_slab_reclaimed, s->shrink_empty_slab, + s->shrink_slab_skipped, s->shrink_object_reclaim_failed); else printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n", s->name, s->objects, s->object_size, size_str, dist_str, s->objs_per_slab, s->order, - s->slabs ? (s->partial * 100) / s->slabs : 100, + s->slabs ? (s->partial * 100) / + (s->slabs * s->objs_per_slab) : 100, s->slabs ? (s->objects * s->object_size * 100) / (s->slabs * (page_size << s->order)) : 100, flags); @@ -1190,7 +1228,24 @@ void read_slab_dir(void) slab->deactivate_to_tail = get_obj("deactivate_to_tail"); slab->deactivate_remote_frees = get_obj("deactivate_remote_frees"); slab->order_fallback = get_obj("order_fallback"); + slab->shrink_calls = get_obj("shrink_calls"); + slab->shrink_attempt_defrag = get_obj("shrink_attempt_defrag"); + slab->shrink_empty_slab = get_obj("shrink_empty_slab"); + slab->shrink_slab_skipped = get_obj("shrink_slab_skipped"); + slab->shrink_slab_reclaimed = get_obj("shrink_slab_reclaimed"); + slab->shrink_object_reclaim_failed = + get_obj("shrink_object_reclaim_failed"); + slab->defrag_ratio = get_obj("defrag_ratio"); + slab->remote_node_defrag_ratio = + get_obj("remote_node_defrag_ratio"); chdir(".."); + if (read_slab_obj(slab, "ops")) { + if (strstr(buffer, "ctor :")) + slab->ctor = 1; + if (strstr(buffer, "kick :")) + slab->defrag = 1; + } + if (slab->name[0] == ':') alias_targets++; slab++; @@ -1241,10 +1296,13 @@ void output_slabs(void) struct option opts[] = { { "aliases", 0, NULL, 'a' }, { "activity", 0, NULL, 'A' }, + { "ctor", 0, NULL, 'C' }, { "debug", 2, NULL, 'd' }, { "display-activity", 0, NULL, 'D' }, + { "display-defrag", 0, NULL, 'G' }, { "empty", 0, NULL, 'e' }, { "first-alias", 0, NULL, 'f' }, + { "defrag", 0, NULL, 'F' }, { "help", 0, NULL, 'h' }, { "inverted", 0, NULL, 'i'}, { "numa", 0, NULL, 'n' }, @@ -1267,7 +1325,7 @@ int main(int argc, char *argv[]) page_size = getpagesize(); - while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTS", + while ((c = getopt_long(argc, argv, "aACd::DefFGhil1noprstvzTS", opts, NULL)) != -1) switch (c) { case '1': @@ -1293,6 +1351,9 @@ int main(int argc, char *argv[]) case 'f': show_first_alias = 1; break; + case 'G': + show_defragcount = 1; + break; case 'h': usage(); return 0; @@ -1323,6 +1384,12 @@ int main(int argc, char *argv[]) case 'z': skip_zero = 0; break; + case 'C': + show_ctor = 1; + break; + case 'F': + show_defrag = 1; + break; case 'T': show_totals = 1; break; diff --git a/fs/buffer.c b/fs/buffer.c index 6569fda5cfed..706817a3173d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3318,6 +3318,107 @@ int bh_submit_read(struct buffer_head *bh) } EXPORT_SYMBOL(bh_submit_read); +/* + * Writeback a page to clean the dirty state + */ +static void trigger_write(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + int rc; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = 1, + .range_start = 0, + .range_end = LLONG_MAX, + .nonblocking = 1, + .for_reclaim = 0 + }; + + if (PageWriteback(page)) + goto unlock; + + if (!mapping->a_ops->writepage) + /* No write method for the address space */ + goto unlock; + + if (!clear_page_dirty_for_io(page)) + /* Someone else already triggered a write */ + goto unlock; + + rc = mapping->a_ops->writepage(page, &wbc); + if (rc < 0) + /* I/O Error writing */ + return; + + if (rc == AOP_WRITEPAGE_ACTIVATE) +unlock: unlock_page(page); +} + +/* + * Get references on buffers. + * + * We obtain references on the page that uses the buffer. v[i] will point to + * the corresponding page after get_buffers() is through. + * + * We are safe from the underlying page being removed simply by doing + * a get_page_unless_zero. The buffer head removal may race at will. + * try_to_free_buffes will later take appropriate locks to remove the + * buffers if they are still there. + */ +static void *get_buffers(struct kmem_cache *s, int nr, void **v) +{ + struct page *page; + struct buffer_head *bh; + int i, j; + int n = 0; + + for (i = 0; i < nr; i++) { + bh = v[i]; + v[i] = NULL; + + page = bh->b_page; + + if (page && PagePrivate(page)) { + for (j = 0; j < n; j++) + if (page == v[j]) + continue; + } + + if (get_page_unless_zero(page)) + v[n++] = page; + } + return NULL; +} + +/* + * Despite its name: kick_buffers operates on a list of pointers to + * page structs that was set up by get_buffer(). + */ +static void kick_buffers(struct kmem_cache *s, int nr, void **v, + void *private) +{ + struct page *page; + int i; + + for (i = 0; i < nr; i++) { + page = v[i]; + + if (!page) + continue; + + if (trylock_page(page)) { + if (PageDirty(page)) + trigger_write(page); + else { + if (PagePrivate(page)) + try_to_free_buffers(page); + unlock_page(page); + } + } + put_page(page); + } +} + static void init_buffer_head(void *data) { @@ -3336,6 +3437,7 @@ void __init buffer_init(void) (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| SLAB_MEM_SPREAD), init_buffer_head); + kmem_cache_setup_defrag(bh_cachep, get_buffers, kick_buffers); /* * Limit the bh occupancy to 10% of ZONE_NORMAL diff --git a/fs/dcache.c b/fs/dcache.c index a1d86c7f3e66..bb653fa5a1a4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -32,6 +32,7 @@ #include <linux/seqlock.h> #include <linux/swap.h> #include <linux/bootmem.h> +#include <linux/backing-dev.h> #include "internal.h" @@ -142,15 +143,6 @@ static void dentry_lru_add_tail(struct dentry *dentry) static void dentry_lru_del(struct dentry *dentry) { if (!list_empty(&dentry->d_lru)) { - list_del(&dentry->d_lru); - dentry->d_sb->s_nr_dentry_unused--; - dentry_stat.nr_unused--; - } -} - -static void dentry_lru_del_init(struct dentry *dentry) -{ - if (likely(!list_empty(&dentry->d_lru))) { list_del_init(&dentry->d_lru); dentry->d_sb->s_nr_dentry_unused--; dentry_stat.nr_unused--; @@ -173,7 +165,10 @@ static struct dentry *d_kill(struct dentry *dentry) list_del(&dentry->d_u.d_child); dentry_stat.nr_dentry--; /* For d_free, below */ - /*drops the locks, at that point nobody can reach this dentry */ + /* + * drops the locks, at that point nobody (aside from defrag) + * can reach this dentry + */ dentry_iput(dentry); if (IS_ROOT(dentry)) parent = NULL; @@ -320,7 +315,7 @@ int d_invalidate(struct dentry * dentry) static inline struct dentry * __dget_locked(struct dentry *dentry) { atomic_inc(&dentry->d_count); - dentry_lru_del_init(dentry); + dentry_lru_del(dentry); return dentry; } @@ -436,7 +431,7 @@ static void prune_one_dentry(struct dentry * dentry) if (dentry->d_op && dentry->d_op->d_delete) dentry->d_op->d_delete(dentry); - dentry_lru_del_init(dentry); + dentry_lru_del(dentry); __d_drop(dentry); dentry = d_kill(dentry); spin_lock(&dcache_lock); @@ -496,7 +491,7 @@ restart: } while (!list_empty(&tmp)) { dentry = list_entry(tmp.prev, struct dentry, d_lru); - dentry_lru_del_init(dentry); + dentry_lru_del(dentry); spin_lock(&dentry->d_lock); /* * We found an inuse dentry which was not removed from @@ -625,7 +620,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) /* detach this root from the system */ spin_lock(&dcache_lock); - dentry_lru_del_init(dentry); + dentry_lru_del(dentry); __d_drop(dentry); spin_unlock(&dcache_lock); @@ -639,7 +634,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) spin_lock(&dcache_lock); list_for_each_entry(loop, &dentry->d_subdirs, d_u.d_child) { - dentry_lru_del_init(loop); + dentry_lru_del(loop); __d_drop(loop); cond_resched_lock(&dcache_lock); } @@ -822,7 +817,7 @@ resume: struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; - dentry_lru_del_init(dentry); + dentry_lru_del(dentry); /* * move only zero ref count dentries to the end * of the unused list for prune_dcache @@ -904,6 +899,16 @@ static struct shrinker dcache_shrinker = { .seeks = DEFAULT_SEEKS, }; +static void dcache_ctor(void *p) +{ + struct dentry *dentry = p; + + spin_lock_init(&dentry->d_lock); + dentry->d_inode = NULL; + INIT_LIST_HEAD(&dentry->d_lru); + INIT_LIST_HEAD(&dentry->d_alias); +} + /** * d_alloc - allocate a dcache entry * @parent: parent of entry to allocate @@ -941,8 +946,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) atomic_set(&dentry->d_count, 1); dentry->d_flags = DCACHE_UNHASHED; - spin_lock_init(&dentry->d_lock); - dentry->d_inode = NULL; dentry->d_parent = NULL; dentry->d_sb = NULL; dentry->d_op = NULL; @@ -952,9 +955,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) dentry->d_cookie = NULL; #endif INIT_HLIST_NODE(&dentry->d_hash); - INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); - INIT_LIST_HEAD(&dentry->d_alias); if (parent) { dentry->d_parent = dget(parent); @@ -2278,19 +2279,110 @@ static void __init dcache_init_early(void) INIT_HLIST_HEAD(&dentry_hashtable[loop]); } +/* + * The slab allocator is holding off frees. We can safely examine + * the object without the danger of it vanishing from under us. + */ +static void *get_dentries(struct kmem_cache *s, int nr, void **v) +{ + struct dentry *dentry; + int i; + + spin_lock(&dcache_lock); + for (i = 0; i < nr; i++) { + dentry = v[i]; + + /* + * Three sorts of dentries cannot be reclaimed: + * + * 1. dentries that are in the process of being allocated + * or being freed. In that case the dentry is neither + * on the LRU nor hashed. + * + * 2. Fake hashed entries as used for anonymous dentries + * and pipe I/O. The fake hashed entries have d_flags + * set to indicate a hashed entry. However, the + * d_hash field indicates that the entry is not hashed. + * + * 3. dentries that have a backing store that is not + * writable. This is true for tmpsfs and other in + * memory filesystems. Removing dentries from them + * would loose dentries for good. + */ + if ((d_unhashed(dentry) && list_empty(&dentry->d_lru)) || + (!d_unhashed(dentry) && hlist_unhashed(&dentry->d_hash)) || + (dentry->d_inode && + !mapping_cap_writeback_dirty(dentry->d_inode->i_mapping))) + /* Ignore this dentry */ + v[i] = NULL; + else + /* dget_locked will remove the dentry from the LRU */ + dget_locked(dentry); + } + spin_unlock(&dcache_lock); + return NULL; +} + +/* + * Slab has dropped all the locks. Get rid of the refcount obtained + * earlier and also free the object. + */ +static void kick_dentries(struct kmem_cache *s, + int nr, void **v, void *private) +{ + struct dentry *dentry; + int i; + + /* + * First invalidate the dentries without holding the dcache lock + */ + for (i = 0; i < nr; i++) { + dentry = v[i]; + + if (dentry) + d_invalidate(dentry); + } + + /* + * If we are the last one holding a reference then the dentries can + * be freed. We need the dcache_lock. + */ + spin_lock(&dcache_lock); + for (i = 0; i < nr; i++) { + dentry = v[i]; + if (!dentry) + continue; + + spin_lock(&dentry->d_lock); + if (atomic_read(&dentry->d_count) > 1) { + spin_unlock(&dentry->d_lock); + spin_unlock(&dcache_lock); + dput(dentry); + spin_lock(&dcache_lock); + continue; + } + + prune_one_dentry(dentry); + } + spin_unlock(&dcache_lock); + + /* + * dentries are freed using RCU so we need to wait until RCU + * operations are complete. + */ + synchronize_rcu(); +} + static void __init dcache_init(void) { int loop; - /* - * A constructor could be added for stable state like the lists, - * but it is probably not worth it because of the cache nature - * of the dcache. - */ - dentry_cache = KMEM_CACHE(dentry, - SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); - + dentry_cache = kmem_cache_create("dentry_cache", sizeof(struct dentry), + 0, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD, + dcache_ctor); + register_shrinker(&dcache_shrinker); + kmem_cache_setup_defrag(dentry_cache, get_dentries, kick_dentries); /* Hash may have been set up in dcache_init_early */ if (!hashdist) diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 3e5637fc3779..55f68b421547 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -58,7 +58,7 @@ static void drop_slab(void) int nr_objects; do { - nr_objects = shrink_slab(1000, GFP_KERNEL, 1000); + nr_objects = shrink_slab(1000, GFP_KERNEL, 1000, NULL); } while (nr_objects > 10); } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 647cd888ac87..cf26ff18a795 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -171,6 +171,12 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } +static void *ext2_get_inodes(struct kmem_cache *s, int nr, void **v) +{ + return fs_get_inodes(s, nr, v, + offsetof(struct ext2_inode_info, vfs_inode)); +} + static int init_inodecache(void) { ext2_inode_cachep = kmem_cache_create("ext2_inode_cache", @@ -180,6 +186,9 @@ static int init_inodecache(void) init_once); if (ext2_inode_cachep == NULL) return -ENOMEM; + + kmem_cache_setup_defrag(ext2_inode_cachep, + ext2_get_inodes, kick_inodes); return 0; } diff --git a/fs/ext3/super.c b/fs/ext3/super.c index f6c94f232ec1..58fca5a161ec 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -489,6 +489,12 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } +static void *ext3_get_inodes(struct kmem_cache *s, int nr, void **v) +{ + return fs_get_inodes(s, nr, v, + offsetof(struct ext3_inode_info, vfs_inode)); +} + static int init_inodecache(void) { ext3_inode_cachep = kmem_cache_create("ext3_inode_cache", @@ -498,6 +504,8 @@ static int init_inodecache(void) init_once); if (ext3_inode_cachep == NULL) return -ENOMEM; + kmem_cache_setup_defrag(ext3_inode_cachep, + ext3_get_inodes, kick_inodes); return 0; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e4a241c65dbe..2bceec39cb51 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -556,6 +556,12 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } +static void *ext4_get_inodes(struct kmem_cache *s, int nr, void **v) +{ + return fs_get_inodes(s, nr, v, + offsetof(struct ext4_inode_info, vfs_inode)); +} + static int init_inodecache(void) { ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", @@ -565,6 +571,8 @@ static int init_inodecache(void) init_once); if (ext4_inode_cachep == NULL) return -ENOMEM; + kmem_cache_setup_defrag(ext4_inode_cachep, + ext4_get_inodes, kick_inodes); return 0; } diff --git a/fs/inode.c b/fs/inode.c index 0487ddba1397..b33d2adb99bd 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1364,6 +1364,128 @@ static int __init set_ihash_entries(char *str) __setup("ihash_entries=", set_ihash_entries); /* + * Obtain a refcount on a list of struct inodes pointed to by v. If the + * inode is in the process of being freed then zap the v[] entry so that + * we skip the freeing attempts later. + * + * This is a generic function for the ->get slab defrag callback. + */ +void *get_inodes(struct kmem_cache *s, int nr, void **v) +{ + int i; + + spin_lock(&inode_lock); + for (i = 0; i < nr; i++) { + struct inode *inode = v[i]; + + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) + v[i] = NULL; + else + __iget(inode); + } + spin_unlock(&inode_lock); + return NULL; +} +EXPORT_SYMBOL(get_inodes); + +/* + * Function for filesystems that embedd struct inode into their own + * fs inode. The offset is the offset of the struct inode in the fs inode. + * + * The function adds to the pointers in v[] in order to make them point to + * struct inode. Then get_inodes() is used to get the refcount. + * The converted v[] pointers can then also be passed to the kick() callback + * without further processing. + */ +void *fs_get_inodes(struct kmem_cache *s, int nr, void **v, + unsigned long offset) +{ + int i; + + for (i = 0; i < nr; i++) + v[i] += offset; + + return get_inodes(s, nr, v); +} +EXPORT_SYMBOL(fs_get_inodes); + +/* + * Generic callback function slab defrag ->kick methods. Takes the + * array with inodes where we obtained refcounts using fs_get_inodes() + * or get_inodes() and tries to free them. + */ +void kick_inodes(struct kmem_cache *s, int nr, void **v, void *private) +{ + struct inode *inode; + int i; + int abort = 0; + LIST_HEAD(freeable); + int active; + + for (i = 0; i < nr; i++) { + inode = v[i]; + if (!inode) + continue; + + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + if (remove_inode_buffers(inode)) + /* + * Should we really be doing this? Or + * limit the writeback here to only a few pages? + * + * Possibly an expensive operation but we + * cannot reclaim the inode if the pages + * are still present. + */ + invalidate_mapping_pages(&inode->i_data, + 0, -1); + } + + /* Invalidate children and dentry */ + if (S_ISDIR(inode->i_mode)) { + struct dentry *d = d_find_alias(inode); + + if (d) { + d_invalidate(d); + dput(d); + } + } + + if (inode->i_state & I_DIRTY) + write_inode_now(inode, 1); + + d_prune_aliases(inode); + } + + mutex_lock(&iprune_mutex); + for (i = 0; i < nr; i++) { + inode = v[i]; + + if (!inode) + /* inode is alrady being freed */ + continue; + + active = inode->i_sb->s_flags & MS_ACTIVE; + iput(inode); + if (abort || !active) + continue; + + spin_lock(&inode_lock); + abort = !can_unuse(inode); + + if (!abort) { + list_move(&inode->i_list, &freeable); + inode->i_state |= I_FREEING; + inodes_stat.nr_unused--; + } + spin_unlock(&inode_lock); + } + dispose_list(&freeable); + mutex_unlock(&iprune_mutex); +} +EXPORT_SYMBOL(kick_inodes); + +/* * Initialize the waitqueues and inode hash table. */ void __init inode_init_early(void) @@ -1402,6 +1524,7 @@ void __init inode_init(void) SLAB_MEM_SPREAD), init_once); register_shrinker(&icache_shrinker); + kmem_cache_setup_defrag(inode_cachep, get_inodes, kick_inodes); /* Hash may have been set up in inode_init_early */ if (!hashdist) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 2543fd00c658..bcb674275348 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -106,6 +106,12 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } +static void *proc_get_inodes(struct kmem_cache *s, int nr, void **v) +{ + return fs_get_inodes(s, nr, v, + offsetof(struct proc_inode, vfs_inode)); +} + void __init proc_init_inodecache(void) { proc_inode_cachep = kmem_cache_create("proc_inode_cache", @@ -113,6 +119,8 @@ void __init proc_init_inodecache(void) 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_PANIC), init_once); + kmem_cache_setup_defrag(proc_inode_cachep, + proc_get_inodes, kick_inodes); } static const struct super_operations proc_sops = { diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 663a91f5dce8..0d4c1a2f0f74 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -532,6 +532,12 @@ static void init_once(void *foo) #endif } +static void *reiserfs_get_inodes(struct kmem_cache *s, int nr, void **v) +{ + return fs_get_inodes(s, nr, v, + offsetof(struct reiserfs_inode_info, vfs_inode)); +} + static int init_inodecache(void) { reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache", @@ -542,6 +548,8 @@ static int init_inodecache(void) init_once); if (reiserfs_inode_cachep == NULL) return -ENOMEM; + kmem_cache_setup_defrag(reiserfs_inode_cachep, + reiserfs_get_inodes, kick_inodes); return 0; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 0dcdd9458f4b..eba5ecbf127c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1907,6 +1907,12 @@ static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } +/* Helper functions for inode defragmentation support in filesystems */ +extern void kick_inodes(struct kmem_cache *, int, void **, void *); +extern void *get_inodes(struct kmem_cache *, int nr, void **); +extern void *fs_get_inodes(struct kmem_cache *, int nr, void **, + unsigned long offset); + extern struct file * get_empty_filp(void); extern void file_move(struct file *f, struct list_head *list); extern void file_kill(struct file *f); diff --git a/include/linux/mm.h b/include/linux/mm.h index ffee2f743418..76b407cf8493 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1263,8 +1263,7 @@ int in_gate_area_no_task(unsigned long addr); int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, - unsigned long lru_pages); - + unsigned long lru_pages, struct zone *z); #ifndef CONFIG_MMU #define randomize_va_space 0 #else diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 35a7b5e19465..d9e765269136 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -327,6 +327,7 @@ struct zone { unsigned long recent_scanned[2]; unsigned long pages_scanned; /* since last reclaim */ + unsigned long slab_defrag_counter; /* since last defrag */ unsigned long flags; /* zone flags, see below */ /* Zone statistics */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b12f93a3c345..487cd3b6bcfe 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -117,6 +117,7 @@ enum pageflags { /* SLUB */ PG_slub_frozen = PG_active, PG_slub_debug = PG_error, + PG_slub_kickable = PG_dirty, }; #ifndef __GENERATING_BOUNDS_H @@ -201,6 +202,7 @@ __PAGEFLAG(SlobFree, slob_free) __PAGEFLAG(SlubFrozen, slub_frozen) __PAGEFLAG(SlubDebug, slub_debug) +__PAGEFLAG(SlubKickable, slub_kickable) /* * Only test-and-set exist for PG_writeback. The unconditional operators are diff --git a/include/linux/slab.h b/include/linux/slab.h index f96d13c281e8..1331fe06223a 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -130,6 +130,59 @@ void kfree(const void *); size_t ksize(const void *); /* + * Function prototypes passed to kmem_cache_defrag() to enable defragmentation + * and targeted reclaim in slab caches. + */ + +/* + * kmem_cache_defrag_get_func() is called with locks held so that the slab + * objects cannot be freed. We are in an atomic context and no slab + * operations may be performed. The purpose of kmem_cache_defrag_get_func() + * is to obtain a stable refcount on the objects, so that they cannot be + * removed until kmem_cache_kick_func() has handled them. + * + * Parameters passed are the number of objects to process and an array of + * pointers to objects for which we need references. + * + * Returns a pointer that is passed to the kick function. If any objects + * cannot be moved then the pointer may indicate a failure and + * then kick can simply remove the references that were already obtained. + * + * The object pointer array passed is also passed to kmem_cache_defrag_kick(). + * The function may remove objects from the array by setting pointers to + * NULL. This is useful if we can determine that an object is already about + * to be removed. In that case it is often impossible to obtain the necessary + * refcount. + */ +typedef void *kmem_defrag_get_func(struct kmem_cache *, int, void **); + +/* + * kmem_cache_defrag_kick_func is called with no locks held and interrupts + * enabled. Sleeping is possible. Any operation may be performed in kick(). + * kmem_cache_defrag should free all the objects in the pointer array. + * + * Parameters passed are the number of objects in the array, the array of + * pointers to the objects and the pointer returned by kmem_cache_defrag_get(). + * + * Success is checked by examining the number of remaining objects in the slab. + */ +typedef void kmem_defrag_kick_func(struct kmem_cache *, int, void **, void *); + +/* + * kmem_cache_setup_defrag() is used to setup callbacks for a slab cache. + * kmem_cache_defrag() performs the actual defragmentation. + */ +#ifdef CONFIG_SLUB +void kmem_cache_setup_defrag(struct kmem_cache *, kmem_defrag_get_func, + kmem_defrag_kick_func); +int kmem_cache_defrag(int node); +#else +static inline void kmem_cache_setup_defrag(struct kmem_cache *s, + kmem_defrag_get_func get, kmem_defrag_kick_func kiok) {} +static inline int kmem_cache_defrag(int node) { return 0; } +#endif + +/* * Allocator specific definitions. These are mainly used to establish optimized * ways to convert kmalloc() calls to kmem_cache_alloc() invocations by * selecting the appropriate general cache at compile time. diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index dc28432b5b9a..5f8dfcf74b71 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -31,6 +31,12 @@ enum stat_item { DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ ORDER_FALLBACK, /* Number of times fallback was necessary */ + SHRINK_CALLS, /* Number of invocations of kmem_cache_shrink */ + SHRINK_ATTEMPT_DEFRAG, /* Slabs that were attempted to be reclaimed */ + SHRINK_EMPTY_SLAB, /* Shrink encountered and freed empty slab */ + SHRINK_SLAB_SKIPPED, /* Slab reclaim skipped an slab (busy etc) */ + SHRINK_SLAB_RECLAIMED, /* Successfully reclaimed slabs */ + SHRINK_OBJECT_RECLAIM_FAILED, /* Callbacks signaled busy objects */ NR_SLUB_STAT_ITEMS }; struct kmem_cache_cpu { @@ -88,8 +94,18 @@ struct kmem_cache { gfp_t allocflags; /* gfp flags to use on each alloc */ int refcount; /* Refcount for slab cache destroy */ void (*ctor)(void *); + kmem_defrag_get_func *get; + kmem_defrag_kick_func *kick; + int inuse; /* Offset to metadata */ int align; /* Alignment */ + int defrag_ratio; /* + * Ratio used to check the percentage of + * objects allocate in a slab page. + * If less than this ratio is allocated + * then reclaim attempts are made. + */ + const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ #ifdef CONFIG_SLUB_DEBUG diff --git a/include/linux/swap.h b/include/linux/swap.h index a3af95b2cb6d..e9584191067f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -217,6 +217,9 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, extern int __isolate_lru_page(struct page *page, int mode, int file); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; +extern int slab_defrag_limit; +extern int slab_defrag_counter; + extern int remove_mapping(struct address_space *mapping, struct page *page); extern long vm_total_pages; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9d048fa2d902..7a7a07a3f5f8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1078,6 +1078,26 @@ static struct ctl_table vm_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "slab_defrag_limit", + .data = &slab_defrag_limit, + .maxlen = sizeof(slab_defrag_limit), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &one_hundred, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "slab_defrag_count", + .data = &slab_defrag_counter, + .maxlen = sizeof(slab_defrag_counter), + .mode = 0444, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT { .ctl_name = VM_LEGACY_VA_LAYOUT, diff --git a/mm/slub.c b/mm/slub.c index 3d9b6d064798..9446012a20f1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -129,10 +129,10 @@ /* * Maximum number of desirable partial slabs. - * The existence of more partial slabs makes kmem_cache_shrink - * sort the partial list by the number of objects in the. + * More slabs cause kmem_cache_shrink to sort the slabs by objects + * and triggers slab defragmentation. */ -#define MAX_PARTIAL 10 +#define MAX_PARTIAL 20 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_STORE_USER) @@ -179,6 +179,9 @@ static enum { static DECLARE_RWSEM(slub_lock); static LIST_HEAD(slab_caches); +/* Maximum objects in defragmentable slabs */ +static unsigned int max_defrag_slab_objects; + /* * Tracking user of a slab. */ @@ -1133,6 +1136,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) SLAB_STORE_USER | SLAB_TRACE)) __SetPageSlubDebug(page); + if (s->kick) + __SetPageSlubKickable(page); + start = page_address(page); if (unlikely(s->flags & SLAB_POISON)) @@ -1173,6 +1179,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -pages); + __ClearPageSlubKickable(page); __ClearPageSlab(page); reset_page_mapcount(page); __free_pages(page, order); @@ -1383,6 +1390,8 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) if (SLABDEBUG && PageSlubDebug(page) && (s->flags & SLAB_STORE_USER)) add_full(n, page); + if (s->kick) + __SetPageSlubKickable(page); } slab_unlock(page); } else { @@ -2346,6 +2355,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, goto error; s->refcount = 1; + s->defrag_ratio = 30; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif @@ -2552,7 +2562,7 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, flags, NULL)) goto panic; - list_add(&s->list, &slab_caches); + list_add_tail(&s->list, &slab_caches); up_write(&slub_lock); if (sysfs_slab_add(s)) goto panic; @@ -2615,7 +2625,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) goto unlock_out; } - list_add(&s->list, &slab_caches); + list_add_tail(&s->list, &slab_caches); kmalloc_caches_dma[index] = s; schedule_work(&sysfs_add_work); @@ -2804,76 +2814,284 @@ void kfree(const void *x) EXPORT_SYMBOL(kfree); /* - * kmem_cache_shrink removes empty slabs from the partial lists and sorts - * the remaining slabs by the number of items in use. The slabs with the - * most items in use come first. New allocations will then fill those up - * and thus they can be removed from the partial lists. + * Allocate a slab scratch space that is sufficient to keep at least + * max_defrag_slab_objects pointers to individual objects and also a bitmap + * for max_defrag_slab_objects. + */ +static inline void *alloc_scratch(void) +{ + return kmalloc(max_defrag_slab_objects * sizeof(void *) + + BITS_TO_LONGS(max_defrag_slab_objects) * sizeof(unsigned long), + GFP_KERNEL); +} + +void kmem_cache_setup_defrag(struct kmem_cache *s, + kmem_defrag_get_func get, kmem_defrag_kick_func kick) +{ + int max_objects = oo_objects(s->max); + + /* + * Defragmentable slabs must have a ctor otherwise objects may be + * in an undetermined state after they are allocated. + */ + BUG_ON(!s->ctor); + s->get = get; + s->kick = kick; + down_write(&slub_lock); + list_move(&s->list, &slab_caches); + if (max_objects > max_defrag_slab_objects) + max_defrag_slab_objects = max_objects; + up_write(&slub_lock); +} +EXPORT_SYMBOL(kmem_cache_setup_defrag); + +/* + * Vacate all objects in the given slab. * - * The slabs with the least items are placed last. This results in them - * being allocated from last increasing the chance that the last objects - * are freed in them. + * The scratch aread passed to list function is sufficient to hold + * struct listhead times objects per slab. We use it to hold void ** times + * objects per slab plus a bitmap for each object. */ -int kmem_cache_shrink(struct kmem_cache *s) +static int kmem_cache_vacate(struct page *page, void *scratch) { - int node; - int i; - struct kmem_cache_node *n; - struct page *page; - struct page *t; - int objects = oo_objects(s->max); - struct list_head *slabs_by_inuse = - kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); + void **vector = scratch; + void *p; + void *addr = page_address(page); + struct kmem_cache *s; + unsigned long *map; + int leftover; + int count; + void *private; unsigned long flags; + unsigned long objects; + struct kmem_cache_cpu *c; - if (!slabs_by_inuse) - return -ENOMEM; + local_irq_save(flags); + slab_lock(page); - flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - n = get_node(s, node); + BUG_ON(!PageSlab(page)); /* Must be s slab page */ + BUG_ON(!PageSlubFrozen(page)); /* Slab must have been frozen earlier */ - if (!n->nr_partial) - continue; + s = page->slab; + objects = page->objects; + map = scratch + objects * sizeof(void **); + if (!page->inuse || !s->kick || !PageSlubKickable(page)) + goto out; - for (i = 0; i < objects; i++) - INIT_LIST_HEAD(slabs_by_inuse + i); + /* Determine used objects */ + bitmap_fill(map, objects); + for_each_free_object(p, s, page->freelist) + __clear_bit(slab_index(p, s, addr), map); - spin_lock_irqsave(&n->list_lock, flags); + /* Build vector of pointers to objects */ + count = 0; + memset(vector, 0, objects * sizeof(void **)); + for_each_object(p, s, addr, objects) + if (test_bit(slab_index(p, s, addr), map)) + vector[count++] = p; - /* - * Build lists indexed by the items in use in each slab. - * - * Note that concurrent frees may occur while we hold the - * list_lock. page->inuse here is the upper limit. - */ - list_for_each_entry_safe(page, t, &n->partial, lru) { - if (!page->inuse && slab_trylock(page)) { + private = s->get(s, count, vector); + + /* + * Got references. Now we can drop the slab lock. The slab + * is frozen so it cannot vanish from under us nor will + * allocations be performed on the slab. However, unlocking the + * slab will allow concurrent slab_frees to proceed. + */ + slab_unlock(page); + local_irq_restore(flags); + + /* + * Perform the KICK callbacks to remove the objects. + */ + s->kick(s, count, vector, private); + + local_irq_save(flags); + slab_lock(page); +out: + /* + * Check the result and unfreeze the slab + */ + leftover = page->inuse; + c = get_cpu_slab(s, smp_processor_id()); + if (leftover) { + /* Unsuccessful reclaim. Avoid future reclaim attempts. */ + stat(c, SHRINK_OBJECT_RECLAIM_FAILED); + __ClearPageSlubKickable(page); + } else + stat(c, SHRINK_SLAB_RECLAIMED); + unfreeze_slab(s, page, leftover > 0); + local_irq_restore(flags); + return leftover; +} + +/* + * Remove objects from a list of slab pages that have been gathered. + * Must be called with slabs that have been isolated before. + * + * kmem_cache_reclaim() is never called from an atomic context. It + * allocates memory for temporary storage. We are holding the + * slub_lock semaphore which prevents another call into + * the defrag logic. + */ +int kmem_cache_reclaim(struct list_head *zaplist) +{ + int freed = 0; + void **scratch; + struct page *page; + struct page *page2; + + if (list_empty(zaplist)) + return 0; + + scratch = alloc_scratch(); + if (!scratch) + return 0; + + list_for_each_entry_safe(page, page2, zaplist, lru) { + list_del(&page->lru); + if (kmem_cache_vacate(page, scratch) == 0) + freed++; + } + kfree(scratch); + return freed; +} + +/* + * Shrink the slab cache on a particular node of the cache + * by releasing slabs with zero objects and trying to reclaim + * slabs with less than the configured percentage of objects allocated. + */ +static unsigned long __kmem_cache_shrink(struct kmem_cache *s, int node, + unsigned long limit) +{ + unsigned long flags; + struct page *page, *page2; + LIST_HEAD(zaplist); + int freed = 0; + struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_cpu *c; + + if (n->nr_partial <= limit) + return 0; + + spin_lock_irqsave(&n->list_lock, flags); + c = get_cpu_slab(s, smp_processor_id()); + stat(c, SHRINK_CALLS); + list_for_each_entry_safe(page, page2, &n->partial, lru) { + if (!slab_trylock(page)) + /* Busy slab. Get out of the way */ + continue; + + if (page->inuse) { + if (!PageSlubKickable(page) || page->inuse * 100 >= + s->defrag_ratio * page->objects) { + slab_unlock(page); /* - * Must hold slab lock here because slab_free - * may have freed the last object and be - * waiting to release the slab. + * Slab contains enough objects + * or we alrady tried reclaim before and + * it failed. Skip this one. */ - list_del(&page->lru); + continue; + } + + list_move(&page->lru, &zaplist); + if (s->kick) { + stat(c, SHRINK_ATTEMPT_DEFRAG); n->nr_partial--; - slab_unlock(page); - discard_slab(s, page); - } else { - list_move(&page->lru, - slabs_by_inuse + page->inuse); + __SetPageSlubFrozen(page); } + slab_unlock(page); + } else { + /* Empty slab page */ + stat(c, SHRINK_EMPTY_SLAB); + list_del(&page->lru); + n->nr_partial--; + slab_unlock(page); + discard_slab(s, page); + freed++; } + } + if (!s->kick) /* - * Rebuild the partial list with the slabs filled up most - * first and the least used slabs at the end. + * No defrag methods. By simply putting the zaplist at the + * end of the partial list we can let them simmer longer + * and thus increase the chance of all objects being + * reclaimed. + * + * We have effectively sorted the partial list and put + * the slabs with more objects first. As soon as they + * are allocated they are going to be removed from the + * partial list. */ - for (i = objects - 1; i >= 0; i--) - list_splice(slabs_by_inuse + i, n->partial.prev); + list_splice(&zaplist, n->partial.prev); - spin_unlock_irqrestore(&n->list_lock, flags); + + spin_unlock_irqrestore(&n->list_lock, flags); + + if (s->kick) + freed += kmem_cache_reclaim(&zaplist); + + return freed; +} + +/* + * Defrag slabs conditional on the amount of fragmentation in a page. + */ +int kmem_cache_defrag(int node) +{ + struct kmem_cache *s; + unsigned long slabs = 0; + + /* + * kmem_cache_defrag may be called from the reclaim path which may be + * called for any page allocator alloc. So there is the danger that we + * get called in a situation where slub already acquired the slub_lock + * for other purposes. + */ + if (!down_read_trylock(&slub_lock)) + return 0; + + list_for_each_entry(s, &slab_caches, list) { + unsigned long reclaimed = 0; + + /* + * Defragmentable caches come first. If the slab cache is not + * defragmentable then we can stop traversing the list. + */ + if (!s->kick) + break; + + if (node == -1) { + int nid; + + for_each_node_state(nid, N_NORMAL_MEMORY) + reclaimed += __kmem_cache_shrink(s, nid, + MAX_PARTIAL); + } else + reclaimed = __kmem_cache_shrink(s, node, MAX_PARTIAL); + + slabs += reclaimed; } + up_read(&slub_lock); + return slabs; +} +EXPORT_SYMBOL(kmem_cache_defrag); + +/* + * kmem_cache_shrink removes empty slabs from the partial lists. + * If the slab cache supports defragmentation then objects are + * reclaimed. + */ +int kmem_cache_shrink(struct kmem_cache *s) +{ + int node; + + flush_all(s); + for_each_node_state(node, N_NORMAL_MEMORY) + __kmem_cache_shrink(s, node, 0); - kfree(slabs_by_inuse); return 0; } EXPORT_SYMBOL(kmem_cache_shrink); @@ -3096,7 +3314,7 @@ static int slab_unmergeable(struct kmem_cache *s) if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) return 1; - if (s->ctor) + if (s->ctor || s->kick || s->get) return 1; /* @@ -3185,7 +3403,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, if (s) { if (kmem_cache_open(s, GFP_KERNEL, name, size, align, flags, ctor)) { - list_add(&s->list, &slab_caches); + list_add_tail(&s->list, &slab_caches); up_write(&slub_lock); if (sysfs_slab_add(s)) goto err; @@ -3884,16 +4102,32 @@ static ssize_t order_show(struct kmem_cache *s, char *buf) } SLAB_ATTR(order); -static ssize_t ctor_show(struct kmem_cache *s, char *buf) +static ssize_t ops_show(struct kmem_cache *s, char *buf) { + int x = 0; + if (s->ctor) { - int n = sprint_symbol(buf, (unsigned long)s->ctor); + x += sprintf(buf + x, "ctor : "); + x += sprint_symbol(buf + x, (unsigned long)s->ctor); + x += sprintf(buf + x, "\n"); + } - return n + sprintf(buf + n, "\n"); + if (s->get) { + x += sprintf(buf + x, "get : "); + x += sprint_symbol(buf + x, + (unsigned long)s->get); + x += sprintf(buf + x, "\n"); } - return 0; + + if (s->kick) { + x += sprintf(buf + x, "kick : "); + x += sprint_symbol(buf + x, + (unsigned long)s->kick); + x += sprintf(buf + x, "\n"); + } + return x; } -SLAB_ATTR_RO(ctor); +SLAB_ATTR_RO(ops); static ssize_t aliases_show(struct kmem_cache *s, char *buf) { @@ -4113,6 +4347,27 @@ static ssize_t free_calls_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(free_calls); +static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->defrag_ratio); +} + +static ssize_t defrag_ratio_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + unsigned long ratio; + int err; + + err = strict_strtoul(buf, 10, &ratio); + if (err) + return err; + + if (ratio < 100) + s->defrag_ratio = ratio; + return length; +} +SLAB_ATTR(defrag_ratio); + #ifdef CONFIG_NUMA static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) { @@ -4192,6 +4447,12 @@ STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); STAT_ATTR(ORDER_FALLBACK, order_fallback); +STAT_ATTR(SHRINK_CALLS, shrink_calls); +STAT_ATTR(SHRINK_ATTEMPT_DEFRAG, shrink_attempt_defrag); +STAT_ATTR(SHRINK_EMPTY_SLAB, shrink_empty_slab); +STAT_ATTR(SHRINK_SLAB_SKIPPED, shrink_slab_skipped); +STAT_ATTR(SHRINK_SLAB_RECLAIMED, shrink_slab_reclaimed); +STAT_ATTR(SHRINK_OBJECT_RECLAIM_FAILED, shrink_object_reclaim_failed); #endif static struct attribute *slab_attrs[] = { @@ -4205,7 +4466,7 @@ static struct attribute *slab_attrs[] = { &slabs_attr.attr, &partial_attr.attr, &cpu_slabs_attr.attr, - &ctor_attr.attr, + &ops_attr.attr, &aliases_attr.attr, &align_attr.attr, &sanity_checks_attr.attr, @@ -4220,6 +4481,7 @@ static struct attribute *slab_attrs[] = { &shrink_attr.attr, &alloc_calls_attr.attr, &free_calls_attr.attr, + &defrag_ratio_attr.attr, #ifdef CONFIG_ZONE_DMA &cache_dma_attr.attr, #endif @@ -4245,6 +4507,12 @@ static struct attribute *slab_attrs[] = { &deactivate_to_tail_attr.attr, &deactivate_remote_frees_attr.attr, &order_fallback_attr.attr, + &shrink_calls_attr.attr, + &shrink_attempt_defrag_attr.attr, + &shrink_empty_slab_attr.attr, + &shrink_slab_skipped_attr.attr, + &shrink_slab_reclaimed_attr.attr, + &shrink_object_reclaim_failed_attr.attr, #endif NULL }; diff --git a/mm/vmscan.c b/mm/vmscan.c index 7ea1440b53db..6aa67117e6d0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -151,6 +151,14 @@ void unregister_shrinker(struct shrinker *shrinker) EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 + +/* + * Trigger a call into slab defrag if the sum of the returns from + * shrinkers cross this value. + */ +int slab_defrag_limit = 1000; +int slab_defrag_counter; + /* * Call the shrink functions to age shrinkable caches * @@ -168,10 +176,18 @@ EXPORT_SYMBOL(unregister_shrinker); * are eligible for the caller's allocation attempt. It is used for balancing * slab reclaim versus page reclaim. * + * zone is the zone for which we are shrinking the slabs. If the intent + * is to do a global shrink then zone may be NULL. Specification of a + * zone is currently only used to limit slab defragmentation to a NUMA node. + * The performace of shrink_slab would be better (in particular under NUMA) + * if it could be targeted as a whole to the zone that is under memory + * pressure but the VFS infrastructure does not allow that at the present + * time. + * * Returns the number of slab objects which we shrunk. */ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, - unsigned long lru_pages) + unsigned long lru_pages, struct zone *zone) { struct shrinker *shrinker; unsigned long ret = 0; @@ -228,6 +244,39 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, shrinker->nr += total_scan; } up_read(&shrinker_rwsem); + + + /* Avoid dirtying cachelines */ + if (!ret) + return 0; + + /* + * "ret" doesnt really contain the freed object count. The shrinkers + * fake it. Gotta go with what we are getting though. + * + * Handling of the defrag_counter is also racy. If we get the + * wrong counts then we may unnecessarily do a defrag pass or defer + * one. "ret" is already faked. So this is just increasing + * the already existing fuzziness to get some notion as to when + * to initiate slab defrag which will hopefully be okay. + */ + if (zone) { + /* balance_pgdat running on a zone so we only scan one node */ + zone->slab_defrag_counter += ret; + if (zone->slab_defrag_counter > slab_defrag_limit && + (gfp_mask & __GFP_FS)) { + zone->slab_defrag_counter = 0; + kmem_cache_defrag(zone_to_nid(zone)); + } + } else { + /* Direct (and thus global) reclaim. Scan all nodes */ + slab_defrag_counter += ret; + if (slab_defrag_counter > slab_defrag_limit && + (gfp_mask & __GFP_FS)) { + slab_defrag_counter = 0; + kmem_cache_defrag(-1); + } + } return ret; } @@ -1586,7 +1635,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, * over limit cgroups */ if (scan_global_lru(sc)) { - shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); + shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages, NULL); if (reclaim_state) { nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; @@ -1820,7 +1869,7 @@ loop_again: nr_reclaimed += shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, - lru_pages); + lru_pages, zone); nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; if (zone_is_all_unreclaimable(zone)) @@ -2060,7 +2109,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, lru_pages); + shrink_slab(nr_pages, sc.gfp_mask, lru_pages, NULL); if (!reclaim_state.reclaimed_slab) break; @@ -2098,7 +2147,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) reclaim_state.reclaimed_slab = 0; shrink_slab(sc.nr_scanned, sc.gfp_mask, - global_lru_pages()); + global_lru_pages(), NULL); ret += reclaim_state.reclaimed_slab; if (ret >= nr_pages) goto out; @@ -2115,7 +2164,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) if (!ret) { do { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); + shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages(), NULL); ret += reclaim_state.reclaimed_slab; } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); } @@ -2277,7 +2326,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Note that shrink_slab will free memory on all zones and may * take a long time. */ - while (shrink_slab(sc.nr_scanned, gfp_mask, order) && + while (shrink_slab(sc.nr_scanned, gfp_mask, order, + zone) && zone_page_state(zone, NR_SLAB_RECLAIMABLE) > slab_reclaimable - nr_pages) ; diff --git a/mm/vmstat.c b/mm/vmstat.c index c3ccfda23adc..c4b7280e6e70 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -774,11 +774,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, #endif } seq_printf(m, + "\n slab_defrag_count: %lu" "\n all_unreclaimable: %u" "\n prev_priority: %i" "\n start_pfn: %lu" "\n inactive_ratio: %u", - zone_is_all_unreclaimable(zone), + zone->slab_defrag_counter, + zone_is_all_unreclaimable(zone), zone->prev_priority, zone->zone_start_pfn, zone->inactive_ratio); |