summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/sysctl/vm.txt12
-rw-r--r--Documentation/vm/slabinfo.c95
-rw-r--r--fs/buffer.c102
-rw-r--r--fs/dcache.c148
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/ext2/super.c9
-rw-r--r--fs/ext3/super.c8
-rw-r--r--fs/ext4/super.c8
-rw-r--r--fs/inode.c123
-rw-r--r--fs/proc/inode.c8
-rw-r--r--fs/reiserfs/super.c8
-rw-r--r--include/linux/fs.h6
-rw-r--r--include/linux/mm.h3
-rw-r--r--include/linux/mmzone.h1
-rw-r--r--include/linux/page-flags.h2
-rw-r--r--include/linux/slab.h53
-rw-r--r--include/linux/slub_def.h16
-rw-r--r--include/linux/swap.h3
-rw-r--r--kernel/sysctl.c20
-rw-r--r--mm/slub.c392
-rw-r--r--mm/vmscan.c64
-rw-r--r--mm/vmstat.c4
22 files changed, 972 insertions, 115 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index d79eeda7a699..5e7329a1abcc 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -38,6 +38,7 @@ Currently, these files are in /proc/sys/vm:
- numa_zonelist_order
- nr_hugepages
- nr_overcommit_hugepages
+- slab_defrag_limit
==============================================================
@@ -347,3 +348,14 @@ Change the maximum size of the hugepage pool. The maximum is
nr_hugepages + nr_overcommit_hugepages.
See Documentation/vm/hugetlbpage.txt
+
+==============================================================
+
+slab_defrag_limit
+
+Determines the frequency of calls from reclaim into slab defragmentation.
+Slab defrag reclaims objects from sparsely populates slab pages.
+The default is 1000. Increase if slab defragmentation occurs
+too frequently. Decrease if more slab defragmentation passes
+are needed. The slabinfo tool can report on the frequency of the callbacks.
+
diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c
index df3227605d59..75b56e8fc651 100644
--- a/Documentation/vm/slabinfo.c
+++ b/Documentation/vm/slabinfo.c
@@ -31,6 +31,8 @@ struct slabinfo {
int hwcache_align, object_size, objs_per_slab;
int sanity_checks, slab_size, store_user, trace;
int order, poison, reclaim_account, red_zone;
+ int defrag, ctor;
+ int defrag_ratio, remote_node_defrag_ratio;
unsigned long partial, objects, slabs, objects_partial, objects_total;
unsigned long alloc_fastpath, alloc_slowpath;
unsigned long free_fastpath, free_slowpath;
@@ -39,6 +41,9 @@ struct slabinfo {
unsigned long cpuslab_flush, deactivate_full, deactivate_empty;
unsigned long deactivate_to_head, deactivate_to_tail;
unsigned long deactivate_remote_frees, order_fallback;
+ unsigned long shrink_calls, shrink_attempt_defrag, shrink_empty_slab;
+ unsigned long shrink_slab_skipped, shrink_slab_reclaimed;
+ unsigned long shrink_object_reclaim_failed;
int numa[MAX_NODES];
int numa_partial[MAX_NODES];
} slabinfo[MAX_SLABS];
@@ -64,6 +69,8 @@ int show_slab = 0;
int skip_zero = 1;
int show_numa = 0;
int show_track = 0;
+int show_defrag = 0;
+int show_ctor = 0;
int show_first_alias = 0;
int validate = 0;
int shrink = 0;
@@ -75,6 +82,7 @@ int sort_active = 0;
int set_debug = 0;
int show_ops = 0;
int show_activity = 0;
+int show_defragcount = 0;
/* Debug options */
int sanity = 0;
@@ -100,20 +108,23 @@ void fatal(const char *x, ...)
void usage(void)
{
printf("slabinfo 5/7/2007. (c) 2007 sgi.\n\n"
- "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n"
+ "slabinfo [-aCdDefFhnpvtsz] [-d debugopts] [slab-regexp]\n"
"-a|--aliases Show aliases\n"
"-A|--activity Most active slabs first\n"
"-d<options>|--debug=<options> Set/Clear Debug options\n"
+ "-C|--ctor Show slabs with ctors\n"
"-D|--display-active Switch line format to activity\n"
"-e|--empty Show empty slabs\n"
"-f|--first-alias Show first alias\n"
+ "-F|--defrag Show defragmentable caches\n"
+ "-G|--display-defrag Display defrag counters\n"
"-h|--help Show usage information\n"
"-i|--inverted Inverted list\n"
"-l|--slabs Show slabs\n"
"-n|--numa Show NUMA information\n"
- "-o|--ops Show kmem_cache_ops\n"
+ "-o|--ops Show kmem_cache_ops\n"
"-s|--shrink Shrink slabs\n"
- "-r|--report Detailed report on single slabs\n"
+ "-r|--report Detailed report on single slabs\n"
"-S|--Size Sort by size\n"
"-t|--tracking Show alloc/free information\n"
"-T|--Totals Show summary information\n"
@@ -294,9 +305,11 @@ void first_line(void)
{
if (show_activity)
printf("Name Objects Alloc Free %%Fast Fallb O\n");
+ else if (show_defragcount)
+ printf("Name Objects DefragRQ Slabs Success Empty Skipped Failed\n");
else
printf("Name Objects Objsize Space "
- "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n");
+ "Slabs/Part/Cpu O/S O %%Ra %%Ef Flg\n");
}
/*
@@ -345,7 +358,7 @@ void slab_numa(struct slabinfo *s, int mode)
return;
if (!line) {
- printf("\n%-21s:", mode ? "NUMA nodes" : "Slab");
+ printf("\n%-21s: Rto ", mode ? "NUMA nodes" : "Slab");
for(node = 0; node <= highest_node; node++)
printf(" %4d", node);
printf("\n----------------------");
@@ -354,6 +367,7 @@ void slab_numa(struct slabinfo *s, int mode)
printf("\n");
}
printf("%-21s ", mode ? "All slabs" : s->name);
+ printf("%3d ", s->remote_node_defrag_ratio);
for(node = 0; node <= highest_node; node++) {
char b[20];
@@ -459,22 +473,28 @@ void slab_stats(struct slabinfo *s)
printf("Total %8lu %8lu\n\n", total_alloc, total_free);
- if (s->cpuslab_flush)
- printf("Flushes %8lu\n", s->cpuslab_flush);
-
- if (s->alloc_refill)
- printf("Refill %8lu\n", s->alloc_refill);
+ if (s->cpuslab_flush || s->alloc_refill)
+ printf("CPU Slab : Flushes=%lu Refills=%lu\n",
+ s->cpuslab_flush, s->alloc_refill);
total = s->deactivate_full + s->deactivate_empty +
s->deactivate_to_head + s->deactivate_to_tail;
if (total)
- printf("Deactivate Full=%lu(%lu%%) Empty=%lu(%lu%%) "
+ printf("Deactivate: Full=%lu(%lu%%) Empty=%lu(%lu%%) "
"ToHead=%lu(%lu%%) ToTail=%lu(%lu%%)\n",
s->deactivate_full, (s->deactivate_full * 100) / total,
s->deactivate_empty, (s->deactivate_empty * 100) / total,
s->deactivate_to_head, (s->deactivate_to_head * 100) / total,
s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total);
+
+ if (s->shrink_calls)
+ printf("Shrink : Calls=%lu Attempts=%lu Empty=%lu Successful=%lu\n",
+ s->shrink_calls, s->shrink_attempt_defrag,
+ s->shrink_empty_slab, s->shrink_slab_reclaimed);
+ if (s->shrink_slab_skipped || s->shrink_object_reclaim_failed)
+ printf("Defrag : Slabs skipped=%lu Object reclaim failed=%lu\n",
+ s->shrink_slab_skipped, s->shrink_object_reclaim_failed);
}
void report(struct slabinfo *s)
@@ -492,6 +512,8 @@ void report(struct slabinfo *s)
printf("** Slabs are destroyed via RCU\n");
if (s->reclaim_account)
printf("** Reclaim accounting active\n");
+ if (s->defrag)
+ printf("** Defragmentation at %d%%\n", s->defrag_ratio);
printf("\nSizes (bytes) Slabs Debug Memory\n");
printf("------------------------------------------------------------------------\n");
@@ -539,6 +561,12 @@ void slabcache(struct slabinfo *s)
if (show_empty && s->slabs)
return;
+ if (show_defrag && !s->defrag)
+ return;
+
+ if (show_ctor && !s->ctor)
+ return;
+
store_size(size_str, slab_size(s));
snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs,
s->partial, s->cpu_slabs);
@@ -550,6 +578,10 @@ void slabcache(struct slabinfo *s)
*p++ = '*';
if (s->cache_dma)
*p++ = 'd';
+ if (s->defrag)
+ *p++ = 'F';
+ if (s->ctor)
+ *p++ = 'C';
if (s->hwcache_align)
*p++ = 'A';
if (s->poison)
@@ -579,12 +611,18 @@ void slabcache(struct slabinfo *s)
total_alloc ? (s->alloc_fastpath * 100 / total_alloc) : 0,
total_free ? (s->free_fastpath * 100 / total_free) : 0,
s->order_fallback, s->order);
- }
+ } else
+ if (show_defragcount)
+ printf("%-21s %8ld %7lu %7lu %7lu %7lu %7lu %7lu\n",
+ s->name, s->objects, s->shrink_calls, s->shrink_attempt_defrag,
+ s->shrink_slab_reclaimed, s->shrink_empty_slab,
+ s->shrink_slab_skipped, s->shrink_object_reclaim_failed);
else
printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n",
s->name, s->objects, s->object_size, size_str, dist_str,
s->objs_per_slab, s->order,
- s->slabs ? (s->partial * 100) / s->slabs : 100,
+ s->slabs ? (s->partial * 100) /
+ (s->slabs * s->objs_per_slab) : 100,
s->slabs ? (s->objects * s->object_size * 100) /
(s->slabs * (page_size << s->order)) : 100,
flags);
@@ -1190,7 +1228,24 @@ void read_slab_dir(void)
slab->deactivate_to_tail = get_obj("deactivate_to_tail");
slab->deactivate_remote_frees = get_obj("deactivate_remote_frees");
slab->order_fallback = get_obj("order_fallback");
+ slab->shrink_calls = get_obj("shrink_calls");
+ slab->shrink_attempt_defrag = get_obj("shrink_attempt_defrag");
+ slab->shrink_empty_slab = get_obj("shrink_empty_slab");
+ slab->shrink_slab_skipped = get_obj("shrink_slab_skipped");
+ slab->shrink_slab_reclaimed = get_obj("shrink_slab_reclaimed");
+ slab->shrink_object_reclaim_failed =
+ get_obj("shrink_object_reclaim_failed");
+ slab->defrag_ratio = get_obj("defrag_ratio");
+ slab->remote_node_defrag_ratio =
+ get_obj("remote_node_defrag_ratio");
chdir("..");
+ if (read_slab_obj(slab, "ops")) {
+ if (strstr(buffer, "ctor :"))
+ slab->ctor = 1;
+ if (strstr(buffer, "kick :"))
+ slab->defrag = 1;
+ }
+
if (slab->name[0] == ':')
alias_targets++;
slab++;
@@ -1241,10 +1296,13 @@ void output_slabs(void)
struct option opts[] = {
{ "aliases", 0, NULL, 'a' },
{ "activity", 0, NULL, 'A' },
+ { "ctor", 0, NULL, 'C' },
{ "debug", 2, NULL, 'd' },
{ "display-activity", 0, NULL, 'D' },
+ { "display-defrag", 0, NULL, 'G' },
{ "empty", 0, NULL, 'e' },
{ "first-alias", 0, NULL, 'f' },
+ { "defrag", 0, NULL, 'F' },
{ "help", 0, NULL, 'h' },
{ "inverted", 0, NULL, 'i'},
{ "numa", 0, NULL, 'n' },
@@ -1267,7 +1325,7 @@ int main(int argc, char *argv[])
page_size = getpagesize();
- while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTS",
+ while ((c = getopt_long(argc, argv, "aACd::DefFGhil1noprstvzTS",
opts, NULL)) != -1)
switch (c) {
case '1':
@@ -1293,6 +1351,9 @@ int main(int argc, char *argv[])
case 'f':
show_first_alias = 1;
break;
+ case 'G':
+ show_defragcount = 1;
+ break;
case 'h':
usage();
return 0;
@@ -1323,6 +1384,12 @@ int main(int argc, char *argv[])
case 'z':
skip_zero = 0;
break;
+ case 'C':
+ show_ctor = 1;
+ break;
+ case 'F':
+ show_defrag = 1;
+ break;
case 'T':
show_totals = 1;
break;
diff --git a/fs/buffer.c b/fs/buffer.c
index 6569fda5cfed..706817a3173d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3318,6 +3318,107 @@ int bh_submit_read(struct buffer_head *bh)
}
EXPORT_SYMBOL(bh_submit_read);
+/*
+ * Writeback a page to clean the dirty state
+ */
+static void trigger_write(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+ int rc;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = 1,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ .nonblocking = 1,
+ .for_reclaim = 0
+ };
+
+ if (PageWriteback(page))
+ goto unlock;
+
+ if (!mapping->a_ops->writepage)
+ /* No write method for the address space */
+ goto unlock;
+
+ if (!clear_page_dirty_for_io(page))
+ /* Someone else already triggered a write */
+ goto unlock;
+
+ rc = mapping->a_ops->writepage(page, &wbc);
+ if (rc < 0)
+ /* I/O Error writing */
+ return;
+
+ if (rc == AOP_WRITEPAGE_ACTIVATE)
+unlock: unlock_page(page);
+}
+
+/*
+ * Get references on buffers.
+ *
+ * We obtain references on the page that uses the buffer. v[i] will point to
+ * the corresponding page after get_buffers() is through.
+ *
+ * We are safe from the underlying page being removed simply by doing
+ * a get_page_unless_zero. The buffer head removal may race at will.
+ * try_to_free_buffes will later take appropriate locks to remove the
+ * buffers if they are still there.
+ */
+static void *get_buffers(struct kmem_cache *s, int nr, void **v)
+{
+ struct page *page;
+ struct buffer_head *bh;
+ int i, j;
+ int n = 0;
+
+ for (i = 0; i < nr; i++) {
+ bh = v[i];
+ v[i] = NULL;
+
+ page = bh->b_page;
+
+ if (page && PagePrivate(page)) {
+ for (j = 0; j < n; j++)
+ if (page == v[j])
+ continue;
+ }
+
+ if (get_page_unless_zero(page))
+ v[n++] = page;
+ }
+ return NULL;
+}
+
+/*
+ * Despite its name: kick_buffers operates on a list of pointers to
+ * page structs that was set up by get_buffer().
+ */
+static void kick_buffers(struct kmem_cache *s, int nr, void **v,
+ void *private)
+{
+ struct page *page;
+ int i;
+
+ for (i = 0; i < nr; i++) {
+ page = v[i];
+
+ if (!page)
+ continue;
+
+ if (trylock_page(page)) {
+ if (PageDirty(page))
+ trigger_write(page);
+ else {
+ if (PagePrivate(page))
+ try_to_free_buffers(page);
+ unlock_page(page);
+ }
+ }
+ put_page(page);
+ }
+}
+
static void
init_buffer_head(void *data)
{
@@ -3336,6 +3437,7 @@ void __init buffer_init(void)
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
SLAB_MEM_SPREAD),
init_buffer_head);
+ kmem_cache_setup_defrag(bh_cachep, get_buffers, kick_buffers);
/*
* Limit the bh occupancy to 10% of ZONE_NORMAL
diff --git a/fs/dcache.c b/fs/dcache.c
index a1d86c7f3e66..bb653fa5a1a4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
#include <linux/seqlock.h>
#include <linux/swap.h>
#include <linux/bootmem.h>
+#include <linux/backing-dev.h>
#include "internal.h"
@@ -142,15 +143,6 @@ static void dentry_lru_add_tail(struct dentry *dentry)
static void dentry_lru_del(struct dentry *dentry)
{
if (!list_empty(&dentry->d_lru)) {
- list_del(&dentry->d_lru);
- dentry->d_sb->s_nr_dentry_unused--;
- dentry_stat.nr_unused--;
- }
-}
-
-static void dentry_lru_del_init(struct dentry *dentry)
-{
- if (likely(!list_empty(&dentry->d_lru))) {
list_del_init(&dentry->d_lru);
dentry->d_sb->s_nr_dentry_unused--;
dentry_stat.nr_unused--;
@@ -173,7 +165,10 @@ static struct dentry *d_kill(struct dentry *dentry)
list_del(&dentry->d_u.d_child);
dentry_stat.nr_dentry--; /* For d_free, below */
- /*drops the locks, at that point nobody can reach this dentry */
+ /*
+ * drops the locks, at that point nobody (aside from defrag)
+ * can reach this dentry
+ */
dentry_iput(dentry);
if (IS_ROOT(dentry))
parent = NULL;
@@ -320,7 +315,7 @@ int d_invalidate(struct dentry * dentry)
static inline struct dentry * __dget_locked(struct dentry *dentry)
{
atomic_inc(&dentry->d_count);
- dentry_lru_del_init(dentry);
+ dentry_lru_del(dentry);
return dentry;
}
@@ -436,7 +431,7 @@ static void prune_one_dentry(struct dentry * dentry)
if (dentry->d_op && dentry->d_op->d_delete)
dentry->d_op->d_delete(dentry);
- dentry_lru_del_init(dentry);
+ dentry_lru_del(dentry);
__d_drop(dentry);
dentry = d_kill(dentry);
spin_lock(&dcache_lock);
@@ -496,7 +491,7 @@ restart:
}
while (!list_empty(&tmp)) {
dentry = list_entry(tmp.prev, struct dentry, d_lru);
- dentry_lru_del_init(dentry);
+ dentry_lru_del(dentry);
spin_lock(&dentry->d_lock);
/*
* We found an inuse dentry which was not removed from
@@ -625,7 +620,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
/* detach this root from the system */
spin_lock(&dcache_lock);
- dentry_lru_del_init(dentry);
+ dentry_lru_del(dentry);
__d_drop(dentry);
spin_unlock(&dcache_lock);
@@ -639,7 +634,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
spin_lock(&dcache_lock);
list_for_each_entry(loop, &dentry->d_subdirs,
d_u.d_child) {
- dentry_lru_del_init(loop);
+ dentry_lru_del(loop);
__d_drop(loop);
cond_resched_lock(&dcache_lock);
}
@@ -822,7 +817,7 @@ resume:
struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
next = tmp->next;
- dentry_lru_del_init(dentry);
+ dentry_lru_del(dentry);
/*
* move only zero ref count dentries to the end
* of the unused list for prune_dcache
@@ -904,6 +899,16 @@ static struct shrinker dcache_shrinker = {
.seeks = DEFAULT_SEEKS,
};
+static void dcache_ctor(void *p)
+{
+ struct dentry *dentry = p;
+
+ spin_lock_init(&dentry->d_lock);
+ dentry->d_inode = NULL;
+ INIT_LIST_HEAD(&dentry->d_lru);
+ INIT_LIST_HEAD(&dentry->d_alias);
+}
+
/**
* d_alloc - allocate a dcache entry
* @parent: parent of entry to allocate
@@ -941,8 +946,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
atomic_set(&dentry->d_count, 1);
dentry->d_flags = DCACHE_UNHASHED;
- spin_lock_init(&dentry->d_lock);
- dentry->d_inode = NULL;
dentry->d_parent = NULL;
dentry->d_sb = NULL;
dentry->d_op = NULL;
@@ -952,9 +955,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
dentry->d_cookie = NULL;
#endif
INIT_HLIST_NODE(&dentry->d_hash);
- INIT_LIST_HEAD(&dentry->d_lru);
INIT_LIST_HEAD(&dentry->d_subdirs);
- INIT_LIST_HEAD(&dentry->d_alias);
if (parent) {
dentry->d_parent = dget(parent);
@@ -2278,19 +2279,110 @@ static void __init dcache_init_early(void)
INIT_HLIST_HEAD(&dentry_hashtable[loop]);
}
+/*
+ * The slab allocator is holding off frees. We can safely examine
+ * the object without the danger of it vanishing from under us.
+ */
+static void *get_dentries(struct kmem_cache *s, int nr, void **v)
+{
+ struct dentry *dentry;
+ int i;
+
+ spin_lock(&dcache_lock);
+ for (i = 0; i < nr; i++) {
+ dentry = v[i];
+
+ /*
+ * Three sorts of dentries cannot be reclaimed:
+ *
+ * 1. dentries that are in the process of being allocated
+ * or being freed. In that case the dentry is neither
+ * on the LRU nor hashed.
+ *
+ * 2. Fake hashed entries as used for anonymous dentries
+ * and pipe I/O. The fake hashed entries have d_flags
+ * set to indicate a hashed entry. However, the
+ * d_hash field indicates that the entry is not hashed.
+ *
+ * 3. dentries that have a backing store that is not
+ * writable. This is true for tmpsfs and other in
+ * memory filesystems. Removing dentries from them
+ * would loose dentries for good.
+ */
+ if ((d_unhashed(dentry) && list_empty(&dentry->d_lru)) ||
+ (!d_unhashed(dentry) && hlist_unhashed(&dentry->d_hash)) ||
+ (dentry->d_inode &&
+ !mapping_cap_writeback_dirty(dentry->d_inode->i_mapping)))
+ /* Ignore this dentry */
+ v[i] = NULL;
+ else
+ /* dget_locked will remove the dentry from the LRU */
+ dget_locked(dentry);
+ }
+ spin_unlock(&dcache_lock);
+ return NULL;
+}
+
+/*
+ * Slab has dropped all the locks. Get rid of the refcount obtained
+ * earlier and also free the object.
+ */
+static void kick_dentries(struct kmem_cache *s,
+ int nr, void **v, void *private)
+{
+ struct dentry *dentry;
+ int i;
+
+ /*
+ * First invalidate the dentries without holding the dcache lock
+ */
+ for (i = 0; i < nr; i++) {
+ dentry = v[i];
+
+ if (dentry)
+ d_invalidate(dentry);
+ }
+
+ /*
+ * If we are the last one holding a reference then the dentries can
+ * be freed. We need the dcache_lock.
+ */
+ spin_lock(&dcache_lock);
+ for (i = 0; i < nr; i++) {
+ dentry = v[i];
+ if (!dentry)
+ continue;
+
+ spin_lock(&dentry->d_lock);
+ if (atomic_read(&dentry->d_count) > 1) {
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&dcache_lock);
+ dput(dentry);
+ spin_lock(&dcache_lock);
+ continue;
+ }
+
+ prune_one_dentry(dentry);
+ }
+ spin_unlock(&dcache_lock);
+
+ /*
+ * dentries are freed using RCU so we need to wait until RCU
+ * operations are complete.
+ */
+ synchronize_rcu();
+}
+
static void __init dcache_init(void)
{
int loop;
- /*
- * A constructor could be added for stable state like the lists,
- * but it is probably not worth it because of the cache nature
- * of the dcache.
- */
- dentry_cache = KMEM_CACHE(dentry,
- SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
-
+ dentry_cache = kmem_cache_create("dentry_cache", sizeof(struct dentry),
+ 0, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD,
+ dcache_ctor);
+
register_shrinker(&dcache_shrinker);
+ kmem_cache_setup_defrag(dentry_cache, get_dentries, kick_dentries);
/* Hash may have been set up in dcache_init_early */
if (!hashdist)
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 3e5637fc3779..55f68b421547 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -58,7 +58,7 @@ static void drop_slab(void)
int nr_objects;
do {
- nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
+ nr_objects = shrink_slab(1000, GFP_KERNEL, 1000, NULL);
} while (nr_objects > 10);
}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 647cd888ac87..cf26ff18a795 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -171,6 +171,12 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
+static void *ext2_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v,
+ offsetof(struct ext2_inode_info, vfs_inode));
+}
+
static int init_inodecache(void)
{
ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
@@ -180,6 +186,9 @@ static int init_inodecache(void)
init_once);
if (ext2_inode_cachep == NULL)
return -ENOMEM;
+
+ kmem_cache_setup_defrag(ext2_inode_cachep,
+ ext2_get_inodes, kick_inodes);
return 0;
}
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f6c94f232ec1..58fca5a161ec 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -489,6 +489,12 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
+static void *ext3_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v,
+ offsetof(struct ext3_inode_info, vfs_inode));
+}
+
static int init_inodecache(void)
{
ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
@@ -498,6 +504,8 @@ static int init_inodecache(void)
init_once);
if (ext3_inode_cachep == NULL)
return -ENOMEM;
+ kmem_cache_setup_defrag(ext3_inode_cachep,
+ ext3_get_inodes, kick_inodes);
return 0;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e4a241c65dbe..2bceec39cb51 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -556,6 +556,12 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
+static void *ext4_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v,
+ offsetof(struct ext4_inode_info, vfs_inode));
+}
+
static int init_inodecache(void)
{
ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
@@ -565,6 +571,8 @@ static int init_inodecache(void)
init_once);
if (ext4_inode_cachep == NULL)
return -ENOMEM;
+ kmem_cache_setup_defrag(ext4_inode_cachep,
+ ext4_get_inodes, kick_inodes);
return 0;
}
diff --git a/fs/inode.c b/fs/inode.c
index 0487ddba1397..b33d2adb99bd 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1364,6 +1364,128 @@ static int __init set_ihash_entries(char *str)
__setup("ihash_entries=", set_ihash_entries);
/*
+ * Obtain a refcount on a list of struct inodes pointed to by v. If the
+ * inode is in the process of being freed then zap the v[] entry so that
+ * we skip the freeing attempts later.
+ *
+ * This is a generic function for the ->get slab defrag callback.
+ */
+void *get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ int i;
+
+ spin_lock(&inode_lock);
+ for (i = 0; i < nr; i++) {
+ struct inode *inode = v[i];
+
+ if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+ v[i] = NULL;
+ else
+ __iget(inode);
+ }
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+EXPORT_SYMBOL(get_inodes);
+
+/*
+ * Function for filesystems that embedd struct inode into their own
+ * fs inode. The offset is the offset of the struct inode in the fs inode.
+ *
+ * The function adds to the pointers in v[] in order to make them point to
+ * struct inode. Then get_inodes() is used to get the refcount.
+ * The converted v[] pointers can then also be passed to the kick() callback
+ * without further processing.
+ */
+void *fs_get_inodes(struct kmem_cache *s, int nr, void **v,
+ unsigned long offset)
+{
+ int i;
+
+ for (i = 0; i < nr; i++)
+ v[i] += offset;
+
+ return get_inodes(s, nr, v);
+}
+EXPORT_SYMBOL(fs_get_inodes);
+
+/*
+ * Generic callback function slab defrag ->kick methods. Takes the
+ * array with inodes where we obtained refcounts using fs_get_inodes()
+ * or get_inodes() and tries to free them.
+ */
+void kick_inodes(struct kmem_cache *s, int nr, void **v, void *private)
+{
+ struct inode *inode;
+ int i;
+ int abort = 0;
+ LIST_HEAD(freeable);
+ int active;
+
+ for (i = 0; i < nr; i++) {
+ inode = v[i];
+ if (!inode)
+ continue;
+
+ if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ if (remove_inode_buffers(inode))
+ /*
+ * Should we really be doing this? Or
+ * limit the writeback here to only a few pages?
+ *
+ * Possibly an expensive operation but we
+ * cannot reclaim the inode if the pages
+ * are still present.
+ */
+ invalidate_mapping_pages(&inode->i_data,
+ 0, -1);
+ }
+
+ /* Invalidate children and dentry */
+ if (S_ISDIR(inode->i_mode)) {
+ struct dentry *d = d_find_alias(inode);
+
+ if (d) {
+ d_invalidate(d);
+ dput(d);
+ }
+ }
+
+ if (inode->i_state & I_DIRTY)
+ write_inode_now(inode, 1);
+
+ d_prune_aliases(inode);
+ }
+
+ mutex_lock(&iprune_mutex);
+ for (i = 0; i < nr; i++) {
+ inode = v[i];
+
+ if (!inode)
+ /* inode is alrady being freed */
+ continue;
+
+ active = inode->i_sb->s_flags & MS_ACTIVE;
+ iput(inode);
+ if (abort || !active)
+ continue;
+
+ spin_lock(&inode_lock);
+ abort = !can_unuse(inode);
+
+ if (!abort) {
+ list_move(&inode->i_list, &freeable);
+ inode->i_state |= I_FREEING;
+ inodes_stat.nr_unused--;
+ }
+ spin_unlock(&inode_lock);
+ }
+ dispose_list(&freeable);
+ mutex_unlock(&iprune_mutex);
+}
+EXPORT_SYMBOL(kick_inodes);
+
+/*
* Initialize the waitqueues and inode hash table.
*/
void __init inode_init_early(void)
@@ -1402,6 +1524,7 @@ void __init inode_init(void)
SLAB_MEM_SPREAD),
init_once);
register_shrinker(&icache_shrinker);
+ kmem_cache_setup_defrag(inode_cachep, get_inodes, kick_inodes);
/* Hash may have been set up in inode_init_early */
if (!hashdist)
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 2543fd00c658..bcb674275348 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -106,6 +106,12 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
+static void *proc_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v,
+ offsetof(struct proc_inode, vfs_inode));
+}
+
void __init proc_init_inodecache(void)
{
proc_inode_cachep = kmem_cache_create("proc_inode_cache",
@@ -113,6 +119,8 @@ void __init proc_init_inodecache(void)
0, (SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD|SLAB_PANIC),
init_once);
+ kmem_cache_setup_defrag(proc_inode_cachep,
+ proc_get_inodes, kick_inodes);
}
static const struct super_operations proc_sops = {
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 663a91f5dce8..0d4c1a2f0f74 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -532,6 +532,12 @@ static void init_once(void *foo)
#endif
}
+static void *reiserfs_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v,
+ offsetof(struct reiserfs_inode_info, vfs_inode));
+}
+
static int init_inodecache(void)
{
reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
@@ -542,6 +548,8 @@ static int init_inodecache(void)
init_once);
if (reiserfs_inode_cachep == NULL)
return -ENOMEM;
+ kmem_cache_setup_defrag(reiserfs_inode_cachep,
+ reiserfs_get_inodes, kick_inodes);
return 0;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0dcdd9458f4b..eba5ecbf127c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1907,6 +1907,12 @@ static inline void insert_inode_hash(struct inode *inode) {
__insert_inode_hash(inode, inode->i_ino);
}
+/* Helper functions for inode defragmentation support in filesystems */
+extern void kick_inodes(struct kmem_cache *, int, void **, void *);
+extern void *get_inodes(struct kmem_cache *, int nr, void **);
+extern void *fs_get_inodes(struct kmem_cache *, int nr, void **,
+ unsigned long offset);
+
extern struct file * get_empty_filp(void);
extern void file_move(struct file *f, struct list_head *list);
extern void file_kill(struct file *f);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ffee2f743418..76b407cf8493 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1263,8 +1263,7 @@ int in_gate_area_no_task(unsigned long addr);
int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
- unsigned long lru_pages);
-
+ unsigned long lru_pages, struct zone *z);
#ifndef CONFIG_MMU
#define randomize_va_space 0
#else
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 35a7b5e19465..d9e765269136 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -327,6 +327,7 @@ struct zone {
unsigned long recent_scanned[2];
unsigned long pages_scanned; /* since last reclaim */
+ unsigned long slab_defrag_counter; /* since last defrag */
unsigned long flags; /* zone flags, see below */
/* Zone statistics */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index b12f93a3c345..487cd3b6bcfe 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -117,6 +117,7 @@ enum pageflags {
/* SLUB */
PG_slub_frozen = PG_active,
PG_slub_debug = PG_error,
+ PG_slub_kickable = PG_dirty,
};
#ifndef __GENERATING_BOUNDS_H
@@ -201,6 +202,7 @@ __PAGEFLAG(SlobFree, slob_free)
__PAGEFLAG(SlubFrozen, slub_frozen)
__PAGEFLAG(SlubDebug, slub_debug)
+__PAGEFLAG(SlubKickable, slub_kickable)
/*
* Only test-and-set exist for PG_writeback. The unconditional operators are
diff --git a/include/linux/slab.h b/include/linux/slab.h
index f96d13c281e8..1331fe06223a 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -130,6 +130,59 @@ void kfree(const void *);
size_t ksize(const void *);
/*
+ * Function prototypes passed to kmem_cache_defrag() to enable defragmentation
+ * and targeted reclaim in slab caches.
+ */
+
+/*
+ * kmem_cache_defrag_get_func() is called with locks held so that the slab
+ * objects cannot be freed. We are in an atomic context and no slab
+ * operations may be performed. The purpose of kmem_cache_defrag_get_func()
+ * is to obtain a stable refcount on the objects, so that they cannot be
+ * removed until kmem_cache_kick_func() has handled them.
+ *
+ * Parameters passed are the number of objects to process and an array of
+ * pointers to objects for which we need references.
+ *
+ * Returns a pointer that is passed to the kick function. If any objects
+ * cannot be moved then the pointer may indicate a failure and
+ * then kick can simply remove the references that were already obtained.
+ *
+ * The object pointer array passed is also passed to kmem_cache_defrag_kick().
+ * The function may remove objects from the array by setting pointers to
+ * NULL. This is useful if we can determine that an object is already about
+ * to be removed. In that case it is often impossible to obtain the necessary
+ * refcount.
+ */
+typedef void *kmem_defrag_get_func(struct kmem_cache *, int, void **);
+
+/*
+ * kmem_cache_defrag_kick_func is called with no locks held and interrupts
+ * enabled. Sleeping is possible. Any operation may be performed in kick().
+ * kmem_cache_defrag should free all the objects in the pointer array.
+ *
+ * Parameters passed are the number of objects in the array, the array of
+ * pointers to the objects and the pointer returned by kmem_cache_defrag_get().
+ *
+ * Success is checked by examining the number of remaining objects in the slab.
+ */
+typedef void kmem_defrag_kick_func(struct kmem_cache *, int, void **, void *);
+
+/*
+ * kmem_cache_setup_defrag() is used to setup callbacks for a slab cache.
+ * kmem_cache_defrag() performs the actual defragmentation.
+ */
+#ifdef CONFIG_SLUB
+void kmem_cache_setup_defrag(struct kmem_cache *, kmem_defrag_get_func,
+ kmem_defrag_kick_func);
+int kmem_cache_defrag(int node);
+#else
+static inline void kmem_cache_setup_defrag(struct kmem_cache *s,
+ kmem_defrag_get_func get, kmem_defrag_kick_func kiok) {}
+static inline int kmem_cache_defrag(int node) { return 0; }
+#endif
+
+/*
* Allocator specific definitions. These are mainly used to establish optimized
* ways to convert kmalloc() calls to kmem_cache_alloc() invocations by
* selecting the appropriate general cache at compile time.
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index dc28432b5b9a..5f8dfcf74b71 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -31,6 +31,12 @@ enum stat_item {
DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */
DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */
ORDER_FALLBACK, /* Number of times fallback was necessary */
+ SHRINK_CALLS, /* Number of invocations of kmem_cache_shrink */
+ SHRINK_ATTEMPT_DEFRAG, /* Slabs that were attempted to be reclaimed */
+ SHRINK_EMPTY_SLAB, /* Shrink encountered and freed empty slab */
+ SHRINK_SLAB_SKIPPED, /* Slab reclaim skipped an slab (busy etc) */
+ SHRINK_SLAB_RECLAIMED, /* Successfully reclaimed slabs */
+ SHRINK_OBJECT_RECLAIM_FAILED, /* Callbacks signaled busy objects */
NR_SLUB_STAT_ITEMS };
struct kmem_cache_cpu {
@@ -88,8 +94,18 @@ struct kmem_cache {
gfp_t allocflags; /* gfp flags to use on each alloc */
int refcount; /* Refcount for slab cache destroy */
void (*ctor)(void *);
+ kmem_defrag_get_func *get;
+ kmem_defrag_kick_func *kick;
+
int inuse; /* Offset to metadata */
int align; /* Alignment */
+ int defrag_ratio; /*
+ * Ratio used to check the percentage of
+ * objects allocate in a slab page.
+ * If less than this ratio is allocated
+ * then reclaim attempts are made.
+ */
+
const char *name; /* Name (only for display!) */
struct list_head list; /* List of slab caches */
#ifdef CONFIG_SLUB_DEBUG
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a3af95b2cb6d..e9584191067f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -217,6 +217,9 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
extern int __isolate_lru_page(struct page *page, int mode, int file);
extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
+extern int slab_defrag_limit;
+extern int slab_defrag_counter;
+
extern int remove_mapping(struct address_space *mapping, struct page *page);
extern long vm_total_pages;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9d048fa2d902..7a7a07a3f5f8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1078,6 +1078,26 @@ static struct ctl_table vm_table[] = {
.strategy = &sysctl_intvec,
.extra1 = &zero,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "slab_defrag_limit",
+ .data = &slab_defrag_limit,
+ .maxlen = sizeof(slab_defrag_limit),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ .extra1 = &one_hundred,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "slab_defrag_count",
+ .data = &slab_defrag_counter,
+ .maxlen = sizeof(slab_defrag_counter),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ },
#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
{
.ctl_name = VM_LEGACY_VA_LAYOUT,
diff --git a/mm/slub.c b/mm/slub.c
index 3d9b6d064798..9446012a20f1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -129,10 +129,10 @@
/*
* Maximum number of desirable partial slabs.
- * The existence of more partial slabs makes kmem_cache_shrink
- * sort the partial list by the number of objects in the.
+ * More slabs cause kmem_cache_shrink to sort the slabs by objects
+ * and triggers slab defragmentation.
*/
-#define MAX_PARTIAL 10
+#define MAX_PARTIAL 20
#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
SLAB_POISON | SLAB_STORE_USER)
@@ -179,6 +179,9 @@ static enum {
static DECLARE_RWSEM(slub_lock);
static LIST_HEAD(slab_caches);
+/* Maximum objects in defragmentable slabs */
+static unsigned int max_defrag_slab_objects;
+
/*
* Tracking user of a slab.
*/
@@ -1133,6 +1136,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
SLAB_STORE_USER | SLAB_TRACE))
__SetPageSlubDebug(page);
+ if (s->kick)
+ __SetPageSlubKickable(page);
+
start = page_address(page);
if (unlikely(s->flags & SLAB_POISON))
@@ -1173,6 +1179,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-pages);
+ __ClearPageSlubKickable(page);
__ClearPageSlab(page);
reset_page_mapcount(page);
__free_pages(page, order);
@@ -1383,6 +1390,8 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
if (SLABDEBUG && PageSlubDebug(page) &&
(s->flags & SLAB_STORE_USER))
add_full(n, page);
+ if (s->kick)
+ __SetPageSlubKickable(page);
}
slab_unlock(page);
} else {
@@ -2346,6 +2355,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
goto error;
s->refcount = 1;
+ s->defrag_ratio = 30;
#ifdef CONFIG_NUMA
s->remote_node_defrag_ratio = 1000;
#endif
@@ -2552,7 +2562,7 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
flags, NULL))
goto panic;
- list_add(&s->list, &slab_caches);
+ list_add_tail(&s->list, &slab_caches);
up_write(&slub_lock);
if (sysfs_slab_add(s))
goto panic;
@@ -2615,7 +2625,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
goto unlock_out;
}
- list_add(&s->list, &slab_caches);
+ list_add_tail(&s->list, &slab_caches);
kmalloc_caches_dma[index] = s;
schedule_work(&sysfs_add_work);
@@ -2804,76 +2814,284 @@ void kfree(const void *x)
EXPORT_SYMBOL(kfree);
/*
- * kmem_cache_shrink removes empty slabs from the partial lists and sorts
- * the remaining slabs by the number of items in use. The slabs with the
- * most items in use come first. New allocations will then fill those up
- * and thus they can be removed from the partial lists.
+ * Allocate a slab scratch space that is sufficient to keep at least
+ * max_defrag_slab_objects pointers to individual objects and also a bitmap
+ * for max_defrag_slab_objects.
+ */
+static inline void *alloc_scratch(void)
+{
+ return kmalloc(max_defrag_slab_objects * sizeof(void *) +
+ BITS_TO_LONGS(max_defrag_slab_objects) * sizeof(unsigned long),
+ GFP_KERNEL);
+}
+
+void kmem_cache_setup_defrag(struct kmem_cache *s,
+ kmem_defrag_get_func get, kmem_defrag_kick_func kick)
+{
+ int max_objects = oo_objects(s->max);
+
+ /*
+ * Defragmentable slabs must have a ctor otherwise objects may be
+ * in an undetermined state after they are allocated.
+ */
+ BUG_ON(!s->ctor);
+ s->get = get;
+ s->kick = kick;
+ down_write(&slub_lock);
+ list_move(&s->list, &slab_caches);
+ if (max_objects > max_defrag_slab_objects)
+ max_defrag_slab_objects = max_objects;
+ up_write(&slub_lock);
+}
+EXPORT_SYMBOL(kmem_cache_setup_defrag);
+
+/*
+ * Vacate all objects in the given slab.
*
- * The slabs with the least items are placed last. This results in them
- * being allocated from last increasing the chance that the last objects
- * are freed in them.
+ * The scratch aread passed to list function is sufficient to hold
+ * struct listhead times objects per slab. We use it to hold void ** times
+ * objects per slab plus a bitmap for each object.
*/
-int kmem_cache_shrink(struct kmem_cache *s)
+static int kmem_cache_vacate(struct page *page, void *scratch)
{
- int node;
- int i;
- struct kmem_cache_node *n;
- struct page *page;
- struct page *t;
- int objects = oo_objects(s->max);
- struct list_head *slabs_by_inuse =
- kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
+ void **vector = scratch;
+ void *p;
+ void *addr = page_address(page);
+ struct kmem_cache *s;
+ unsigned long *map;
+ int leftover;
+ int count;
+ void *private;
unsigned long flags;
+ unsigned long objects;
+ struct kmem_cache_cpu *c;
- if (!slabs_by_inuse)
- return -ENOMEM;
+ local_irq_save(flags);
+ slab_lock(page);
- flush_all(s);
- for_each_node_state(node, N_NORMAL_MEMORY) {
- n = get_node(s, node);
+ BUG_ON(!PageSlab(page)); /* Must be s slab page */
+ BUG_ON(!PageSlubFrozen(page)); /* Slab must have been frozen earlier */
- if (!n->nr_partial)
- continue;
+ s = page->slab;
+ objects = page->objects;
+ map = scratch + objects * sizeof(void **);
+ if (!page->inuse || !s->kick || !PageSlubKickable(page))
+ goto out;
- for (i = 0; i < objects; i++)
- INIT_LIST_HEAD(slabs_by_inuse + i);
+ /* Determine used objects */
+ bitmap_fill(map, objects);
+ for_each_free_object(p, s, page->freelist)
+ __clear_bit(slab_index(p, s, addr), map);
- spin_lock_irqsave(&n->list_lock, flags);
+ /* Build vector of pointers to objects */
+ count = 0;
+ memset(vector, 0, objects * sizeof(void **));
+ for_each_object(p, s, addr, objects)
+ if (test_bit(slab_index(p, s, addr), map))
+ vector[count++] = p;
- /*
- * Build lists indexed by the items in use in each slab.
- *
- * Note that concurrent frees may occur while we hold the
- * list_lock. page->inuse here is the upper limit.
- */
- list_for_each_entry_safe(page, t, &n->partial, lru) {
- if (!page->inuse && slab_trylock(page)) {
+ private = s->get(s, count, vector);
+
+ /*
+ * Got references. Now we can drop the slab lock. The slab
+ * is frozen so it cannot vanish from under us nor will
+ * allocations be performed on the slab. However, unlocking the
+ * slab will allow concurrent slab_frees to proceed.
+ */
+ slab_unlock(page);
+ local_irq_restore(flags);
+
+ /*
+ * Perform the KICK callbacks to remove the objects.
+ */
+ s->kick(s, count, vector, private);
+
+ local_irq_save(flags);
+ slab_lock(page);
+out:
+ /*
+ * Check the result and unfreeze the slab
+ */
+ leftover = page->inuse;
+ c = get_cpu_slab(s, smp_processor_id());
+ if (leftover) {
+ /* Unsuccessful reclaim. Avoid future reclaim attempts. */
+ stat(c, SHRINK_OBJECT_RECLAIM_FAILED);
+ __ClearPageSlubKickable(page);
+ } else
+ stat(c, SHRINK_SLAB_RECLAIMED);
+ unfreeze_slab(s, page, leftover > 0);
+ local_irq_restore(flags);
+ return leftover;
+}
+
+/*
+ * Remove objects from a list of slab pages that have been gathered.
+ * Must be called with slabs that have been isolated before.
+ *
+ * kmem_cache_reclaim() is never called from an atomic context. It
+ * allocates memory for temporary storage. We are holding the
+ * slub_lock semaphore which prevents another call into
+ * the defrag logic.
+ */
+int kmem_cache_reclaim(struct list_head *zaplist)
+{
+ int freed = 0;
+ void **scratch;
+ struct page *page;
+ struct page *page2;
+
+ if (list_empty(zaplist))
+ return 0;
+
+ scratch = alloc_scratch();
+ if (!scratch)
+ return 0;
+
+ list_for_each_entry_safe(page, page2, zaplist, lru) {
+ list_del(&page->lru);
+ if (kmem_cache_vacate(page, scratch) == 0)
+ freed++;
+ }
+ kfree(scratch);
+ return freed;
+}
+
+/*
+ * Shrink the slab cache on a particular node of the cache
+ * by releasing slabs with zero objects and trying to reclaim
+ * slabs with less than the configured percentage of objects allocated.
+ */
+static unsigned long __kmem_cache_shrink(struct kmem_cache *s, int node,
+ unsigned long limit)
+{
+ unsigned long flags;
+ struct page *page, *page2;
+ LIST_HEAD(zaplist);
+ int freed = 0;
+ struct kmem_cache_node *n = get_node(s, node);
+ struct kmem_cache_cpu *c;
+
+ if (n->nr_partial <= limit)
+ return 0;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ c = get_cpu_slab(s, smp_processor_id());
+ stat(c, SHRINK_CALLS);
+ list_for_each_entry_safe(page, page2, &n->partial, lru) {
+ if (!slab_trylock(page))
+ /* Busy slab. Get out of the way */
+ continue;
+
+ if (page->inuse) {
+ if (!PageSlubKickable(page) || page->inuse * 100 >=
+ s->defrag_ratio * page->objects) {
+ slab_unlock(page);
/*
- * Must hold slab lock here because slab_free
- * may have freed the last object and be
- * waiting to release the slab.
+ * Slab contains enough objects
+ * or we alrady tried reclaim before and
+ * it failed. Skip this one.
*/
- list_del(&page->lru);
+ continue;
+ }
+
+ list_move(&page->lru, &zaplist);
+ if (s->kick) {
+ stat(c, SHRINK_ATTEMPT_DEFRAG);
n->nr_partial--;
- slab_unlock(page);
- discard_slab(s, page);
- } else {
- list_move(&page->lru,
- slabs_by_inuse + page->inuse);
+ __SetPageSlubFrozen(page);
}
+ slab_unlock(page);
+ } else {
+ /* Empty slab page */
+ stat(c, SHRINK_EMPTY_SLAB);
+ list_del(&page->lru);
+ n->nr_partial--;
+ slab_unlock(page);
+ discard_slab(s, page);
+ freed++;
}
+ }
+ if (!s->kick)
/*
- * Rebuild the partial list with the slabs filled up most
- * first and the least used slabs at the end.
+ * No defrag methods. By simply putting the zaplist at the
+ * end of the partial list we can let them simmer longer
+ * and thus increase the chance of all objects being
+ * reclaimed.
+ *
+ * We have effectively sorted the partial list and put
+ * the slabs with more objects first. As soon as they
+ * are allocated they are going to be removed from the
+ * partial list.
*/
- for (i = objects - 1; i >= 0; i--)
- list_splice(slabs_by_inuse + i, n->partial.prev);
+ list_splice(&zaplist, n->partial.prev);
- spin_unlock_irqrestore(&n->list_lock, flags);
+
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ if (s->kick)
+ freed += kmem_cache_reclaim(&zaplist);
+
+ return freed;
+}
+
+/*
+ * Defrag slabs conditional on the amount of fragmentation in a page.
+ */
+int kmem_cache_defrag(int node)
+{
+ struct kmem_cache *s;
+ unsigned long slabs = 0;
+
+ /*
+ * kmem_cache_defrag may be called from the reclaim path which may be
+ * called for any page allocator alloc. So there is the danger that we
+ * get called in a situation where slub already acquired the slub_lock
+ * for other purposes.
+ */
+ if (!down_read_trylock(&slub_lock))
+ return 0;
+
+ list_for_each_entry(s, &slab_caches, list) {
+ unsigned long reclaimed = 0;
+
+ /*
+ * Defragmentable caches come first. If the slab cache is not
+ * defragmentable then we can stop traversing the list.
+ */
+ if (!s->kick)
+ break;
+
+ if (node == -1) {
+ int nid;
+
+ for_each_node_state(nid, N_NORMAL_MEMORY)
+ reclaimed += __kmem_cache_shrink(s, nid,
+ MAX_PARTIAL);
+ } else
+ reclaimed = __kmem_cache_shrink(s, node, MAX_PARTIAL);
+
+ slabs += reclaimed;
}
+ up_read(&slub_lock);
+ return slabs;
+}
+EXPORT_SYMBOL(kmem_cache_defrag);
+
+/*
+ * kmem_cache_shrink removes empty slabs from the partial lists.
+ * If the slab cache supports defragmentation then objects are
+ * reclaimed.
+ */
+int kmem_cache_shrink(struct kmem_cache *s)
+{
+ int node;
+
+ flush_all(s);
+ for_each_node_state(node, N_NORMAL_MEMORY)
+ __kmem_cache_shrink(s, node, 0);
- kfree(slabs_by_inuse);
return 0;
}
EXPORT_SYMBOL(kmem_cache_shrink);
@@ -3096,7 +3314,7 @@ static int slab_unmergeable(struct kmem_cache *s)
if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
return 1;
- if (s->ctor)
+ if (s->ctor || s->kick || s->get)
return 1;
/*
@@ -3185,7 +3403,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
if (s) {
if (kmem_cache_open(s, GFP_KERNEL, name,
size, align, flags, ctor)) {
- list_add(&s->list, &slab_caches);
+ list_add_tail(&s->list, &slab_caches);
up_write(&slub_lock);
if (sysfs_slab_add(s))
goto err;
@@ -3884,16 +4102,32 @@ static ssize_t order_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR(order);
-static ssize_t ctor_show(struct kmem_cache *s, char *buf)
+static ssize_t ops_show(struct kmem_cache *s, char *buf)
{
+ int x = 0;
+
if (s->ctor) {
- int n = sprint_symbol(buf, (unsigned long)s->ctor);
+ x += sprintf(buf + x, "ctor : ");
+ x += sprint_symbol(buf + x, (unsigned long)s->ctor);
+ x += sprintf(buf + x, "\n");
+ }
- return n + sprintf(buf + n, "\n");
+ if (s->get) {
+ x += sprintf(buf + x, "get : ");
+ x += sprint_symbol(buf + x,
+ (unsigned long)s->get);
+ x += sprintf(buf + x, "\n");
}
- return 0;
+
+ if (s->kick) {
+ x += sprintf(buf + x, "kick : ");
+ x += sprint_symbol(buf + x,
+ (unsigned long)s->kick);
+ x += sprintf(buf + x, "\n");
+ }
+ return x;
}
-SLAB_ATTR_RO(ctor);
+SLAB_ATTR_RO(ops);
static ssize_t aliases_show(struct kmem_cache *s, char *buf)
{
@@ -4113,6 +4347,27 @@ static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(free_calls);
+static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->defrag_ratio);
+}
+
+static ssize_t defrag_ratio_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ unsigned long ratio;
+ int err;
+
+ err = strict_strtoul(buf, 10, &ratio);
+ if (err)
+ return err;
+
+ if (ratio < 100)
+ s->defrag_ratio = ratio;
+ return length;
+}
+SLAB_ATTR(defrag_ratio);
+
#ifdef CONFIG_NUMA
static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
{
@@ -4192,6 +4447,12 @@ STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
STAT_ATTR(ORDER_FALLBACK, order_fallback);
+STAT_ATTR(SHRINK_CALLS, shrink_calls);
+STAT_ATTR(SHRINK_ATTEMPT_DEFRAG, shrink_attempt_defrag);
+STAT_ATTR(SHRINK_EMPTY_SLAB, shrink_empty_slab);
+STAT_ATTR(SHRINK_SLAB_SKIPPED, shrink_slab_skipped);
+STAT_ATTR(SHRINK_SLAB_RECLAIMED, shrink_slab_reclaimed);
+STAT_ATTR(SHRINK_OBJECT_RECLAIM_FAILED, shrink_object_reclaim_failed);
#endif
static struct attribute *slab_attrs[] = {
@@ -4205,7 +4466,7 @@ static struct attribute *slab_attrs[] = {
&slabs_attr.attr,
&partial_attr.attr,
&cpu_slabs_attr.attr,
- &ctor_attr.attr,
+ &ops_attr.attr,
&aliases_attr.attr,
&align_attr.attr,
&sanity_checks_attr.attr,
@@ -4220,6 +4481,7 @@ static struct attribute *slab_attrs[] = {
&shrink_attr.attr,
&alloc_calls_attr.attr,
&free_calls_attr.attr,
+ &defrag_ratio_attr.attr,
#ifdef CONFIG_ZONE_DMA
&cache_dma_attr.attr,
#endif
@@ -4245,6 +4507,12 @@ static struct attribute *slab_attrs[] = {
&deactivate_to_tail_attr.attr,
&deactivate_remote_frees_attr.attr,
&order_fallback_attr.attr,
+ &shrink_calls_attr.attr,
+ &shrink_attempt_defrag_attr.attr,
+ &shrink_empty_slab_attr.attr,
+ &shrink_slab_skipped_attr.attr,
+ &shrink_slab_reclaimed_attr.attr,
+ &shrink_object_reclaim_failed_attr.attr,
#endif
NULL
};
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7ea1440b53db..6aa67117e6d0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -151,6 +151,14 @@ void unregister_shrinker(struct shrinker *shrinker)
EXPORT_SYMBOL(unregister_shrinker);
#define SHRINK_BATCH 128
+
+/*
+ * Trigger a call into slab defrag if the sum of the returns from
+ * shrinkers cross this value.
+ */
+int slab_defrag_limit = 1000;
+int slab_defrag_counter;
+
/*
* Call the shrink functions to age shrinkable caches
*
@@ -168,10 +176,18 @@ EXPORT_SYMBOL(unregister_shrinker);
* are eligible for the caller's allocation attempt. It is used for balancing
* slab reclaim versus page reclaim.
*
+ * zone is the zone for which we are shrinking the slabs. If the intent
+ * is to do a global shrink then zone may be NULL. Specification of a
+ * zone is currently only used to limit slab defragmentation to a NUMA node.
+ * The performace of shrink_slab would be better (in particular under NUMA)
+ * if it could be targeted as a whole to the zone that is under memory
+ * pressure but the VFS infrastructure does not allow that at the present
+ * time.
+ *
* Returns the number of slab objects which we shrunk.
*/
unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
- unsigned long lru_pages)
+ unsigned long lru_pages, struct zone *zone)
{
struct shrinker *shrinker;
unsigned long ret = 0;
@@ -228,6 +244,39 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
shrinker->nr += total_scan;
}
up_read(&shrinker_rwsem);
+
+
+ /* Avoid dirtying cachelines */
+ if (!ret)
+ return 0;
+
+ /*
+ * "ret" doesnt really contain the freed object count. The shrinkers
+ * fake it. Gotta go with what we are getting though.
+ *
+ * Handling of the defrag_counter is also racy. If we get the
+ * wrong counts then we may unnecessarily do a defrag pass or defer
+ * one. "ret" is already faked. So this is just increasing
+ * the already existing fuzziness to get some notion as to when
+ * to initiate slab defrag which will hopefully be okay.
+ */
+ if (zone) {
+ /* balance_pgdat running on a zone so we only scan one node */
+ zone->slab_defrag_counter += ret;
+ if (zone->slab_defrag_counter > slab_defrag_limit &&
+ (gfp_mask & __GFP_FS)) {
+ zone->slab_defrag_counter = 0;
+ kmem_cache_defrag(zone_to_nid(zone));
+ }
+ } else {
+ /* Direct (and thus global) reclaim. Scan all nodes */
+ slab_defrag_counter += ret;
+ if (slab_defrag_counter > slab_defrag_limit &&
+ (gfp_mask & __GFP_FS)) {
+ slab_defrag_counter = 0;
+ kmem_cache_defrag(-1);
+ }
+ }
return ret;
}
@@ -1586,7 +1635,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
* over limit cgroups
*/
if (scan_global_lru(sc)) {
- shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
+ shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages, NULL);
if (reclaim_state) {
nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
@@ -1820,7 +1869,7 @@ loop_again:
nr_reclaimed += shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
- lru_pages);
+ lru_pages, zone);
nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
if (zone_is_all_unreclaimable(zone))
@@ -2060,7 +2109,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
/* If slab caches are huge, it's better to hit them first */
while (nr_slab >= lru_pages) {
reclaim_state.reclaimed_slab = 0;
- shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+ shrink_slab(nr_pages, sc.gfp_mask, lru_pages, NULL);
if (!reclaim_state.reclaimed_slab)
break;
@@ -2098,7 +2147,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
reclaim_state.reclaimed_slab = 0;
shrink_slab(sc.nr_scanned, sc.gfp_mask,
- global_lru_pages());
+ global_lru_pages(), NULL);
ret += reclaim_state.reclaimed_slab;
if (ret >= nr_pages)
goto out;
@@ -2115,7 +2164,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
if (!ret) {
do {
reclaim_state.reclaimed_slab = 0;
- shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
+ shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages(), NULL);
ret += reclaim_state.reclaimed_slab;
} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
}
@@ -2277,7 +2326,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* Note that shrink_slab will free memory on all zones and may
* take a long time.
*/
- while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+ while (shrink_slab(sc.nr_scanned, gfp_mask, order,
+ zone) &&
zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
slab_reclaimable - nr_pages)
;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c3ccfda23adc..c4b7280e6e70 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -774,11 +774,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
#endif
}
seq_printf(m,
+ "\n slab_defrag_count: %lu"
"\n all_unreclaimable: %u"
"\n prev_priority: %i"
"\n start_pfn: %lu"
"\n inactive_ratio: %u",
- zone_is_all_unreclaimable(zone),
+ zone->slab_defrag_counter,
+ zone_is_all_unreclaimable(zone),
zone->prev_priority,
zone->zone_start_pfn,
zone->inactive_ratio);