From 83137a5649ec8a0bb769c68024b0532733087482 Mon Sep 17 00:00:00 2001 From: Dan Magenheimer Date: Fri, 8 Oct 2010 13:45:18 -0600 Subject: mm: cleancache core ops functions and config This third patch of eight in this cleancache series provides the core code for cleancache that interfaces between the hooks in VFS and individual filesystems and a cleancache backend. It also includes build and config patches. Two new files are added: mm/cleancache.c and include/linux/cleancache.h. Note that CONFIG_CLEANCACHE defaults to on; in systems that do not provide a cleancache backend, all hooks devolve to a simple check of a global enable flag, so performance impact should be negligible but can be reduced to zero impact if config'ed off. Details and a FAQ can be found in Documentation/vm/cleancache.txt Credits: Cleancache_ops design derived from Jeremy Fitzhardinge design for tmem [v7: dan.magenheimer@oracle.com: cleanup sysfs and remove cleancache prefix] [v6: JBeulich@novell.com: robustly handle buggy fs encode_fh actor definition] [v5: jeremy@goop.org: clean up global usage and static var names] [v5: jeremy@goop.org: simplify init hook and any future fs init changes] [v5: hch@infradead.org: cleaner non-global interface for ops registration] [v4: adilger@sun.com: interface must support exportfs FS's] [v4: hch@infradead.org: interface must support 64-bit FS on 32-bit kernel] [v3: akpm@linux-foundation.org: use one ops struct to avoid pointer hops] [v3: akpm@linux-foundation.org: document and ensure PageLocked reqts are met] [v3: ngupta@vflare.org: fix success/fail codes, change funcs to void] [v2: viro@ZenIV.linux.org.uk: use sane types] Signed-off-by: Dan Magenheimer Reviewed-by: Jeremy Fitzhardinge Reviewed-by: Konrad Rzeszutek Wilk Acked-by: Al Viro Acked-by: Andrew Morton Acked-by: Nitin Gupta Acked-by: Minchan Kim Acked-by: Andreas Dilger Acked-by: Jan Beulich Cc: Matthew Wilcox Cc: Nick Piggin Cc: Mel Gorman Cc: Rik Van Riel Cc: Chris Mason Cc: Ted Ts'o Cc: Mark Fasheh Cc: Joel Becker --- mm/Kconfig | 22 +++++ mm/Makefile | 1 + mm/cleancache.c | 245 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 268 insertions(+) create mode 100644 mm/cleancache.c (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index f0fb9124e410..9ee07517451a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -301,3 +301,25 @@ config NOMMU_INITIAL_TRIM_EXCESS of 1 says that all excess pages should be trimmed. See Documentation/nommu-mmap.txt for more information. + +config CLEANCACHE + bool "Enable cleancache pseudo-RAM driver to cache clean pages" + default y + help + Cleancache can be thought of as a page-granularity victim cache + for clean pages that the kernel's pageframe replacement algorithm + (PFRA) would like to keep around, but can't since there isn't enough + memory. So when the PFRA "evicts" a page, it first attempts to put + it into a synchronous concurrency-safe page-oriented pseudo-RAM + device (such as Xen's Transcendent Memory, aka "tmem") which is not + directly accessible or addressable by the kernel and is of unknown + (and possibly time-varying) size. And when a cleancache-enabled + filesystem wishes to access a page in a file on disk, it first + checks cleancache to see if it already contains it; if it does, + the page is copied into the kernel and a disk access is avoided. + When a pseudo-RAM device is available, a significant I/O reduction + may be achieved. When none is available, all cleancache calls + are reduced to a single pointer-compare-against-NULL resulting + in a negligible performance hit. + + If unsure, say Y to enable cleancache diff --git a/mm/Makefile b/mm/Makefile index 34b2546a9e37..10d9419d78e7 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -47,3 +47,4 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o +obj-$(CONFIG_CLEANCACHE) += cleancache.o diff --git a/mm/cleancache.c b/mm/cleancache.c new file mode 100644 index 000000000000..f545eb8f1118 --- /dev/null +++ b/mm/cleancache.c @@ -0,0 +1,245 @@ +/* + * Cleancache frontend + * + * This code provides the generic "frontend" layer to call a matching + * "backend" driver implementation of cleancache. See + * Documentation/vm/cleancache.txt for more information. + * + * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include +#include +#include +#include +#include + +/* + * This global enablement flag may be read thousands of times per second + * by cleancache_get/put/flush even on systems where cleancache_ops + * is not claimed (e.g. cleancache is config'ed on but remains + * disabled), so is preferred to the slower alternative: a function + * call that checks a non-global. + */ +int cleancache_enabled; +EXPORT_SYMBOL(cleancache_enabled); + +/* + * cleancache_ops is set by cleancache_ops_register to contain the pointers + * to the cleancache "backend" implementation functions. + */ +static struct cleancache_ops cleancache_ops; + +/* useful stats available in /sys/kernel/mm/cleancache */ +static unsigned long cleancache_succ_gets; +static unsigned long cleancache_failed_gets; +static unsigned long cleancache_puts; +static unsigned long cleancache_flushes; + +/* + * register operations for cleancache, returning previous thus allowing + * detection of multiple backends and possible nesting + */ +struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops) +{ + struct cleancache_ops old = cleancache_ops; + + cleancache_ops = *ops; + cleancache_enabled = 1; + return old; +} +EXPORT_SYMBOL(cleancache_register_ops); + +/* Called by a cleancache-enabled filesystem at time of mount */ +void __cleancache_init_fs(struct super_block *sb) +{ + sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE); +} +EXPORT_SYMBOL(__cleancache_init_fs); + +/* Called by a cleancache-enabled clustered filesystem at time of mount */ +void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) +{ + sb->cleancache_poolid = + (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE); +} +EXPORT_SYMBOL(__cleancache_init_shared_fs); + +/* + * If the filesystem uses exportable filehandles, use the filehandle as + * the key, else use the inode number. + */ +static int cleancache_get_key(struct inode *inode, + struct cleancache_filekey *key) +{ + int (*fhfn)(struct dentry *, __u32 *fh, int *, int); + int len = 0, maxlen = CLEANCACHE_KEY_MAX; + struct super_block *sb = inode->i_sb; + struct dentry *d; + + key->u.ino = inode->i_ino; + if (sb->s_export_op != NULL) { + fhfn = sb->s_export_op->encode_fh; + if (fhfn) { + d = list_first_entry(&inode->i_dentry, + struct dentry, d_alias); + len = (*fhfn)(d, &key->u.fh[0], &maxlen, 0); + if (len <= 0 || len == 255) + return -1; + if (maxlen > CLEANCACHE_KEY_MAX) + return -1; + } + } + return 0; +} + +/* + * "Get" data from cleancache associated with the poolid/inode/index + * that were specified when the data was put to cleanache and, if + * successful, use it to fill the specified page with data and return 0. + * The pageframe is unchanged and returns -1 if the get fails. + * Page must be locked by caller. + */ +int __cleancache_get_page(struct page *page) +{ + int ret = -1; + int pool_id; + struct cleancache_filekey key = { .u.key = { 0 } }; + + VM_BUG_ON(!PageLocked(page)); + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id < 0) + goto out; + + if (cleancache_get_key(page->mapping->host, &key) < 0) + goto out; + + ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page); + if (ret == 0) + cleancache_succ_gets++; + else + cleancache_failed_gets++; +out: + return ret; +} +EXPORT_SYMBOL(__cleancache_get_page); + +/* + * "Put" data from a page to cleancache and associate it with the + * (previously-obtained per-filesystem) poolid and the page's, + * inode and page index. Page must be locked. Note that a put_page + * always "succeeds", though a subsequent get_page may succeed or fail. + */ +void __cleancache_put_page(struct page *page) +{ + int pool_id; + struct cleancache_filekey key = { .u.key = { 0 } }; + + VM_BUG_ON(!PageLocked(page)); + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id >= 0 && + cleancache_get_key(page->mapping->host, &key) >= 0) { + (*cleancache_ops.put_page)(pool_id, key, page->index, page); + cleancache_puts++; + } +} +EXPORT_SYMBOL(__cleancache_put_page); + +/* + * Flush any data from cleancache associated with the poolid and the + * page's inode and page index so that a subsequent "get" will fail. + */ +void __cleancache_flush_page(struct address_space *mapping, struct page *page) +{ + /* careful... page->mapping is NULL sometimes when this is called */ + int pool_id = mapping->host->i_sb->cleancache_poolid; + struct cleancache_filekey key = { .u.key = { 0 } }; + + if (pool_id >= 0) { + VM_BUG_ON(!PageLocked(page)); + if (cleancache_get_key(mapping->host, &key) >= 0) { + (*cleancache_ops.flush_page)(pool_id, key, page->index); + cleancache_flushes++; + } + } +} +EXPORT_SYMBOL(__cleancache_flush_page); + +/* + * Flush all data from cleancache associated with the poolid and the + * mappings's inode so that all subsequent gets to this poolid/inode + * will fail. + */ +void __cleancache_flush_inode(struct address_space *mapping) +{ + int pool_id = mapping->host->i_sb->cleancache_poolid; + struct cleancache_filekey key = { .u.key = { 0 } }; + + if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) + (*cleancache_ops.flush_inode)(pool_id, key); +} +EXPORT_SYMBOL(__cleancache_flush_inode); + +/* + * Called by any cleancache-enabled filesystem at time of unmount; + * note that pool_id is surrendered and may be reutrned by a subsequent + * cleancache_init_fs or cleancache_init_shared_fs + */ +void __cleancache_flush_fs(struct super_block *sb) +{ + if (sb->cleancache_poolid >= 0) { + int old_poolid = sb->cleancache_poolid; + sb->cleancache_poolid = -1; + (*cleancache_ops.flush_fs)(old_poolid); + } +} +EXPORT_SYMBOL(__cleancache_flush_fs); + +#ifdef CONFIG_SYSFS + +/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */ + +#define CLEANCACHE_SYSFS_RO(_name) \ + static ssize_t cleancache_##_name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ + { \ + return sprintf(buf, "%lu\n", cleancache_##_name); \ + } \ + static struct kobj_attribute cleancache_##_name##_attr = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = cleancache_##_name##_show, \ + } + +CLEANCACHE_SYSFS_RO(succ_gets); +CLEANCACHE_SYSFS_RO(failed_gets); +CLEANCACHE_SYSFS_RO(puts); +CLEANCACHE_SYSFS_RO(flushes); + +static struct attribute *cleancache_attrs[] = { + &cleancache_succ_gets_attr.attr, + &cleancache_failed_gets_attr.attr, + &cleancache_puts_attr.attr, + &cleancache_flushes_attr.attr, + NULL, +}; + +static struct attribute_group cleancache_attr_group = { + .attrs = cleancache_attrs, + .name = "cleancache", +}; + +#endif /* CONFIG_SYSFS */ + +static int __init init_cleancache(void) +{ +#ifdef CONFIG_SYSFS + int err; + + err = sysfs_create_group(mm_kobj, &cleancache_attr_group); +#endif /* CONFIG_SYSFS */ + return 0; +} +module_init(init_cleancache) -- cgit v1.2.3 From 03e838947c8abe29a9d7abfaf7fd9125a46b70e9 Mon Sep 17 00:00:00 2001 From: Dan Magenheimer Date: Fri, 8 Oct 2010 13:45:18 -0600 Subject: mm/fs: add hooks to support cleancache This fourth patch of eight in this cleancache series provides the core hooks in VFS for: initializing cleancache per filesystem; capturing clean pages reclaimed by page cache; attempting to get pages from cleancache before filesystem read; and ensuring coherency between pagecache, disk, and cleancache. Note that the placement of these hooks has been stable since 2.6.18. All hooks become no-ops if CONFIG_CLEANCACHE is unset, or become a check of a boolean global if CONFIG_CLEANCACHE is set but no cleancache "backend" has claimed cleancache_ops. Details and a FAQ can be found in Documentation/vm/cleancache.txt Signed-off-by: Chris Mason Signed-off-by: Dan Magenheimer Reviewed-by: Jeremy Fitzhardinge Reviewed-by: Konrad Rzeszutek Wilk Cc: Andrew Morton Cc: Al Viro Cc: Matthew Wilcox Cc: Nick Piggin Cc: Mel Gorman Cc: Rik Van Riel Cc: Jan Beulich Cc: Andreas Dilger Cc: Ted Ts'o Cc: Mark Fasheh Cc: Joel Becker Cc: Nitin Gupta --- fs/buffer.c | 5 +++++ fs/mpage.c | 7 +++++++ fs/super.c | 3 +++ mm/filemap.c | 11 +++++++++++ mm/truncate.c | 10 ++++++++++ 5 files changed, 36 insertions(+) (limited to 'mm') diff --git a/fs/buffer.c b/fs/buffer.c index 3e7dca279d1c..f6e4008f288a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -41,6 +41,7 @@ #include #include #include +#include static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -277,6 +278,10 @@ void invalidate_bdev(struct block_device *bdev) invalidate_bh_lrus(); lru_add_drain_all(); /* make sure all lru add caches are flushed */ invalidate_mapping_pages(mapping, 0, -1); + /* 99% of the time, we don't need to flush the cleancache on the bdev. + * But, for the strange corners, lets be cautious + */ + cleancache_flush_inode(mapping); } EXPORT_SYMBOL(invalidate_bdev); diff --git a/fs/mpage.c b/fs/mpage.c index fd56ca2ea556..47baec3dc944 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -27,6 +27,7 @@ #include #include #include +#include /* * I/O completion handler for multipage BIOs. @@ -286,6 +287,12 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, SetPageMappedToDisk(page); } + if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) && + cleancache_get_page(page) == 0) { + SetPageUptodate(page); + goto confused; + } + /* * This page will go to BIO. Do we need to send this BIO off first? */ diff --git a/fs/super.c b/fs/super.c index 8819e3a7ff20..30a54e0d81a6 100644 --- a/fs/super.c +++ b/fs/super.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "internal.h" @@ -110,6 +111,7 @@ static struct super_block *alloc_super(struct file_system_type *type) s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; + s->cleancache_poolid = -1; } out: return s; @@ -176,6 +178,7 @@ void deactivate_locked_super(struct super_block *s) struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { fs->kill_sb(s); + cleancache_flush_fs(s); put_filesystem(fs); put_super(s); } else { diff --git a/mm/filemap.c b/mm/filemap.c index 3d4df44e4221..b234c6dfd47c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -34,6 +34,7 @@ #include /* for BUG_ON(!in_atomic()) only */ #include #include /* for page_is_file_cache() */ +#include #include "internal.h" /* @@ -119,6 +120,16 @@ void __remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + /* + * if we're uptodate, flush out into the cleancache, otherwise + * invalidate any existing cleancache entries. We can't leave + * stale data around in the cleancache once our page is gone + */ + if (PageUptodate(page)) + cleancache_put_page(page); + else + cleancache_flush_page(mapping, page); + radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; diff --git a/mm/truncate.c b/mm/truncate.c index ba887bff48c5..cd94607b6762 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -19,6 +19,7 @@ #include #include /* grr. try_to_release_page, do_invalidatepage */ +#include #include "internal.h" @@ -51,6 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset) static inline void truncate_partial_page(struct page *page, unsigned partial) { zero_user_segment(page, partial, PAGE_CACHE_SIZE); + cleancache_flush_page(page->mapping, page); if (page_has_private(page)) do_invalidatepage(page, partial); } @@ -108,6 +110,10 @@ truncate_complete_page(struct address_space *mapping, struct page *page) clear_page_mlock(page); remove_from_page_cache(page); ClearPageMappedToDisk(page); + /* this must be after the remove_from_page_cache which + * calls cleancache_put_page (and note page->mapping is now NULL) + */ + cleancache_flush_page(mapping, page); page_cache_release(page); /* pagecache ref */ return 0; } @@ -215,6 +221,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pgoff_t next; int i; + cleancache_flush_inode(mapping); if (mapping->nrpages == 0) return; @@ -290,6 +297,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); mem_cgroup_uncharge_end(); } + cleancache_flush_inode(mapping); } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -428,6 +436,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int did_range_unmap = 0; int wrapped = 0; + cleancache_flush_inode(mapping); pagevec_init(&pvec, 0); next = start; while (next <= end && !wrapped && @@ -486,6 +495,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, mem_cgroup_uncharge_end(); cond_resched(); } + cleancache_flush_inode(mapping); return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); -- cgit v1.2.3