summaryrefslogtreecommitdiff
path: root/mm/filemap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c439
1 files changed, 304 insertions, 135 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index a49702445ce0..ee83baaf855d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -35,6 +35,7 @@
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/cleancache.h>
+#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include "internal.h"
@@ -130,20 +131,11 @@ static int page_cache_tree_insert(struct address_space *mapping,
return -EEXIST;
mapping->nrexceptional--;
- if (!dax_mapping(mapping)) {
- if (shadowp)
- *shadowp = p;
- } else {
- /* DAX can replace empty locked entry with a hole */
- WARN_ON_ONCE(p !=
- dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
- /* Wakeup waiters for exceptional entry lock */
- dax_wake_mapping_entry_waiter(mapping, page->index, p,
- true);
- }
+ if (shadowp)
+ *shadowp = p;
}
__radix_tree_replace(&mapping->page_tree, node, slot, page,
- workingset_update_node, mapping);
+ workingset_lookup_update(mapping));
mapping->nrpages++;
return 0;
}
@@ -171,9 +163,12 @@ static void page_cache_tree_delete(struct address_space *mapping,
radix_tree_clear_tags(&mapping->page_tree, node, slot);
__radix_tree_replace(&mapping->page_tree, node, slot, shadow,
- workingset_update_node, mapping);
+ workingset_lookup_update(mapping));
}
+ page->mapping = NULL;
+ /* Leave page->index set: truncation lookup relies upon it */
+
if (shadow) {
mapping->nrexceptional += nr;
/*
@@ -187,17 +182,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
mapping->nrpages -= nr;
}
-/*
- * Delete a page from the page cache and free it. Caller has to make
- * sure the page is locked and that nobody else uses it - or that usage
- * is safe. The caller must hold the mapping's tree_lock.
- */
-void __delete_from_page_cache(struct page *page, void *shadow)
+static void unaccount_page_cache_page(struct address_space *mapping,
+ struct page *page)
{
- struct address_space *mapping = page->mapping;
- int nr = hpage_nr_pages(page);
+ int nr;
- trace_mm_filemap_delete_from_page_cache(page);
/*
* if we're uptodate, flush out into the cleancache, otherwise
* invalidate any existing cleancache entries. We can't leave
@@ -233,15 +222,12 @@ void __delete_from_page_cache(struct page *page, void *shadow)
}
}
- page_cache_tree_delete(mapping, page, shadow);
-
- page->mapping = NULL;
- /* Leave page->index set: truncation lookup relies upon it */
-
/* hugetlb pages do not participate in page cache accounting. */
if (PageHuge(page))
return;
+ nr = hpage_nr_pages(page);
+
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
if (PageSwapBacked(page)) {
__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
@@ -252,17 +238,51 @@ void __delete_from_page_cache(struct page *page, void *shadow)
}
/*
- * At this point page must be either written or cleaned by truncate.
- * Dirty page here signals a bug and loss of unwritten data.
+ * At this point page must be either written or cleaned by
+ * truncate. Dirty page here signals a bug and loss of
+ * unwritten data.
*
- * This fixes dirty accounting after removing the page entirely but
- * leaves PageDirty set: it has no effect for truncated page and
- * anyway will be cleared before returning page into buddy allocator.
+ * This fixes dirty accounting after removing the page entirely
+ * but leaves PageDirty set: it has no effect for truncated
+ * page and anyway will be cleared before returning page into
+ * buddy allocator.
*/
if (WARN_ON_ONCE(PageDirty(page)))
account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
}
+/*
+ * Delete a page from the page cache and free it. Caller has to make
+ * sure the page is locked and that nobody else uses it - or that usage
+ * is safe. The caller must hold the mapping's tree_lock.
+ */
+void __delete_from_page_cache(struct page *page, void *shadow)
+{
+ struct address_space *mapping = page->mapping;
+
+ trace_mm_filemap_delete_from_page_cache(page);
+
+ unaccount_page_cache_page(mapping, page);
+ page_cache_tree_delete(mapping, page, shadow);
+}
+
+static void page_cache_free_page(struct address_space *mapping,
+ struct page *page)
+{
+ void (*freepage)(struct page *);
+
+ freepage = mapping->a_ops->freepage;
+ if (freepage)
+ freepage(page);
+
+ if (PageTransHuge(page) && !PageHuge(page)) {
+ page_ref_sub(page, HPAGE_PMD_NR);
+ VM_BUG_ON_PAGE(page_count(page) <= 0, page);
+ } else {
+ put_page(page);
+ }
+}
+
/**
* delete_from_page_cache - delete page from page cache
* @page: the page which the kernel is trying to remove from page cache
@@ -275,27 +295,98 @@ void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page_mapping(page);
unsigned long flags;
- void (*freepage)(struct page *);
BUG_ON(!PageLocked(page));
-
- freepage = mapping->a_ops->freepage;
-
spin_lock_irqsave(&mapping->tree_lock, flags);
__delete_from_page_cache(page, NULL);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- if (freepage)
- freepage(page);
+ page_cache_free_page(mapping, page);
+}
+EXPORT_SYMBOL(delete_from_page_cache);
- if (PageTransHuge(page) && !PageHuge(page)) {
- page_ref_sub(page, HPAGE_PMD_NR);
- VM_BUG_ON_PAGE(page_count(page) <= 0, page);
- } else {
- put_page(page);
+/*
+ * page_cache_tree_delete_batch - delete several pages from page cache
+ * @mapping: the mapping to which pages belong
+ * @pvec: pagevec with pages to delete
+ *
+ * The function walks over mapping->page_tree and removes pages passed in @pvec
+ * from the radix tree. The function expects @pvec to be sorted by page index.
+ * It tolerates holes in @pvec (radix tree entries at those indices are not
+ * modified). The function expects only THP head pages to be present in the
+ * @pvec and takes care to delete all corresponding tail pages from the radix
+ * tree as well.
+ *
+ * The function expects mapping->tree_lock to be held.
+ */
+static void
+page_cache_tree_delete_batch(struct address_space *mapping,
+ struct pagevec *pvec)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ int total_pages = 0;
+ int i = 0, tail_pages = 0;
+ struct page *page;
+ pgoff_t start;
+
+ start = pvec->pages[0]->index;
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ if (i >= pagevec_count(pvec) && !tail_pages)
+ break;
+ page = radix_tree_deref_slot_protected(slot,
+ &mapping->tree_lock);
+ if (radix_tree_exceptional_entry(page))
+ continue;
+ if (!tail_pages) {
+ /*
+ * Some page got inserted in our range? Skip it. We
+ * have our pages locked so they are protected from
+ * being removed.
+ */
+ if (page != pvec->pages[i])
+ continue;
+ WARN_ON_ONCE(!PageLocked(page));
+ if (PageTransHuge(page) && !PageHuge(page))
+ tail_pages = HPAGE_PMD_NR - 1;
+ page->mapping = NULL;
+ /*
+ * Leave page->index set: truncation lookup relies
+ * upon it
+ */
+ i++;
+ } else {
+ tail_pages--;
+ }
+ radix_tree_clear_tags(&mapping->page_tree, iter.node, slot);
+ __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL,
+ workingset_lookup_update(mapping));
+ total_pages++;
}
+ mapping->nrpages -= total_pages;
+}
+
+void delete_from_page_cache_batch(struct address_space *mapping,
+ struct pagevec *pvec)
+{
+ int i;
+ unsigned long flags;
+
+ if (!pagevec_count(pvec))
+ return;
+
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ for (i = 0; i < pagevec_count(pvec); i++) {
+ trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
+
+ unaccount_page_cache_page(mapping, pvec->pages[i]);
+ }
+ page_cache_tree_delete_batch(mapping, pvec);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+
+ for (i = 0; i < pagevec_count(pvec); i++)
+ page_cache_free_page(mapping, pvec->pages[i]);
}
-EXPORT_SYMBOL(delete_from_page_cache);
int filemap_check_errors(struct address_space *mapping)
{
@@ -402,8 +493,7 @@ bool filemap_range_has_page(struct address_space *mapping,
{
pgoff_t index = start_byte >> PAGE_SHIFT;
pgoff_t end = end_byte >> PAGE_SHIFT;
- struct pagevec pvec;
- bool ret;
+ struct page *page;
if (end_byte < start_byte)
return false;
@@ -411,12 +501,10 @@ bool filemap_range_has_page(struct address_space *mapping,
if (mapping->nrpages == 0)
return false;
- pagevec_init(&pvec, 0);
- if (!pagevec_lookup(&pvec, mapping, index, 1))
+ if (!find_get_pages_range(mapping, &index, end, 1, &page))
return false;
- ret = (pvec.pages[0]->index <= end);
- pagevec_release(&pvec);
- return ret;
+ put_page(page);
+ return true;
}
EXPORT_SYMBOL(filemap_range_has_page);
@@ -431,20 +519,18 @@ static void __filemap_fdatawait_range(struct address_space *mapping,
if (end_byte < start_byte)
return;
- pagevec_init(&pvec, 0);
- while ((index <= end) &&
- (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_WRITEBACK,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+ pagevec_init(&pvec);
+ while (index <= end) {
unsigned i;
+ nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
+ end, PAGECACHE_TAG_WRITEBACK);
+ if (!nr_pages)
+ break;
+
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- /* until radix tree lookup accepts end_index */
- if (page->index > end)
- continue;
-
wait_on_page_writeback(page);
ClearPageError(page);
}
@@ -476,6 +562,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
EXPORT_SYMBOL(filemap_fdatawait_range);
/**
+ * file_fdatawait_range - wait for writeback to complete
+ * @file: file pointing to address space structure to wait for
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the address space that file
+ * refers to, in the given range and wait for all of them. Check error
+ * status of the address space vs. the file->f_wb_err cursor and return it.
+ *
+ * Since the error status of the file is advanced by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
+ */
+int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ __filemap_fdatawait_range(mapping, start_byte, end_byte);
+ return file_check_and_advance_wb_err(file);
+}
+EXPORT_SYMBOL(file_fdatawait_range);
+
+/**
* filemap_fdatawait_keep_errors - wait for writeback without clearing errors
* @mapping: address space structure to wait for
*
@@ -489,45 +598,22 @@ EXPORT_SYMBOL(filemap_fdatawait_range);
*/
int filemap_fdatawait_keep_errors(struct address_space *mapping)
{
- loff_t i_size = i_size_read(mapping->host);
-
- if (i_size == 0)
- return 0;
-
- __filemap_fdatawait_range(mapping, 0, i_size - 1);
+ __filemap_fdatawait_range(mapping, 0, LLONG_MAX);
return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
-/**
- * filemap_fdatawait - wait for all under-writeback pages to complete
- * @mapping: address space structure to wait for
- *
- * Walk the list of under-writeback pages of the given address space
- * and wait for all of them. Check error status of the address space
- * and return it.
- *
- * Since the error status of the address space is cleared by this function,
- * callers are responsible for checking the return value and handling and/or
- * reporting the error.
- */
-int filemap_fdatawait(struct address_space *mapping)
+static bool mapping_needs_writeback(struct address_space *mapping)
{
- loff_t i_size = i_size_read(mapping->host);
-
- if (i_size == 0)
- return 0;
-
- return filemap_fdatawait_range(mapping, 0, i_size - 1);
+ return (!dax_mapping(mapping) && mapping->nrpages) ||
+ (dax_mapping(mapping) && mapping->nrexceptional);
}
-EXPORT_SYMBOL(filemap_fdatawait);
int filemap_write_and_wait(struct address_space *mapping)
{
int err = 0;
- if ((!dax_mapping(mapping) && mapping->nrpages) ||
- (dax_mapping(mapping) && mapping->nrexceptional)) {
+ if (mapping_needs_writeback(mapping)) {
err = filemap_fdatawrite(mapping);
/*
* Even if the above returned error, the pages may be
@@ -566,8 +652,7 @@ int filemap_write_and_wait_range(struct address_space *mapping,
{
int err = 0;
- if ((!dax_mapping(mapping) && mapping->nrpages) ||
- (dax_mapping(mapping) && mapping->nrexceptional)) {
+ if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
/* See comment of filemap_write_and_wait() */
@@ -589,7 +674,7 @@ EXPORT_SYMBOL(filemap_write_and_wait_range);
void __filemap_set_wb_err(struct address_space *mapping, int err)
{
- errseq_t eseq = __errseq_set(&mapping->wb_err, err);
+ errseq_t eseq = errseq_set(&mapping->wb_err, err);
trace_filemap_set_wb_err(mapping, eseq);
}
@@ -633,6 +718,14 @@ int file_check_and_advance_wb_err(struct file *file)
trace_file_check_and_advance_wb_err(file, old);
spin_unlock(&file->f_lock);
}
+
+ /*
+ * We're mostly using this function as a drop in replacement for
+ * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
+ * that the legacy code would have had on these flags.
+ */
+ clear_bit(AS_EIO, &mapping->flags);
+ clear_bit(AS_ENOSPC, &mapping->flags);
return err;
}
EXPORT_SYMBOL(file_check_and_advance_wb_err);
@@ -656,8 +749,7 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
int err = 0, err2;
struct address_space *mapping = file->f_mapping;
- if ((!dax_mapping(mapping) && mapping->nrpages) ||
- (dax_mapping(mapping) && mapping->nrexceptional)) {
+ if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
/* See comment of filemap_write_and_wait() */
@@ -885,6 +977,7 @@ void __init pagecache_init(void)
page_writeback_init();
}
+/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */
struct wait_page_key {
struct page *page;
int bit_nr;
@@ -909,8 +1002,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
if (wait_page->bit_nr != key->bit_nr)
return 0;
+
+ /* Stop walking if it's locked */
if (test_bit(key->bit_nr, &key->page->flags))
- return 0;
+ return -1;
return autoremove_wake_function(wait, mode, sync, key);
}
@@ -920,13 +1015,33 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
wait_queue_head_t *q = page_waitqueue(page);
struct wait_page_key key;
unsigned long flags;
+ wait_queue_entry_t bookmark;
key.page = page;
key.bit_nr = bit_nr;
key.page_match = 0;
+ bookmark.flags = 0;
+ bookmark.private = NULL;
+ bookmark.func = NULL;
+ INIT_LIST_HEAD(&bookmark.entry);
+
spin_lock_irqsave(&q->lock, flags);
- __wake_up_locked_key(q, TASK_NORMAL, &key);
+ __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+
+ while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+ /*
+ * Take a breather from holding the lock,
+ * allow pages that finish wake up asynchronously
+ * to acquire the lock and remove themselves
+ * from wait queue
+ */
+ spin_unlock_irqrestore(&q->lock, flags);
+ cpu_relax();
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+ }
+
/*
* It is possible for other pages to have collided on the waitqueue
* hash, so in that case check for a page match. That prevents a long-
@@ -964,6 +1079,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
int ret = 0;
init_wait(wait);
+ wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
wait->func = wake_page_function;
wait_page.page = page;
wait_page.bit_nr = bit_nr;
@@ -972,10 +1088,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
spin_lock_irq(&q->lock);
if (likely(list_empty(&wait->entry))) {
- if (lock)
- __add_wait_queue_entry_tail_exclusive(q, wait);
- else
- __add_wait_queue(q, wait);
+ __add_wait_queue_entry_tail(q, wait);
SetPageWaiters(page);
}
@@ -985,10 +1098,6 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
if (likely(test_bit(bit_nr, &page->flags))) {
io_schedule();
- if (unlikely(signal_pending_state(state, current))) {
- ret = -EINTR;
- break;
- }
}
if (lock) {
@@ -998,6 +1107,11 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
if (!test_bit(bit_nr, &page->flags))
break;
}
+
+ if (unlikely(signal_pending_state(state, current))) {
+ ret = -EINTR;
+ break;
+ }
}
finish_wait(q, wait);
@@ -1025,6 +1139,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
wait_queue_head_t *q = page_waitqueue(page);
return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
}
+EXPORT_SYMBOL(wait_on_page_bit_killable);
/**
* add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
@@ -1039,7 +1154,7 @@ void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue(q, waiter);
+ __add_wait_queue_entry_tail(q, waiter);
SetPageWaiters(page);
spin_unlock_irqrestore(&q->lock, flags);
}
@@ -1564,23 +1679,29 @@ export:
}
/**
- * find_get_pages - gang pagecache lookup
+ * find_get_pages_range - gang pagecache lookup
* @mapping: The address_space to search
* @start: The starting page index
+ * @end: The final page index (inclusive)
* @nr_pages: The maximum number of pages
* @pages: Where the resulting pages are placed
*
- * find_get_pages() will search for and return a group of up to
- * @nr_pages pages in the mapping. The pages are placed at @pages.
- * find_get_pages() takes a reference against the returned pages.
+ * find_get_pages_range() will search for and return a group of up to @nr_pages
+ * pages in the mapping starting at index @start and up to index @end
+ * (inclusive). The pages are placed at @pages. find_get_pages_range() takes
+ * a reference against the returned pages.
*
* The search returns a group of mapping-contiguous pages with ascending
* indexes. There may be holes in the indices due to not-present pages.
+ * We also update @start to index the next page for the traversal.
*
- * find_get_pages() returns the number of pages which were found.
+ * find_get_pages_range() returns the number of pages which were found. If this
+ * number is smaller than @nr_pages, the end of specified range has been
+ * reached.
*/
-unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
- unsigned int nr_pages, struct page **pages)
+unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, unsigned int nr_pages,
+ struct page **pages)
{
struct radix_tree_iter iter;
void **slot;
@@ -1590,8 +1711,11 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
return 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) {
struct page *head, *page;
+
+ if (iter.index > end)
+ break;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1627,11 +1751,25 @@ repeat:
}
pages[ret] = page;
- if (++ret == nr_pages)
- break;
+ if (++ret == nr_pages) {
+ *start = pages[ret - 1]->index + 1;
+ goto out;
+ }
}
+ /*
+ * We come here when there is no page beyond @end. We take care to not
+ * overflow the index @start as it confuses some of the callers. This
+ * breaks the iteration when there is page at index -1 but that is
+ * already broken anyway.
+ */
+ if (end == (pgoff_t)-1)
+ *start = (pgoff_t)-1;
+ else
+ *start = end + 1;
+out:
rcu_read_unlock();
+
return ret;
}
@@ -1715,9 +1853,10 @@ repeat:
EXPORT_SYMBOL(find_get_pages_contig);
/**
- * find_get_pages_tag - find and return pages that match @tag
+ * find_get_pages_range_tag - find and return pages in given range matching @tag
* @mapping: the address_space to search
* @index: the starting page index
+ * @end: The final page index (inclusive)
* @tag: the tag index
* @nr_pages: the maximum number of pages
* @pages: where the resulting pages are placed
@@ -1725,8 +1864,9 @@ EXPORT_SYMBOL(find_get_pages_contig);
* Like find_get_pages, except we only return pages which are tagged with
* @tag. We update @index to index the next page for the traversal.
*/
-unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
- int tag, unsigned int nr_pages, struct page **pages)
+unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
+ pgoff_t end, int tag, unsigned int nr_pages,
+ struct page **pages)
{
struct radix_tree_iter iter;
void **slot;
@@ -1739,6 +1879,9 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, *index, tag) {
struct page *head, *page;
+
+ if (iter.index > end)
+ break;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1780,18 +1923,28 @@ repeat:
}
pages[ret] = page;
- if (++ret == nr_pages)
- break;
+ if (++ret == nr_pages) {
+ *index = pages[ret - 1]->index + 1;
+ goto out;
+ }
}
+ /*
+ * We come here when we got at @end. We take care to not overflow the
+ * index @index as it confuses some of the callers. This breaks the
+ * iteration when there is page at index -1 but that is already broken
+ * anyway.
+ */
+ if (end == (pgoff_t)-1)
+ *index = (pgoff_t)-1;
+ else
+ *index = end + 1;
+out:
rcu_read_unlock();
- if (ret)
- *index = pages[ret - 1]->index + 1;
-
return ret;
}
-EXPORT_SYMBOL(find_get_pages_tag);
+EXPORT_SYMBOL(find_get_pages_range_tag);
/**
* find_get_entries_tag - find and return entries that match @tag
@@ -1886,9 +2039,8 @@ static void shrink_readahead_size_eio(struct file *filp,
}
/**
- * do_generic_file_read - generic file read routine
- * @filp: the file to read
- * @ppos: current file position
+ * generic_file_buffered_read - generic file read routine
+ * @iocb: the iocb to read
* @iter: data destination
* @written: already copied
*
@@ -1898,12 +2050,14 @@ static void shrink_readahead_size_eio(struct file *filp,
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
*/
-static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
+static ssize_t generic_file_buffered_read(struct kiocb *iocb,
struct iov_iter *iter, ssize_t written)
{
+ struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
struct file_ra_state *ra = &filp->f_ra;
+ loff_t *ppos = &iocb->ki_pos;
pgoff_t index;
pgoff_t last_index;
pgoff_t prev_index;
@@ -1936,6 +2090,8 @@ find_page:
page = find_get_page(mapping, index);
if (!page) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ goto would_block;
page_cache_sync_readahead(mapping,
ra, filp,
index, last_index - index);
@@ -1949,6 +2105,11 @@ find_page:
index, last_index - index);
}
if (!PageUptodate(page)) {
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ put_page(page);
+ goto would_block;
+ }
+
/*
* See comment in do_read_cache_page on why
* wait_on_page_locked is used to avoid unnecessarily
@@ -2112,7 +2273,7 @@ no_cached_page:
* Ok, it wasn't cached, so we need to create a new
* page..
*/
- page = page_cache_alloc_cold(mapping);
+ page = page_cache_alloc(mapping);
if (!page) {
error = -ENOMEM;
goto out;
@@ -2130,6 +2291,8 @@ no_cached_page:
goto readpage;
}
+would_block:
+ error = -EAGAIN;
out:
ra->prev_pos = prev_index;
ra->prev_pos <<= PAGE_SHIFT;
@@ -2151,14 +2314,14 @@ out:
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- struct file *file = iocb->ki_filp;
- ssize_t retval = 0;
size_t count = iov_iter_count(iter);
+ ssize_t retval = 0;
if (!count)
goto out; /* skip atime */
if (iocb->ki_flags & IOCB_DIRECT) {
+ struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
loff_t size;
@@ -2199,7 +2362,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
goto out;
}
- retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval);
+ retval = generic_file_buffered_read(iocb, iter, retval);
out:
return retval;
}
@@ -2222,7 +2385,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
int ret;
do {
- page = __page_cache_alloc(gfp_mask|__GFP_COLD);
+ page = __page_cache_alloc(gfp_mask);
if (!page)
return -ENOMEM;
@@ -2626,7 +2789,7 @@ static struct page *do_read_cache_page(struct address_space *mapping,
repeat:
page = find_get_page(mapping, index);
if (!page) {
- page = __page_cache_alloc(gfp | __GFP_COLD);
+ page = __page_cache_alloc(gfp);
if (!page)
return ERR_PTR(-ENOMEM);
err = add_to_page_cache_lru(page, mapping, index, gfp);
@@ -2885,9 +3048,15 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
* we're writing. Either one is a pretty crazy thing to do,
* so we don't support it 100%. If this invalidation
* fails, tough, the write still worked...
+ *
+ * Most of the time we do not need this since dio_complete() will do
+ * the invalidation for us. However there are some file systems that
+ * do not end up with dio_complete() being called, so let's not break
+ * them by removing it completely
*/
- invalidate_inode_pages2_range(mapping,
- pos >> PAGE_SHIFT, end);
+ if (mapping->nrpages)
+ invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_SHIFT, end);
if (written > 0) {
pos += written;