summaryrefslogtreecommitdiff
path: root/mm/filemap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c762
1 files changed, 426 insertions, 336 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index d1458ecf2f51..5e206a429b57 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -30,7 +30,6 @@
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
-#include <linux/blkdev.h>
#include <linux/security.h>
#include <linux/cpuset.h>
#include <linux/hugetlb.h>
@@ -76,8 +75,9 @@
* ->swap_lock (exclusive_swap_page, others)
* ->i_pages lock
*
- * ->i_mutex
- * ->i_mmap_rwsem (truncate->unmap_mapping_range)
+ * ->i_rwsem
+ * ->invalidate_lock (acquired by fs in truncate path)
+ * ->i_mmap_rwsem (truncate->unmap_mapping_range)
*
* ->mmap_lock
* ->i_mmap_rwsem
@@ -85,9 +85,10 @@
* ->i_pages lock (arch-dependent flush_dcache_mmap_lock)
*
* ->mmap_lock
- * ->lock_page (access_process_vm)
+ * ->invalidate_lock (filemap_fault)
+ * ->lock_page (filemap_fault, access_process_vm)
*
- * ->i_mutex (generic_perform_write)
+ * ->i_rwsem (generic_perform_write)
* ->mmap_lock (fault_in_pages_readable->do_page_fault)
*
* bdi->wb.list_lock
@@ -258,12 +259,11 @@ static void page_cache_free_page(struct address_space *mapping,
void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- unsigned long flags;
BUG_ON(!PageLocked(page));
- xa_lock_irqsave(&mapping->i_pages, flags);
+ xa_lock_irq(&mapping->i_pages);
__delete_from_page_cache(page, NULL);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
page_cache_free_page(mapping, page);
}
@@ -335,19 +335,18 @@ void delete_from_page_cache_batch(struct address_space *mapping,
struct pagevec *pvec)
{
int i;
- unsigned long flags;
if (!pagevec_count(pvec))
return;
- xa_lock_irqsave(&mapping->i_pages, flags);
+ xa_lock_irq(&mapping->i_pages);
for (i = 0; i < pagevec_count(pvec); i++) {
trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
unaccount_page_cache_page(mapping, pvec->pages[i]);
}
page_cache_delete_batch(mapping, pvec);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
for (i = 0; i < pagevec_count(pvec); i++)
page_cache_free_page(mapping, pvec->pages[i]);
@@ -378,6 +377,32 @@ static int filemap_check_and_keep_errors(struct address_space *mapping)
}
/**
+ * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
+ * @mapping: address space structure to write
+ * @wbc: the writeback_control controlling the writeout
+ *
+ * Call writepages on the mapping using the provided wbc to control the
+ * writeout.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int filemap_fdatawrite_wbc(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ int ret;
+
+ if (!mapping_can_writeback(mapping) ||
+ !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ return 0;
+
+ wbc_attach_fdatawrite_inode(wbc, mapping->host);
+ ret = do_writepages(mapping, wbc);
+ wbc_detach_inode(wbc);
+ return ret;
+}
+EXPORT_SYMBOL(filemap_fdatawrite_wbc);
+
+/**
* __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
* @mapping: address space structure to write
* @start: offset in bytes where the range starts
@@ -397,7 +422,6 @@ static int filemap_check_and_keep_errors(struct address_space *mapping)
int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
loff_t end, int sync_mode)
{
- int ret;
struct writeback_control wbc = {
.sync_mode = sync_mode,
.nr_to_write = LONG_MAX,
@@ -405,14 +429,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
.range_end = end,
};
- if (!mapping_can_writeback(mapping) ||
- !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
- return 0;
-
- wbc_attach_fdatawrite_inode(&wbc, mapping->host);
- ret = do_writepages(mapping, &wbc);
- wbc_detach_inode(&wbc);
- return ret;
+ return filemap_fdatawrite_wbc(mapping, &wbc);
}
static inline int __filemap_fdatawrite(struct address_space *mapping,
@@ -817,11 +834,12 @@ EXPORT_SYMBOL(file_write_and_wait_range);
*/
void replace_page_cache_page(struct page *old, struct page *new)
{
+ struct folio *fold = page_folio(old);
+ struct folio *fnew = page_folio(new);
struct address_space *mapping = old->mapping;
void (*freepage)(struct page *) = mapping->a_ops->freepage;
pgoff_t offset = old->index;
XA_STATE(xas, &mapping->i_pages, offset);
- unsigned long flags;
VM_BUG_ON_PAGE(!PageLocked(old), old);
VM_BUG_ON_PAGE(!PageLocked(new), new);
@@ -831,9 +849,9 @@ void replace_page_cache_page(struct page *old, struct page *new)
new->mapping = mapping;
new->index = offset;
- mem_cgroup_migrate(old, new);
+ mem_cgroup_migrate(fold, fnew);
- xas_lock_irqsave(&xas, flags);
+ xas_lock_irq(&xas);
xas_store(&xas, new);
old->mapping = NULL;
@@ -846,33 +864,32 @@ void replace_page_cache_page(struct page *old, struct page *new)
__dec_lruvec_page_state(old, NR_SHMEM);
if (PageSwapBacked(new))
__inc_lruvec_page_state(new, NR_SHMEM);
- xas_unlock_irqrestore(&xas, flags);
+ xas_unlock_irq(&xas);
if (freepage)
freepage(old);
put_page(old);
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);
-noinline int __add_to_page_cache_locked(struct page *page,
- struct address_space *mapping,
- pgoff_t offset, gfp_t gfp,
- void **shadowp)
+noinline int __filemap_add_folio(struct address_space *mapping,
+ struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
{
- XA_STATE(xas, &mapping->i_pages, offset);
- int huge = PageHuge(page);
+ XA_STATE(xas, &mapping->i_pages, index);
+ int huge = folio_test_hugetlb(folio);
int error;
bool charged = false;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageSwapBacked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
mapping_set_update(&xas, mapping);
- get_page(page);
- page->mapping = mapping;
- page->index = offset;
+ folio_get(folio);
+ folio->mapping = mapping;
+ folio->index = index;
if (!huge) {
- error = mem_cgroup_charge(page, NULL, gfp);
+ error = mem_cgroup_charge(folio, NULL, gfp);
+ VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
if (error)
goto error;
charged = true;
@@ -884,7 +901,7 @@ noinline int __add_to_page_cache_locked(struct page *page,
unsigned int order = xa_get_order(xas.xa, xas.xa_index);
void *entry, *old = NULL;
- if (order > thp_order(page))
+ if (order > folio_order(folio))
xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
order, gfp);
xas_lock_irq(&xas);
@@ -901,13 +918,13 @@ noinline int __add_to_page_cache_locked(struct page *page,
*shadowp = old;
/* entry may have been split before we acquired lock */
order = xa_get_order(xas.xa, xas.xa_index);
- if (order > thp_order(page)) {
+ if (order > folio_order(folio)) {
xas_split(&xas, old, order);
xas_reset(&xas);
}
}
- xas_store(&xas, page);
+ xas_store(&xas, folio);
if (xas_error(&xas))
goto unlock;
@@ -915,7 +932,7 @@ noinline int __add_to_page_cache_locked(struct page *page,
/* hugetlb pages do not participate in page cache accounting */
if (!huge)
- __inc_lruvec_page_state(page, NR_FILE_PAGES);
+ __lruvec_stat_add_folio(folio, NR_FILE_PAGES);
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
@@ -923,19 +940,19 @@ unlock:
if (xas_error(&xas)) {
error = xas_error(&xas);
if (charged)
- mem_cgroup_uncharge(page);
+ mem_cgroup_uncharge(folio);
goto error;
}
- trace_mm_filemap_add_to_page_cache(page);
+ trace_mm_filemap_add_to_page_cache(&folio->page);
return 0;
error:
- page->mapping = NULL;
+ folio->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
- put_page(page);
+ folio_put(folio);
return error;
}
-ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
+ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
/**
* add_to_page_cache_locked - add a locked page to the pagecache
@@ -952,62 +969,99 @@ ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
{
- return __add_to_page_cache_locked(page, mapping, offset,
+ return __filemap_add_folio(mapping, page_folio(page), offset,
gfp_mask, NULL);
}
EXPORT_SYMBOL(add_to_page_cache_locked);
-int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask)
+int filemap_add_folio(struct address_space *mapping, struct folio *folio,
+ pgoff_t index, gfp_t gfp)
{
void *shadow = NULL;
int ret;
- __SetPageLocked(page);
- ret = __add_to_page_cache_locked(page, mapping, offset,
- gfp_mask, &shadow);
+ __folio_set_locked(folio);
+ ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
if (unlikely(ret))
- __ClearPageLocked(page);
+ __folio_clear_locked(folio);
else {
/*
- * The page might have been evicted from cache only
+ * The folio might have been evicted from cache only
* recently, in which case it should be activated like
- * any other repeatedly accessed page.
- * The exception is pages getting rewritten; evicting other
+ * any other repeatedly accessed folio.
+ * The exception is folios getting rewritten; evicting other
* data from the working set, only to cache data that will
* get overwritten with something else, is a waste of memory.
*/
- WARN_ON_ONCE(PageActive(page));
- if (!(gfp_mask & __GFP_WRITE) && shadow)
- workingset_refault(page, shadow);
- lru_cache_add(page);
+ WARN_ON_ONCE(folio_test_active(folio));
+ if (!(gfp & __GFP_WRITE) && shadow)
+ workingset_refault(folio, shadow);
+ folio_add_lru(folio);
}
return ret;
}
-EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
+EXPORT_SYMBOL_GPL(filemap_add_folio);
#ifdef CONFIG_NUMA
-struct page *__page_cache_alloc(gfp_t gfp)
+struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
{
int n;
- struct page *page;
+ struct folio *folio;
if (cpuset_do_page_mem_spread()) {
unsigned int cpuset_mems_cookie;
do {
cpuset_mems_cookie = read_mems_allowed_begin();
n = cpuset_mem_spread_node();
- page = __alloc_pages_node(n, gfp, 0);
- } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
+ folio = __folio_alloc_node(gfp, order, n);
+ } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));
- return page;
+ return folio;
}
- return alloc_pages(gfp, 0);
+ return folio_alloc(gfp, order);
}
-EXPORT_SYMBOL(__page_cache_alloc);
+EXPORT_SYMBOL(filemap_alloc_folio);
#endif
/*
+ * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
+ *
+ * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
+ *
+ * @mapping1: the first mapping to lock
+ * @mapping2: the second mapping to lock
+ */
+void filemap_invalidate_lock_two(struct address_space *mapping1,
+ struct address_space *mapping2)
+{
+ if (mapping1 > mapping2)
+ swap(mapping1, mapping2);
+ if (mapping1)
+ down_write(&mapping1->invalidate_lock);
+ if (mapping2 && mapping1 != mapping2)
+ down_write_nested(&mapping2->invalidate_lock, 1);
+}
+EXPORT_SYMBOL(filemap_invalidate_lock_two);
+
+/*
+ * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
+ *
+ * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
+ *
+ * @mapping1: the first mapping to unlock
+ * @mapping2: the second mapping to unlock
+ */
+void filemap_invalidate_unlock_two(struct address_space *mapping1,
+ struct address_space *mapping2)
+{
+ if (mapping1)
+ up_write(&mapping1->invalidate_lock);
+ if (mapping2 && mapping1 != mapping2)
+ up_write(&mapping2->invalidate_lock);
+}
+EXPORT_SYMBOL(filemap_invalidate_unlock_two);
+
+/*
* In order to wait for pages to become available there must be
* waitqueues associated with pages. By using a hash table of
* waitqueues where the bucket discipline is to maintain all
@@ -1019,11 +1073,11 @@ EXPORT_SYMBOL(__page_cache_alloc);
*/
#define PAGE_WAIT_TABLE_BITS 8
#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
-static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
+static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
-static wait_queue_head_t *page_waitqueue(struct page *page)
+static wait_queue_head_t *folio_waitqueue(struct folio *folio)
{
- return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
+ return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
}
void __init pagecache_init(void)
@@ -1031,7 +1085,7 @@ void __init pagecache_init(void)
int i;
for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
- init_waitqueue_head(&page_wait_table[i]);
+ init_waitqueue_head(&folio_wait_table[i]);
page_writeback_init();
}
@@ -1086,10 +1140,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
*/
flags = wait->flags;
if (flags & WQ_FLAG_EXCLUSIVE) {
- if (test_bit(key->bit_nr, &key->page->flags))
+ if (test_bit(key->bit_nr, &key->folio->flags))
return -1;
if (flags & WQ_FLAG_CUSTOM) {
- if (test_and_set_bit(key->bit_nr, &key->page->flags))
+ if (test_and_set_bit(key->bit_nr, &key->folio->flags))
return -1;
flags |= WQ_FLAG_DONE;
}
@@ -1102,7 +1156,7 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
*
* So update the flags atomically, and wake up the waiter
* afterwards to avoid any races. This store-release pairs
- * with the load-acquire in wait_on_page_bit_common().
+ * with the load-acquire in folio_wait_bit_common().
*/
smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
wake_up_state(wait->private, mode);
@@ -1121,14 +1175,14 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
return (flags & WQ_FLAG_EXCLUSIVE) != 0;
}
-static void wake_up_page_bit(struct page *page, int bit_nr)
+static void folio_wake_bit(struct folio *folio, int bit_nr)
{
- wait_queue_head_t *q = page_waitqueue(page);
+ wait_queue_head_t *q = folio_waitqueue(folio);
struct wait_page_key key;
unsigned long flags;
wait_queue_entry_t bookmark;
- key.page = page;
+ key.folio = folio;
key.bit_nr = bit_nr;
key.page_match = 0;
@@ -1163,7 +1217,7 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
* page waiters.
*/
if (!waitqueue_active(q) || !key.page_match) {
- ClearPageWaiters(page);
+ folio_clear_waiters(folio);
/*
* It's possible to miss clearing Waiters here, when we woke
* our page waiters, but the hashed waitqueue has waiters for
@@ -1175,19 +1229,19 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
spin_unlock_irqrestore(&q->lock, flags);
}
-static void wake_up_page(struct page *page, int bit)
+static void folio_wake(struct folio *folio, int bit)
{
- if (!PageWaiters(page))
+ if (!folio_test_waiters(folio))
return;
- wake_up_page_bit(page, bit);
+ folio_wake_bit(folio, bit);
}
/*
- * A choice of three behaviors for wait_on_page_bit_common():
+ * A choice of three behaviors for folio_wait_bit_common():
*/
enum behavior {
EXCLUSIVE, /* Hold ref to page and take the bit when woken, like
- * __lock_page() waiting on then setting PG_locked.
+ * __folio_lock() waiting on then setting PG_locked.
*/
SHARED, /* Hold ref to page and check the bit when woken, like
* wait_on_page_writeback() waiting on PG_writeback.
@@ -1198,16 +1252,16 @@ enum behavior {
};
/*
- * Attempt to check (or get) the page bit, and mark us done
+ * Attempt to check (or get) the folio flag, and mark us done
* if successful.
*/
-static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
+static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
struct wait_queue_entry *wait)
{
if (wait->flags & WQ_FLAG_EXCLUSIVE) {
- if (test_and_set_bit(bit_nr, &page->flags))
+ if (test_and_set_bit(bit_nr, &folio->flags))
return false;
- } else if (test_bit(bit_nr, &page->flags))
+ } else if (test_bit(bit_nr, &folio->flags))
return false;
wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
@@ -1217,9 +1271,10 @@ static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
/* How many times do we accept lock stealing from under a waiter? */
int sysctl_page_lock_unfairness = 5;
-static inline int wait_on_page_bit_common(wait_queue_head_t *q,
- struct page *page, int bit_nr, int state, enum behavior behavior)
+static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
+ int state, enum behavior behavior)
{
+ wait_queue_head_t *q = folio_waitqueue(folio);
int unfairness = sysctl_page_lock_unfairness;
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
@@ -1228,8 +1283,8 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
unsigned long pflags;
if (bit_nr == PG_locked &&
- !PageUptodate(page) && PageWorkingset(page)) {
- if (!PageSwapBacked(page)) {
+ !folio_test_uptodate(folio) && folio_test_workingset(folio)) {
+ if (!folio_test_swapbacked(folio)) {
delayacct_thrashing_start();
delayacct = true;
}
@@ -1239,7 +1294,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
init_wait(wait);
wait->func = wake_page_function;
- wait_page.page = page;
+ wait_page.folio = folio;
wait_page.bit_nr = bit_nr;
repeat:
@@ -1254,7 +1309,7 @@ repeat:
* Do one last check whether we can get the
* page bit synchronously.
*
- * Do the SetPageWaiters() marking before that
+ * Do the folio_set_waiters() marking before that
* to let any waker we _just_ missed know they
* need to wake us up (otherwise they'll never
* even go to the slow case that looks at the
@@ -1265,8 +1320,8 @@ repeat:
* lock to avoid races.
*/
spin_lock_irq(&q->lock);
- SetPageWaiters(page);
- if (!trylock_page_bit_common(page, bit_nr, wait))
+ folio_set_waiters(folio);
+ if (!folio_trylock_flag(folio, bit_nr, wait))
__add_wait_queue_entry_tail(q, wait);
spin_unlock_irq(&q->lock);
@@ -1276,10 +1331,10 @@ repeat:
* see whether the page bit testing has already
* been done by the wake function.
*
- * We can drop our reference to the page.
+ * We can drop our reference to the folio.
*/
if (behavior == DROP)
- put_page(page);
+ folio_put(folio);
/*
* Note that until the "finish_wait()", or until
@@ -1316,7 +1371,7 @@ repeat:
*
* And if that fails, we'll have to retry this all.
*/
- if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
+ if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
goto repeat;
wait->flags |= WQ_FLAG_DONE;
@@ -1325,7 +1380,7 @@ repeat:
/*
* If a signal happened, this 'finish_wait()' may remove the last
- * waiter from the wait-queues, but the PageWaiters bit will remain
+ * waiter from the wait-queues, but the folio waiters bit will remain
* set. That's ok. The next wakeup will take care of it, and trying
* to do it here would be difficult and prone to races.
*/
@@ -1356,19 +1411,17 @@ repeat:
return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
}
-void wait_on_page_bit(struct page *page, int bit_nr)
+void folio_wait_bit(struct folio *folio, int bit_nr)
{
- wait_queue_head_t *q = page_waitqueue(page);
- wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
+ folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
}
-EXPORT_SYMBOL(wait_on_page_bit);
+EXPORT_SYMBOL(folio_wait_bit);
-int wait_on_page_bit_killable(struct page *page, int bit_nr)
+int folio_wait_bit_killable(struct folio *folio, int bit_nr)
{
- wait_queue_head_t *q = page_waitqueue(page);
- return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
+ return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
}
-EXPORT_SYMBOL(wait_on_page_bit_killable);
+EXPORT_SYMBOL(folio_wait_bit_killable);
/**
* put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
@@ -1385,31 +1438,28 @@ EXPORT_SYMBOL(wait_on_page_bit_killable);
*/
int put_and_wait_on_page_locked(struct page *page, int state)
{
- wait_queue_head_t *q;
-
- page = compound_head(page);
- q = page_waitqueue(page);
- return wait_on_page_bit_common(q, page, PG_locked, state, DROP);
+ return folio_wait_bit_common(page_folio(page), PG_locked, state,
+ DROP);
}
/**
- * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
- * @page: Page defining the wait queue of interest
+ * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue
+ * @folio: Folio defining the wait queue of interest
* @waiter: Waiter to add to the queue
*
- * Add an arbitrary @waiter to the wait queue for the nominated @page.
+ * Add an arbitrary @waiter to the wait queue for the nominated @folio.
*/
-void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
+void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
{
- wait_queue_head_t *q = page_waitqueue(page);
+ wait_queue_head_t *q = folio_waitqueue(folio);
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
__add_wait_queue_entry_tail(q, waiter);
- SetPageWaiters(page);
+ folio_set_waiters(folio);
spin_unlock_irqrestore(&q->lock, flags);
}
-EXPORT_SYMBOL_GPL(add_page_wait_queue);
+EXPORT_SYMBOL_GPL(folio_add_wait_queue);
#ifndef clear_bit_unlock_is_negative_byte
@@ -1435,124 +1485,116 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem
#endif
/**
- * unlock_page - unlock a locked page
- * @page: the page
+ * folio_unlock - Unlock a locked folio.
+ * @folio: The folio.
*
- * Unlocks the page and wakes up sleepers in wait_on_page_locked().
- * Also wakes sleepers in wait_on_page_writeback() because the wakeup
- * mechanism between PageLocked pages and PageWriteback pages is shared.
- * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
+ * Unlocks the folio and wakes up any thread sleeping on the page lock.
*
- * Note that this depends on PG_waiters being the sign bit in the byte
- * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
- * clear the PG_locked bit and test PG_waiters at the same time fairly
- * portably (architectures that do LL/SC can test any bit, while x86 can
- * test the sign bit).
+ * Context: May be called from interrupt or process context. May not be
+ * called from NMI context.
*/
-void unlock_page(struct page *page)
+void folio_unlock(struct folio *folio)
{
+ /* Bit 7 allows x86 to check the byte's sign bit */
BUILD_BUG_ON(PG_waiters != 7);
- page = compound_head(page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
- wake_up_page_bit(page, PG_locked);
+ BUILD_BUG_ON(PG_locked > 7);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0)))
+ folio_wake_bit(folio, PG_locked);
}
-EXPORT_SYMBOL(unlock_page);
+EXPORT_SYMBOL(folio_unlock);
/**
- * end_page_private_2 - Clear PG_private_2 and release any waiters
- * @page: The page
+ * folio_end_private_2 - Clear PG_private_2 and wake any waiters.
+ * @folio: The folio.
*
- * Clear the PG_private_2 bit on a page and wake up any sleepers waiting for
- * this. The page ref held for PG_private_2 being set is released.
+ * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
+ * it. The folio reference held for PG_private_2 being set is released.
*
- * This is, for example, used when a netfs page is being written to a local
- * disk cache, thereby allowing writes to the cache for the same page to be
+ * This is, for example, used when a netfs folio is being written to a local
+ * disk cache, thereby allowing writes to the cache for the same folio to be
* serialised.
*/
-void end_page_private_2(struct page *page)
+void folio_end_private_2(struct folio *folio)
{
- page = compound_head(page);
- VM_BUG_ON_PAGE(!PagePrivate2(page), page);
- clear_bit_unlock(PG_private_2, &page->flags);
- wake_up_page_bit(page, PG_private_2);
- put_page(page);
+ VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
+ clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
+ folio_wake_bit(folio, PG_private_2);
+ folio_put(folio);
}
-EXPORT_SYMBOL(end_page_private_2);
+EXPORT_SYMBOL(folio_end_private_2);
/**
- * wait_on_page_private_2 - Wait for PG_private_2 to be cleared on a page
- * @page: The page to wait on
+ * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
+ * @folio: The folio to wait on.
*
- * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page.
+ * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio.
*/
-void wait_on_page_private_2(struct page *page)
+void folio_wait_private_2(struct folio *folio)
{
- page = compound_head(page);
- while (PagePrivate2(page))
- wait_on_page_bit(page, PG_private_2);
+ while (folio_test_private_2(folio))
+ folio_wait_bit(folio, PG_private_2);
}
-EXPORT_SYMBOL(wait_on_page_private_2);
+EXPORT_SYMBOL(folio_wait_private_2);
/**
- * wait_on_page_private_2_killable - Wait for PG_private_2 to be cleared on a page
- * @page: The page to wait on
+ * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
+ * @folio: The folio to wait on.
*
- * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page or until a
+ * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a
* fatal signal is received by the calling task.
*
* Return:
* - 0 if successful.
* - -EINTR if a fatal signal was encountered.
*/
-int wait_on_page_private_2_killable(struct page *page)
+int folio_wait_private_2_killable(struct folio *folio)
{
int ret = 0;
- page = compound_head(page);
- while (PagePrivate2(page)) {
- ret = wait_on_page_bit_killable(page, PG_private_2);
+ while (folio_test_private_2(folio)) {
+ ret = folio_wait_bit_killable(folio, PG_private_2);
if (ret < 0)
break;
}
return ret;
}
-EXPORT_SYMBOL(wait_on_page_private_2_killable);
+EXPORT_SYMBOL(folio_wait_private_2_killable);
/**
- * end_page_writeback - end writeback against a page
- * @page: the page
+ * folio_end_writeback - End writeback against a folio.
+ * @folio: The folio.
*/
-void end_page_writeback(struct page *page)
+void folio_end_writeback(struct folio *folio)
{
/*
- * TestClearPageReclaim could be used here but it is an atomic
- * operation and overkill in this particular case. Failing to
- * shuffle a page marked for immediate reclaim is too mild to
- * justify taking an atomic operation penalty at the end of
- * ever page writeback.
+ * folio_test_clear_reclaim() could be used here but it is an
+ * atomic operation and overkill in this particular case. Failing
+ * to shuffle a folio marked for immediate reclaim is too mild
+ * a gain to justify taking an atomic operation penalty at the
+ * end of every folio writeback.
*/
- if (PageReclaim(page)) {
- ClearPageReclaim(page);
- rotate_reclaimable_page(page);
+ if (folio_test_reclaim(folio)) {
+ folio_clear_reclaim(folio);
+ folio_rotate_reclaimable(folio);
}
/*
- * Writeback does not hold a page reference of its own, relying
+ * Writeback does not hold a folio reference of its own, relying
* on truncation to wait for the clearing of PG_writeback.
- * But here we must make sure that the page is not freed and
- * reused before the wake_up_page().
+ * But here we must make sure that the folio is not freed and
+ * reused before the folio_wake().
*/
- get_page(page);
- if (!test_clear_page_writeback(page))
+ folio_get(folio);
+ if (!__folio_end_writeback(folio))
BUG();
smp_mb__after_atomic();
- wake_up_page(page, PG_writeback);
- put_page(page);
+ folio_wake(folio, PG_writeback);
+ folio_put(folio);
}
-EXPORT_SYMBOL(end_page_writeback);
+EXPORT_SYMBOL(folio_end_writeback);
/*
* After completing I/O on a page, call this routine to update the page
@@ -1583,39 +1625,35 @@ void page_endio(struct page *page, bool is_write, int err)
EXPORT_SYMBOL_GPL(page_endio);
/**
- * __lock_page - get a lock on the page, assuming we need to sleep to get it
- * @__page: the page to lock
+ * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
+ * @folio: The folio to lock
*/
-void __lock_page(struct page *__page)
+void __folio_lock(struct folio *folio)
{
- struct page *page = compound_head(__page);
- wait_queue_head_t *q = page_waitqueue(page);
- wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
+ folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
EXCLUSIVE);
}
-EXPORT_SYMBOL(__lock_page);
+EXPORT_SYMBOL(__folio_lock);
-int __lock_page_killable(struct page *__page)
+int __folio_lock_killable(struct folio *folio)
{
- struct page *page = compound_head(__page);
- wait_queue_head_t *q = page_waitqueue(page);
- return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
+ return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
EXCLUSIVE);
}
-EXPORT_SYMBOL_GPL(__lock_page_killable);
+EXPORT_SYMBOL_GPL(__folio_lock_killable);
-int __lock_page_async(struct page *page, struct wait_page_queue *wait)
+static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
{
- struct wait_queue_head *q = page_waitqueue(page);
+ struct wait_queue_head *q = folio_waitqueue(folio);
int ret = 0;
- wait->page = page;
+ wait->folio = folio;
wait->bit_nr = PG_locked;
spin_lock_irq(&q->lock);
__add_wait_queue_entry_tail(q, &wait->wait);
- SetPageWaiters(page);
- ret = !trylock_page(page);
+ folio_set_waiters(folio);
+ ret = !folio_trylock(folio);
/*
* If we were successful now, we know we're still on the
* waitqueue as we're still under the lock. This means it's
@@ -1632,16 +1670,16 @@ int __lock_page_async(struct page *page, struct wait_page_queue *wait)
/*
* Return values:
- * 1 - page is locked; mmap_lock is still held.
- * 0 - page is not locked.
+ * true - folio is locked; mmap_lock is still held.
+ * false - folio is not locked.
* mmap_lock has been released (mmap_read_unlock(), unless flags had both
* FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
* which case mmap_lock is still held.
*
- * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
- * with the page locked and the mmap_lock unperturbed.
+ * If neither ALLOW_RETRY nor KILLABLE are set, will always return true
+ * with the folio locked and the mmap_lock unperturbed.
*/
-int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm,
unsigned int flags)
{
if (fault_flag_allow_retry_first(flags)) {
@@ -1650,28 +1688,28 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
* even though return 0.
*/
if (flags & FAULT_FLAG_RETRY_NOWAIT)
- return 0;
+ return false;
mmap_read_unlock(mm);
if (flags & FAULT_FLAG_KILLABLE)
- wait_on_page_locked_killable(page);
+ folio_wait_locked_killable(folio);
else
- wait_on_page_locked(page);
- return 0;
+ folio_wait_locked(folio);
+ return false;
}
if (flags & FAULT_FLAG_KILLABLE) {
- int ret;
+ bool ret;
- ret = __lock_page_killable(page);
+ ret = __folio_lock_killable(folio);
if (ret) {
mmap_read_unlock(mm);
- return 0;
+ return false;
}
} else {
- __lock_page(page);
+ __folio_lock(folio);
}
- return 1;
+ return true;
}
/**
@@ -1747,143 +1785,155 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
EXPORT_SYMBOL(page_cache_prev_miss);
/*
+ * Lockless page cache protocol:
+ * On the lookup side:
+ * 1. Load the folio from i_pages
+ * 2. Increment the refcount if it's not zero
+ * 3. If the folio is not found by xas_reload(), put the refcount and retry
+ *
+ * On the removal side:
+ * A. Freeze the page (by zeroing the refcount if nobody else has a reference)
+ * B. Remove the page from i_pages
+ * C. Return the page to the page allocator
+ *
+ * This means that any page may have its reference count temporarily
+ * increased by a speculative page cache (or fast GUP) lookup as it can
+ * be allocated by another user before the RCU grace period expires.
+ * Because the refcount temporarily acquired here may end up being the
+ * last refcount on the page, any page allocation must be freeable by
+ * folio_put().
+ */
+
+/*
* mapping_get_entry - Get a page cache entry.
* @mapping: the address_space to search
* @index: The page cache index.
*
- * Looks up the page cache slot at @mapping & @index. If there is a
- * page cache page, the head page is returned with an increased refcount.
+ * Looks up the page cache entry at @mapping & @index. If it is a folio,
+ * it is returned with an increased refcount. If it is a shadow entry
+ * of a previously evicted folio, or a swap entry from shmem/tmpfs,
+ * it is returned without further action.
*
- * If the slot holds a shadow entry of a previously evicted page, or a
- * swap entry from shmem/tmpfs, it is returned.
- *
- * Return: The head page or shadow entry, %NULL if nothing is found.
+ * Return: The folio, swap or shadow entry, %NULL if nothing is found.
*/
-static struct page *mapping_get_entry(struct address_space *mapping,
- pgoff_t index)
+static void *mapping_get_entry(struct address_space *mapping, pgoff_t index)
{
XA_STATE(xas, &mapping->i_pages, index);
- struct page *page;
+ struct folio *folio;
rcu_read_lock();
repeat:
xas_reset(&xas);
- page = xas_load(&xas);
- if (xas_retry(&xas, page))
+ folio = xas_load(&xas);
+ if (xas_retry(&xas, folio))
goto repeat;
/*
* A shadow entry of a recently evicted page, or a swap entry from
* shmem/tmpfs. Return it without attempting to raise page count.
*/
- if (!page || xa_is_value(page))
+ if (!folio || xa_is_value(folio))
goto out;
- if (!page_cache_get_speculative(page))
+ if (!folio_try_get_rcu(folio))
goto repeat;
- /*
- * Has the page moved or been split?
- * This is part of the lockless pagecache protocol. See
- * include/linux/pagemap.h for details.
- */
- if (unlikely(page != xas_reload(&xas))) {
- put_page(page);
+ if (unlikely(folio != xas_reload(&xas))) {
+ folio_put(folio);
goto repeat;
}
out:
rcu_read_unlock();
- return page;
+ return folio;
}
/**
- * pagecache_get_page - Find and get a reference to a page.
+ * __filemap_get_folio - Find and get a reference to a folio.
* @mapping: The address_space to search.
* @index: The page index.
- * @fgp_flags: %FGP flags modify how the page is returned.
- * @gfp_mask: Memory allocation flags to use if %FGP_CREAT is specified.
+ * @fgp_flags: %FGP flags modify how the folio is returned.
+ * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
*
* Looks up the page cache entry at @mapping & @index.
*
* @fgp_flags can be zero or more of these flags:
*
- * * %FGP_ACCESSED - The page will be marked accessed.
- * * %FGP_LOCK - The page is returned locked.
- * * %FGP_HEAD - If the page is present and a THP, return the head page
- * rather than the exact page specified by the index.
+ * * %FGP_ACCESSED - The folio will be marked accessed.
+ * * %FGP_LOCK - The folio is returned locked.
* * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it
- * instead of allocating a new page to replace it.
+ * instead of allocating a new folio to replace it.
* * %FGP_CREAT - If no page is present then a new page is allocated using
- * @gfp_mask and added to the page cache and the VM's LRU list.
+ * @gfp and added to the page cache and the VM's LRU list.
* The page is returned locked and with an increased refcount.
* * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
* page is already in cache. If the page was allocated, unlock it before
* returning so the caller can do the same dance.
- * * %FGP_WRITE - The page will be written
- * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask
- * * %FGP_NOWAIT - Don't get blocked by page lock
+ * * %FGP_WRITE - The page will be written to by the caller.
+ * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
+ * * %FGP_NOWAIT - Don't get blocked by page lock.
+ * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
*
* If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
* if the %GFP flags specified for %FGP_CREAT are atomic.
*
* If there is a page cache page, it is returned with an increased refcount.
*
- * Return: The found page or %NULL otherwise.
+ * Return: The found folio or %NULL otherwise.
*/
-struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp_mask)
+struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
+ int fgp_flags, gfp_t gfp)
{
- struct page *page;
+ struct folio *folio;
repeat:
- page = mapping_get_entry(mapping, index);
- if (xa_is_value(page)) {
+ folio = mapping_get_entry(mapping, index);
+ if (xa_is_value(folio)) {
if (fgp_flags & FGP_ENTRY)
- return page;
- page = NULL;
+ return folio;
+ folio = NULL;
}
- if (!page)
+ if (!folio)
goto no_page;
if (fgp_flags & FGP_LOCK) {
if (fgp_flags & FGP_NOWAIT) {
- if (!trylock_page(page)) {
- put_page(page);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
return NULL;
}
} else {
- lock_page(page);
+ folio_lock(folio);
}
/* Has the page been truncated? */
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- put_page(page);
+ if (unlikely(folio->mapping != mapping)) {
+ folio_unlock(folio);
+ folio_put(folio);
goto repeat;
}
- VM_BUG_ON_PAGE(!thp_contains(page, index), page);
+ VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
}
if (fgp_flags & FGP_ACCESSED)
- mark_page_accessed(page);
+ folio_mark_accessed(folio);
else if (fgp_flags & FGP_WRITE) {
/* Clear idle flag for buffer write */
- if (page_is_idle(page))
- clear_page_idle(page);
+ if (folio_test_idle(folio))
+ folio_clear_idle(folio);
}
- if (!(fgp_flags & FGP_HEAD))
- page = find_subpage(page, index);
+ if (fgp_flags & FGP_STABLE)
+ folio_wait_stable(folio);
no_page:
- if (!page && (fgp_flags & FGP_CREAT)) {
+ if (!folio && (fgp_flags & FGP_CREAT)) {
int err;
if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
- gfp_mask |= __GFP_WRITE;
+ gfp |= __GFP_WRITE;
if (fgp_flags & FGP_NOFS)
- gfp_mask &= ~__GFP_FS;
+ gfp &= ~__GFP_FS;
- page = __page_cache_alloc(gfp_mask);
- if (!page)
+ folio = filemap_alloc_folio(gfp, 0);
+ if (!folio)
return NULL;
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
@@ -1891,27 +1941,27 @@ no_page:
/* Init accessed so avoid atomic mark_page_accessed later */
if (fgp_flags & FGP_ACCESSED)
- __SetPageReferenced(page);
+ __folio_set_referenced(folio);
- err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
+ err = filemap_add_folio(mapping, folio, index, gfp);
if (unlikely(err)) {
- put_page(page);
- page = NULL;
+ folio_put(folio);
+ folio = NULL;
if (err == -EEXIST)
goto repeat;
}
/*
- * add_to_page_cache_lru locks the page, and for mmap we expect
- * an unlocked page.
+ * filemap_add_folio locks the page, and for mmap
+ * we expect an unlocked page.
*/
- if (page && (fgp_flags & FGP_FOR_MMAP))
- unlock_page(page);
+ if (folio && (fgp_flags & FGP_FOR_MMAP))
+ folio_unlock(folio);
}
- return page;
+ return folio;
}
-EXPORT_SYMBOL(pagecache_get_page);
+EXPORT_SYMBOL(__filemap_get_folio);
static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max,
xa_mark_t mark)
@@ -2366,41 +2416,50 @@ static int filemap_update_page(struct kiocb *iocb,
struct address_space *mapping, struct iov_iter *iter,
struct page *page)
{
+ struct folio *folio = page_folio(page);
int error;
- if (!trylock_page(page)) {
- if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!filemap_invalidate_trylock_shared(mapping))
return -EAGAIN;
+ } else {
+ filemap_invalidate_lock_shared(mapping);
+ }
+
+ if (!folio_trylock(folio)) {
+ error = -EAGAIN;
+ if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
+ goto unlock_mapping;
if (!(iocb->ki_flags & IOCB_WAITQ)) {
- put_and_wait_on_page_locked(page, TASK_KILLABLE);
+ filemap_invalidate_unlock_shared(mapping);
+ put_and_wait_on_page_locked(&folio->page, TASK_KILLABLE);
return AOP_TRUNCATED_PAGE;
}
- error = __lock_page_async(page, iocb->ki_waitq);
+ error = __folio_lock_async(folio, iocb->ki_waitq);
if (error)
- return error;
+ goto unlock_mapping;
}
- if (!page->mapping)
- goto truncated;
+ error = AOP_TRUNCATED_PAGE;
+ if (!folio->mapping)
+ goto unlock;
error = 0;
- if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page))
+ if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, &folio->page))
goto unlock;
error = -EAGAIN;
if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
goto unlock;
- error = filemap_read_page(iocb->ki_filp, mapping, page);
- if (error == AOP_TRUNCATED_PAGE)
- put_page(page);
- return error;
-truncated:
- unlock_page(page);
- put_page(page);
- return AOP_TRUNCATED_PAGE;
+ error = filemap_read_page(iocb->ki_filp, mapping, &folio->page);
+ goto unlock_mapping;
unlock:
- unlock_page(page);
+ folio_unlock(folio);
+unlock_mapping:
+ filemap_invalidate_unlock_shared(mapping);
+ if (error == AOP_TRUNCATED_PAGE)
+ folio_put(folio);
return error;
}
@@ -2415,6 +2474,19 @@ static int filemap_create_page(struct file *file,
if (!page)
return -ENOMEM;
+ /*
+ * Protect against truncate / hole punch. Grabbing invalidate_lock here
+ * assures we cannot instantiate and bring uptodate new pagecache pages
+ * after evicting page cache during truncate and before actually
+ * freeing blocks. Note that we could release invalidate_lock after
+ * inserting the page into page cache as the locked page would then be
+ * enough to synchronize with hole punching. But there are code paths
+ * such as filemap_update_page() filling in partially uptodate pages or
+ * ->readpages() that need to hold invalidate_lock while mapping blocks
+ * for IO so let's hold the lock here as well to keep locking rules
+ * simple.
+ */
+ filemap_invalidate_lock_shared(mapping);
error = add_to_page_cache_lru(page, mapping, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error == -EEXIST)
@@ -2426,9 +2498,11 @@ static int filemap_create_page(struct file *file,
if (error)
goto error;
+ filemap_invalidate_unlock_shared(mapping);
pagevec_add(pvec, page);
return 0;
error:
+ filemap_invalidate_unlock_shared(mapping);
put_page(page);
return error;
}
@@ -2822,7 +2896,9 @@ unlock:
static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
struct file **fpin)
{
- if (trylock_page(page))
+ struct folio *folio = page_folio(page);
+
+ if (folio_trylock(folio))
return 1;
/*
@@ -2835,7 +2911,7 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
if (vmf->flags & FAULT_FLAG_KILLABLE) {
- if (__lock_page_killable(page)) {
+ if (__folio_lock_killable(folio)) {
/*
* We didn't have the right flags to drop the mmap_lock,
* but all fault_handlers only check for fatal signals
@@ -2847,11 +2923,11 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
return 0;
}
} else
- __lock_page(page);
+ __folio_lock(folio);
+
return 1;
}
-
/*
* Synchronous readahead happens when we don't even find a page in the page
* cache at all. We don't want to perform IO under the mmap sem, so if we have
@@ -2967,6 +3043,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
pgoff_t max_off;
struct page *page;
vm_fault_t ret = 0;
+ bool mapping_locked = false;
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (unlikely(offset >= max_off))
@@ -2976,25 +3053,39 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
* Do we have something in the page cache already?
*/
page = find_get_page(mapping, offset);
- if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
+ if (likely(page)) {
/*
- * We found the page, so try async readahead before
- * waiting for the lock.
+ * We found the page, so try async readahead before waiting for
+ * the lock.
*/
- fpin = do_async_mmap_readahead(vmf, page);
- } else if (!page) {
+ if (!(vmf->flags & FAULT_FLAG_TRIED))
+ fpin = do_async_mmap_readahead(vmf, page);
+ if (unlikely(!PageUptodate(page))) {
+ filemap_invalidate_lock_shared(mapping);
+ mapping_locked = true;
+ }
+ } else {
/* No page in the page cache at all */
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
fpin = do_sync_mmap_readahead(vmf);
retry_find:
+ /*
+ * See comment in filemap_create_page() why we need
+ * invalidate_lock
+ */
+ if (!mapping_locked) {
+ filemap_invalidate_lock_shared(mapping);
+ mapping_locked = true;
+ }
page = pagecache_get_page(mapping, offset,
FGP_CREAT|FGP_FOR_MMAP,
vmf->gfp_mask);
if (!page) {
if (fpin)
goto out_retry;
+ filemap_invalidate_unlock_shared(mapping);
return VM_FAULT_OOM;
}
}
@@ -3014,8 +3105,20 @@ retry_find:
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
*/
- if (unlikely(!PageUptodate(page)))
+ if (unlikely(!PageUptodate(page))) {
+ /*
+ * The page was in cache and uptodate and now it is not.
+ * Strange but possible since we didn't hold the page lock all
+ * the time. Let's drop everything get the invalidate lock and
+ * try again.
+ */
+ if (!mapping_locked) {
+ unlock_page(page);
+ put_page(page);
+ goto retry_find;
+ }
goto page_not_uptodate;
+ }
/*
* We've made it this far and we had to drop our mmap_lock, now is the
@@ -3026,6 +3129,8 @@ retry_find:
unlock_page(page);
goto out_retry;
}
+ if (mapping_locked)
+ filemap_invalidate_unlock_shared(mapping);
/*
* Found the page and have a reference on it.
@@ -3056,6 +3161,7 @@ page_not_uptodate:
if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;
+ filemap_invalidate_unlock_shared(mapping);
return VM_FAULT_SIGBUS;
@@ -3067,6 +3173,8 @@ out_retry:
*/
if (page)
put_page(page);
+ if (mapping_locked)
+ filemap_invalidate_unlock_shared(mapping);
if (fpin)
fput(fpin);
return ret | VM_FAULT_RETRY;
@@ -3437,6 +3545,8 @@ out:
*
* If the page does not get brought uptodate, return -EIO.
*
+ * The function expects mapping->invalidate_lock to be already held.
+ *
* Return: up to date page on success, ERR_PTR() on failure.
*/
struct page *read_cache_page(struct address_space *mapping,
@@ -3460,6 +3570,8 @@ EXPORT_SYMBOL(read_cache_page);
*
* If the page does not get brought uptodate, return -EIO.
*
+ * The function expects mapping->invalidate_lock to be already held.
+ *
* Return: up to date page on success, ERR_PTR() on failure.
*/
struct page *read_cache_page_gfp(struct address_space *mapping,
@@ -3594,28 +3706,6 @@ out:
}
EXPORT_SYMBOL(generic_file_direct_write);
-/*
- * Find or create a page at the given pagecache position. Return the locked
- * page. This function is specifically for buffered writes.
- */
-struct page *grab_cache_page_write_begin(struct address_space *mapping,
- pgoff_t index, unsigned flags)
-{
- struct page *page;
- int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
-
- if (flags & AOP_FLAG_NOFS)
- fgp_flags |= FGP_NOFS;
-
- page = pagecache_get_page(mapping, index, fgp_flags,
- mapping_gfp_mask(mapping));
- if (page)
- wait_for_stable_page(page);
-
- return page;
-}
-EXPORT_SYMBOL(grab_cache_page_write_begin);
-
ssize_t generic_perform_write(struct file *file,
struct iov_iter *i, loff_t pos)
{
@@ -3704,12 +3794,12 @@ EXPORT_SYMBOL(generic_perform_write);
* modification times and calls proper subroutines depending on whether we
* do direct IO or a standard buffered write.
*
- * It expects i_mutex to be grabbed unless we work on a block device or similar
+ * It expects i_rwsem to be grabbed unless we work on a block device or similar
* object which does not need locking at all.
*
* This function does *not* take care of syncing data in case of O_SYNC write.
* A caller has to handle it. This is mainly due to the fact that we want to
- * avoid syncing under i_mutex.
+ * avoid syncing under i_rwsem.
*
* Return:
* * number of bytes written, even for truncated writes
@@ -3797,7 +3887,7 @@ EXPORT_SYMBOL(__generic_file_write_iter);
*
* This is a wrapper around __generic_file_write_iter() to be used by most
* filesystems. It takes care of syncing the file in case of O_SYNC file
- * and acquires i_mutex as needed.
+ * and acquires i_rwsem as needed.
* Return:
* * negative error code if no data has been written at all of
* vfs_fsync_range() failed for a synchronous write