summaryrefslogtreecommitdiff
path: root/mm/filemap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c308
1 files changed, 215 insertions, 93 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 1aaea26556cc..1a6beaf69f49 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -249,7 +249,7 @@ static void page_cache_free_page(struct address_space *mapping,
freepage(page);
if (PageTransHuge(page) && !PageHuge(page)) {
- page_ref_sub(page, HPAGE_PMD_NR);
+ page_ref_sub(page, thp_nr_pages(page));
VM_BUG_ON_PAGE(page_count(page) <= 0, page);
} else {
put_page(page);
@@ -414,7 +414,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
.range_end = end,
};
- if (!mapping_cap_writeback_dirty(mapping) ||
+ if (!mapping_can_writeback(mapping) ||
!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
@@ -827,15 +827,14 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);
-static int __add_to_page_cache_locked(struct page *page,
- struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask,
- void **shadowp)
+noinline int __add_to_page_cache_locked(struct page *page,
+ struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp,
+ void **shadowp)
{
XA_STATE(xas, &mapping->i_pages, offset);
int huge = PageHuge(page);
int error;
- void *old;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
@@ -846,25 +845,46 @@ static int __add_to_page_cache_locked(struct page *page,
page->index = offset;
if (!huge) {
- error = mem_cgroup_charge(page, current->mm, gfp_mask);
+ error = mem_cgroup_charge(page, current->mm, gfp);
if (error)
goto error;
}
+ gfp &= GFP_RECLAIM_MASK;
+
do {
+ unsigned int order = xa_get_order(xas.xa, xas.xa_index);
+ void *entry, *old = NULL;
+
+ if (order > thp_order(page))
+ xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
+ order, gfp);
xas_lock_irq(&xas);
- old = xas_load(&xas);
- if (old && !xa_is_value(old))
- xas_set_err(&xas, -EEXIST);
+ xas_for_each_conflict(&xas, entry) {
+ old = entry;
+ if (!xa_is_value(entry)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ }
+
+ if (old) {
+ if (shadowp)
+ *shadowp = old;
+ /* entry may have been split before we acquired lock */
+ order = xa_get_order(xas.xa, xas.xa_index);
+ if (order > thp_order(page)) {
+ xas_split(&xas, old, order);
+ xas_reset(&xas);
+ }
+ }
+
xas_store(&xas, page);
if (xas_error(&xas))
goto unlock;
- if (xa_is_value(old)) {
+ if (old)
mapping->nrexceptional--;
- if (shadowp)
- *shadowp = old;
- }
mapping->nrpages++;
/* hugetlb pages do not participate in page cache accounting */
@@ -872,7 +892,7 @@ static int __add_to_page_cache_locked(struct page *page,
__inc_lruvec_page_state(page, NR_FILE_PAGES);
unlock:
xas_unlock_irq(&xas);
- } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
+ } while (xas_nomem(&xas, gfp));
if (xas_error(&xas)) {
error = xas_error(&xas);
@@ -988,9 +1008,43 @@ void __init pagecache_init(void)
page_writeback_init();
}
+/*
+ * The page wait code treats the "wait->flags" somewhat unusually, because
+ * we have multiple different kinds of waits, not just the usual "exclusive"
+ * one.
+ *
+ * We have:
+ *
+ * (a) no special bits set:
+ *
+ * We're just waiting for the bit to be released, and when a waker
+ * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
+ * and remove it from the wait queue.
+ *
+ * Simple and straightforward.
+ *
+ * (b) WQ_FLAG_EXCLUSIVE:
+ *
+ * The waiter is waiting to get the lock, and only one waiter should
+ * be woken up to avoid any thundering herd behavior. We'll set the
+ * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
+ *
+ * This is the traditional exclusive wait.
+ *
+ * (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
+ *
+ * The waiter is waiting to get the bit, and additionally wants the
+ * lock to be transferred to it for fair lock behavior. If the lock
+ * cannot be taken, we stop walking the wait queue without waking
+ * the waiter.
+ *
+ * This is the "fair lock handoff" case, and in addition to setting
+ * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
+ * that it now has the lock.
+ */
static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{
- int ret;
+ unsigned int flags;
struct wait_page_key *key = arg;
struct wait_page_queue *wait_page
= container_of(wait, struct wait_page_queue, wait);
@@ -999,35 +1053,44 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
return 0;
/*
- * If it's an exclusive wait, we get the bit for it, and
- * stop walking if we can't.
- *
- * If it's a non-exclusive wait, then the fact that this
- * wake function was called means that the bit already
- * was cleared, and we don't care if somebody then
- * re-took it.
+ * If it's a lock handoff wait, we get the bit for it, and
+ * stop walking (and do not wake it up) if we can't.
*/
- ret = 0;
- if (wait->flags & WQ_FLAG_EXCLUSIVE) {
- if (test_and_set_bit(key->bit_nr, &key->page->flags))
+ flags = wait->flags;
+ if (flags & WQ_FLAG_EXCLUSIVE) {
+ if (test_bit(key->bit_nr, &key->page->flags))
return -1;
- ret = 1;
+ if (flags & WQ_FLAG_CUSTOM) {
+ if (test_and_set_bit(key->bit_nr, &key->page->flags))
+ return -1;
+ flags |= WQ_FLAG_DONE;
+ }
}
- wait->flags |= WQ_FLAG_WOKEN;
+ /*
+ * We are holding the wait-queue lock, but the waiter that
+ * is waiting for this will be checking the flags without
+ * any locking.
+ *
+ * So update the flags atomically, and wake up the waiter
+ * afterwards to avoid any races. This store-release pairs
+ * with the load-acquire in wait_on_page_bit_common().
+ */
+ smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
wake_up_state(wait->private, mode);
/*
* Ok, we have successfully done what we're waiting for,
* and we can unconditionally remove the wait entry.
*
- * Note that this has to be the absolute last thing we do,
- * since after list_del_init(&wait->entry) the wait entry
+ * Note that this pairs with the "finish_wait()" in the
+ * waiter, and has to be the absolute last thing we do.
+ * After this list_del_init(&wait->entry) the wait entry
* might be de-allocated and the process might even have
* exited.
*/
list_del_init_careful(&wait->entry);
- return ret;
+ return (flags & WQ_FLAG_EXCLUSIVE) != 0;
}
static void wake_up_page_bit(struct page *page, int bit_nr)
@@ -1107,8 +1170,8 @@ enum behavior {
};
/*
- * Attempt to check (or get) the page bit, and mark the
- * waiter woken if successful.
+ * Attempt to check (or get) the page bit, and mark us done
+ * if successful.
*/
static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
struct wait_queue_entry *wait)
@@ -1119,13 +1182,17 @@ static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
} else if (test_bit(bit_nr, &page->flags))
return false;
- wait->flags |= WQ_FLAG_WOKEN;
+ wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
return true;
}
+/* How many times do we accept lock stealing from under a waiter? */
+int sysctl_page_lock_unfairness = 5;
+
static inline int wait_on_page_bit_common(wait_queue_head_t *q,
struct page *page, int bit_nr, int state, enum behavior behavior)
{
+ int unfairness = sysctl_page_lock_unfairness;
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
bool thrashing = false;
@@ -1143,11 +1210,18 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
}
init_wait(wait);
- wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0;
wait->func = wake_page_function;
wait_page.page = page;
wait_page.bit_nr = bit_nr;
+repeat:
+ wait->flags = 0;
+ if (behavior == EXCLUSIVE) {
+ wait->flags = WQ_FLAG_EXCLUSIVE;
+ if (--unfairness < 0)
+ wait->flags |= WQ_FLAG_CUSTOM;
+ }
+
/*
* Do one last check whether we can get the
* page bit synchronously.
@@ -1170,27 +1244,63 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
/*
* From now on, all the logic will be based on
- * the WQ_FLAG_WOKEN flag, and the and the page
- * bit testing (and setting) will be - or has
- * already been - done by the wake function.
+ * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
+ * see whether the page bit testing has already
+ * been done by the wake function.
*
* We can drop our reference to the page.
*/
if (behavior == DROP)
put_page(page);
+ /*
+ * Note that until the "finish_wait()", or until
+ * we see the WQ_FLAG_WOKEN flag, we need to
+ * be very careful with the 'wait->flags', because
+ * we may race with a waker that sets them.
+ */
for (;;) {
+ unsigned int flags;
+
set_current_state(state);
- if (signal_pending_state(state, current))
+ /* Loop until we've been woken or interrupted */
+ flags = smp_load_acquire(&wait->flags);
+ if (!(flags & WQ_FLAG_WOKEN)) {
+ if (signal_pending_state(state, current))
+ break;
+
+ io_schedule();
+ continue;
+ }
+
+ /* If we were non-exclusive, we're done */
+ if (behavior != EXCLUSIVE)
break;
- if (wait->flags & WQ_FLAG_WOKEN)
+ /* If the waker got the lock for us, we're done */
+ if (flags & WQ_FLAG_DONE)
break;
- io_schedule();
+ /*
+ * Otherwise, if we're getting the lock, we need to
+ * try to get it ourselves.
+ *
+ * And if that fails, we'll have to retry this all.
+ */
+ if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
+ goto repeat;
+
+ wait->flags |= WQ_FLAG_DONE;
+ break;
}
+ /*
+ * If a signal happened, this 'finish_wait()' may remove the last
+ * waiter from the wait-queues, but the PageWaiters bit will remain
+ * set. That's ok. The next wakeup will take care of it, and trying
+ * to do it here would be difficult and prone to races.
+ */
finish_wait(q, wait);
if (thrashing) {
@@ -1200,12 +1310,20 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
}
/*
- * A signal could leave PageWaiters set. Clearing it here if
- * !waitqueue_active would be possible (by open-coding finish_wait),
- * but still fail to catch it in the case of wait hash collision. We
- * already can fail to clear wait hash collision cases, so don't
- * bother with signals either.
+ * NOTE! The wait->flags weren't stable until we've done the
+ * 'finish_wait()', and we could have exited the loop above due
+ * to a signal, and had a wakeup event happen after the signal
+ * test but before the 'finish_wait()'.
+ *
+ * So only after the finish_wait() can we reliably determine
+ * if we got woken up or not, so we can now figure out the final
+ * return value based on that state without races.
+ *
+ * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
+ * waiter, but an exclusive one requires WQ_FLAG_DONE.
*/
+ if (behavior == EXCLUSIVE)
+ return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
}
@@ -1327,7 +1445,7 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem
* unlock_page - unlock a locked page
* @page: the page
*
- * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
+ * Unlocks the page and wakes up sleepers in wait_on_page_locked().
* Also wakes sleepers in wait_on_page_writeback() because the wakeup
* mechanism between PageLocked pages and PageWriteback pages is shared.
* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
@@ -1547,19 +1665,19 @@ EXPORT_SYMBOL(page_cache_prev_miss);
/**
* find_get_entry - find and get a page cache entry
* @mapping: the address_space to search
- * @offset: the page cache index
+ * @index: The page cache index.
*
* Looks up the page cache slot at @mapping & @offset. If there is a
- * page cache page, it is returned with an increased refcount.
+ * page cache page, the head page is returned with an increased refcount.
*
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
- * Return: the found page or shadow entry, %NULL if nothing is found.
+ * Return: The head page or shadow entry, %NULL if nothing is found.
*/
-struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
+struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
{
- XA_STATE(xas, &mapping->i_pages, offset);
+ XA_STATE(xas, &mapping->i_pages, index);
struct page *page;
rcu_read_lock();
@@ -1587,7 +1705,6 @@ repeat:
put_page(page);
goto repeat;
}
- page = find_subpage(page, offset);
out:
rcu_read_unlock();
@@ -1595,40 +1712,37 @@ out:
}
/**
- * find_lock_entry - locate, pin and lock a page cache entry
- * @mapping: the address_space to search
- * @offset: the page cache index
+ * find_lock_entry - Locate and lock a page cache entry.
+ * @mapping: The address_space to search.
+ * @index: The page cache index.
*
- * Looks up the page cache slot at @mapping & @offset. If there is a
- * page cache page, it is returned locked and with an increased
- * refcount.
+ * Looks up the page at @mapping & @index. If there is a page in the
+ * cache, the head page is returned locked and with an increased refcount.
*
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
- * find_lock_entry() may sleep.
- *
- * Return: the found page or shadow entry, %NULL if nothing is found.
+ * Context: May sleep.
+ * Return: The head page or shadow entry, %NULL if nothing is found.
*/
-struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
+struct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
{
struct page *page;
repeat:
- page = find_get_entry(mapping, offset);
+ page = find_get_entry(mapping, index);
if (page && !xa_is_value(page)) {
lock_page(page);
/* Has the page been truncated? */
- if (unlikely(page_mapping(page) != mapping)) {
+ if (unlikely(page->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
}
- VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
+ VM_BUG_ON_PAGE(!thp_contains(page, index), page);
}
return page;
}
-EXPORT_SYMBOL(find_lock_entry);
/**
* pagecache_get_page - Find and get a reference to a page.
@@ -1643,6 +1757,8 @@ EXPORT_SYMBOL(find_lock_entry);
*
* * %FGP_ACCESSED - The page will be marked accessed.
* * %FGP_LOCK - The page is returned locked.
+ * * %FGP_HEAD - If the page is present and a THP, return the head page
+ * rather than the exact page specified by the index.
* * %FGP_CREAT - If no page is present then a new page is allocated using
* @gfp_mask and added to the page cache and the VM's LRU list.
* The page is returned locked and with an increased refcount.
@@ -1683,12 +1799,12 @@ repeat:
}
/* Has the page been truncated? */
- if (unlikely(compound_head(page)->mapping != mapping)) {
+ if (unlikely(page->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
}
- VM_BUG_ON_PAGE(page->index != index, page);
+ VM_BUG_ON_PAGE(!thp_contains(page, index), page);
}
if (fgp_flags & FGP_ACCESSED)
@@ -1698,11 +1814,13 @@ repeat:
if (page_is_idle(page))
clear_page_idle(page);
}
+ if (!(fgp_flags & FGP_HEAD))
+ page = find_subpage(page, index);
no_page:
if (!page && (fgp_flags & FGP_CREAT)) {
int err;
- if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
+ if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
gfp_mask |= __GFP_WRITE;
if (fgp_flags & FGP_NOFS)
gfp_mask &= ~__GFP_FS;
@@ -2267,7 +2385,11 @@ readpage:
}
if (!PageUptodate(page)) {
- error = lock_page_killable(page);
+ if (iocb->ki_flags & IOCB_WAITQ)
+ error = lock_page_async(page, iocb->ki_waitq);
+ else
+ error = lock_page_killable(page);
+
if (unlikely(error))
goto readpage_error;
if (!PageUptodate(page)) {
@@ -2466,8 +2588,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
struct file *file = vmf->vma->vm_file;
struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
+ DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
struct file *fpin = NULL;
- pgoff_t offset = vmf->pgoff;
unsigned int mmap_miss;
/* If we don't want any read-ahead, don't bother */
@@ -2478,8 +2600,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
if (vmf->vma->vm_flags & VM_SEQ_READ) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- page_cache_sync_readahead(mapping, ra, file, offset,
- ra->ra_pages);
+ page_cache_sync_ra(&ractl, ra, ra->ra_pages);
return fpin;
}
@@ -2499,10 +2620,11 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
* mmap read-around
*/
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
+ ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
ra->size = ra->ra_pages;
ra->async_size = ra->ra_pages / 4;
- ra_submit(ra, mapping, file);
+ ractl._index = ra->start;
+ do_page_cache_ra(&ractl, ra->size, ra->async_size);
return fpin;
}
@@ -2691,42 +2813,42 @@ void filemap_map_pages(struct vm_fault *vmf,
pgoff_t last_pgoff = start_pgoff;
unsigned long max_idx;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
- struct page *page;
+ struct page *head, *page;
unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
rcu_read_lock();
- xas_for_each(&xas, page, end_pgoff) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, head, end_pgoff) {
+ if (xas_retry(&xas, head))
continue;
- if (xa_is_value(page))
+ if (xa_is_value(head))
goto next;
/*
* Check for a locked page first, as a speculative
* reference may adversely influence page migration.
*/
- if (PageLocked(page))
+ if (PageLocked(head))
goto next;
- if (!page_cache_get_speculative(page))
+ if (!page_cache_get_speculative(head))
goto next;
/* Has the page moved or been split? */
- if (unlikely(page != xas_reload(&xas)))
+ if (unlikely(head != xas_reload(&xas)))
goto skip;
- page = find_subpage(page, xas.xa_index);
+ page = find_subpage(head, xas.xa_index);
- if (!PageUptodate(page) ||
+ if (!PageUptodate(head) ||
PageReadahead(page) ||
PageHWPoison(page))
goto skip;
- if (!trylock_page(page))
+ if (!trylock_page(head))
goto skip;
- if (page->mapping != mapping || !PageUptodate(page))
+ if (head->mapping != mapping || !PageUptodate(head))
goto unlock;
max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
- if (page->index >= max_idx)
+ if (xas.xa_index >= max_idx)
goto unlock;
if (mmap_miss > 0)
@@ -2738,12 +2860,12 @@ void filemap_map_pages(struct vm_fault *vmf,
last_pgoff = xas.xa_index;
if (alloc_set_pte(vmf, page))
goto unlock;
- unlock_page(page);
+ unlock_page(head);
goto next;
unlock:
- unlock_page(page);
+ unlock_page(head);
skip:
- put_page(page);
+ put_page(head);
next:
/* Huge page is mapped? No need to proceed. */
if (pmd_trans_huge(*vmf->pmd))
@@ -2882,7 +3004,7 @@ filler:
goto out;
/*
- * Page is not up to date and may be locked due one of the following
+ * Page is not up to date and may be locked due to one of the following
* case a: Page is being filled and the page lock is held
* case b: Read/write error clearing the page uptodate status
* case c: Truncation in progress (page locked)