summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c160
-rw-r--r--mm/huge_memory.c42
-rw-r--r--mm/ksm.c4
-rw-r--r--mm/memory_hotplug.c14
-rw-r--r--mm/migrate.c3
-rw-r--r--mm/mlock.c24
-rw-r--r--mm/page_isolation.c8
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/shmem.c10
-rw-r--r--mm/swap.c6
-rw-r--r--mm/vmscan.c10
11 files changed, 213 insertions, 70 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 1aaea26556cc..5202e38ab79e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -988,9 +988,43 @@ void __init pagecache_init(void)
page_writeback_init();
}
+/*
+ * The page wait code treats the "wait->flags" somewhat unusually, because
+ * we have multiple different kinds of waits, not just the usual "exclusive"
+ * one.
+ *
+ * We have:
+ *
+ * (a) no special bits set:
+ *
+ * We're just waiting for the bit to be released, and when a waker
+ * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
+ * and remove it from the wait queue.
+ *
+ * Simple and straightforward.
+ *
+ * (b) WQ_FLAG_EXCLUSIVE:
+ *
+ * The waiter is waiting to get the lock, and only one waiter should
+ * be woken up to avoid any thundering herd behavior. We'll set the
+ * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
+ *
+ * This is the traditional exclusive wait.
+ *
+ * (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
+ *
+ * The waiter is waiting to get the bit, and additionally wants the
+ * lock to be transferred to it for fair lock behavior. If the lock
+ * cannot be taken, we stop walking the wait queue without waking
+ * the waiter.
+ *
+ * This is the "fair lock handoff" case, and in addition to setting
+ * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
+ * that it now has the lock.
+ */
static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{
- int ret;
+ unsigned int flags;
struct wait_page_key *key = arg;
struct wait_page_queue *wait_page
= container_of(wait, struct wait_page_queue, wait);
@@ -999,35 +1033,44 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
return 0;
/*
- * If it's an exclusive wait, we get the bit for it, and
- * stop walking if we can't.
- *
- * If it's a non-exclusive wait, then the fact that this
- * wake function was called means that the bit already
- * was cleared, and we don't care if somebody then
- * re-took it.
+ * If it's a lock handoff wait, we get the bit for it, and
+ * stop walking (and do not wake it up) if we can't.
*/
- ret = 0;
- if (wait->flags & WQ_FLAG_EXCLUSIVE) {
- if (test_and_set_bit(key->bit_nr, &key->page->flags))
+ flags = wait->flags;
+ if (flags & WQ_FLAG_EXCLUSIVE) {
+ if (test_bit(key->bit_nr, &key->page->flags))
return -1;
- ret = 1;
+ if (flags & WQ_FLAG_CUSTOM) {
+ if (test_and_set_bit(key->bit_nr, &key->page->flags))
+ return -1;
+ flags |= WQ_FLAG_DONE;
+ }
}
- wait->flags |= WQ_FLAG_WOKEN;
+ /*
+ * We are holding the wait-queue lock, but the waiter that
+ * is waiting for this will be checking the flags without
+ * any locking.
+ *
+ * So update the flags atomically, and wake up the waiter
+ * afterwards to avoid any races. This store-release pairs
+ * with the load-acquire in wait_on_page_bit_common().
+ */
+ smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
wake_up_state(wait->private, mode);
/*
* Ok, we have successfully done what we're waiting for,
* and we can unconditionally remove the wait entry.
*
- * Note that this has to be the absolute last thing we do,
- * since after list_del_init(&wait->entry) the wait entry
+ * Note that this pairs with the "finish_wait()" in the
+ * waiter, and has to be the absolute last thing we do.
+ * After this list_del_init(&wait->entry) the wait entry
* might be de-allocated and the process might even have
* exited.
*/
list_del_init_careful(&wait->entry);
- return ret;
+ return (flags & WQ_FLAG_EXCLUSIVE) != 0;
}
static void wake_up_page_bit(struct page *page, int bit_nr)
@@ -1107,8 +1150,8 @@ enum behavior {
};
/*
- * Attempt to check (or get) the page bit, and mark the
- * waiter woken if successful.
+ * Attempt to check (or get) the page bit, and mark us done
+ * if successful.
*/
static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
struct wait_queue_entry *wait)
@@ -1119,13 +1162,17 @@ static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
} else if (test_bit(bit_nr, &page->flags))
return false;
- wait->flags |= WQ_FLAG_WOKEN;
+ wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
return true;
}
+/* How many times do we accept lock stealing from under a waiter? */
+int sysctl_page_lock_unfairness = 5;
+
static inline int wait_on_page_bit_common(wait_queue_head_t *q,
struct page *page, int bit_nr, int state, enum behavior behavior)
{
+ int unfairness = sysctl_page_lock_unfairness;
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
bool thrashing = false;
@@ -1143,11 +1190,18 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
}
init_wait(wait);
- wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0;
wait->func = wake_page_function;
wait_page.page = page;
wait_page.bit_nr = bit_nr;
+repeat:
+ wait->flags = 0;
+ if (behavior == EXCLUSIVE) {
+ wait->flags = WQ_FLAG_EXCLUSIVE;
+ if (--unfairness < 0)
+ wait->flags |= WQ_FLAG_CUSTOM;
+ }
+
/*
* Do one last check whether we can get the
* page bit synchronously.
@@ -1170,27 +1224,63 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
/*
* From now on, all the logic will be based on
- * the WQ_FLAG_WOKEN flag, and the and the page
- * bit testing (and setting) will be - or has
- * already been - done by the wake function.
+ * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
+ * see whether the page bit testing has already
+ * been done by the wake function.
*
* We can drop our reference to the page.
*/
if (behavior == DROP)
put_page(page);
+ /*
+ * Note that until the "finish_wait()", or until
+ * we see the WQ_FLAG_WOKEN flag, we need to
+ * be very careful with the 'wait->flags', because
+ * we may race with a waker that sets them.
+ */
for (;;) {
+ unsigned int flags;
+
set_current_state(state);
- if (signal_pending_state(state, current))
+ /* Loop until we've been woken or interrupted */
+ flags = smp_load_acquire(&wait->flags);
+ if (!(flags & WQ_FLAG_WOKEN)) {
+ if (signal_pending_state(state, current))
+ break;
+
+ io_schedule();
+ continue;
+ }
+
+ /* If we were non-exclusive, we're done */
+ if (behavior != EXCLUSIVE)
break;
- if (wait->flags & WQ_FLAG_WOKEN)
+ /* If the waker got the lock for us, we're done */
+ if (flags & WQ_FLAG_DONE)
break;
- io_schedule();
+ /*
+ * Otherwise, if we're getting the lock, we need to
+ * try to get it ourselves.
+ *
+ * And if that fails, we'll have to retry this all.
+ */
+ if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
+ goto repeat;
+
+ wait->flags |= WQ_FLAG_DONE;
+ break;
}
+ /*
+ * If a signal happened, this 'finish_wait()' may remove the last
+ * waiter from the wait-queues, but the PageWaiters bit will remain
+ * set. That's ok. The next wakeup will take care of it, and trying
+ * to do it here would be difficult and prone to races.
+ */
finish_wait(q, wait);
if (thrashing) {
@@ -1200,12 +1290,20 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
}
/*
- * A signal could leave PageWaiters set. Clearing it here if
- * !waitqueue_active would be possible (by open-coding finish_wait),
- * but still fail to catch it in the case of wait hash collision. We
- * already can fail to clear wait hash collision cases, so don't
- * bother with signals either.
+ * NOTE! The wait->flags weren't stable until we've done the
+ * 'finish_wait()', and we could have exited the loop above due
+ * to a signal, and had a wakeup event happen after the signal
+ * test but before the 'finish_wait()'.
+ *
+ * So only after the finish_wait() can we reliably determine
+ * if we got woken up or not, so we can now figure out the final
+ * return value based on that state without races.
+ *
+ * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
+ * waiter, but an exclusive one requires WQ_FLAG_DONE.
*/
+ if (behavior == EXCLUSIVE)
+ return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7ff29cc3d55c..faadc449cca5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2022,7 +2022,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
put_page(page);
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
- } else if (is_huge_zero_pmd(*pmd)) {
+ } else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) {
/*
* FIXME: Do we want to invalidate secondary mmu by calling
* mmu_notifier_invalidate_range() see comments below inside
@@ -2116,30 +2116,34 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pte = pte_offset_map(&_pmd, addr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, entry);
- atomic_inc(&page[i]._mapcount);
- pte_unmap(pte);
- }
-
- /*
- * Set PG_double_map before dropping compound_mapcount to avoid
- * false-negative page_mapped().
- */
- if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
- for (i = 0; i < HPAGE_PMD_NR; i++)
+ if (!pmd_migration)
atomic_inc(&page[i]._mapcount);
+ pte_unmap(pte);
}
- lock_page_memcg(page);
- if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
- /* Last compound_mapcount is gone. */
- __dec_lruvec_page_state(page, NR_ANON_THPS);
- if (TestClearPageDoubleMap(page)) {
- /* No need in mapcount reference anymore */
+ if (!pmd_migration) {
+ /*
+ * Set PG_double_map before dropping compound_mapcount to avoid
+ * false-negative page_mapped().
+ */
+ if (compound_mapcount(page) > 1 &&
+ !TestSetPageDoubleMap(page)) {
for (i = 0; i < HPAGE_PMD_NR; i++)
- atomic_dec(&page[i]._mapcount);
+ atomic_inc(&page[i]._mapcount);
+ }
+
+ lock_page_memcg(page);
+ if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
+ /* Last compound_mapcount is gone. */
+ __dec_lruvec_page_state(page, NR_ANON_THPS);
+ if (TestClearPageDoubleMap(page)) {
+ /* No need in mapcount reference anymore */
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ atomic_dec(&page[i]._mapcount);
+ }
}
+ unlock_page_memcg(page);
}
- unlock_page_memcg(page);
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
diff --git a/mm/ksm.c b/mm/ksm.c
index 235f55d01541..9afccc36dbd2 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2586,6 +2586,10 @@ struct page *ksm_might_need_to_copy(struct page *page,
return page; /* let do_swap_page report the error */
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) {
+ put_page(new_page);
+ new_page = NULL;
+ }
if (new_page) {
copy_user_highpage(new_page, page, address, vma);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e9d5ab5d3ca0..b11a269e2356 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1575,6 +1575,20 @@ static int __ref __offline_pages(unsigned long start_pfn,
/* check again */
ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
NULL, check_pages_isolated_cb);
+ /*
+ * per-cpu pages are drained in start_isolate_page_range, but if
+ * there are still pages that are not free, make sure that we
+ * drain again, because when we isolated range we might
+ * have raced with another thread that was adding pages to pcp
+ * list.
+ *
+ * Forward progress should be still guaranteed because
+ * pages on the pcp list can only belong to MOVABLE_ZONE
+ * because has_unmovable_pages explicitly checks for
+ * PageBuddy on freed pages on other zones.
+ */
+ if (ret)
+ drain_all_pages(zone);
} while (ret);
/* Ok, all of our target is isolated.
diff --git a/mm/migrate.c b/mm/migrate.c
index 941b89383cf3..aecb1433cf3c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -668,7 +668,8 @@ void migrate_page_states(struct page *newpage, struct page *page)
copy_page_owner(page, newpage);
- mem_cgroup_migrate(page, newpage);
+ if (!PageHuge(page))
+ mem_cgroup_migrate(page, newpage);
}
EXPORT_SYMBOL(migrate_page_states);
diff --git a/mm/mlock.c b/mm/mlock.c
index 93ca2bf30b4f..884b1216da6a 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -58,11 +58,14 @@ EXPORT_SYMBOL(can_do_mlock);
*/
void clear_page_mlock(struct page *page)
{
+ int nr_pages;
+
if (!TestClearPageMlocked(page))
return;
- mod_zone_page_state(page_zone(page), NR_MLOCK, -thp_nr_pages(page));
- count_vm_event(UNEVICTABLE_PGCLEARED);
+ nr_pages = thp_nr_pages(page);
+ mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+ count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
/*
* The previous TestClearPageMlocked() corresponds to the smp_mb()
* in __pagevec_lru_add_fn().
@@ -76,7 +79,7 @@ void clear_page_mlock(struct page *page)
* We lost the race. the page already moved to evictable list.
*/
if (PageUnevictable(page))
- count_vm_event(UNEVICTABLE_PGSTRANDED);
+ count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
}
}
@@ -93,9 +96,10 @@ void mlock_vma_page(struct page *page)
VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
if (!TestSetPageMlocked(page)) {
- mod_zone_page_state(page_zone(page), NR_MLOCK,
- thp_nr_pages(page));
- count_vm_event(UNEVICTABLE_PGMLOCKED);
+ int nr_pages = thp_nr_pages(page);
+
+ mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
+ count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
if (!isolate_lru_page(page))
putback_lru_page(page);
}
@@ -138,7 +142,7 @@ static void __munlock_isolated_page(struct page *page)
/* Did try_to_unlock() succeed or punt? */
if (!PageMlocked(page))
- count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+ count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page));
putback_lru_page(page);
}
@@ -154,10 +158,12 @@ static void __munlock_isolated_page(struct page *page)
*/
static void __munlock_isolation_failed(struct page *page)
{
+ int nr_pages = thp_nr_pages(page);
+
if (PageUnevictable(page))
- __count_vm_event(UNEVICTABLE_PGSTRANDED);
+ __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
else
- __count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+ __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
}
/**
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 242c03121d73..63a3db10a8c0 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -170,6 +170,14 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* pageblocks we may have modified and return -EBUSY to caller. This
* prevents two threads from simultaneously working on overlapping ranges.
*
+ * Please note that there is no strong synchronization with the page allocator
+ * either. Pages might be freed while their page blocks are marked ISOLATED.
+ * In some cases pages might still end up on pcp lists and that would allow
+ * for their allocation even when they are in fact isolated already. Depending
+ * on how strong of a guarantee the caller needs drain_all_pages might be needed
+ * (e.g. __offline_pages will need to call it after check for isolated range for
+ * a next retry).
+ *
* Return: the number of isolated pageblocks on success and -EBUSY if any part
* of range cannot be isolated.
*/
diff --git a/mm/percpu.c b/mm/percpu.c
index f4709629e6de..1ed1a349eab8 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1316,7 +1316,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
/* allocate chunk */
alloc_size = sizeof(struct pcpu_chunk) +
- BITS_TO_LONGS(region_size >> PAGE_SHIFT);
+ BITS_TO_LONGS(region_size >> PAGE_SHIFT) * sizeof(unsigned long);
chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
if (!chunk)
panic("%s: Failed to allocate %zu bytes\n", __func__,
diff --git a/mm/shmem.c b/mm/shmem.c
index 271548ca20f3..8e2b35ba93ad 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -279,11 +279,13 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
if (!(sb->s_flags & SB_KERNMOUNT)) {
spin_lock(&sbinfo->stat_lock);
- if (!sbinfo->free_inodes) {
- spin_unlock(&sbinfo->stat_lock);
- return -ENOSPC;
+ if (sbinfo->max_inodes) {
+ if (!sbinfo->free_inodes) {
+ spin_unlock(&sbinfo->stat_lock);
+ return -ENOSPC;
+ }
+ sbinfo->free_inodes--;
}
- sbinfo->free_inodes--;
if (inop) {
ino = sbinfo->next_ino++;
if (unlikely(is_zero_ino(ino)))
diff --git a/mm/swap.c b/mm/swap.c
index d16d65d9b4e0..e7bdf094f76a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -494,14 +494,14 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
+ int nr_pages = thp_nr_pages(page);
/*
* We use the irq-unsafe __mod_zone_page_stat because this
* counter is not modified from interrupt context, and the pte
* lock is held(spinlock), which implies preemption disabled.
*/
- __mod_zone_page_state(page_zone(page), NR_MLOCK,
- thp_nr_pages(page));
- count_vm_event(UNEVICTABLE_PGMLOCKED);
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
+ count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
}
lru_cache_add(page);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9727dd8e2581..466fc3144fff 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4268,8 +4268,14 @@ void check_move_unevictable_pages(struct pagevec *pvec)
for (i = 0; i < pvec->nr; i++) {
struct page *page = pvec->pages[i];
struct pglist_data *pagepgdat = page_pgdat(page);
+ int nr_pages;
+
+ if (PageTransTail(page))
+ continue;
+
+ nr_pages = thp_nr_pages(page);
+ pgscanned += nr_pages;
- pgscanned++;
if (pagepgdat != pgdat) {
if (pgdat)
spin_unlock_irq(&pgdat->lru_lock);
@@ -4288,7 +4294,7 @@ void check_move_unevictable_pages(struct pagevec *pvec)
ClearPageUnevictable(page);
del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
add_page_to_lru_list(page, lruvec, lru);
- pgrescued++;
+ pgrescued += nr_pages;
}
}