summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig8
-rw-r--r--mm/Kconfig.debug6
-rw-r--r--mm/Makefile9
-rw-r--r--mm/backing-dev.c87
-rw-r--r--mm/bootmem.c2
-rw-r--r--mm/cma.c55
-rw-r--r--mm/cma_debug.c2
-rw-r--r--mm/compaction.c105
-rw-r--r--mm/debug.c4
-rw-r--r--mm/dmapool.c18
-rw-r--r--mm/fadvise.c15
-rw-r--r--mm/filemap.c348
-rw-r--r--mm/gup.c101
-rw-r--r--mm/huge_memory.c774
-rw-r--r--mm/hugetlb.c291
-rw-r--r--mm/init-mm.c2
-rw-r--r--mm/internal.h31
-rw-r--r--mm/kasan/kasan.c22
-rw-r--r--mm/kasan/kasan.h5
-rw-r--r--mm/kasan/kasan_init.c57
-rw-r--r--mm/kasan/quarantine.c144
-rw-r--r--mm/kasan/report.c41
-rw-r--r--mm/khugepaged.c50
-rw-r--r--mm/kmemleak.c8
-rw-r--r--mm/ksm.c113
-rw-r--r--mm/madvise.c102
-rw-r--r--mm/memblock.c121
-rw-r--r--mm/memcontrol.c90
-rw-r--r--mm/memory-failure.c33
-rw-r--r--mm/memory.c1379
-rw-r--r--mm/memory_hotplug.c121
-rw-r--r--mm/mempolicy.c57
-rw-r--r--mm/migrate.c145
-rw-r--r--mm/mincore.c3
-rw-r--r--mm/mlock.c11
-rw-r--r--mm/mmap.c94
-rw-r--r--mm/mmu_context.c4
-rw-r--r--mm/mmu_notifier.c3
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mprotect.c95
-rw-r--r--mm/mremap.c43
-rw-r--r--mm/nommu.c39
-rw-r--r--mm/oom_kill.c38
-rw-r--r--mm/page-writeback.c38
-rw-r--r--mm/page_alloc.c864
-rw-r--r--mm/page_idle.c34
-rw-r--r--mm/page_io.c5
-rw-r--r--mm/page_isolation.c10
-rw-r--r--mm/page_vma_mapped.c223
-rw-r--r--mm/pagewalk.c52
-rw-r--r--mm/percpu-vm.c7
-rw-r--r--mm/percpu.c26
-rw-r--r--mm/pgtable-generic.c20
-rw-r--r--mm/process_vm_access.c13
-rw-r--r--mm/readahead.c39
-rw-r--r--mm/rmap.c657
-rw-r--r--mm/rodata_test.c56
-rw-r--r--mm/shmem.c223
-rw-r--r--mm/slab.c155
-rw-r--r--mm/slab.h53
-rw-r--r--mm/slab_common.c288
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c133
-rw-r--r--mm/sparse-vmemmap.c22
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c44
-rw-r--r--mm/swap_cgroup.c2
-rw-r--r--mm/swap_slots.c340
-rw-r--r--mm/swap_state.c80
-rw-r--r--mm/swapfile.c589
-rw-r--r--mm/truncate.c99
-rw-r--r--mm/usercopy.c7
-rw-r--r--mm/userfaultfd.c301
-rw-r--r--mm/util.c9
-rw-r--r--mm/vmacache.c13
-rw-r--r--mm/vmalloc.c295
-rw-r--r--mm/vmpressure.c10
-rw-r--r--mm/vmscan.c360
-rw-r--r--mm/vmstat.c114
-rw-r--r--mm/workingset.c118
-rw-r--r--mm/z3fold.c395
-rw-r--r--mm/zsmalloc.c86
-rw-r--r--mm/zswap.c309
83 files changed, 7182 insertions, 3591 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 86e3e0e74d20..9b8fccb969dc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -153,7 +153,7 @@ config MOVABLE_NODE
bool "Enable to assign a node which has only movable memory"
depends on HAVE_MEMBLOCK
depends on NO_BOOTMEM
- depends on X86_64
+ depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG
depends on NUMA
default n
help
@@ -447,13 +447,9 @@ choice
benefit.
endchoice
-#
-# We don't deposit page tables on file THP mapping,
-# but Power makes use of them to address MMU quirk.
-#
config TRANSPARENT_HUGE_PAGECACHE
def_bool y
- depends on TRANSPARENT_HUGEPAGE && !PPC
+ depends on TRANSPARENT_HUGEPAGE
#
# UP and nommu archs use km based percpu allocator
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index afcc550877ff..79d0fd13b5b3 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -90,3 +90,9 @@ config DEBUG_PAGE_REF
careful when enabling this feature because it adds about 30 KB to the
kernel code. However the runtime performance overhead is virtually
nil until the tracepoints are actually enabled.
+
+config DEBUG_RODATA_TEST
+ bool "Testcase for the marking rodata read-only"
+ depends on STRICT_KERNEL_RWX
+ ---help---
+ This option enables a testcase for the setting rodata read-only.
diff --git a/mm/Makefile b/mm/Makefile
index 295bd7a9f76b..026f6a828a50 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -23,8 +23,10 @@ KCOV_INSTRUMENT_vmstat.o := n
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
- mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o pagewalk.o pgtable-generic.o
+ mlock.o mmap.o mprotect.o mremap.o msync.o \
+ page_vma_mapped.o pagewalk.o pgtable-generic.o \
+ rmap.o vmalloc.o
+
ifdef CONFIG_CROSS_MEMORY_ATTACH
mmu-$(CONFIG_MMU) += process_vm_access.o
@@ -35,7 +37,7 @@ obj-y := filemap.o mempool.o oom_kill.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
- compaction.o vmacache.o \
+ compaction.o vmacache.o swap_slots.o \
interval_tree.o list_lru.o workingset.o \
debug.o $(mmu-y)
@@ -83,6 +85,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
+obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8fde443f36d7..c6f2a37028c2 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -237,6 +237,7 @@ static __init int bdi_class_init(void)
bdi_class->dev_groups = bdi_dev_groups;
bdi_debug_init();
+
return 0;
}
postcore_initcall(bdi_class_init);
@@ -310,6 +311,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
spin_lock_init(&wb->work_lock);
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
+ wb->dirty_sleep = jiffies;
wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
if (!wb->congested)
@@ -409,8 +411,8 @@ retry:
while (*node != NULL) {
parent = *node;
- congested = container_of(parent, struct bdi_writeback_congested,
- rb_node);
+ congested = rb_entry(parent, struct bdi_writeback_congested,
+ rb_node);
if (congested->blkcg_id < blkcg_id)
node = &parent->rb_left;
else if (congested->blkcg_id > blkcg_id)
@@ -681,33 +683,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
{
struct radix_tree_iter iter;
- struct rb_node *rbn;
void **slot;
WARN_ON(test_bit(WB_registered, &bdi->wb.state));
spin_lock_irq(&cgwb_lock);
-
radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
cgwb_kill(*slot);
-
- while ((rbn = rb_first(&bdi->cgwb_congested_tree))) {
- struct bdi_writeback_congested *congested =
- rb_entry(rbn, struct bdi_writeback_congested, rb_node);
-
- rb_erase(rbn, &bdi->cgwb_congested_tree);
- congested->bdi = NULL; /* mark @congested unlinked */
- }
-
spin_unlock_irq(&cgwb_lock);
/*
- * All cgwb's and their congested states must be shutdown and
- * released before returning. Drain the usage counter to wait for
- * all cgwb's and cgwb_congested's ever created on @bdi.
+ * All cgwb's must be shutdown and released before returning. Drain
+ * the usage counter to wait for all cgwb's ever created on @bdi.
*/
atomic_dec(&bdi->usage_cnt);
wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
+ /*
+ * Grab back our reference so that we hold it when @bdi gets
+ * re-registered.
+ */
+ atomic_inc(&bdi->usage_cnt);
}
/**
@@ -747,6 +742,21 @@ void wb_blkcg_offline(struct blkcg *blkcg)
spin_unlock_irq(&cgwb_lock);
}
+static void cgwb_bdi_exit(struct backing_dev_info *bdi)
+{
+ struct rb_node *rbn;
+
+ spin_lock_irq(&cgwb_lock);
+ while ((rbn = rb_first(&bdi->cgwb_congested_tree))) {
+ struct bdi_writeback_congested *congested =
+ rb_entry(rbn, struct bdi_writeback_congested, rb_node);
+
+ rb_erase(rbn, &bdi->cgwb_congested_tree);
+ congested->bdi = NULL; /* mark @congested unlinked */
+ }
+ spin_unlock_irq(&cgwb_lock);
+}
+
#else /* CONFIG_CGROUP_WRITEBACK */
static int cgwb_bdi_init(struct backing_dev_info *bdi)
@@ -757,9 +767,11 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
if (!bdi->wb_congested)
return -ENOMEM;
+ atomic_set(&bdi->wb_congested->refcnt, 1);
+
err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (err) {
- kfree(bdi->wb_congested);
+ wb_congested_put(bdi->wb_congested);
return err;
}
return 0;
@@ -767,6 +779,11 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
+static void cgwb_bdi_exit(struct backing_dev_info *bdi)
+{
+ wb_congested_put(bdi->wb_congested);
+}
+
#endif /* CONFIG_CGROUP_WRITEBACK */
int bdi_init(struct backing_dev_info *bdi)
@@ -775,6 +792,7 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->dev = NULL;
+ kref_init(&bdi->refcnt);
bdi->min_ratio = 0;
bdi->max_ratio = 100;
bdi->max_prop_frac = FPROP_FRAC_BASE;
@@ -790,6 +808,22 @@ int bdi_init(struct backing_dev_info *bdi)
}
EXPORT_SYMBOL(bdi_init);
+struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
+{
+ struct backing_dev_info *bdi;
+
+ bdi = kmalloc_node(sizeof(struct backing_dev_info),
+ gfp_mask | __GFP_ZERO, node_id);
+ if (!bdi)
+ return NULL;
+
+ if (bdi_init(bdi)) {
+ kfree(bdi);
+ return NULL;
+ }
+ return bdi;
+}
+
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
@@ -833,6 +867,8 @@ int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner)
MINOR(owner->devt));
if (rc)
return rc;
+ /* Leaking owner reference... */
+ WARN_ON(bdi->owner);
bdi->owner = owner;
get_device(owner);
return 0;
@@ -870,10 +906,25 @@ void bdi_unregister(struct backing_dev_info *bdi)
}
}
-void bdi_exit(struct backing_dev_info *bdi)
+static void bdi_exit(struct backing_dev_info *bdi)
{
WARN_ON_ONCE(bdi->dev);
wb_exit(&bdi->wb);
+ cgwb_bdi_exit(bdi);
+}
+
+static void release_bdi(struct kref *ref)
+{
+ struct backing_dev_info *bdi =
+ container_of(ref, struct backing_dev_info, refcnt);
+
+ bdi_exit(bdi);
+ kfree(bdi);
+}
+
+void bdi_put(struct backing_dev_info *bdi)
+{
+ kref_put(&bdi->refcnt, release_bdi);
}
void bdi_destroy(struct backing_dev_info *bdi)
diff --git a/mm/bootmem.c b/mm/bootmem.c
index e8a55a3c9feb..9fedb27c6451 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -53,7 +53,7 @@ early_param("bootmem_debug", bootmem_debug_setup);
static unsigned long __init bootmap_bytes(unsigned long pages)
{
- unsigned long bytes = DIV_ROUND_UP(pages, 8);
+ unsigned long bytes = DIV_ROUND_UP(pages, BITS_PER_BYTE);
return ALIGN(bytes, sizeof(long));
}
diff --git a/mm/cma.c b/mm/cma.c
index c960459eda7e..a6033e344430 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -235,18 +235,13 @@ int __init cma_declare_contiguous(phys_addr_t base,
phys_addr_t highmem_start;
int ret = 0;
-#ifdef CONFIG_X86
/*
- * high_memory isn't direct mapped memory so retrieving its physical
- * address isn't appropriate. But it would be useful to check the
- * physical address of the highmem boundary so it's justifiable to get
- * the physical address from it. On x86 there is a validation check for
- * this case, so the following workaround is needed to avoid it.
+ * We can't use __pa(high_memory) directly, since high_memory
+ * isn't a valid direct map VA, and DEBUG_VIRTUAL will (validly)
+ * complain. Find the boundary by adding one to the last valid
+ * address.
*/
- highmem_start = __pa_nodebug(high_memory);
-#else
- highmem_start = __pa(high_memory);
-#endif
+ highmem_start = __pa(high_memory - 1) + 1;
pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
__func__, &size, &base, &limit, &alignment);
@@ -353,6 +348,32 @@ err:
return ret;
}
+#ifdef CONFIG_CMA_DEBUG
+static void cma_debug_show_areas(struct cma *cma)
+{
+ unsigned long next_zero_bit, next_set_bit;
+ unsigned long start = 0;
+ unsigned int nr_zero, nr_total = 0;
+
+ mutex_lock(&cma->lock);
+ pr_info("number of available pages: ");
+ for (;;) {
+ next_zero_bit = find_next_zero_bit(cma->bitmap, cma->count, start);
+ if (next_zero_bit >= cma->count)
+ break;
+ next_set_bit = find_next_bit(cma->bitmap, cma->count, next_zero_bit);
+ nr_zero = next_set_bit - next_zero_bit;
+ pr_cont("%s%u@%lu", nr_total ? "+" : "", nr_zero, next_zero_bit);
+ nr_total += nr_zero;
+ start = next_zero_bit + nr_zero;
+ }
+ pr_cont("=> %u free of %lu total pages\n", nr_total, cma->count);
+ mutex_unlock(&cma->lock);
+}
+#else
+static inline void cma_debug_show_areas(struct cma *cma) { }
+#endif
+
/**
* cma_alloc() - allocate pages from contiguous area
* @cma: Contiguous memory region for which the allocation is performed.
@@ -362,14 +383,15 @@ err:
* This function allocates part of contiguous memory on specific
* contiguous memory area.
*/
-struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
+struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
+ gfp_t gfp_mask)
{
unsigned long mask, offset;
unsigned long pfn = -1;
unsigned long start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
struct page *page = NULL;
- int ret;
+ int ret = -ENOMEM;
if (!cma || !cma->count)
return NULL;
@@ -407,7 +429,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
mutex_lock(&cma_mutex);
- ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA);
+ ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
+ gfp_mask);
mutex_unlock(&cma_mutex);
if (ret == 0) {
page = pfn_to_page(pfn);
@@ -426,6 +449,12 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
trace_cma_alloc(pfn, page, count, align);
+ if (ret) {
+ pr_info("%s: alloc failed, req-size: %zu pages, ret: %d\n",
+ __func__, count, ret);
+ cma_debug_show_areas(cma);
+ }
+
pr_debug("%s(): returned %p\n", __func__, page);
return page;
}
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index f8e4b60db167..ffc0c3d0ae64 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -138,7 +138,7 @@ static int cma_alloc_mem(struct cma *cma, int count)
if (!mem)
return -ENOMEM;
- p = cma_alloc(cma, count, 0);
+ p = cma_alloc(cma, count, 0, GFP_KERNEL);
if (!p) {
kfree(mem);
return -ENOMEM;
diff --git a/mm/compaction.c b/mm/compaction.c
index 0409a4ad6ea1..81e1eaa2a2cf 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -12,6 +12,7 @@
#include <linux/migrate.h>
#include <linux/compaction.h>
#include <linux/mm_inline.h>
+#include <linux/sched/signal.h>
#include <linux/backing-dev.h>
#include <linux/sysctl.h>
#include <linux/sysfs.h>
@@ -548,7 +549,7 @@ isolate_fail:
if (blockpfn == end_pfn)
update_pageblock_skip(cc, valid_page, total_isolated, false);
- count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
+ cc->total_free_scanned += nr_scanned;
if (total_isolated)
count_compact_events(COMPACTISOLATED, total_isolated);
return total_isolated;
@@ -634,22 +635,6 @@ isolate_freepages_range(struct compact_control *cc,
return pfn;
}
-/* Update the number of anon and file isolated pages in the zone */
-static void acct_isolated(struct zone *zone, struct compact_control *cc)
-{
- struct page *page;
- unsigned int count[2] = { 0, };
-
- if (list_empty(&cc->migratepages))
- return;
-
- list_for_each_entry(page, &cc->migratepages, lru)
- count[!!page_is_file_cache(page)]++;
-
- mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]);
- mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]);
-}
-
/* Similar to reclaim, but different enough that they don't share logic */
static bool too_many_isolated(struct zone *zone)
{
@@ -818,7 +803,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
locked = false;
}
- if (isolate_movable_page(page, isolate_mode))
+ if (!isolate_movable_page(page, isolate_mode))
goto isolate_success;
}
@@ -834,6 +819,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
page_count(page) > page_mapcount(page))
goto isolate_fail;
+ /*
+ * Only allow to migrate anonymous pages in GFP_NOFS context
+ * because those do not depend on fs locks.
+ */
+ if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
+ goto isolate_fail;
+
/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
locked = compact_trylock_irqsave(zone_lru_lock(zone),
@@ -866,6 +858,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Successfully isolated */
del_page_from_lru_list(page, lruvec, page_lru(page));
+ inc_node_page_state(page,
+ NR_ISOLATED_ANON + page_is_file_cache(page));
isolate_success:
list_add(&page->lru, &cc->migratepages);
@@ -902,7 +896,6 @@ isolate_fail:
spin_unlock_irqrestore(zone_lru_lock(zone), flags);
locked = false;
}
- acct_isolated(zone, cc);
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
cc->last_migrated_pfn = 0;
@@ -939,7 +932,7 @@ isolate_fail:
trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
nr_scanned, nr_isolated);
- count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
+ cc->total_migrate_scanned += nr_scanned;
if (nr_isolated)
count_compact_events(COMPACTISOLATED, nr_isolated);
@@ -988,7 +981,6 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
break;
}
- acct_isolated(cc->zone, cc);
return pfn;
}
@@ -1258,10 +1250,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
low_pfn = isolate_migratepages_block(cc, low_pfn,
block_end_pfn, isolate_mode);
- if (!low_pfn || cc->contended) {
- acct_isolated(zone, cc);
+ if (!low_pfn || cc->contended)
return ISOLATE_ABORT;
- }
/*
* Either we isolated something and proceed with migration. Or
@@ -1271,7 +1261,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
break;
}
- acct_isolated(zone, cc);
/* Record where migration scanner will be restarted. */
cc->migrate_pfn = low_pfn;
@@ -1643,6 +1632,9 @@ out:
zone->compact_cached_free_pfn = free_pfn;
}
+ count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
+ count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned);
+
trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync, ret);
@@ -1657,6 +1649,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
struct compact_control cc = {
.nr_freepages = 0,
.nr_migratepages = 0,
+ .total_migrate_scanned = 0,
+ .total_free_scanned = 0,
.order = order,
.gfp_mask = gfp_mask,
.zone = zone,
@@ -1696,14 +1690,16 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio)
{
- int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
struct zone *zone;
enum compact_result rc = COMPACT_SKIPPED;
- /* Check if the GFP flags allow compaction */
- if (!may_enter_fs || !may_perform_io)
+ /*
+ * Check if the GFP flags allow compaction - GFP_NOIO is really
+ * tricky context because the migration might require IO
+ */
+ if (!may_perform_io)
return COMPACT_SKIPPED;
trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
@@ -1767,9 +1763,12 @@ static void compact_node(int nid)
struct zone *zone;
struct compact_control cc = {
.order = -1,
+ .total_migrate_scanned = 0,
+ .total_free_scanned = 0,
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
.whole_zone = true,
+ .gfp_mask = GFP_KERNEL,
};
@@ -1892,14 +1891,17 @@ static void kcompactd_do_work(pg_data_t *pgdat)
struct zone *zone;
struct compact_control cc = {
.order = pgdat->kcompactd_max_order,
+ .total_migrate_scanned = 0,
+ .total_free_scanned = 0,
.classzone_idx = pgdat->kcompactd_classzone_idx,
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = true,
+ .gfp_mask = GFP_KERNEL,
};
trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
cc.classzone_idx);
- count_vm_event(KCOMPACTD_WAKE);
+ count_compact_event(KCOMPACTD_WAKE);
for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
int status;
@@ -1917,6 +1919,8 @@ static void kcompactd_do_work(pg_data_t *pgdat)
cc.nr_freepages = 0;
cc.nr_migratepages = 0;
+ cc.total_migrate_scanned = 0;
+ cc.total_free_scanned = 0;
cc.zone = zone;
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -1935,6 +1939,11 @@ static void kcompactd_do_work(pg_data_t *pgdat)
defer_compaction(zone, cc.order);
}
+ count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
+ cc.total_migrate_scanned);
+ count_compact_events(KCOMPACTD_FREE_SCANNED,
+ cc.total_free_scanned);
+
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
}
@@ -1958,6 +1967,13 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
if (pgdat->kcompactd_max_order < order)
pgdat->kcompactd_max_order = order;
+ /*
+ * Pairs with implicit barrier in wait_event_freezable()
+ * such that wakeups are not missed in the lockless
+ * waitqueue_active() call.
+ */
+ smp_acquire__after_ctrl_dep();
+
if (pgdat->kcompactd_classzone_idx > classzone_idx)
pgdat->kcompactd_classzone_idx = classzone_idx;
@@ -2043,33 +2059,38 @@ void kcompactd_stop(int nid)
* away, we get changed to run anywhere: as the first one comes back,
* restore their cpu bindings.
*/
-static int cpu_callback(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
+static int kcompactd_cpu_online(unsigned int cpu)
{
int nid;
- if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
- for_each_node_state(nid, N_MEMORY) {
- pg_data_t *pgdat = NODE_DATA(nid);
- const struct cpumask *mask;
+ for_each_node_state(nid, N_MEMORY) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ const struct cpumask *mask;
- mask = cpumask_of_node(pgdat->node_id);
+ mask = cpumask_of_node(pgdat->node_id);
- if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
- /* One of our CPUs online: restore mask */
- set_cpus_allowed_ptr(pgdat->kcompactd, mask);
- }
+ if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+ /* One of our CPUs online: restore mask */
+ set_cpus_allowed_ptr(pgdat->kcompactd, mask);
}
- return NOTIFY_OK;
+ return 0;
}
static int __init kcompactd_init(void)
{
int nid;
+ int ret;
+
+ ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "mm/compaction:online",
+ kcompactd_cpu_online, NULL);
+ if (ret < 0) {
+ pr_err("kcompactd: failed to register hotplug callbacks.\n");
+ return ret;
+ }
for_each_node_state(nid, N_MEMORY)
kcompactd_run(nid);
- hotcpu_notifier(cpu_callback, 0);
return 0;
}
subsys_initcall(kcompactd_init)
diff --git a/mm/debug.c b/mm/debug.c
index 9feb699c5d25..db1cd26d8752 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -59,6 +59,10 @@ void __dump_page(struct page *page, const char *reason)
pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
+ print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32,
+ sizeof(unsigned long), page,
+ sizeof(struct page), false);
+
if (reason)
pr_alert("page dumped because: %s\n", reason);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index abcbfe86c25a..4d90a64b2fdc 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -93,7 +93,7 @@ show_pools(struct device *dev, struct device_attribute *attr, char *buf)
spin_unlock_irq(&pool->lock);
/* per-pool info, no real statistics yet */
- temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n",
+ temp = scnprintf(next, size, "%-16s %4u %4zu %4zu %2u\n",
pool->name, blocks,
pages * (pool->allocation / pool->size),
pool->size, pages);
@@ -434,11 +434,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
dev_err(pool->dev,
- "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
- pool->name, vaddr, (unsigned long long)dma);
+ "dma_pool_free %s, %p (bad vaddr)/%pad\n",
+ pool->name, vaddr, &dma);
else
- pr_err("dma_pool_free %s, %p (bad vaddr)/%Lx\n",
- pool->name, vaddr, (unsigned long long)dma);
+ pr_err("dma_pool_free %s, %p (bad vaddr)/%pad\n",
+ pool->name, vaddr, &dma);
return;
}
{
@@ -450,11 +450,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
}
spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
- dev_err(pool->dev, "dma_pool_free %s, dma %Lx already free\n",
- pool->name, (unsigned long long)dma);
+ dev_err(pool->dev, "dma_pool_free %s, dma %pad already free\n",
+ pool->name, &dma);
else
- pr_err("dma_pool_free %s, dma %Lx already free\n",
- pool->name, (unsigned long long)dma);
+ pr_err("dma_pool_free %s, dma %pad already free\n",
+ pool->name, &dma);
return;
}
}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 6c707bfe02fd..a43013112581 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -139,7 +139,20 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
}
if (end_index >= start_index) {
- unsigned long count = invalidate_mapping_pages(mapping,
+ unsigned long count;
+
+ /*
+ * It's common to FADV_DONTNEED right after
+ * the read or write that instantiates the
+ * pages, in which case there will be some
+ * sitting on the local LRU cache. Try to
+ * avoid the expensive remote drain and the
+ * second cache tree walk below by flushing
+ * them out right away.
+ */
+ lru_add_drain();
+
+ count = invalidate_mapping_pages(mapping,
start_index, end_index);
/*
diff --git a/mm/filemap.c b/mm/filemap.c
index 50b52fe51937..1694623a6289 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -13,6 +13,7 @@
#include <linux/compiler.h>
#include <linux/dax.h>
#include <linux/fs.h>
+#include <linux/sched/signal.h>
#include <linux/uaccess.h>
#include <linux/capability.h>
#include <linux/kernel_stat.h>
@@ -132,44 +133,28 @@ static int page_cache_tree_insert(struct address_space *mapping,
if (!dax_mapping(mapping)) {
if (shadowp)
*shadowp = p;
- if (node)
- workingset_node_shadows_dec(node);
} else {
/* DAX can replace empty locked entry with a hole */
WARN_ON_ONCE(p !=
- (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
- RADIX_DAX_ENTRY_LOCK));
- /* DAX accounts exceptional entries as normal pages */
- if (node)
- workingset_node_pages_dec(node);
+ dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
/* Wakeup waiters for exceptional entry lock */
- dax_wake_mapping_entry_waiter(mapping, page->index,
- false);
+ dax_wake_mapping_entry_waiter(mapping, page->index, p,
+ true);
}
}
- radix_tree_replace_slot(slot, page);
+ __radix_tree_replace(&mapping->page_tree, node, slot, page,
+ workingset_update_node, mapping);
mapping->nrpages++;
- if (node) {
- workingset_node_pages_inc(node);
- /*
- * Don't track node that contains actual pages.
- *
- * Avoid acquiring the list_lru lock if already
- * untracked. The list_empty() test is safe as
- * node->private_list is protected by
- * mapping->tree_lock.
- */
- if (!list_empty(&node->private_list))
- list_lru_del(&workingset_shadow_nodes,
- &node->private_list);
- }
return 0;
}
static void page_cache_tree_delete(struct address_space *mapping,
struct page *page, void *shadow)
{
- int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
+ int i, nr;
+
+ /* hugetlb pages are represented by one entry in the radix tree */
+ nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -182,44 +167,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
__radix_tree_lookup(&mapping->page_tree, page->index + i,
&node, &slot);
- radix_tree_clear_tags(&mapping->page_tree, node, slot);
-
- if (!node) {
- VM_BUG_ON_PAGE(nr != 1, page);
- /*
- * We need a node to properly account shadow
- * entries. Don't plant any without. XXX
- */
- shadow = NULL;
- }
-
- radix_tree_replace_slot(slot, shadow);
+ VM_BUG_ON_PAGE(!node && nr != 1, page);
- if (!node)
- break;
-
- workingset_node_pages_dec(node);
- if (shadow)
- workingset_node_shadows_inc(node);
- else
- if (__radix_tree_delete_node(&mapping->page_tree, node))
- continue;
-
- /*
- * Track node that only contains shadow entries. DAX mappings
- * contain no shadow entries and may contain other exceptional
- * entries so skip those.
- *
- * Avoid acquiring the list_lru lock if already tracked.
- * The list_empty() test is safe as node->private_list is
- * protected by mapping->tree_lock.
- */
- if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
- list_empty(&node->private_list)) {
- node->private_data = mapping;
- list_lru_add(&workingset_shadow_nodes,
- &node->private_list);
- }
+ radix_tree_clear_tags(&mapping->page_tree, node, slot);
+ __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
+ workingset_update_node, mapping);
}
if (shadow) {
@@ -788,45 +740,165 @@ EXPORT_SYMBOL(__page_cache_alloc);
* at a cost of "thundering herd" phenomena during rare hash
* collisions.
*/
-wait_queue_head_t *page_waitqueue(struct page *page)
+#define PAGE_WAIT_TABLE_BITS 8
+#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
+static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
+
+static wait_queue_head_t *page_waitqueue(struct page *page)
{
- return bit_waitqueue(page, 0);
+ return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
}
-EXPORT_SYMBOL(page_waitqueue);
-void wait_on_page_bit(struct page *page, int bit_nr)
+void __init pagecache_init(void)
{
- DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+ int i;
+
+ for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
+ init_waitqueue_head(&page_wait_table[i]);
- if (test_bit(bit_nr, &page->flags))
- __wait_on_bit(page_waitqueue(page), &wait, bit_wait_io,
- TASK_UNINTERRUPTIBLE);
+ page_writeback_init();
}
-EXPORT_SYMBOL(wait_on_page_bit);
-int wait_on_page_bit_killable(struct page *page, int bit_nr)
+struct wait_page_key {
+ struct page *page;
+ int bit_nr;
+ int page_match;
+};
+
+struct wait_page_queue {
+ struct page *page;
+ int bit_nr;
+ wait_queue_t wait;
+};
+
+static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
{
- DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+ struct wait_page_key *key = arg;
+ struct wait_page_queue *wait_page
+ = container_of(wait, struct wait_page_queue, wait);
+
+ if (wait_page->page != key->page)
+ return 0;
+ key->page_match = 1;
- if (!test_bit(bit_nr, &page->flags))
+ if (wait_page->bit_nr != key->bit_nr)
+ return 0;
+ if (test_bit(key->bit_nr, &key->page->flags))
return 0;
- return __wait_on_bit(page_waitqueue(page), &wait,
- bit_wait_io, TASK_KILLABLE);
+ return autoremove_wake_function(wait, mode, sync, key);
}
-int wait_on_page_bit_killable_timeout(struct page *page,
- int bit_nr, unsigned long timeout)
+static void wake_up_page_bit(struct page *page, int bit_nr)
{
- DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+ wait_queue_head_t *q = page_waitqueue(page);
+ struct wait_page_key key;
+ unsigned long flags;
- wait.key.timeout = jiffies + timeout;
- if (!test_bit(bit_nr, &page->flags))
- return 0;
- return __wait_on_bit(page_waitqueue(page), &wait,
- bit_wait_io_timeout, TASK_KILLABLE);
+ key.page = page;
+ key.bit_nr = bit_nr;
+ key.page_match = 0;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_locked_key(q, TASK_NORMAL, &key);
+ /*
+ * It is possible for other pages to have collided on the waitqueue
+ * hash, so in that case check for a page match. That prevents a long-
+ * term waiter
+ *
+ * It is still possible to miss a case here, when we woke page waiters
+ * and removed them from the waitqueue, but there are still other
+ * page waiters.
+ */
+ if (!waitqueue_active(q) || !key.page_match) {
+ ClearPageWaiters(page);
+ /*
+ * It's possible to miss clearing Waiters here, when we woke
+ * our page waiters, but the hashed waitqueue has waiters for
+ * other pages on it.
+ *
+ * That's okay, it's a rare case. The next waker will clear it.
+ */
+ }
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+
+static void wake_up_page(struct page *page, int bit)
+{
+ if (!PageWaiters(page))
+ return;
+ wake_up_page_bit(page, bit);
+}
+
+static inline int wait_on_page_bit_common(wait_queue_head_t *q,
+ struct page *page, int bit_nr, int state, bool lock)
+{
+ struct wait_page_queue wait_page;
+ wait_queue_t *wait = &wait_page.wait;
+ int ret = 0;
+
+ init_wait(wait);
+ wait->func = wake_page_function;
+ wait_page.page = page;
+ wait_page.bit_nr = bit_nr;
+
+ for (;;) {
+ spin_lock_irq(&q->lock);
+
+ if (likely(list_empty(&wait->task_list))) {
+ if (lock)
+ __add_wait_queue_tail_exclusive(q, wait);
+ else
+ __add_wait_queue(q, wait);
+ SetPageWaiters(page);
+ }
+
+ set_current_state(state);
+
+ spin_unlock_irq(&q->lock);
+
+ if (likely(test_bit(bit_nr, &page->flags))) {
+ io_schedule();
+ if (unlikely(signal_pending_state(state, current))) {
+ ret = -EINTR;
+ break;
+ }
+ }
+
+ if (lock) {
+ if (!test_and_set_bit_lock(bit_nr, &page->flags))
+ break;
+ } else {
+ if (!test_bit(bit_nr, &page->flags))
+ break;
+ }
+ }
+
+ finish_wait(q, wait);
+
+ /*
+ * A signal could leave PageWaiters set. Clearing it here if
+ * !waitqueue_active would be possible (by open-coding finish_wait),
+ * but still fail to catch it in the case of wait hash collision. We
+ * already can fail to clear wait hash collision cases, so don't
+ * bother with signals either.
+ */
+
+ return ret;
+}
+
+void wait_on_page_bit(struct page *page, int bit_nr)
+{
+ wait_queue_head_t *q = page_waitqueue(page);
+ wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false);
+}
+EXPORT_SYMBOL(wait_on_page_bit);
+
+int wait_on_page_bit_killable(struct page *page, int bit_nr)
+{
+ wait_queue_head_t *q = page_waitqueue(page);
+ return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
}
-EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout);
/**
* add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
@@ -842,10 +914,34 @@ void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
spin_lock_irqsave(&q->lock, flags);
__add_wait_queue(q, waiter);
+ SetPageWaiters(page);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL_GPL(add_page_wait_queue);
+#ifndef clear_bit_unlock_is_negative_byte
+
+/*
+ * PG_waiters is the high bit in the same byte as PG_lock.
+ *
+ * On x86 (and on many other architectures), we can clear PG_lock and
+ * test the sign bit at the same time. But if the architecture does
+ * not support that special operation, we just do this all by hand
+ * instead.
+ *
+ * The read of PG_waiters has to be after (or concurrently with) PG_locked
+ * being cleared, but a memory barrier should be unneccssary since it is
+ * in the same byte as PG_locked.
+ */
+static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
+{
+ clear_bit_unlock(nr, mem);
+ /* smp_mb__after_atomic(); */
+ return test_bit(PG_waiters, mem);
+}
+
+#endif
+
/**
* unlock_page - unlock a locked page
* @page: the page
@@ -855,16 +951,19 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
* mechanism between PageLocked pages and PageWriteback pages is shared.
* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
*
- * The mb is necessary to enforce ordering between the clear_bit and the read
- * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
+ * Note that this depends on PG_waiters being the sign bit in the byte
+ * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
+ * clear the PG_locked bit and test PG_waiters at the same time fairly
+ * portably (architectures that do LL/SC can test any bit, while x86 can
+ * test the sign bit).
*/
void unlock_page(struct page *page)
{
+ BUILD_BUG_ON(PG_waiters != 7);
page = compound_head(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
- clear_bit_unlock(PG_locked, &page->flags);
- smp_mb__after_atomic();
- wake_up_page(page, PG_locked);
+ if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
+ wake_up_page_bit(page, PG_locked);
}
EXPORT_SYMBOL(unlock_page);
@@ -910,9 +1009,12 @@ void page_endio(struct page *page, bool is_write, int err)
unlock_page(page);
} else {
if (err) {
+ struct address_space *mapping;
+
SetPageError(page);
- if (page->mapping)
- mapping_set_error(page->mapping, err);
+ mapping = page_mapping(page);
+ if (mapping)
+ mapping_set_error(mapping, err);
}
end_page_writeback(page);
}
@@ -921,25 +1023,21 @@ EXPORT_SYMBOL_GPL(page_endio);
/**
* __lock_page - get a lock on the page, assuming we need to sleep to get it
- * @page: the page to lock
+ * @__page: the page to lock
*/
-void __lock_page(struct page *page)
+void __lock_page(struct page *__page)
{
- struct page *page_head = compound_head(page);
- DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
-
- __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
- TASK_UNINTERRUPTIBLE);
+ struct page *page = compound_head(__page);
+ wait_queue_head_t *q = page_waitqueue(page);
+ wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true);
}
EXPORT_SYMBOL(__lock_page);
-int __lock_page_killable(struct page *page)
+int __lock_page_killable(struct page *__page)
{
- struct page *page_head = compound_head(page);
- DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
-
- return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
- bit_wait_io, TASK_KILLABLE);
+ struct page *page = compound_head(__page);
+ wait_queue_head_t *q = page_waitqueue(page);
+ return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
@@ -1686,7 +1784,7 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
int error = 0;
if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
- return -EINVAL;
+ return 0;
iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
index = *ppos >> PAGE_SHIFT;
@@ -1703,6 +1801,11 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
cond_resched();
find_page:
+ if (fatal_signal_pending(current)) {
+ error = -EINTR;
+ goto out;
+ }
+
page = find_get_page(mapping, index);
if (!page) {
page_cache_sync_readahead(mapping,
@@ -2070,7 +2173,6 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
/**
* filemap_fault - read in file data for page fault handling
- * @vma: vma in which the fault was taken
* @vmf: struct vm_fault containing details of the fault
*
* filemap_fault() is invoked via the vma operations vector for a
@@ -2092,10 +2194,10 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
*
* We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
*/
-int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int filemap_fault(struct vm_fault *vmf)
{
int error;
- struct file *file = vma->vm_file;
+ struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
@@ -2117,12 +2219,12 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
* We found the page, so try async readahead before
* waiting for the lock.
*/
- do_async_mmap_readahead(vma, ra, file, page, offset);
+ do_async_mmap_readahead(vmf->vma, ra, file, page, offset);
} else if (!page) {
/* No page in the page cache at all */
- do_sync_mmap_readahead(vma, ra, file, offset);
+ do_sync_mmap_readahead(vmf->vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
retry_find:
page = find_get_page(mapping, offset);
@@ -2130,7 +2232,7 @@ retry_find:
goto no_cached_page;
}
- if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+ if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) {
put_page(page);
return ret | VM_FAULT_RETRY;
}
@@ -2213,12 +2315,12 @@ page_not_uptodate:
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct fault_env *fe,
+void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
struct radix_tree_iter iter;
void **slot;
- struct file *file = fe->vma->vm_file;
+ struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
loff_t size;
@@ -2274,11 +2376,11 @@ repeat:
if (file->f_ra.mmap_miss > 0)
file->f_ra.mmap_miss--;
- fe->address += (iter.index - last_pgoff) << PAGE_SHIFT;
- if (fe->pte)
- fe->pte += iter.index - last_pgoff;
+ vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
+ if (vmf->pte)
+ vmf->pte += iter.index - last_pgoff;
last_pgoff = iter.index;
- if (alloc_set_pte(fe, NULL, page))
+ if (alloc_set_pte(vmf, NULL, page))
goto unlock;
unlock_page(page);
goto next;
@@ -2288,7 +2390,7 @@ skip:
put_page(page);
next:
/* Huge page is mapped? No need to proceed. */
- if (pmd_trans_huge(*fe->pmd))
+ if (pmd_trans_huge(*vmf->pmd))
break;
if (iter.index == end_pgoff)
break;
@@ -2297,14 +2399,14 @@ next:
}
EXPORT_SYMBOL(filemap_map_pages);
-int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+int filemap_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
- struct inode *inode = file_inode(vma->vm_file);
+ struct inode *inode = file_inode(vmf->vma->vm_file);
int ret = VM_FAULT_LOCKED;
sb_start_pagefault(inode->i_sb);
- file_update_time(vma->vm_file);
+ file_update_time(vmf->vma->vm_file);
lock_page(page);
if (page->mapping != inode->i_mapping) {
unlock_page(page);
diff --git a/mm/gup.c b/mm/gup.c
index ec4f82704b6f..04aa405350dc 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -10,7 +10,7 @@
#include <linux/swap.h>
#include <linux/swapops.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/rwsem.h>
#include <linux/hugetlb.h>
@@ -226,6 +226,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
unsigned int *page_mask)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
spinlock_t *ptl;
@@ -243,8 +244,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
return no_page_table(vma, flags);
-
- pud = pud_offset(pgd, address);
+ p4d = p4d_offset(pgd, address);
+ if (p4d_none(*p4d))
+ return no_page_table(vma, flags);
+ BUILD_BUG_ON(p4d_huge(*p4d));
+ if (unlikely(p4d_bad(*p4d)))
+ return no_page_table(vma, flags);
+ pud = pud_offset(p4d, address);
if (pud_none(*pud))
return no_page_table(vma, flags);
if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
@@ -253,6 +259,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
return page;
return no_page_table(vma, flags);
}
+ if (pud_devmap(*pud)) {
+ ptl = pud_lock(mm, pud);
+ page = follow_devmap_pud(vma, address, pud, flags);
+ spin_unlock(ptl);
+ if (page)
+ return page;
+ }
if (unlikely(pud_bad(*pud)))
return no_page_table(vma, flags);
@@ -265,8 +278,6 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
return page;
return no_page_table(vma, flags);
}
- if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
- return no_page_table(vma, flags);
if (pmd_devmap(*pmd)) {
ptl = pmd_lock(mm, pmd);
page = follow_devmap_pmd(vma, address, pmd, flags);
@@ -277,6 +288,9 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
if (likely(!pmd_trans_huge(*pmd)))
return follow_page_pte(vma, address, pmd, flags);
+ if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
+ return no_page_table(vma, flags);
+
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(ptl);
@@ -317,6 +331,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
struct page **page)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -330,7 +345,9 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
else
pgd = pgd_offset_gate(mm, address);
BUG_ON(pgd_none(*pgd));
- pud = pud_offset(pgd, address);
+ p4d = p4d_offset(pgd, address);
+ BUG_ON(p4d_none(*p4d));
+ pud = pud_offset(p4d, address);
BUG_ON(pud_none(*pud));
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
@@ -572,7 +589,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (is_vm_hugetlb_page(vma)) {
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
- gup_flags);
+ gup_flags, nonblocking);
continue;
}
}
@@ -632,7 +649,8 @@ next_page:
return i;
}
-bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags)
+static bool vma_permits_fault(struct vm_area_struct *vma,
+ unsigned int fault_flags)
{
bool write = !!(fault_flags & FAULT_FLAG_WRITE);
bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
@@ -857,18 +875,17 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
EXPORT_SYMBOL(get_user_pages_locked);
/*
- * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
- * pass additional gup_flags as last parameter (like FOLL_HWPOISON).
+ * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows for
+ * tsk, mm to be specified.
*
* NOTE: here FOLL_TOUCH is not set implicitly and must be set by the
- * caller if required (just like with __get_user_pages). "FOLL_GET",
- * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed
- * according to the parameters "pages", "write", "force"
- * respectively.
+ * caller if required (just like with __get_user_pages). "FOLL_GET"
+ * is set implicitly if "pages" is non-NULL.
*/
-__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- struct page **pages, unsigned int gup_flags)
+static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long nr_pages, struct page **pages,
+ unsigned int gup_flags)
{
long ret;
int locked = 1;
@@ -880,7 +897,6 @@ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct m
up_read(&mm->mmap_sem);
return ret;
}
-EXPORT_SYMBOL(__get_user_pages_unlocked);
/*
* get_user_pages_unlocked() is suitable to replace the form:
@@ -894,10 +910,8 @@ EXPORT_SYMBOL(__get_user_pages_unlocked);
* get_user_pages_unlocked(tsk, mm, ..., pages);
*
* It is functionally equivalent to get_user_pages_fast so
- * get_user_pages_fast should be used instead, if the two parameters
- * "tsk" and "mm" are respectively equal to current and current->mm,
- * or if "force" shall be set to 1 (get_user_pages_fast misses the
- * "force" parameter).
+ * get_user_pages_fast should be used instead if specific gup_flags
+ * (e.g. FOLL_FORCE) are not required.
*/
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags)
@@ -920,6 +934,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
+ * @locked: pointer to lock flag indicating whether lock is held and
+ * subsequently whether VM_FAULT_RETRY functionality can be
+ * utilised. Lock must initially be held.
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages
@@ -963,10 +980,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
+ struct vm_area_struct **vmas, int *locked)
{
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
- NULL, false,
+ locked, true,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
EXPORT_SYMBOL(get_user_pages_remote);
@@ -974,8 +991,9 @@ EXPORT_SYMBOL(get_user_pages_remote);
/*
* This is the same as get_user_pages_remote(), just with a
* less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's. We also
- * obviously don't pass FOLL_REMOTE in here.
+ * and mm being operated on are the current task's and don't allow
+ * passing of a locked parameter. We also obviously don't pass
+ * FOLL_REMOTE in here.
*/
long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
@@ -1391,13 +1409,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
return 1;
}
-static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;
- pudp = pud_offset(&pgd, addr);
+ pudp = pud_offset(&p4d, addr);
do {
pud_t pud = READ_ONCE(*pudp);
@@ -1419,6 +1437,31 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
return 1;
}
+static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ p4d_t *p4dp;
+
+ p4dp = p4d_offset(&pgd, addr);
+ do {
+ p4d_t p4d = READ_ONCE(*p4dp);
+
+ next = p4d_addr_end(addr, end);
+ if (p4d_none(p4d))
+ return 0;
+ BUILD_BUG_ON(p4d_huge(p4d));
+ if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
+ if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
+ P4D_SHIFT, next, write, pages, nr))
+ return 0;
+ } else if (!gup_pud_range(p4d, addr, next, write, pages, nr))
+ return 0;
+ } while (p4dp++, addr = next, addr != end);
+
+ return 1;
+}
+
/*
* Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
* the regular GUP. It will only return non-negative values.
@@ -1469,7 +1512,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
PGDIR_SHIFT, next, write, pages, &nr))
break;
- } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ } else if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
break;
} while (pgdp++, addr = next, addr != end);
local_irq_restore(flags);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d4a6e4001512..f3c4f9d22821 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -9,6 +9,8 @@
#include <linux/mm.h>
#include <linux/sched.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/numa_balancing.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/mmu_notifier.h>
@@ -142,42 +144,6 @@ static struct shrinker huge_zero_page_shrinker = {
};
#ifdef CONFIG_SYSFS
-
-static ssize_t triple_flag_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count,
- enum transparent_hugepage_flag enabled,
- enum transparent_hugepage_flag deferred,
- enum transparent_hugepage_flag req_madv)
-{
- if (!memcmp("defer", buf,
- min(sizeof("defer")-1, count))) {
- if (enabled == deferred)
- return -EINVAL;
- clear_bit(enabled, &transparent_hugepage_flags);
- clear_bit(req_madv, &transparent_hugepage_flags);
- set_bit(deferred, &transparent_hugepage_flags);
- } else if (!memcmp("always", buf,
- min(sizeof("always")-1, count))) {
- clear_bit(deferred, &transparent_hugepage_flags);
- clear_bit(req_madv, &transparent_hugepage_flags);
- set_bit(enabled, &transparent_hugepage_flags);
- } else if (!memcmp("madvise", buf,
- min(sizeof("madvise")-1, count))) {
- clear_bit(enabled, &transparent_hugepage_flags);
- clear_bit(deferred, &transparent_hugepage_flags);
- set_bit(req_madv, &transparent_hugepage_flags);
- } else if (!memcmp("never", buf,
- min(sizeof("never")-1, count))) {
- clear_bit(enabled, &transparent_hugepage_flags);
- clear_bit(req_madv, &transparent_hugepage_flags);
- clear_bit(deferred, &transparent_hugepage_flags);
- } else
- return -EINVAL;
-
- return count;
-}
-
static ssize_t enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -193,19 +159,28 @@ static ssize_t enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- ssize_t ret;
+ ssize_t ret = count;
- ret = triple_flag_store(kobj, attr, buf, count,
- TRANSPARENT_HUGEPAGE_FLAG,
- TRANSPARENT_HUGEPAGE_FLAG,
- TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+ if (!memcmp("always", buf,
+ min(sizeof("always")-1, count))) {
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+ } else if (!memcmp("madvise", buf,
+ min(sizeof("madvise")-1, count))) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ } else if (!memcmp("never", buf,
+ min(sizeof("never")-1, count))) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ } else
+ ret = -EINVAL;
if (ret > 0) {
int err = start_stop_khugepaged();
if (err)
ret = err;
}
-
return ret;
}
static struct kobj_attribute enabled_attr =
@@ -241,32 +216,58 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj,
return count;
}
-/*
- * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
- * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
- * memory just to allocate one more hugepage.
- */
static ssize_t defrag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
- return sprintf(buf, "[always] defer madvise never\n");
+ return sprintf(buf, "[always] defer defer+madvise madvise never\n");
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
- return sprintf(buf, "always [defer] madvise never\n");
- else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
- return sprintf(buf, "always defer [madvise] never\n");
- else
- return sprintf(buf, "always defer madvise [never]\n");
-
+ return sprintf(buf, "always [defer] defer+madvise madvise never\n");
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always defer [defer+madvise] madvise never\n");
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always defer defer+madvise [madvise] never\n");
+ return sprintf(buf, "always defer defer+madvise madvise [never]\n");
}
+
static ssize_t defrag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- return triple_flag_store(kobj, attr, buf, count,
- TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
- TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
- TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+ if (!memcmp("always", buf,
+ min(sizeof("always")-1, count))) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ } else if (!memcmp("defer+madvise", buf,
+ min(sizeof("defer+madvise")-1, count))) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ } else if (!memcmp("defer", buf,
+ min(sizeof("defer")-1, count))) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ } else if (!memcmp("madvise", buf,
+ min(sizeof("madvise")-1, count))) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ } else if (!memcmp("never", buf,
+ min(sizeof("never")-1, count))) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ } else
+ return -EINVAL;
+
+ return count;
}
static struct kobj_attribute defrag_attr =
__ATTR(defrag, 0644, defrag_show, defrag_store);
@@ -285,6 +286,15 @@ static ssize_t use_zero_page_store(struct kobject *kobj,
}
static struct kobj_attribute use_zero_page_attr =
__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
+
+static ssize_t hpage_pmd_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE);
+}
+static struct kobj_attribute hpage_pmd_size_attr =
+ __ATTR_RO(hpage_pmd_size);
+
#ifdef CONFIG_DEBUG_VM
static ssize_t debug_cow_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -307,6 +317,7 @@ static struct attribute *hugepage_attr[] = {
&enabled_attr.attr,
&defrag_attr.attr,
&use_zero_page_attr.attr,
+ &hpage_pmd_size_attr.attr,
#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
&shmem_enabled_attr.attr,
#endif
@@ -532,13 +543,13 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
-static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
+static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
gfp_t gfp)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
pgtable_t pgtable;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
VM_BUG_ON_PAGE(!PageCompound(page), page);
@@ -563,9 +574,9 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
*/
__SetPageUptodate(page);
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_none(*fe->pmd))) {
- spin_unlock(fe->ptl);
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd))) {
+ spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(vma->vm_mm, pgtable);
@@ -576,11 +587,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
if (userfaultfd_missing(vma)) {
int ret;
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(vma->vm_mm, pgtable);
- ret = handle_userfault(fe, VM_UFFD_MISSING);
+ ret = handle_userfault(vmf, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
return ret;
}
@@ -590,11 +601,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
page_add_new_anon_rmap(page, vma, haddr, true);
mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
- pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
atomic_long_inc(&vma->vm_mm->nr_ptes);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
}
@@ -602,25 +613,28 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
}
/*
- * If THP defrag is set to always then directly reclaim/compact as necessary
- * If set to defer then do only background reclaim/compact and defer to khugepaged
- * If set to madvise and the VMA is flagged then directly reclaim/compact
- * When direct reclaim/compact is allowed, don't retry except for flagged VMA's
+ * always: directly stall for all thp allocations
+ * defer: wake kswapd and fail if not immediately available
+ * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
+ * fail if not immediately available
+ * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
+ * available
+ * never: never stall for any thp allocation
*/
static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
{
- bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+ const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
- if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
- &transparent_hugepage_flags) && vma_madvised)
- return GFP_TRANSHUGE;
- else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
- &transparent_hugepage_flags))
- return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
- else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
- &transparent_hugepage_flags))
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
-
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+ return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
+ return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
+ __GFP_KSWAPD_RECLAIM);
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
+ return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
+ 0);
return GFP_TRANSHUGE_LIGHT;
}
@@ -641,12 +655,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return true;
}
-int do_huge_pmd_anonymous_page(struct fault_env *fe)
+int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
gfp_t gfp;
struct page *page;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
return VM_FAULT_FALLBACK;
@@ -654,7 +668,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
return VM_FAULT_OOM;
- if (!(fe->flags & FAULT_FLAG_WRITE) &&
+ if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm) &&
transparent_hugepage_use_zero_page()) {
pgtable_t pgtable;
@@ -670,22 +684,22 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
ret = 0;
set = false;
- if (pmd_none(*fe->pmd)) {
+ if (pmd_none(*vmf->pmd)) {
if (userfaultfd_missing(vma)) {
- spin_unlock(fe->ptl);
- ret = handle_userfault(fe, VM_UFFD_MISSING);
+ spin_unlock(vmf->ptl);
+ ret = handle_userfault(vmf, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
} else {
set_huge_zero_page(pgtable, vma->vm_mm, vma,
- haddr, fe->pmd, zero_page);
- spin_unlock(fe->ptl);
+ haddr, vmf->pmd, zero_page);
+ spin_unlock(vmf->ptl);
set = true;
}
} else
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
if (!set)
pte_free(vma->vm_mm, pgtable);
return ret;
@@ -697,7 +711,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
return VM_FAULT_FALLBACK;
}
prep_transhuge_page(page);
- return __do_huge_pmd_anonymous_page(fe, page, gfp);
+ return __do_huge_pmd_anonymous_page(vmf, page, gfp);
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -737,13 +751,68 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
- if (track_pfn_insert(vma, &pgprot, pfn))
- return VM_FAULT_SIGBUS;
+
+ track_pfn_insert(vma, &pgprot, pfn);
+
insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_flags & VM_WRITE))
+ pud = pud_mkwrite(pud);
+ return pud;
+}
+
+static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
+ pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pud_t entry;
+ spinlock_t *ptl;
+
+ ptl = pud_lock(mm, pud);
+ entry = pud_mkhuge(pfn_t_pud(pfn, prot));
+ if (pfn_t_devmap(pfn))
+ entry = pud_mkdevmap(entry);
+ if (write) {
+ entry = pud_mkyoung(pud_mkdirty(entry));
+ entry = maybe_pud_mkwrite(entry, vma);
+ }
+ set_pud_at(mm, addr, pud, entry);
+ update_mmu_cache_pud(vma, addr, pud);
+ spin_unlock(ptl);
+}
+
+int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
+ pud_t *pud, pfn_t pfn, bool write)
+{
+ pgprot_t pgprot = vma->vm_page_prot;
+ /*
+ * If we had pud_special, we could avoid all these restrictions,
+ * but we need to be consistent with PTEs and architectures that
+ * can't support a 'special' bit.
+ */
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+ BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+ (VM_PFNMAP|VM_MIXEDMAP));
+ BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+ BUG_ON(!pfn_t_devmap(pfn));
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ track_pfn_insert(vma, &pgprot, pfn);
+
+ insert_pfn_pud(vma, addr, pud, pfn, pgprot, write);
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+
static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd)
{
@@ -772,6 +841,12 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
assert_spin_locked(pmd_lockptr(mm, pmd));
+ /*
+ * When we COW a devmap PMD entry, we split it into PTEs, so we should
+ * not be in this function with `flags & FOLL_COW` set.
+ */
+ WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
+
if (flags & FOLL_WRITE && !pmd_write(*pmd))
return NULL;
@@ -868,30 +943,149 @@ out:
return ret;
}
-void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd)
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
+ pud_t *pud)
+{
+ pud_t _pud;
+
+ /*
+ * We should set the dirty bit only for FOLL_WRITE but for now
+ * the dirty bit in the pud is meaningless. And if the dirty
+ * bit will become meaningful and we'll only set it with
+ * FOLL_WRITE, an atomic set_bit will be required on the pud to
+ * set the young bit, instead of the current set_pud_at.
+ */
+ _pud = pud_mkyoung(pud_mkdirty(*pud));
+ if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
+ pud, _pud, 1))
+ update_mmu_cache_pud(vma, addr, pud);
+}
+
+struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
+ pud_t *pud, int flags)
+{
+ unsigned long pfn = pud_pfn(*pud);
+ struct mm_struct *mm = vma->vm_mm;
+ struct dev_pagemap *pgmap;
+ struct page *page;
+
+ assert_spin_locked(pud_lockptr(mm, pud));
+
+ if (flags & FOLL_WRITE && !pud_write(*pud))
+ return NULL;
+
+ if (pud_present(*pud) && pud_devmap(*pud))
+ /* pass */;
+ else
+ return NULL;
+
+ if (flags & FOLL_TOUCH)
+ touch_pud(vma, addr, pud);
+
+ /*
+ * device mapped pages can only be returned if the
+ * caller will manage the page reference count.
+ */
+ if (!(flags & FOLL_GET))
+ return ERR_PTR(-EEXIST);
+
+ pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
+ pgmap = get_dev_pagemap(pfn, NULL);
+ if (!pgmap)
+ return ERR_PTR(-EFAULT);
+ page = pfn_to_page(pfn);
+ get_page(page);
+ put_dev_pagemap(pgmap);
+
+ return page;
+}
+
+int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
+ struct vm_area_struct *vma)
+{
+ spinlock_t *dst_ptl, *src_ptl;
+ pud_t pud;
+ int ret;
+
+ dst_ptl = pud_lock(dst_mm, dst_pud);
+ src_ptl = pud_lockptr(src_mm, src_pud);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+
+ ret = -EAGAIN;
+ pud = *src_pud;
+ if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
+ goto out_unlock;
+
+ /*
+ * When page table lock is held, the huge zero pud should not be
+ * under splitting since we don't split the page itself, only pud to
+ * a page table.
+ */
+ if (is_huge_zero_pud(pud)) {
+ /* No huge zero pud yet */
+ }
+
+ pudp_set_wrprotect(src_mm, addr, src_pud);
+ pud = pud_mkold(pud_wrprotect(pud));
+ set_pud_at(dst_mm, addr, dst_pud, pud);
+
+ ret = 0;
+out_unlock:
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ return ret;
+}
+
+void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
+{
+ pud_t entry;
+ unsigned long haddr;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+ vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
+ if (unlikely(!pud_same(*vmf->pud, orig_pud)))
+ goto unlock;
+
+ entry = pud_mkyoung(orig_pud);
+ if (write)
+ entry = pud_mkdirty(entry);
+ haddr = vmf->address & HPAGE_PUD_MASK;
+ if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
+ update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
+
+unlock:
+ spin_unlock(vmf->ptl);
+}
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+
+void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
{
pmd_t entry;
unsigned long haddr;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
- fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
+ vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto unlock;
entry = pmd_mkyoung(orig_pmd);
- haddr = fe->address & HPAGE_PMD_MASK;
- if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry,
- fe->flags & FAULT_FLAG_WRITE))
- update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd);
+ if (write)
+ entry = pmd_mkdirty(entry);
+ haddr = vmf->address & HPAGE_PMD_MASK;
+ if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
+ update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
unlock:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
}
-static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
+static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
struct page *page)
{
- struct vm_area_struct *vma = fe->vma;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
struct mem_cgroup *memcg;
pgtable_t pgtable;
pmd_t _pmd;
@@ -908,9 +1102,8 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
}
for (i = 0; i < HPAGE_PMD_NR; i++) {
- pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
- __GFP_OTHER_NODE, vma,
- fe->address, page_to_nid(page));
+ pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
+ vmf->address, page_to_nid(page));
if (unlikely(!pages[i] ||
mem_cgroup_try_charge(pages[i], vma->vm_mm,
GFP_KERNEL, &memcg, false))) {
@@ -941,15 +1134,15 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
- pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
/* leave pmd empty until pte is filled */
- pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd);
+ pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
pmd_populate(vma->vm_mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -958,20 +1151,20 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- page_add_new_anon_rmap(pages[i], fe->vma, haddr, false);
+ page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
mem_cgroup_commit_charge(pages[i], memcg, false, false);
lru_cache_add_active_or_unevictable(pages[i], vma);
- fe->pte = pte_offset_map(&_pmd, haddr);
- VM_BUG_ON(!pte_none(*fe->pte));
- set_pte_at(vma->vm_mm, haddr, fe->pte, entry);
- pte_unmap(fe->pte);
+ vmf->pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*vmf->pte));
+ set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
+ pte_unmap(vmf->pte);
}
kfree(pages);
smp_wmb(); /* make pte visible before pmd */
- pmd_populate(vma->vm_mm, fe->pmd, pgtable);
+ pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
page_remove_rmap(page, true);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
@@ -982,7 +1175,7 @@ out:
return ret;
out_free_pages:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
for (i = 0; i < HPAGE_PMD_NR; i++) {
memcg = (void *)page_private(pages[i]);
@@ -994,23 +1187,23 @@ out_free_pages:
goto out;
}
-int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
+int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *new_page;
struct mem_cgroup *memcg;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
gfp_t huge_gfp; /* for allocation and charge */
int ret = 0;
- fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd);
+ vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
- spin_lock(fe->ptl);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
+ spin_lock(vmf->ptl);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto out_unlock;
page = pmd_page(orig_pmd);
@@ -1023,13 +1216,13 @@ int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1))
- update_mmu_cache_pmd(vma, fe->address, fe->pmd);
+ if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
ret |= VM_FAULT_WRITE;
goto out_unlock;
}
get_page(page);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
@@ -1042,12 +1235,12 @@ alloc:
prep_transhuge_page(new_page);
} else {
if (!page) {
- split_huge_pmd(vma, fe->pmd, fe->address);
+ split_huge_pmd(vma, vmf->pmd, vmf->address);
ret |= VM_FAULT_FALLBACK;
} else {
- ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page);
+ ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
if (ret & VM_FAULT_OOM) {
- split_huge_pmd(vma, fe->pmd, fe->address);
+ split_huge_pmd(vma, vmf->pmd, vmf->address);
ret |= VM_FAULT_FALLBACK;
}
put_page(page);
@@ -1059,7 +1252,7 @@ alloc:
if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
huge_gfp, &memcg, true))) {
put_page(new_page);
- split_huge_pmd(vma, fe->pmd, fe->address);
+ split_huge_pmd(vma, vmf->pmd, vmf->address);
if (page)
put_page(page);
ret |= VM_FAULT_FALLBACK;
@@ -1079,11 +1272,11 @@ alloc:
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
- spin_lock(fe->ptl);
+ spin_lock(vmf->ptl);
if (page)
put_page(page);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) {
- spin_unlock(fe->ptl);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(new_page, memcg, true);
put_page(new_page);
goto out_mn;
@@ -1091,12 +1284,12 @@ alloc:
pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
page_add_new_anon_rmap(new_page, vma, haddr, true);
mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
- update_mmu_cache_pmd(vma, fe->address, fe->pmd);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
if (!page) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
} else {
@@ -1106,16 +1299,26 @@ alloc:
}
ret |= VM_FAULT_WRITE;
}
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
out_mn:
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
out:
return ret;
out_unlock:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
return ret;
}
+/*
+ * FOLL_FORCE can write to even unwritable pmd's, but only
+ * after we've gone through a COW cycle and they are dirty.
+ */
+static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
+{
+ return pmd_write(pmd) ||
+ ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
+}
+
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmd,
@@ -1126,7 +1329,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
assert_spin_locked(pmd_lockptr(mm, pmd));
- if (flags & FOLL_WRITE && !pmd_write(*pmd))
+ if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
goto out;
/* Avoid dumping huge zero page */
@@ -1185,12 +1388,12 @@ out:
}
/* NUMA hinting page fault entry point for trans huge pmds */
-int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
+int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct anon_vma *anon_vma = NULL;
struct page *page;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
int page_nid = -1, this_nid = numa_node_id();
int target_nid, last_cpupid = -1;
bool page_locked;
@@ -1198,8 +1401,8 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
bool was_writable;
int flags = 0;
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_same(pmd, *fe->pmd)))
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(pmd, *vmf->pmd)))
goto out_unlock;
/*
@@ -1207,9 +1410,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
* without disrupting NUMA hinting information. Do not relock and
* check_same as the page may no longer be mapped.
*/
- if (unlikely(pmd_trans_migrating(*fe->pmd))) {
- page = pmd_page(*fe->pmd);
- spin_unlock(fe->ptl);
+ if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
+ page = pmd_page(*vmf->pmd);
+ spin_unlock(vmf->ptl);
wait_on_page_locked(page);
goto out;
}
@@ -1225,7 +1428,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
}
/* See similar comment in do_numa_page for explanation */
- if (!pmd_write(pmd))
+ if (!pmd_savedwrite(pmd))
flags |= TNF_NO_GROUP;
/*
@@ -1242,7 +1445,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
wait_on_page_locked(page);
page_nid = -1;
goto out;
@@ -1253,12 +1456,12 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
* to serialises splits
*/
get_page(page);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
anon_vma = page_lock_anon_vma_read(page);
/* Confirm the PMD did not change while page_table_lock was released */
- spin_lock(fe->ptl);
- if (unlikely(!pmd_same(pmd, *fe->pmd))) {
+ spin_lock(vmf->ptl);
+ if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
unlock_page(page);
put_page(page);
page_nid = -1;
@@ -1276,9 +1479,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
* Migrate the THP to the requested node, returns with page unlocked
* and access rights restored.
*/
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
- fe->pmd, pmd, fe->address, page, target_nid);
+ vmf->pmd, pmd, vmf->address, page, target_nid);
if (migrated) {
flags |= TNF_MIGRATED;
page_nid = target_nid;
@@ -1288,23 +1491,24 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
goto out;
clear_pmdnuma:
BUG_ON(!PageLocked(page));
- was_writable = pmd_write(pmd);
+ was_writable = pmd_savedwrite(pmd);
pmd = pmd_modify(pmd, vma->vm_page_prot);
pmd = pmd_mkyoung(pmd);
if (was_writable)
pmd = pmd_mkwrite(pmd);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd);
- update_mmu_cache_pmd(vma, fe->address, fe->pmd);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
unlock_page(page);
out_unlock:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
out:
if (anon_vma)
page_unlock_anon_vma_read(anon_vma);
if (page_nid != -1)
- task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags);
+ task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
+ flags);
return 0;
}
@@ -1322,6 +1526,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct mm_struct *mm = tlb->mm;
bool ret = false;
+ tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
+
ptl = pmd_trans_huge_lock(pmd, vma);
if (!ptl)
goto out_unlocked;
@@ -1362,8 +1568,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
deactivate_page(page);
if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
- orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
- tlb->fullmm);
+ pmdp_invalidate(vma, addr, pmd);
orig_pmd = pmd_mkold(orig_pmd);
orig_pmd = pmd_mkclean(orig_pmd);
@@ -1377,12 +1582,23 @@ out_unlocked:
return ret;
}
+static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
+{
+ pgtable_t pgtable;
+
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pte_free(mm, pgtable);
+ atomic_long_dec(&mm->nr_ptes);
+}
+
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
pmd_t orig_pmd;
spinlock_t *ptl;
+ tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
+
ptl = __pmd_trans_huge_lock(pmd, vma);
if (!ptl)
return 0;
@@ -1398,12 +1614,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (vma_is_dax(vma)) {
spin_unlock(ptl);
if (is_huge_zero_pmd(orig_pmd))
- tlb_remove_page(tlb, pmd_page(orig_pmd));
+ tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
} else if (is_huge_zero_pmd(orig_pmd)) {
pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
atomic_long_dec(&tlb->mm->nr_ptes);
spin_unlock(ptl);
- tlb_remove_page(tlb, pmd_page(orig_pmd));
+ tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
} else {
struct page *page = pmd_page(orig_pmd);
page_remove_rmap(page, true);
@@ -1416,6 +1632,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
atomic_long_dec(&tlb->mm->nr_ptes);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
} else {
+ if (arch_needs_pgtable_deposit())
+ zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
}
spin_unlock(ptl);
@@ -1424,6 +1642,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
return 1;
}
+#ifndef pmd_move_must_withdraw
+static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
+ spinlock_t *old_pmd_ptl,
+ struct vm_area_struct *vma)
+{
+ /*
+ * With split pmd lock we also need to move preallocated
+ * PTE page table if new_pmd is on different PMD page table.
+ *
+ * We also don't deposit and withdraw tables for file pages.
+ */
+ return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
+}
+#endif
+
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
@@ -1461,8 +1694,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
force_flush = true;
VM_BUG_ON(!pmd_none(*new_pmd));
- if (pmd_move_must_withdraw(new_ptl, old_ptl) &&
- vma_is_anonymous(vma)) {
+ if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
pgtable_t pgtable;
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
@@ -1491,37 +1723,69 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
- int ret = 0;
+ pmd_t entry;
+ bool preserve_write;
+ int ret;
ptl = __pmd_trans_huge_lock(pmd, vma);
- if (ptl) {
- pmd_t entry;
- bool preserve_write = prot_numa && pmd_write(*pmd);
- ret = 1;
+ if (!ptl)
+ return 0;
- /*
- * Avoid trapping faults against the zero page. The read-only
- * data is likely to be read-cached on the local CPU and
- * local/remote hits to the zero page are not interesting.
- */
- if (prot_numa && is_huge_zero_pmd(*pmd)) {
- spin_unlock(ptl);
- return ret;
- }
+ preserve_write = prot_numa && pmd_write(*pmd);
+ ret = 1;
- if (!prot_numa || !pmd_protnone(*pmd)) {
- entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
- entry = pmd_modify(entry, newprot);
- if (preserve_write)
- entry = pmd_mkwrite(entry);
- ret = HPAGE_PMD_NR;
- set_pmd_at(mm, addr, pmd, entry);
- BUG_ON(vma_is_anonymous(vma) && !preserve_write &&
- pmd_write(entry));
- }
- spin_unlock(ptl);
- }
+ /*
+ * Avoid trapping faults against the zero page. The read-only
+ * data is likely to be read-cached on the local CPU and
+ * local/remote hits to the zero page are not interesting.
+ */
+ if (prot_numa && is_huge_zero_pmd(*pmd))
+ goto unlock;
+
+ if (prot_numa && pmd_protnone(*pmd))
+ goto unlock;
+ /*
+ * In case prot_numa, we are under down_read(mmap_sem). It's critical
+ * to not clear pmd intermittently to avoid race with MADV_DONTNEED
+ * which is also under down_read(mmap_sem):
+ *
+ * CPU0: CPU1:
+ * change_huge_pmd(prot_numa=1)
+ * pmdp_huge_get_and_clear_notify()
+ * madvise_dontneed()
+ * zap_pmd_range()
+ * pmd_trans_huge(*pmd) == 0 (without ptl)
+ * // skip the pmd
+ * set_pmd_at();
+ * // pmd is re-established
+ *
+ * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
+ * which may break userspace.
+ *
+ * pmdp_invalidate() is required to make sure we don't miss
+ * dirty/young flags set by hardware.
+ */
+ entry = *pmd;
+ pmdp_invalidate(vma, addr, pmd);
+
+ /*
+ * Recover dirty/young flags. It relies on pmdp_invalidate to not
+ * corrupt them.
+ */
+ if (pmd_dirty(*pmd))
+ entry = pmd_mkdirty(entry);
+ if (pmd_young(*pmd))
+ entry = pmd_mkyoung(entry);
+
+ entry = pmd_modify(entry, newprot);
+ if (preserve_write)
+ entry = pmd_mk_savedwrite(entry);
+ ret = HPAGE_PMD_NR;
+ set_pmd_at(mm, addr, pmd, entry);
+ BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
+unlock:
+ spin_unlock(ptl);
return ret;
}
@@ -1541,6 +1805,84 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
return NULL;
}
+/*
+ * Returns true if a given pud maps a thp, false otherwise.
+ *
+ * Note that if it returns true, this routine returns without unlocking page
+ * table lock. So callers must unlock it.
+ */
+spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
+{
+ spinlock_t *ptl;
+
+ ptl = pud_lock(vma->vm_mm, pud);
+ if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
+ return ptl;
+ spin_unlock(ptl);
+ return NULL;
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pud_t *pud, unsigned long addr)
+{
+ pud_t orig_pud;
+ spinlock_t *ptl;
+
+ ptl = __pud_trans_huge_lock(pud, vma);
+ if (!ptl)
+ return 0;
+ /*
+ * For architectures like ppc64 we look at deposited pgtable
+ * when calling pudp_huge_get_and_clear. So do the
+ * pgtable_trans_huge_withdraw after finishing pudp related
+ * operations.
+ */
+ orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud,
+ tlb->fullmm);
+ tlb_remove_pud_tlb_entry(tlb, pud, addr);
+ if (vma_is_dax(vma)) {
+ spin_unlock(ptl);
+ /* No zero page support yet */
+ } else {
+ /* No support for anonymous PUD pages yet */
+ BUG();
+ }
+ return 1;
+}
+
+static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
+ unsigned long haddr)
+{
+ VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
+ VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
+ VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
+ VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
+
+ count_vm_event(THP_SPLIT_PUD);
+
+ pudp_huge_clear_flush_notify(vma, haddr, pud);
+}
+
+void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
+ unsigned long address)
+{
+ spinlock_t *ptl;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long haddr = address & HPAGE_PUD_MASK;
+
+ mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE);
+ ptl = pud_lock(mm, pud);
+ if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
+ goto out;
+ __split_huge_pud_locked(vma, pud, haddr);
+
+out:
+ spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE);
+}
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+
static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
unsigned long haddr, pmd_t *pmd)
{
@@ -1588,6 +1930,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (!vma_is_anonymous(vma)) {
_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ /*
+ * We are going to unmap this huge page. So
+ * just go ahead and zap it
+ */
+ if (arch_needs_pgtable_deposit())
+ zap_deposited_table(mm, pmd);
if (vma_is_dax(vma))
return;
page = pmd_page(_pmd);
@@ -1731,6 +2079,7 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
bool freeze, struct page *page)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
@@ -1738,7 +2087,11 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
if (!pgd_present(*pgd))
return;
- pud = pud_offset(pgd, address);
+ p4d = p4d_offset(pgd, address);
+ if (!p4d_present(*p4d))
+ return;
+
+ pud = pud_offset(p4d, address);
if (!pud_present(*pud))
return;
@@ -1791,32 +2144,27 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
static void freeze_page(struct page *page)
{
enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
- TTU_RMAP_LOCKED;
- int i, ret;
+ TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
+ int ret;
VM_BUG_ON_PAGE(!PageHead(page), page);
if (PageAnon(page))
ttu_flags |= TTU_MIGRATION;
- /* We only need TTU_SPLIT_HUGE_PMD once */
- ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD);
- for (i = 1; !ret && i < HPAGE_PMD_NR; i++) {
- /* Cut short if the page is unmapped */
- if (page_count(page) == 1)
- return;
-
- ret = try_to_unmap(page + i, ttu_flags);
- }
- VM_BUG_ON_PAGE(ret, page + i - 1);
+ ret = try_to_unmap(page, ttu_flags);
+ VM_BUG_ON_PAGE(ret, page);
}
static void unfreeze_page(struct page *page)
{
int i;
-
- for (i = 0; i < HPAGE_PMD_NR; i++)
- remove_migration_ptes(page + i, page + i, true);
+ if (PageTransHuge(page)) {
+ remove_migration_ptes(page, page, true);
+ } else {
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ remove_migration_ptes(page + i, page + i, true);
+ }
}
static void __split_huge_page_tail(struct page *head, int tail,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 418bf01a50ed..e5828875f7bb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,7 @@
#include <linux/bootmem.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
+#include <linux/sched/signal.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
@@ -32,6 +33,7 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
+#include <linux/userfaultfd_k.h>
#include "internal.h"
int hugepages_treat_as_movable;
@@ -1051,7 +1053,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn,
unsigned long nr_pages)
{
unsigned long end_pfn = start_pfn + nr_pages;
- return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+ return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
+ GFP_KERNEL);
}
static bool pfn_range_valid_gigantic(struct zone *z,
@@ -1773,23 +1776,32 @@ free:
}
/*
- * When releasing a hugetlb pool reservation, any surplus pages that were
- * allocated to satisfy the reservation must be explicitly freed if they were
- * never used.
- * Called with hugetlb_lock held.
+ * This routine has two main purposes:
+ * 1) Decrement the reservation count (resv_huge_pages) by the value passed
+ * in unused_resv_pages. This corresponds to the prior adjustments made
+ * to the associated reservation map.
+ * 2) Free any unused surplus pages that may have been allocated to satisfy
+ * the reservation. As many as unused_resv_pages may be freed.
+ *
+ * Called with hugetlb_lock held. However, the lock could be dropped (and
+ * reacquired) during calls to cond_resched_lock. Whenever dropping the lock,
+ * we must make sure nobody else can claim pages we are in the process of
+ * freeing. Do this by ensuring resv_huge_page always is greater than the
+ * number of huge pages we plan to free when dropping the lock.
*/
static void return_unused_surplus_pages(struct hstate *h,
unsigned long unused_resv_pages)
{
unsigned long nr_pages;
- /* Uncommit the reservation */
- h->resv_huge_pages -= unused_resv_pages;
-
/* Cannot return gigantic pages currently */
if (hstate_is_gigantic(h))
- return;
+ goto out;
+ /*
+ * Part (or even all) of the reservation could have been backed
+ * by pre-allocated pages. Only free surplus pages.
+ */
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
/*
@@ -1799,12 +1811,22 @@ static void return_unused_surplus_pages(struct hstate *h,
* when the nodes with surplus pages have no free pages.
* free_pool_huge_page() will balance the the freed pages across the
* on-line nodes with memory and will handle the hstate accounting.
+ *
+ * Note that we decrement resv_huge_pages as we free the pages. If
+ * we drop the lock, resv_huge_pages will still be sufficiently large
+ * to cover subsequent pages we may free.
*/
while (nr_pages--) {
+ h->resv_huge_pages--;
+ unused_resv_pages--;
if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
- break;
+ goto out;
cond_resched_lock(&hugetlb_lock);
}
+
+out:
+ /* Fully uncommit the reservation */
+ h->resv_huge_pages -= unused_resv_pages;
}
@@ -3122,7 +3144,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
* hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
* this far.
*/
-static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int hugetlb_vm_op_fault(struct vm_fault *vmf)
{
BUG();
return 0;
@@ -3286,6 +3308,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
BUG_ON(start & ~huge_page_mask(h));
BUG_ON(end & ~huge_page_mask(h));
+ /*
+ * This is a hugetlb vma, all the pte entries should point
+ * to huge page.
+ */
+ tlb_remove_check_page_size_change(tlb, sz);
tlb_start_vma(tlb, vma);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
address = start;
@@ -3336,7 +3363,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
pte = huge_ptep_get_and_clear(mm, address, ptep);
- tlb_remove_tlb_entry(tlb, ptep, address);
+ tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
@@ -3450,15 +3477,17 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep, pte_t pte,
- struct page *pagecache_page, spinlock_t *ptl)
+ unsigned long address, pte_t *ptep,
+ struct page *pagecache_page, spinlock_t *ptl)
{
+ pte_t pte;
struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
int ret = 0, outside_reserve = 0;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
retry_avoidcopy:
@@ -3654,6 +3683,38 @@ retry:
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto out;
+
+ /*
+ * Check for page in userfault range
+ */
+ if (userfaultfd_missing(vma)) {
+ u32 hash;
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = address,
+ .flags = flags,
+ /*
+ * Hard to debug if it ends up being
+ * used by a callee that assumes
+ * something about the other
+ * uninitialized fields... same as in
+ * memory.c
+ */
+ };
+
+ /*
+ * hugetlb_fault_mutex must be dropped before
+ * handling userfault. Reacquire after handling
+ * fault to make calling code simpler.
+ */
+ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
+ idx, address);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ ret = handle_userfault(&vmf, VM_UFFD_MISSING);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ goto out;
+ }
+
page = alloc_huge_page(vma, address, 0);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
@@ -3711,8 +3772,7 @@ retry:
vma_end_reservation(h, vma, address);
}
- ptl = huge_pte_lockptr(h, mm, ptep);
- spin_lock(ptl);
+ ptl = huge_pte_lock(h, mm, ptep);
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto backout;
@@ -3733,7 +3793,7 @@ retry:
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
+ ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
}
spin_unlock(ptl);
@@ -3888,8 +3948,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & FAULT_FLAG_WRITE) {
if (!huge_pte_write(entry)) {
- ret = hugetlb_cow(mm, vma, address, ptep, entry,
- pagecache_page, ptl);
+ ret = hugetlb_cow(mm, vma, address, ptep,
+ pagecache_page, ptl);
goto out_put_page;
}
entry = huge_pte_mkdirty(entry);
@@ -3923,10 +3983,113 @@ out_mutex:
return ret;
}
+/*
+ * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
+ * modifications for huge pages.
+ */
+int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
+ pte_t *dst_pte,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **pagep)
+{
+ int vm_shared = dst_vma->vm_flags & VM_SHARED;
+ struct hstate *h = hstate_vma(dst_vma);
+ pte_t _dst_pte;
+ spinlock_t *ptl;
+ int ret;
+ struct page *page;
+
+ if (!*pagep) {
+ ret = -ENOMEM;
+ page = alloc_huge_page(dst_vma, dst_addr, 0);
+ if (IS_ERR(page))
+ goto out;
+
+ ret = copy_huge_page_from_user(page,
+ (const void __user *) src_addr,
+ pages_per_huge_page(h), false);
+
+ /* fallback to copy_from_user outside mmap_sem */
+ if (unlikely(ret)) {
+ ret = -EFAULT;
+ *pagep = page;
+ /* don't free the page */
+ goto out;
+ }
+ } else {
+ page = *pagep;
+ *pagep = NULL;
+ }
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+ set_page_huge_active(page);
+
+ /*
+ * If shared, add to page cache
+ */
+ if (vm_shared) {
+ struct address_space *mapping = dst_vma->vm_file->f_mapping;
+ pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+
+ ret = huge_add_to_page_cache(page, mapping, idx);
+ if (ret)
+ goto out_release_nounlock;
+ }
+
+ ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
+ spin_lock(ptl);
+
+ ret = -EEXIST;
+ if (!huge_pte_none(huge_ptep_get(dst_pte)))
+ goto out_release_unlock;
+
+ if (vm_shared) {
+ page_dup_rmap(page, true);
+ } else {
+ ClearPagePrivate(page);
+ hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
+ }
+
+ _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
+ if (dst_vma->vm_flags & VM_WRITE)
+ _dst_pte = huge_pte_mkdirty(_dst_pte);
+ _dst_pte = pte_mkyoung(_dst_pte);
+
+ set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
+ dst_vma->vm_flags & VM_WRITE);
+ hugetlb_count_add(pages_per_huge_page(h), dst_mm);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+
+ spin_unlock(ptl);
+ if (vm_shared)
+ unlock_page(page);
+ ret = 0;
+out:
+ return ret;
+out_release_unlock:
+ spin_unlock(ptl);
+out_release_nounlock:
+ if (vm_shared)
+ unlock_page(page);
+ put_page(page);
+ goto out;
+}
+
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
- long i, unsigned int flags)
+ long i, unsigned int flags, int *nonblocking)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
@@ -3989,16 +4152,43 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
((flags & FOLL_WRITE) &&
!huge_pte_write(huge_ptep_get(pte)))) {
int ret;
+ unsigned int fault_flags = 0;
if (pte)
spin_unlock(ptl);
- ret = hugetlb_fault(mm, vma, vaddr,
- (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
- if (!(ret & VM_FAULT_ERROR))
- continue;
-
- remainder = 0;
- break;
+ if (flags & FOLL_WRITE)
+ fault_flags |= FAULT_FLAG_WRITE;
+ if (nonblocking)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+ if (flags & FOLL_NOWAIT)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY |
+ FAULT_FLAG_RETRY_NOWAIT;
+ if (flags & FOLL_TRIED) {
+ VM_WARN_ON_ONCE(fault_flags &
+ FAULT_FLAG_ALLOW_RETRY);
+ fault_flags |= FAULT_FLAG_TRIED;
+ }
+ ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
+ if (ret & VM_FAULT_ERROR) {
+ remainder = 0;
+ break;
+ }
+ if (ret & VM_FAULT_RETRY) {
+ if (nonblocking)
+ *nonblocking = 0;
+ *nr_pages = 0;
+ /*
+ * VM_FAULT_RETRY must not return an
+ * error, it will return zero
+ * instead.
+ *
+ * No need to update "position" as the
+ * caller will not check it after
+ * *nr_pages is set to 0.
+ */
+ return i;
+ }
+ continue;
}
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
@@ -4027,6 +4217,11 @@ same_page:
spin_unlock(ptl);
}
*nr_pages = remainder;
+ /*
+ * setting position is actually required only if remainder is
+ * not zero but it's faster not to add a "if (remainder)"
+ * branch.
+ */
*position = vaddr;
return i ? i : -EFAULT;
@@ -4208,7 +4403,9 @@ int hugetlb_reserve_pages(struct inode *inode,
return 0;
out_err:
if (!vma || vma->vm_flags & VM_MAYSHARE)
- region_abort(resv_map, from, to);
+ /* Don't call region_abort if region_chg failed */
+ if (chg >= 0)
+ region_abort(resv_map, from, to);
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_put(&resv_map->refs, resv_map_release);
return ret;
@@ -4330,8 +4527,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!spte)
goto out;
- ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
- spin_lock(ptl);
+ ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
if (pud_none(*pud)) {
pud_populate(mm, pud,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
@@ -4361,7 +4557,8 @@ out:
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
{
pgd_t *pgd = pgd_offset(mm, *addr);
- pud_t *pud = pud_offset(pgd, *addr);
+ p4d_t *p4d = p4d_offset(pgd, *addr);
+ pud_t *pud = pud_offset(p4d, *addr);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
@@ -4392,11 +4589,13 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pte_t *pte = NULL;
pgd = pgd_offset(mm, addr);
- pud = pud_alloc(mm, pgd, addr);
+ p4d = p4d_offset(pgd, addr);
+ pud = pud_alloc(mm, p4d, addr);
if (pud) {
if (sz == PUD_SIZE) {
pte = (pte_t *)pud;
@@ -4416,18 +4615,22 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
- pmd_t *pmd = NULL;
+ pmd_t *pmd;
pgd = pgd_offset(mm, addr);
- if (pgd_present(*pgd)) {
- pud = pud_offset(pgd, addr);
- if (pud_present(*pud)) {
- if (pud_huge(*pud))
- return (pte_t *)pud;
- pmd = pmd_offset(pud, addr);
- }
- }
+ if (!pgd_present(*pgd))
+ return NULL;
+ p4d = p4d_offset(pgd, addr);
+ if (!p4d_present(*p4d))
+ return NULL;
+ pud = pud_offset(p4d, addr);
+ if (!pud_present(*pud))
+ return NULL;
+ if (pud_huge(*pud))
+ return (pte_t *)pud;
+ pmd = pmd_offset(pud, addr);
return (pte_t *) pmd;
}
@@ -4450,6 +4653,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
{
struct page *page = NULL;
spinlock_t *ptl;
+ pte_t pte;
retry:
ptl = pmd_lockptr(mm, pmd);
spin_lock(ptl);
@@ -4459,12 +4663,13 @@ retry:
*/
if (!pmd_huge(*pmd))
goto out;
- if (pmd_present(*pmd)) {
+ pte = huge_ptep_get((pte_t *)pmd);
+ if (pte_present(pte)) {
page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
if (flags & FOLL_GET)
get_page(page);
} else {
- if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
+ if (is_hugetlb_entry_migration(pte)) {
spin_unlock(ptl);
__migration_entry_wait(mm, (pte_t *)pmd, ptl);
goto retry;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index a56a851908d2..975e49f00f34 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -6,6 +6,7 @@
#include <linux/cpumask.h>
#include <linux/atomic.h>
+#include <linux/user_namespace.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
@@ -21,5 +22,6 @@ struct mm_struct init_mm = {
.mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
+ .user_ns = &init_user_ns,
INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/internal.h b/mm/internal.h
index 537ac9951f5f..266efaeaa370 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -36,11 +36,18 @@
/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
-int do_swap_page(struct fault_env *fe, pte_t orig_pte);
+void page_writeback_init(void);
+
+int do_swap_page(struct vm_fault *vmf);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
+static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma)
+{
+ return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
+}
+
void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
@@ -131,9 +138,9 @@ struct alloc_context {
* Assumption: *_mem_map is contiguous at least up to MAX_ORDER
*/
static inline unsigned long
-__find_buddy_index(unsigned long page_idx, unsigned int order)
+__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
{
- return page_idx ^ (1 << order);
+ return page_pfn ^ (1 << order);
}
extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
@@ -173,6 +180,8 @@ struct compact_control {
struct list_head migratepages; /* List of pages being migrated */
unsigned long nr_freepages; /* Number of isolated free pages */
unsigned long nr_migratepages; /* Number of pages to migrate */
+ unsigned long total_migrate_scanned;
+ unsigned long total_free_scanned;
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
@@ -326,12 +335,15 @@ __vma_address(struct page *page, struct vm_area_struct *vma)
static inline unsigned long
vma_address(struct page *page, struct vm_area_struct *vma)
{
- unsigned long address = __vma_address(page, vma);
+ unsigned long start, end;
+
+ start = __vma_address(page, vma);
+ end = start + PAGE_SIZE * (hpage_nr_pages(page) - 1);
/* page should be within @vma mapping range */
- VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+ VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma);
- return address;
+ return max(start, vma->vm_start);
}
#else /* !CONFIG_MMU */
@@ -469,6 +481,13 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
enum ttu_flags;
struct tlbflush_unmap_batch;
+
+/*
+ * only for MM internal work items which do not depend on
+ * any allocations or locks which might depend on allocations
+ */
+extern struct workqueue_struct *mm_percpu_wq;
+
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
void try_to_unmap_flush(void);
void try_to_unmap_flush_dirty(void);
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 0e9505f66ec1..98b27195e38b 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -29,6 +29,7 @@
#include <linux/module.h>
#include <linux/printk.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/slab.h>
#include <linux/stacktrace.h>
#include <linux/string.h>
@@ -39,6 +40,16 @@
#include "kasan.h"
#include "../slab.h"
+void kasan_enable_current(void)
+{
+ current->kasan_depth++;
+}
+
+void kasan_disable_current(void)
+{
+ current->kasan_depth--;
+}
+
/*
* Poisons the shadow memory for 'size' bytes starting from 'addr'.
* Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
@@ -80,7 +91,14 @@ void kasan_unpoison_task_stack(struct task_struct *task)
/* Unpoison the stack for the current task beyond a watermark sp value. */
asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
{
- __kasan_unpoison_stack(current, watermark);
+ /*
+ * Calculate the task stack base address. Avoid using 'current'
+ * because this function is called by early resume code which hasn't
+ * yet set up the percpu register (%gs).
+ */
+ void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
+
+ kasan_unpoison_shadow(base, watermark - base);
}
/*
@@ -428,7 +446,7 @@ void kasan_cache_shrink(struct kmem_cache *cache)
quarantine_remove_cache(cache);
}
-void kasan_cache_destroy(struct kmem_cache *cache)
+void kasan_cache_shutdown(struct kmem_cache *cache)
{
quarantine_remove_cache(cache);
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 1c260e6b3b3c..dd2dea8eb077 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -96,11 +96,6 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
<< KASAN_SHADOW_SCALE_SHIFT);
}
-static inline bool kasan_report_enabled(void)
-{
- return !current->kasan_depth;
-}
-
void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
void kasan_report_double_free(struct kmem_cache *cache, void *object,
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c
index 3f9a41cf0ac6..b96a5f773d88 100644
--- a/mm/kasan/kasan_init.c
+++ b/mm/kasan/kasan_init.c
@@ -15,6 +15,7 @@
#include <linux/kasan.h>
#include <linux/kernel.h>
#include <linux/memblock.h>
+#include <linux/mm.h>
#include <linux/pfn.h>
#include <asm/page.h>
@@ -29,6 +30,9 @@
*/
unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
+#if CONFIG_PGTABLE_LEVELS > 4
+p4d_t kasan_zero_p4d[PTRS_PER_P4D] __page_aligned_bss;
+#endif
#if CONFIG_PGTABLE_LEVELS > 3
pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
#endif
@@ -49,7 +53,7 @@ static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
pte_t *pte = pte_offset_kernel(pmd, addr);
pte_t zero_pte;
- zero_pte = pfn_pte(PFN_DOWN(__pa(kasan_zero_page)), PAGE_KERNEL);
+ zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_zero_page)), PAGE_KERNEL);
zero_pte = pte_wrprotect(zero_pte);
while (addr + PAGE_SIZE <= end) {
@@ -69,7 +73,7 @@ static void __init zero_pmd_populate(pud_t *pud, unsigned long addr,
next = pmd_addr_end(addr, end);
if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) {
- pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
+ pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte));
continue;
}
@@ -81,10 +85,10 @@ static void __init zero_pmd_populate(pud_t *pud, unsigned long addr,
} while (pmd++, addr = next, addr != end);
}
-static void __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
+static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr,
unsigned long end)
{
- pud_t *pud = pud_offset(pgd, addr);
+ pud_t *pud = pud_offset(p4d, addr);
unsigned long next;
do {
@@ -92,9 +96,9 @@ static void __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) {
pmd_t *pmd;
- pud_populate(&init_mm, pud, kasan_zero_pmd);
+ pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd));
pmd = pmd_offset(pud, addr);
- pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
+ pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte));
continue;
}
@@ -106,6 +110,23 @@ static void __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
} while (pud++, addr = next, addr != end);
}
+static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr,
+ unsigned long end)
+{
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ unsigned long next;
+
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (p4d_none(*p4d)) {
+ p4d_populate(&init_mm, p4d,
+ early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ }
+ zero_pud_populate(p4d, addr, next);
+ } while (p4d++, addr = next, addr != end);
+}
+
/**
* kasan_populate_zero_shadow - populate shadow memory region with
* kasan_zero_page
@@ -124,6 +145,7 @@ void __init kasan_populate_zero_shadow(const void *shadow_start,
next = pgd_addr_end(addr, end);
if (IS_ALIGNED(addr, PGDIR_SIZE) && end - addr >= PGDIR_SIZE) {
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
@@ -134,12 +156,25 @@ void __init kasan_populate_zero_shadow(const void *shadow_start,
* 3,2 - level page tables where we don't have
* puds,pmds, so pgd_populate(), pud_populate()
* is noops.
+ *
+ * The ifndef is required to avoid build breakage.
+ *
+ * With 5level-fixup.h, pgd_populate() is not nop and
+ * we reference kasan_zero_p4d. It's not defined
+ * unless 5-level paging enabled.
+ *
+ * The ifndef can be dropped once all KASAN-enabled
+ * architectures will switch to pgtable-nop4d.h.
*/
- pgd_populate(&init_mm, pgd, kasan_zero_pud);
- pud = pud_offset(pgd, addr);
- pud_populate(&init_mm, pud, kasan_zero_pmd);
+#ifndef __ARCH_HAS_5LEVEL_HACK
+ pgd_populate(&init_mm, pgd, lm_alias(kasan_zero_p4d));
+#endif
+ p4d = p4d_offset(pgd, addr);
+ p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud));
+ pud = pud_offset(p4d, addr);
+ pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd));
pmd = pmd_offset(pud, addr);
- pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
+ pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte));
continue;
}
@@ -147,6 +182,6 @@ void __init kasan_populate_zero_shadow(const void *shadow_start,
pgd_populate(&init_mm, pgd,
early_alloc(PAGE_SIZE, NUMA_NO_NODE));
}
- zero_pud_populate(pgd, addr, next);
+ zero_p4d_populate(pgd, addr, next);
} while (pgd++, addr = next, addr != end);
}
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index baabaad4a4aa..3a8ddf8baf7d 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -25,6 +25,7 @@
#include <linux/printk.h>
#include <linux/shrinker.h>
#include <linux/slab.h>
+#include <linux/srcu.h>
#include <linux/string.h>
#include <linux/types.h>
@@ -86,24 +87,9 @@ static void qlist_move_all(struct qlist_head *from, struct qlist_head *to)
qlist_init(from);
}
-static void qlist_move(struct qlist_head *from, struct qlist_node *last,
- struct qlist_head *to, size_t size)
-{
- if (unlikely(last == from->tail)) {
- qlist_move_all(from, to);
- return;
- }
- if (qlist_empty(to))
- to->head = from->head;
- else
- to->tail->next = from->head;
- to->tail = last;
- from->head = last->next;
- last->next = NULL;
- from->bytes -= size;
- to->bytes += size;
-}
-
+#define QUARANTINE_PERCPU_SIZE (1 << 20)
+#define QUARANTINE_BATCHES \
+ (1024 > 4 * CONFIG_NR_CPUS ? 1024 : 4 * CONFIG_NR_CPUS)
/*
* The object quarantine consists of per-cpu queues and a global queue,
@@ -111,11 +97,23 @@ static void qlist_move(struct qlist_head *from, struct qlist_node *last,
*/
static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
-static struct qlist_head global_quarantine;
+/* Round-robin FIFO array of batches. */
+static struct qlist_head global_quarantine[QUARANTINE_BATCHES];
+static int quarantine_head;
+static int quarantine_tail;
+/* Total size of all objects in global_quarantine across all batches. */
+static unsigned long quarantine_size;
static DEFINE_SPINLOCK(quarantine_lock);
+DEFINE_STATIC_SRCU(remove_cache_srcu);
/* Maximum size of the global queue. */
-static unsigned long quarantine_size;
+static unsigned long quarantine_max_size;
+
+/*
+ * Target size of a batch in global_quarantine.
+ * Usually equal to QUARANTINE_PERCPU_SIZE unless we have too much RAM.
+ */
+static unsigned long quarantine_batch_size;
/*
* The fraction of physical memory the quarantine is allowed to occupy.
@@ -124,9 +122,6 @@ static unsigned long quarantine_size;
*/
#define QUARANTINE_FRACTION 32
-#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4)
-#define QUARANTINE_PERCPU_SIZE (1 << 20)
-
static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
{
return virt_to_head_page(qlink)->slab_cache;
@@ -180,62 +175,89 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
struct qlist_head *q;
struct qlist_head temp = QLIST_INIT;
+ /*
+ * Note: irq must be disabled until after we move the batch to the
+ * global quarantine. Otherwise quarantine_remove_cache() can miss
+ * some objects belonging to the cache if they are in our local temp
+ * list. quarantine_remove_cache() executes on_each_cpu() at the
+ * beginning which ensures that it either sees the objects in per-cpu
+ * lists or in the global quarantine.
+ */
local_irq_save(flags);
q = this_cpu_ptr(&cpu_quarantine);
qlist_put(q, &info->quarantine_link, cache->size);
- if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE))
+ if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) {
qlist_move_all(q, &temp);
- local_irq_restore(flags);
-
- if (unlikely(!qlist_empty(&temp))) {
- spin_lock_irqsave(&quarantine_lock, flags);
- qlist_move_all(&temp, &global_quarantine);
- spin_unlock_irqrestore(&quarantine_lock, flags);
+ spin_lock(&quarantine_lock);
+ WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
+ qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
+ if (global_quarantine[quarantine_tail].bytes >=
+ READ_ONCE(quarantine_batch_size)) {
+ int new_tail;
+
+ new_tail = quarantine_tail + 1;
+ if (new_tail == QUARANTINE_BATCHES)
+ new_tail = 0;
+ if (new_tail != quarantine_head)
+ quarantine_tail = new_tail;
+ }
+ spin_unlock(&quarantine_lock);
}
+
+ local_irq_restore(flags);
}
void quarantine_reduce(void)
{
- size_t new_quarantine_size, percpu_quarantines;
+ size_t total_size, new_quarantine_size, percpu_quarantines;
unsigned long flags;
+ int srcu_idx;
struct qlist_head to_free = QLIST_INIT;
- size_t size_to_free = 0;
- struct qlist_node *last;
- if (likely(READ_ONCE(global_quarantine.bytes) <=
- READ_ONCE(quarantine_size)))
+ if (likely(READ_ONCE(quarantine_size) <=
+ READ_ONCE(quarantine_max_size)))
return;
+ /*
+ * srcu critical section ensures that quarantine_remove_cache()
+ * will not miss objects belonging to the cache while they are in our
+ * local to_free list. srcu is chosen because (1) it gives us private
+ * grace period domain that does not interfere with anything else,
+ * and (2) it allows synchronize_srcu() to return without waiting
+ * if there are no pending read critical sections (which is the
+ * expected case).
+ */
+ srcu_idx = srcu_read_lock(&remove_cache_srcu);
spin_lock_irqsave(&quarantine_lock, flags);
/*
* Update quarantine size in case of hotplug. Allocate a fraction of
* the installed memory to quarantine minus per-cpu queue limits.
*/
- new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
+ total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
QUARANTINE_FRACTION;
percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
- new_quarantine_size = (new_quarantine_size < percpu_quarantines) ?
- 0 : new_quarantine_size - percpu_quarantines;
- WRITE_ONCE(quarantine_size, new_quarantine_size);
-
- last = global_quarantine.head;
- while (last) {
- struct kmem_cache *cache = qlink_to_cache(last);
-
- size_to_free += cache->size;
- if (!last->next || size_to_free >
- global_quarantine.bytes - QUARANTINE_LOW_SIZE)
- break;
- last = last->next;
+ new_quarantine_size = (total_size < percpu_quarantines) ?
+ 0 : total_size - percpu_quarantines;
+ WRITE_ONCE(quarantine_max_size, new_quarantine_size);
+ /* Aim at consuming at most 1/2 of slots in quarantine. */
+ WRITE_ONCE(quarantine_batch_size, max((size_t)QUARANTINE_PERCPU_SIZE,
+ 2 * total_size / QUARANTINE_BATCHES));
+
+ if (likely(quarantine_size > quarantine_max_size)) {
+ qlist_move_all(&global_quarantine[quarantine_head], &to_free);
+ WRITE_ONCE(quarantine_size, quarantine_size - to_free.bytes);
+ quarantine_head++;
+ if (quarantine_head == QUARANTINE_BATCHES)
+ quarantine_head = 0;
}
- qlist_move(&global_quarantine, last, &to_free, size_to_free);
spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, NULL);
+ srcu_read_unlock(&remove_cache_srcu, srcu_idx);
}
static void qlist_move_cache(struct qlist_head *from,
@@ -273,16 +295,34 @@ static void per_cpu_remove_cache(void *arg)
qlist_free_all(&to_free, cache);
}
+/* Free all quarantined objects belonging to cache. */
void quarantine_remove_cache(struct kmem_cache *cache)
{
- unsigned long flags;
+ unsigned long flags, i;
struct qlist_head to_free = QLIST_INIT;
+ /*
+ * Must be careful to not miss any objects that are being moved from
+ * per-cpu list to the global quarantine in quarantine_put(),
+ * nor objects being freed in quarantine_reduce(). on_each_cpu()
+ * achieves the first goal, while synchronize_srcu() achieves the
+ * second.
+ */
on_each_cpu(per_cpu_remove_cache, cache, 1);
spin_lock_irqsave(&quarantine_lock, flags);
- qlist_move_cache(&global_quarantine, &to_free, cache);
+ for (i = 0; i < QUARANTINE_BATCHES; i++) {
+ if (qlist_empty(&global_quarantine[i]))
+ continue;
+ qlist_move_cache(&global_quarantine[i], &to_free, cache);
+ /* Scanning whole quarantine can take a while. */
+ spin_unlock_irqrestore(&quarantine_lock, flags);
+ cond_resched();
+ spin_lock_irqsave(&quarantine_lock, flags);
+ }
spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, cache);
+
+ synchronize_srcu(&remove_cache_srcu);
}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 073325aedc68..ab42a0803f16 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -13,6 +13,9 @@
*
*/
+#include <linux/bitops.h>
+#include <linux/ftrace.h>
+#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/printk.h>
@@ -136,6 +139,8 @@ static void kasan_end_report(unsigned long *flags)
pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, *flags);
+ if (panic_on_warn)
+ panic("panic_on_warn set ...\n");
kasan_enable_current();
}
@@ -290,6 +295,40 @@ static void kasan_report_error(struct kasan_access_info *info)
kasan_end_report(&flags);
}
+static unsigned long kasan_flags;
+
+#define KASAN_BIT_REPORTED 0
+#define KASAN_BIT_MULTI_SHOT 1
+
+bool kasan_save_enable_multi_shot(void)
+{
+ return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+}
+EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot);
+
+void kasan_restore_multi_shot(bool enabled)
+{
+ if (!enabled)
+ clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+}
+EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
+
+static int __init kasan_set_multi_shot(char *str)
+{
+ set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+ return 1;
+}
+__setup("kasan_multi_shot", kasan_set_multi_shot);
+
+static inline bool kasan_report_enabled(void)
+{
+ if (current->kasan_depth)
+ return false;
+ if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
+ return true;
+ return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
+}
+
void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip)
{
@@ -298,6 +337,8 @@ void kasan_report(unsigned long addr, size_t size,
if (likely(!kasan_report_enabled()))
return;
+ disable_trace_on_warning();
+
info.access_addr = (void *)addr;
info.access_size = size;
info.is_write = is_write;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 87e1a7ca3846..ba40b7f673f4 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2,6 +2,8 @@
#include <linux/mm.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
@@ -420,7 +422,7 @@ int __khugepaged_enter(struct mm_struct *mm)
list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
spin_unlock(&khugepaged_mm_lock);
- atomic_inc(&mm->mm_count);
+ mmgrab(mm);
if (wakeup)
wake_up_interruptible(&khugepaged_wait);
@@ -875,13 +877,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
unsigned long address, pmd_t *pmd,
int referenced)
{
- pte_t pteval;
int swapped_in = 0, ret = 0;
- struct fault_env fe = {
+ struct vm_fault vmf = {
.vma = vma,
.address = address,
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
+ .pgoff = linear_page_index(vma, address),
};
/* we only decide to swapin, if there is enough young ptes */
@@ -889,19 +891,19 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
}
- fe.pte = pte_offset_map(pmd, address);
- for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE;
- fe.pte++, fe.address += PAGE_SIZE) {
- pteval = *fe.pte;
- if (!is_swap_pte(pteval))
+ vmf.pte = pte_offset_map(pmd, address);
+ for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
+ vmf.pte++, vmf.address += PAGE_SIZE) {
+ vmf.orig_pte = *vmf.pte;
+ if (!is_swap_pte(vmf.orig_pte))
continue;
swapped_in++;
- ret = do_swap_page(&fe, pteval);
+ ret = do_swap_page(&vmf);
/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
if (ret & VM_FAULT_RETRY) {
down_read(&mm->mmap_sem);
- if (hugepage_vma_revalidate(mm, address, &fe.vma)) {
+ if (hugepage_vma_revalidate(mm, address, &vmf.vma)) {
/* vma is no longer available, don't continue to swapin */
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
@@ -915,10 +917,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
return false;
}
/* pte is unmapped now, we need to map it */
- fe.pte = pte_offset_map(pmd, fe.address);
+ vmf.pte = pte_offset_map(pmd, vmf.address);
}
- fe.pte--;
- pte_unmap(fe.pte);
+ vmf.pte--;
+ pte_unmap(vmf.pte);
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
return true;
}
@@ -943,7 +945,7 @@ static void collapse_huge_page(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
/* Only allocate from the target node */
- gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE;
+ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
/*
* Before allocating the hugepage, release the mmap_sem read lock.
@@ -1309,8 +1311,7 @@ static void collapse_shmem(struct mm_struct *mm,
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
/* Only allocate from the target node */
- gfp = alloc_hugepage_khugepaged_gfpmask() |
- __GFP_OTHER_NODE | __GFP_THISNODE;
+ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
new_page = khugepaged_alloc_page(hpage, gfp, node);
if (!new_page) {
@@ -1403,6 +1404,9 @@ static void collapse_shmem(struct mm_struct *mm,
spin_lock_irq(&mapping->tree_lock);
+ slot = radix_tree_lookup_slot(&mapping->page_tree, index);
+ VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot,
+ &mapping->tree_lock), page);
VM_BUG_ON_PAGE(page_mapped(page), page);
/*
@@ -1423,9 +1427,10 @@ static void collapse_shmem(struct mm_struct *mm,
list_add_tail(&page->lru, &pagelist);
/* Finally, replace with the new page. */
- radix_tree_replace_slot(slot,
+ radix_tree_replace_slot(&mapping->page_tree, slot,
new_page + (index % HPAGE_PMD_NR));
+ slot = radix_tree_iter_resume(slot, &iter);
index++;
continue;
out_lru:
@@ -1521,9 +1526,10 @@ tree_unlocked:
if (!page || iter.index < page->index) {
if (!nr_none)
break;
- /* Put holes back where they were */
- radix_tree_replace_slot(slot, NULL);
nr_none--;
+ /* Put holes back where they were */
+ radix_tree_delete(&mapping->page_tree,
+ iter.index);
continue;
}
@@ -1532,7 +1538,9 @@ tree_unlocked:
/* Unfreeze the page. */
list_del(&page->lru);
page_ref_unfreeze(page, 2);
- radix_tree_replace_slot(slot, page);
+ radix_tree_replace_slot(&mapping->page_tree,
+ slot, page);
+ slot = radix_tree_iter_resume(slot, &iter);
spin_unlock_irq(&mapping->tree_lock);
putback_lru_page(page);
unlock_page(page);
@@ -1616,8 +1624,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
present++;
if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index d1380ed93fdf..20036d4f9f13 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -19,7 +19,7 @@
*
*
* For more information on the algorithm and kmemleak usage, please see
- * Documentation/kmemleak.txt.
+ * Documentation/dev-tools/kmemleak.rst.
*
* Notes on locking
* ----------------
@@ -73,7 +73,9 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/list.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
#include <linux/jiffies.h>
#include <linux/delay.h>
#include <linux/export.h>
@@ -1414,7 +1416,7 @@ static void kmemleak_scan(void)
/* data/bss scanning */
scan_large_block(_sdata, _edata);
scan_large_block(__bss_start, __bss_stop);
- scan_large_block(__start_data_ro_after_init, __end_data_ro_after_init);
+ scan_large_block(__start_ro_after_init, __end_ro_after_init);
#ifdef CONFIG_SMP
/* per-cpu sections scanning */
diff --git a/mm/ksm.c b/mm/ksm.c
index 9ae6011a41f8..19b4f2dea7a5 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -19,6 +19,8 @@
#include <linux/fs.h>
#include <linux/mman.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
#include <linux/rwsem.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
@@ -223,6 +225,12 @@ static unsigned int ksm_thread_pages_to_scan = 100;
/* Milliseconds ksmd should sleep between batches */
static unsigned int ksm_thread_sleep_millisecs = 20;
+/* Checksum of an empty (zeroed) page */
+static unsigned int zero_checksum __read_mostly;
+
+/* Whether to merge empty (zeroed) pages with actual zero pages */
+static bool ksm_use_zero_pages __read_mostly;
+
#ifdef CONFIG_NUMA
/* Zeroed when merging across nodes is not allowed */
static unsigned int ksm_merge_across_nodes = 1;
@@ -850,33 +858,36 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
pte_t *orig_pte)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long addr;
- pte_t *ptep;
- spinlock_t *ptl;
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ };
int swapped;
int err = -EFAULT;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
- addr = page_address_in_vma(page, vma);
- if (addr == -EFAULT)
+ pvmw.address = page_address_in_vma(page, vma);
+ if (pvmw.address == -EFAULT)
goto out;
BUG_ON(PageTransCompound(page));
- mmun_start = addr;
- mmun_end = addr + PAGE_SIZE;
+ mmun_start = pvmw.address;
+ mmun_end = pvmw.address + PAGE_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- ptep = page_check_address(page, mm, addr, &ptl, 0);
- if (!ptep)
+ if (!page_vma_mapped_walk(&pvmw))
goto out_mn;
+ if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
+ goto out_unlock;
- if (pte_write(*ptep) || pte_dirty(*ptep)) {
+ if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
+ (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte))) {
pte_t entry;
swapped = PageSwapCache(page);
- flush_cache_page(vma, addr, page_to_pfn(page));
+ flush_cache_page(vma, pvmw.address, page_to_pfn(page));
/*
* Ok this is tricky, when get_user_pages_fast() run it doesn't
* take any lock, therefore the check that we are going to make
@@ -886,25 +897,29 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
* this assure us that no O_DIRECT can happen after the check
* or in the middle of the check.
*/
- entry = ptep_clear_flush_notify(vma, addr, ptep);
+ entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
/*
* Check that no O_DIRECT or similar I/O is in progress on the
* page
*/
if (page_mapcount(page) + 1 + swapped != page_count(page)) {
- set_pte_at(mm, addr, ptep, entry);
+ set_pte_at(mm, pvmw.address, pvmw.pte, entry);
goto out_unlock;
}
if (pte_dirty(entry))
set_page_dirty(page);
- entry = pte_mkclean(pte_wrprotect(entry));
- set_pte_at_notify(mm, addr, ptep, entry);
+
+ if (pte_protnone(entry))
+ entry = pte_mkclean(pte_clear_savedwrite(entry));
+ else
+ entry = pte_mkclean(pte_wrprotect(entry));
+ set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
}
- *orig_pte = *ptep;
+ *orig_pte = *pvmw.pte;
err = 0;
out_unlock:
- pte_unmap_unlock(ptep, ptl);
+ page_vma_mapped_walk_done(&pvmw);
out_mn:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
@@ -926,6 +941,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
struct mm_struct *mm = vma->vm_mm;
pmd_t *pmd;
pte_t *ptep;
+ pte_t newpte;
spinlock_t *ptl;
unsigned long addr;
int err = -EFAULT;
@@ -950,12 +966,22 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
goto out_mn;
}
- get_page(kpage);
- page_add_anon_rmap(kpage, vma, addr, false);
+ /*
+ * No need to check ksm_use_zero_pages here: we can only have a
+ * zero_page here if ksm_use_zero_pages was enabled alreaady.
+ */
+ if (!is_zero_pfn(page_to_pfn(kpage))) {
+ get_page(kpage);
+ page_add_anon_rmap(kpage, vma, addr, false);
+ newpte = mk_pte(kpage, vma->vm_page_prot);
+ } else {
+ newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
+ vma->vm_page_prot));
+ }
flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush_notify(vma, addr, ptep);
- set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
+ set_pte_at_notify(mm, addr, ptep, newpte);
page_remove_rmap(page, false);
if (!page_mapped(page))
@@ -1467,6 +1493,23 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
return;
}
+ /*
+ * Same checksum as an empty page. We attempt to merge it with the
+ * appropriate zero page if the user enabled this via sysfs.
+ */
+ if (ksm_use_zero_pages && (checksum == zero_checksum)) {
+ struct vm_area_struct *vma;
+
+ vma = find_mergeable_vma(rmap_item->mm, rmap_item->address);
+ err = try_to_merge_one_page(vma, page,
+ ZERO_PAGE(rmap_item->address));
+ /*
+ * In case of failure, the page was not really empty, so we
+ * need to continue. Otherwise we're done.
+ */
+ if (!err)
+ return;
+ }
tree_rmap_item =
unstable_tree_search_insert(rmap_item, page, &tree_page);
if (tree_rmap_item) {
@@ -1813,7 +1856,7 @@ int __ksm_enter(struct mm_struct *mm)
spin_unlock(&ksm_mmlist_lock);
set_bit(MMF_VM_MERGEABLE, &mm->flags);
- atomic_inc(&mm->mm_count);
+ mmgrab(mm);
if (needs_wakeup)
wake_up_interruptible(&ksm_thread_wait);
@@ -2233,6 +2276,28 @@ static ssize_t merge_across_nodes_store(struct kobject *kobj,
KSM_ATTR(merge_across_nodes);
#endif
+static ssize_t use_zero_pages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_use_zero_pages);
+}
+static ssize_t use_zero_pages_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ bool value;
+
+ err = kstrtobool(buf, &value);
+ if (err)
+ return -EINVAL;
+
+ ksm_use_zero_pages = value;
+
+ return count;
+}
+KSM_ATTR(use_zero_pages);
+
static ssize_t pages_shared_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -2290,6 +2355,7 @@ static struct attribute *ksm_attrs[] = {
#ifdef CONFIG_NUMA
&merge_across_nodes_attr.attr,
#endif
+ &use_zero_pages_attr.attr,
NULL,
};
@@ -2304,6 +2370,11 @@ static int __init ksm_init(void)
struct task_struct *ksm_thread;
int err;
+ /* The correct value depends on page size and endianness */
+ zero_checksum = calc_checksum(ZERO_PAGE(0));
+ /* Default to false for backwards compatibility */
+ ksm_use_zero_pages = false;
+
err = ksm_slab_init();
if (err)
goto out;
diff --git a/mm/madvise.c b/mm/madvise.c
index 93fb63e88b5e..7a2abf0127ae 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -10,6 +10,7 @@
#include <linux/syscalls.h>
#include <linux/mempolicy.h>
#include <linux/page-isolation.h>
+#include <linux/userfaultfd_k.h>
#include <linux/hugetlb.h>
#include <linux/falloc.h>
#include <linux/sched.h>
@@ -20,10 +21,13 @@
#include <linux/backing-dev.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/shmem_fs.h>
#include <linux/mmu_notifier.h>
#include <asm/tlb.h>
+#include "internal.h"
+
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
* take mmap_sem for writing. Others, which simply traverse vmas, need
@@ -89,14 +93,28 @@ static long madvise_behavior(struct vm_area_struct *vma,
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
error = ksm_madvise(vma, start, end, behavior, &new_flags);
- if (error)
+ if (error) {
+ /*
+ * madvise() returns EAGAIN if kernel resources, such as
+ * slab, are temporarily unavailable.
+ */
+ if (error == -ENOMEM)
+ error = -EAGAIN;
goto out;
+ }
break;
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
error = hugepage_madvise(vma, &new_flags, behavior);
- if (error)
+ if (error) {
+ /*
+ * madvise() returns EAGAIN if kernel resources, such as
+ * slab, are temporarily unavailable.
+ */
+ if (error == -ENOMEM)
+ error = -EAGAIN;
goto out;
+ }
break;
}
@@ -117,15 +135,37 @@ static long madvise_behavior(struct vm_area_struct *vma,
*prev = vma;
if (start != vma->vm_start) {
- error = split_vma(mm, vma, start, 1);
- if (error)
+ if (unlikely(mm->map_count >= sysctl_max_map_count)) {
+ error = -ENOMEM;
goto out;
+ }
+ error = __split_vma(mm, vma, start, 1);
+ if (error) {
+ /*
+ * madvise() returns EAGAIN if kernel resources, such as
+ * slab, are temporarily unavailable.
+ */
+ if (error == -ENOMEM)
+ error = -EAGAIN;
+ goto out;
+ }
}
if (end != vma->vm_end) {
- error = split_vma(mm, vma, end, 0);
- if (error)
+ if (unlikely(mm->map_count >= sysctl_max_map_count)) {
+ error = -ENOMEM;
goto out;
+ }
+ error = __split_vma(mm, vma, end, 0);
+ if (error) {
+ /*
+ * madvise() returns EAGAIN if kernel resources, such as
+ * slab, are temporarily unavailable.
+ */
+ if (error == -ENOMEM)
+ error = -EAGAIN;
+ goto out;
+ }
}
success:
@@ -133,10 +173,7 @@ success:
* vm_flags is protected by the mmap_sem held in write mode.
*/
vma->vm_flags = new_flags;
-
out:
- if (error == -ENOMEM)
- error = -EAGAIN;
return error;
}
@@ -281,6 +318,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (pmd_trans_unstable(pmd))
return 0;
+ tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
for (; addr != end; pte++, addr += PAGE_SIZE) {
@@ -472,10 +510,47 @@ static long madvise_dontneed(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
*prev = vma;
- if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
+ if (!can_madv_dontneed_vma(vma))
return -EINVAL;
- zap_page_range(vma, start, end - start, NULL);
+ if (!userfaultfd_remove(vma, start, end)) {
+ *prev = NULL; /* mmap_sem has been dropped, prev is stale */
+
+ down_read(&current->mm->mmap_sem);
+ vma = find_vma(current->mm, start);
+ if (!vma)
+ return -ENOMEM;
+ if (start < vma->vm_start) {
+ /*
+ * This "vma" under revalidation is the one
+ * with the lowest vma->vm_start where start
+ * is also < vma->vm_end. If start <
+ * vma->vm_start it means an hole materialized
+ * in the user address space within the
+ * virtual range passed to MADV_DONTNEED.
+ */
+ return -ENOMEM;
+ }
+ if (!can_madv_dontneed_vma(vma))
+ return -EINVAL;
+ if (end > vma->vm_end) {
+ /*
+ * Don't fail if end > vma->vm_end. If the old
+ * vma was splitted while the mmap_sem was
+ * released the effect of the concurrent
+ * operation may not cause MADV_DONTNEED to
+ * have an undefined result. There may be an
+ * adjacent next vma that we'll walk
+ * next. userfaultfd_remove() will generate an
+ * UFFD_EVENT_REMOVE repetition on the
+ * end-vma->vm_end range, but the manager can
+ * handle a repetition fine.
+ */
+ end = vma->vm_end;
+ }
+ VM_WARN_ON(start >= end);
+ }
+ zap_page_range(vma, start, end - start);
return 0;
}
@@ -515,7 +590,10 @@ static long madvise_remove(struct vm_area_struct *vma,
* mmap_sem.
*/
get_file(f);
- up_read(&current->mm->mmap_sem);
+ if (userfaultfd_remove(vma, start, end)) {
+ /* mmap_sem was not released by userfaultfd_remove() */
+ up_read(&current->mm->mmap_sem);
+ }
error = vfs_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
diff --git a/mm/memblock.c b/mm/memblock.c
index 7608bc305936..696f06d17c4e 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -35,15 +35,18 @@ struct memblock memblock __initdata_memblock = {
.memory.regions = memblock_memory_init_regions,
.memory.cnt = 1, /* empty dummy entry */
.memory.max = INIT_MEMBLOCK_REGIONS,
+ .memory.name = "memory",
.reserved.regions = memblock_reserved_init_regions,
.reserved.cnt = 1, /* empty dummy entry */
.reserved.max = INIT_MEMBLOCK_REGIONS,
+ .reserved.name = "reserved",
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
.physmem.regions = memblock_physmem_init_regions,
.physmem.cnt = 1, /* empty dummy entry */
.physmem.max = INIT_PHYSMEM_REGIONS,
+ .physmem.name = "physmem",
#endif
.bottom_up = false,
@@ -64,18 +67,6 @@ ulong __init_memblock choose_memblock_flags(void)
return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
}
-/* inline so we don't get a warning when pr_debug is compiled out */
-static __init_memblock const char *
-memblock_type_name(struct memblock_type *type)
-{
- if (type == &memblock.memory)
- return "memory";
- else if (type == &memblock.reserved)
- return "reserved";
- else
- return "unknown";
-}
-
/* adjust *@size so that (@base + *@size) doesn't overflow, return new size */
static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
{
@@ -402,12 +393,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
}
if (!addr) {
pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
- memblock_type_name(type), type->max, type->max * 2);
+ type->name, type->max, type->max * 2);
return -1;
}
memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
- memblock_type_name(type), type->max * 2, (u64)addr,
+ type->name, type->max * 2, (u64)addr,
(u64)addr + new_size - 1);
/*
@@ -611,10 +602,10 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
- memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
- (unsigned long long)base,
- (unsigned long long)base + size - 1,
- 0UL, (void *)_RET_IP_);
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg("memblock_add: [%pa-%pa] %pF\n",
+ &base, &end, (void *)_RET_IP_);
return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
}
@@ -718,10 +709,10 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
- memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n",
- (unsigned long long)base,
- (unsigned long long)base + size - 1,
- (void *)_RET_IP_);
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg(" memblock_free: [%pa-%pa] %pF\n",
+ &base, &end, (void *)_RET_IP_);
kmemleak_free_part_phys(base, size);
return memblock_remove_range(&memblock.reserved, base, size);
@@ -729,10 +720,10 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
{
- memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
- (unsigned long long)base,
- (unsigned long long)base + size - 1,
- 0UL, (void *)_RET_IP_);
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg("memblock_reserve: [%pa-%pa] %pF\n",
+ &base, &end, (void *)_RET_IP_);
return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
}
@@ -1105,6 +1096,34 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
*out_nid = r->nid;
}
+unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
+ unsigned long max_pfn)
+{
+ struct memblock_type *type = &memblock.memory;
+ unsigned int right = type->cnt;
+ unsigned int mid, left = 0;
+ phys_addr_t addr = PFN_PHYS(pfn + 1);
+
+ do {
+ mid = (right + left) / 2;
+
+ if (addr < type->regions[mid].base)
+ right = mid;
+ else if (addr >= (type->regions[mid].base +
+ type->regions[mid].size))
+ left = mid + 1;
+ else {
+ /* addr is within the region, so pfn + 1 is valid */
+ return min(pfn + 1, max_pfn);
+ }
+ } while (left < right);
+
+ if (right == type->cnt)
+ return max_pfn;
+ else
+ return min(PHYS_PFN(type->regions[right].base), max_pfn);
+}
+
/**
* memblock_set_node - set node ID on memblock regions
* @base: base of area to set node ID for
@@ -1202,8 +1221,8 @@ phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys
alloc = __memblock_alloc_base(size, align, max_addr);
if (alloc == 0)
- panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
- (unsigned long long) size, (unsigned long long) max_addr);
+ panic("ERROR: Failed to allocate %pa bytes below %pa.\n",
+ &size, &max_addr);
return alloc;
}
@@ -1274,18 +1293,17 @@ static void * __init memblock_virt_alloc_internal(
if (max_addr > memblock.current_limit)
max_addr = memblock.current_limit;
-
again:
alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
nid, flags);
- if (alloc)
+ if (alloc && !memblock_reserve(alloc, size))
goto done;
if (nid != NUMA_NO_NODE) {
alloc = memblock_find_in_range_node(size, align, min_addr,
max_addr, NUMA_NO_NODE,
flags);
- if (alloc)
+ if (alloc && !memblock_reserve(alloc, size))
goto done;
}
@@ -1303,7 +1321,6 @@ again:
return NULL;
done:
- memblock_reserve(alloc, size);
ptr = phys_to_virt(alloc);
memset(ptr, 0, size);
@@ -1615,8 +1632,7 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
if (idx == -1)
return 0;
- return memblock.memory.regions[idx].base <= base &&
- (memblock.memory.regions[idx].base +
+ return (memblock.memory.regions[idx].base +
memblock.memory.regions[idx].size) >= end;
}
@@ -1671,40 +1687,44 @@ phys_addr_t __init_memblock memblock_get_current_limit(void)
return memblock.current_limit;
}
-static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
+static void __init_memblock memblock_dump(struct memblock_type *type)
{
- unsigned long long base, size;
+ phys_addr_t base, end, size;
unsigned long flags;
int idx;
struct memblock_region *rgn;
- pr_info(" %s.cnt = 0x%lx\n", name, type->cnt);
+ pr_info(" %s.cnt = 0x%lx\n", type->name, type->cnt);
for_each_memblock_type(type, rgn) {
char nid_buf[32] = "";
base = rgn->base;
size = rgn->size;
+ end = base + size - 1;
flags = rgn->flags;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
if (memblock_get_region_node(rgn) != MAX_NUMNODES)
snprintf(nid_buf, sizeof(nid_buf), " on node %d",
memblock_get_region_node(rgn));
#endif
- pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n",
- name, idx, base, base + size - 1, size, nid_buf, flags);
+ pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#lx\n",
+ type->name, idx, &base, &end, &size, nid_buf, flags);
}
}
void __init_memblock __memblock_dump_all(void)
{
pr_info("MEMBLOCK configuration:\n");
- pr_info(" memory size = %#llx reserved size = %#llx\n",
- (unsigned long long)memblock.memory.total_size,
- (unsigned long long)memblock.reserved.total_size);
+ pr_info(" memory size = %pa reserved size = %pa\n",
+ &memblock.memory.total_size,
+ &memblock.reserved.total_size);
- memblock_dump(&memblock.memory, "memory");
- memblock_dump(&memblock.reserved, "reserved");
+ memblock_dump(&memblock.memory);
+ memblock_dump(&memblock.reserved);
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+ memblock_dump(&memblock.physmem);
+#endif
}
void __init memblock_allow_resize(void)
@@ -1727,19 +1747,14 @@ static int memblock_debug_show(struct seq_file *m, void *private)
struct memblock_type *type = m->private;
struct memblock_region *reg;
int i;
+ phys_addr_t end;
for (i = 0; i < type->cnt; i++) {
reg = &type->regions[i];
- seq_printf(m, "%4d: ", i);
- if (sizeof(phys_addr_t) == 4)
- seq_printf(m, "0x%08lx..0x%08lx\n",
- (unsigned long)reg->base,
- (unsigned long)(reg->base + reg->size - 1));
- else
- seq_printf(m, "0x%016llx..0x%016llx\n",
- (unsigned long long)reg->base,
- (unsigned long long)(reg->base + reg->size - 1));
+ end = reg->base + reg->size - 1;
+ seq_printf(m, "%4d: ", i);
+ seq_printf(m, "%pa..%pa\n", &reg->base, &end);
}
return 0;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0f870ba43942..2bd7541d7c11 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,6 +35,8 @@
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/smp.h>
@@ -68,7 +70,7 @@
#include <net/ip.h>
#include "slab.h"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <trace/events/vmscan.h>
@@ -317,6 +319,8 @@ void memcg_put_cache_ids(void)
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
+struct workqueue_struct *memcg_kmem_cache_wq;
+
#endif /* !CONFIG_SLOB */
/**
@@ -462,6 +466,8 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
struct mem_cgroup_tree_per_node *mctz;
mctz = soft_limit_tree_from_page(page);
+ if (!mctz)
+ return;
/*
* Necessary to update all ancestors when hierarchy is used.
* because their event counter is not touched.
@@ -499,7 +505,8 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
for_each_node(nid) {
mz = mem_cgroup_nodeinfo(memcg, nid);
mctz = soft_limit_tree_node(nid);
- mem_cgroup_remove_exceeded(mz, mctz);
+ if (mctz)
+ mem_cgroup_remove_exceeded(mz, mctz);
}
}
@@ -625,8 +632,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask)
{
+ struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
unsigned long nr = 0;
- struct mem_cgroup_per_node *mz;
enum lru_list lru;
VM_BUG_ON((unsigned)nid >= nr_node_ids);
@@ -634,8 +641,7 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- mz = mem_cgroup_nodeinfo(memcg, nid);
- nr += mz->lru_size[lru];
+ nr += mem_cgroup_get_lru_size(lruvec, lru);
}
return nr;
}
@@ -1002,6 +1008,7 @@ out:
* mem_cgroup_update_lru_size - account for adding or removing an lru page
* @lruvec: mem_cgroup per zone lru vector
* @lru: index of lru list the page is sitting on
+ * @zid: zone id of the accounted pages
* @nr_pages: positive when adding or negative when removing
*
* This function must be called under lru_lock, just before a page is added
@@ -1009,27 +1016,25 @@ out:
* so as to allow it to check that lru_size 0 is consistent with list_empty).
*/
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
- int nr_pages)
+ int zid, int nr_pages)
{
struct mem_cgroup_per_node *mz;
unsigned long *lru_size;
long size;
- bool empty;
if (mem_cgroup_disabled())
return;
mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- lru_size = mz->lru_size + lru;
- empty = list_empty(lruvec->lists + lru);
+ lru_size = &mz->lru_zone_size[zid][lru];
if (nr_pages < 0)
*lru_size += nr_pages;
size = *lru_size;
- if (WARN_ONCE(size < 0 || empty != !size,
- "%s(%p, %d, %d): lru_size %ld but %sempty\n",
- __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) {
+ if (WARN_ONCE(size < 0,
+ "%s(%p, %d, %d): lru_size %ld\n",
+ __func__, lruvec, lru, nr_pages, size)) {
VM_BUG_ON(1);
*lru_size = 0;
}
@@ -1816,22 +1821,13 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
mutex_unlock(&percpu_charge_mutex);
}
-static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
- unsigned long action,
- void *hcpu)
+static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
- int cpu = (unsigned long)hcpu;
struct memcg_stock_pcp *stock;
- if (action == CPU_ONLINE)
- return NOTIFY_OK;
-
- if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
- return NOTIFY_OK;
-
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
- return NOTIFY_OK;
+ return 0;
}
static void reclaim_high(struct mem_cgroup *memcg,
@@ -2185,7 +2181,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
cw->cachep = cachep;
INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
- schedule_work(&cw->work);
+ queue_work(memcg_kmem_cache_wq, &cw->work);
}
static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
@@ -2565,7 +2561,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
* is empty. Do it lockless to prevent lock bouncing. Races
* are acceptable as soft limit is best effort anyway.
*/
- if (RB_EMPTY_ROOT(&mctz->rb_root))
+ if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
return 0;
/*
@@ -2846,6 +2842,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
*/
memcg->kmemcg_id = memcg_id;
memcg->kmem_state = KMEM_ONLINE;
+ INIT_LIST_HEAD(&memcg->kmem_caches);
return 0;
}
@@ -4011,9 +4008,9 @@ static struct cftype mem_cgroup_legacy_files[] = {
#ifdef CONFIG_SLABINFO
{
.name = "kmem.slabinfo",
- .seq_start = slab_start,
- .seq_next = slab_next,
- .seq_stop = slab_stop,
+ .seq_start = memcg_slab_start,
+ .seq_next = memcg_slab_next,
+ .seq_stop = memcg_slab_stop,
.seq_show = memcg_slab_show,
},
#endif
@@ -4141,17 +4138,22 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
kfree(memcg->nodeinfo[node]);
}
-static void mem_cgroup_free(struct mem_cgroup *memcg)
+static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- memcg_wb_domain_exit(memcg);
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->stat);
kfree(memcg);
}
+static void mem_cgroup_free(struct mem_cgroup *memcg)
+{
+ memcg_wb_domain_exit(memcg);
+ __mem_cgroup_free(memcg);
+}
+
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
@@ -4202,7 +4204,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
fail:
if (memcg->id.id > 0)
idr_remove(&mem_cgroup_idr, memcg->id.id);
- mem_cgroup_free(memcg);
+ __mem_cgroup_free(memcg);
return NULL;
}
@@ -4362,9 +4364,9 @@ static int mem_cgroup_do_precharge(unsigned long count)
return ret;
}
- /* Try charges one by one with reclaim */
+ /* Try charges one by one with reclaim, but do not retry */
while (count--) {
- ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
+ ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
if (ret)
return ret;
mc.precharge++;
@@ -5774,16 +5776,28 @@ __setup("cgroup.memory=", cgroup_memory);
/*
* subsys_initcall() for memory controller.
*
- * Some parts like hotcpu_notifier() have to be initialized from this context
- * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
- * everything that doesn't depend on a specific mem_cgroup structure should
- * be initialized from here.
+ * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
+ * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
+ * basically everything that doesn't depend on a specific mem_cgroup structure
+ * should be initialized from here.
*/
static int __init mem_cgroup_init(void)
{
int cpu, node;
- hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
+#ifndef CONFIG_SLOB
+ /*
+ * Kmem cache creation is mostly done with the slab_mutex held,
+ * so use a workqueue with limited concurrency to avoid stalling
+ * all worker threads in case lots of cgroups are created and
+ * destroyed simultaneously.
+ */
+ memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
+ BUG_ON(!memcg_kmem_cache_wq);
+#endif
+
+ cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
+ memcg_hotplug_cpu_dead);
for_each_possible_cpu(cpu)
INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 19e796d36a62..27f7210e7fab 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -40,7 +40,8 @@
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/kernel-page-flags.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
@@ -764,12 +765,11 @@ static int me_huge_page(struct page *p, unsigned long pfn)
*/
#define dirty (1UL << PG_dirty)
-#define sc (1UL << PG_swapcache)
+#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
#define unevict (1UL << PG_unevictable)
#define mlock (1UL << PG_mlocked)
#define writeback (1UL << PG_writeback)
#define lru (1UL << PG_lru)
-#define swapbacked (1UL << PG_swapbacked)
#define head (1UL << PG_head)
#define slab (1UL << PG_slab)
#define reserved (1UL << PG_reserved)
@@ -819,7 +819,6 @@ static struct page_state {
#undef mlock
#undef writeback
#undef lru
-#undef swapbacked
#undef head
#undef slab
#undef reserved
@@ -1529,7 +1528,8 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
{
int ret = __get_any_page(page, pfn, flags);
- if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
+ if (ret == 1 && !PageHuge(page) &&
+ !PageLRU(page) && !__PageMovable(page)) {
/*
* Try to free it.
*/
@@ -1651,7 +1651,10 @@ static int __soft_offline_page(struct page *page, int flags)
* Try to migrate to a new page instead. migrate.c
* handles a large number of cases for us.
*/
- ret = isolate_lru_page(page);
+ if (PageLRU(page))
+ ret = isolate_lru_page(page);
+ else
+ ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
/*
* Drop page reference which is came from get_any_page()
* successful isolate_lru_page() already took another one.
@@ -1659,18 +1662,20 @@ static int __soft_offline_page(struct page *page, int flags)
put_hwpoison_page(page);
if (!ret) {
LIST_HEAD(pagelist);
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ /*
+ * After isolated lru page, the PageLRU will be cleared,
+ * so use !__PageMovable instead for LRU page's mapping
+ * cannot have PAGE_MAPPING_MOVABLE.
+ */
+ if (!__PageMovable(page))
+ inc_node_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
- if (!list_empty(&pagelist)) {
- list_del(&page->lru);
- dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
- putback_lru_page(page);
- }
+ if (!list_empty(&pagelist))
+ putback_movable_pages(&pagelist);
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
diff --git a/mm/memory.c b/mm/memory.c
index e18c57bdc75c..235ba51b2fbf 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -30,7 +30,7 @@
/*
* 05.04.94 - Multi-page memory management added for v1.1.
- * Idea by Alex Bligh (alex@cconcepts.co.uk)
+ * Idea by Alex Bligh (alex@cconcepts.co.uk)
*
* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
* (Gerhard.Wichert@pdb.siemens.de)
@@ -40,6 +40,10 @@
#include <linux/kernel_stat.h>
#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/numa_balancing.h>
+#include <linux/sched/task.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
@@ -68,7 +72,7 @@
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgtable.h>
@@ -82,9 +86,9 @@
#ifndef CONFIG_NEED_MULTIPLE_NODES
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
-struct page *mem_map;
-
EXPORT_SYMBOL(max_mapnr);
+
+struct page *mem_map;
EXPORT_SYMBOL(mem_map);
#endif
@@ -95,8 +99,7 @@ EXPORT_SYMBOL(mem_map);
* highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
* and ZONE_HIGHMEM.
*/
-void * high_memory;
-
+void *high_memory;
EXPORT_SYMBOL(high_memory);
/*
@@ -120,10 +123,10 @@ static int __init disable_randmaps(char *s)
__setup("norandmaps", disable_randmaps);
unsigned long zero_pfn __read_mostly;
-unsigned long highest_memmap_pfn __read_mostly;
-
EXPORT_SYMBOL(zero_pfn);
+unsigned long highest_memmap_pfn __read_mostly;
+
/*
* CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
*/
@@ -300,15 +303,14 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
struct mmu_gather_batch *batch;
VM_BUG_ON(!tlb->end);
-
- if (!tlb->page_size)
- tlb->page_size = page_size;
- else {
- if (page_size != tlb->page_size)
- return true;
- }
+ VM_WARN_ON(tlb->page_size != page_size);
batch = tlb->active;
+ /*
+ * Add the page and check if we are full. If so
+ * force a flush.
+ */
+ batch->pages[batch->nr++] = page;
if (batch->nr == batch->max) {
if (!tlb_next_batch(tlb))
return true;
@@ -316,7 +318,6 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
}
VM_BUG_ON_PAGE(batch->nr > batch->max, page);
- batch->pages[batch->nr++] = page;
return false;
}
@@ -444,7 +445,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
mm_dec_nr_pmds(tlb->mm);
}
-static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
{
@@ -453,7 +454,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
unsigned long start;
start = addr;
- pud = pud_offset(pgd, addr);
+ pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
@@ -461,6 +462,39 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
free_pmd_range(tlb, pud, addr, next, floor, ceiling);
} while (pud++, addr = next, addr != end);
+ start &= P4D_MASK;
+ if (start < floor)
+ return;
+ if (ceiling) {
+ ceiling &= P4D_MASK;
+ if (!ceiling)
+ return;
+ }
+ if (end - 1 > ceiling - 1)
+ return;
+
+ pud = pud_offset(p4d, start);
+ p4d_clear(p4d);
+ pud_free_tlb(tlb, pud, start);
+}
+
+static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
+{
+ p4d_t *p4d;
+ unsigned long next;
+ unsigned long start;
+
+ start = addr;
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none_or_clear_bad(p4d))
+ continue;
+ free_pud_range(tlb, p4d, addr, next, floor, ceiling);
+ } while (p4d++, addr = next, addr != end);
+
start &= PGDIR_MASK;
if (start < floor)
return;
@@ -472,9 +506,9 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
if (end - 1 > ceiling - 1)
return;
- pud = pud_offset(pgd, start);
+ p4d = p4d_offset(pgd, start);
pgd_clear(pgd);
- pud_free_tlb(tlb, pud, start);
+ p4d_free_tlb(tlb, p4d, start);
}
/*
@@ -528,13 +562,17 @@ void free_pgd_range(struct mmu_gather *tlb,
end -= PMD_SIZE;
if (addr > end - 1)
return;
-
+ /*
+ * We add page table cache pages with PAGE_SIZE,
+ * (see pte_free_tlb()), flush the tlb if we need
+ */
+ tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
pgd = pgd_offset(tlb->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- free_pud_range(tlb, pgd, addr, next, floor, ceiling);
+ free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
} while (pgd++, addr = next, addr != end);
}
@@ -554,7 +592,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (is_vm_hugetlb_page(vma)) {
hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
- floor, next? next->vm_start: ceiling);
+ floor, next ? next->vm_start : ceiling);
} else {
/*
* Optimization: gather nearby vmas into one call down
@@ -567,7 +605,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
- floor, next? next->vm_start: ceiling);
+ floor, next ? next->vm_start : ceiling);
}
vma = next;
}
@@ -653,7 +691,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
pte_t pte, struct page *page)
{
pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
- pud_t *pud = pud_offset(pgd, addr);
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ pud_t *pud = pud_offset(p4d, addr);
pmd_t *pmd = pmd_offset(pud, addr);
struct address_space *mapping;
pgoff_t index;
@@ -999,7 +1038,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
int err;
- VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
+ VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
err = copy_huge_pmd(dst_mm, src_mm,
dst_pmd, src_pmd, addr, vma);
if (err == -ENOMEM)
@@ -1018,18 +1057,30 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
}
static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
+ p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
pud_t *src_pud, *dst_pud;
unsigned long next;
- dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
+ dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
if (!dst_pud)
return -ENOMEM;
- src_pud = pud_offset(src_pgd, addr);
+ src_pud = pud_offset(src_p4d, addr);
do {
next = pud_addr_end(addr, end);
+ if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
+ int err;
+
+ VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
+ err = copy_huge_pud(dst_mm, src_mm,
+ dst_pud, src_pud, addr, vma);
+ if (err == -ENOMEM)
+ return -ENOMEM;
+ if (!err)
+ continue;
+ /* fall through */
+ }
if (pud_none_or_clear_bad(src_pud))
continue;
if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
@@ -1039,6 +1090,28 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
return 0;
}
+static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ p4d_t *src_p4d, *dst_p4d;
+ unsigned long next;
+
+ dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
+ if (!dst_p4d)
+ return -ENOMEM;
+ src_p4d = p4d_offset(src_pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none_or_clear_bad(src_p4d))
+ continue;
+ if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
+ vma, addr, next))
+ return -ENOMEM;
+ } while (dst_p4d++, src_p4d++, addr = next, addr != end);
+ return 0;
+}
+
int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
struct vm_area_struct *vma)
{
@@ -1094,7 +1167,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(src_pgd))
continue;
- if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
+ if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
vma, addr, next))) {
ret = -ENOMEM;
break;
@@ -1118,8 +1191,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pte_t *start_pte;
pte_t *pte;
swp_entry_t entry;
- struct page *pending_page = NULL;
+ tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
again:
init_rss_vec(rss);
start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
@@ -1127,9 +1200,8 @@ again:
arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
- if (pte_none(ptent)) {
+ if (pte_none(ptent))
continue;
- }
if (pte_present(ptent)) {
struct page *page;
@@ -1153,12 +1225,6 @@ again:
if (!PageAnon(page)) {
if (pte_dirty(ptent)) {
- /*
- * oom_reaper cannot tear down dirty
- * pages
- */
- if (unlikely(details && details->ignore_dirty))
- continue;
force_flush = 1;
set_page_dirty(page);
}
@@ -1172,14 +1238,13 @@ again:
print_bad_pte(vma, addr, ptent, page);
if (unlikely(__tlb_remove_page(tlb, page))) {
force_flush = 1;
- pending_page = page;
addr += PAGE_SIZE;
break;
}
continue;
}
- /* only check swap_entries if explicitly asked for in details */
- if (unlikely(details && !details->check_swap_entries))
+ /* If details->check_mapping, we leave swap entries. */
+ if (unlikely(details))
continue;
entry = pte_to_swp_entry(ptent);
@@ -1213,11 +1278,6 @@ again:
if (force_flush) {
force_flush = 0;
tlb_flush_mmu_free(tlb);
- if (pending_page) {
- /* remove the page with new size */
- __tlb_remove_pte_page(tlb, pending_page);
- pending_page = NULL;
- }
if (addr != end)
goto again;
}
@@ -1240,7 +1300,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
if (next - addr != HPAGE_PMD_SIZE) {
VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
- split_huge_pmd(vma, pmd, addr);
+ __split_huge_pmd(vma, pmd, addr, false, NULL);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
@@ -1263,24 +1323,53 @@ next:
}
static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma, pgd_t *pgd,
+ struct vm_area_struct *vma, p4d_t *p4d,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
pud_t *pud;
unsigned long next;
- pud = pud_offset(pgd, addr);
+ pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
+ if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
+ if (next - addr != HPAGE_PUD_SIZE) {
+ VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
+ split_huge_pud(vma, pud, addr);
+ } else if (zap_huge_pud(tlb, vma, pud, addr))
+ goto next;
+ /* fall through */
+ }
if (pud_none_or_clear_bad(pud))
continue;
next = zap_pmd_range(tlb, vma, pud, addr, next, details);
+next:
+ cond_resched();
} while (pud++, addr = next, addr != end);
return addr;
}
+static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none_or_clear_bad(p4d))
+ continue;
+ next = zap_pud_range(tlb, vma, p4d, addr, next, details);
+ } while (p4d++, addr = next, addr != end);
+
+ return addr;
+}
+
void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
@@ -1296,7 +1385,7 @@ void unmap_page_range(struct mmu_gather *tlb,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- next = zap_pud_range(tlb, vma, pgd, addr, next, details);
+ next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
} while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
}
@@ -1380,12 +1469,11 @@ void unmap_vmas(struct mmu_gather *tlb,
* @vma: vm_area_struct holding the applicable pages
* @start: starting address of pages to zap
* @size: number of bytes to zap
- * @details: details of shared cache invalidation
*
* Caller must protect the VMA list
*/
void zap_page_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long size, struct zap_details *details)
+ unsigned long size)
{
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
@@ -1396,7 +1484,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, start, end);
for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
- unmap_single_vma(&tlb, vma, start, end, details);
+ unmap_single_vma(&tlb, vma, start, end, NULL);
mmu_notifier_invalidate_range_end(mm, start, end);
tlb_finish_mmu(&tlb, start, end);
}
@@ -1452,16 +1540,24 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes);
pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
spinlock_t **ptl)
{
- pgd_t * pgd = pgd_offset(mm, addr);
- pud_t * pud = pud_alloc(mm, pgd, addr);
- if (pud) {
- pmd_t * pmd = pmd_alloc(mm, pud, addr);
- if (pmd) {
- VM_BUG_ON(pmd_trans_huge(*pmd));
- return pte_alloc_map_lock(mm, pmd, addr, ptl);
- }
- }
- return NULL;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = pgd_offset(mm, addr);
+ p4d = p4d_alloc(mm, pgd, addr);
+ if (!p4d)
+ return NULL;
+ pud = pud_alloc(mm, p4d, addr);
+ if (!pud)
+ return NULL;
+ pmd = pmd_alloc(mm, pud, addr);
+ if (!pmd)
+ return NULL;
+
+ VM_BUG_ON(pmd_trans_huge(*pmd));
+ return pte_alloc_map_lock(mm, pmd, addr, ptl);
}
/*
@@ -1637,8 +1733,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
- if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)))
- return -EINVAL;
+
+ track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
@@ -1655,8 +1751,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
- if (track_pfn_insert(vma, &pgprot, pfn))
- return -EINVAL;
+
+ track_pfn_insert(vma, &pgprot, pfn);
/*
* If we don't have pte special, then we have to use the pfn_valid()
@@ -1727,7 +1823,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
return 0;
}
-static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
+static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
@@ -1735,7 +1831,7 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
unsigned long next;
pfn -= addr >> PAGE_SHIFT;
- pud = pud_alloc(mm, pgd, addr);
+ pud = pud_alloc(mm, p4d, addr);
if (!pud)
return -ENOMEM;
do {
@@ -1747,6 +1843,26 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
return 0;
}
+static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ unsigned long pfn, pgprot_t prot)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ pfn -= addr >> PAGE_SHIFT;
+ p4d = p4d_alloc(mm, pgd, addr);
+ if (!p4d)
+ return -ENOMEM;
+ do {
+ next = p4d_addr_end(addr, end);
+ if (remap_pud_range(mm, p4d, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot))
+ return -ENOMEM;
+ } while (p4d++, addr = next, addr != end);
+ return 0;
+}
+
/**
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
@@ -1803,7 +1919,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
flush_cache_range(vma, addr, end);
do {
next = pgd_addr_end(addr, end);
- err = remap_pud_range(mm, pgd, addr, next,
+ err = remap_p4d_range(mm, pgd, addr, next,
pfn + (addr >> PAGE_SHIFT), prot);
if (err)
break;
@@ -1919,7 +2035,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
return err;
}
-static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
+static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
unsigned long addr, unsigned long end,
pte_fn_t fn, void *data)
{
@@ -1927,7 +2043,7 @@ static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
unsigned long next;
int err;
- pud = pud_alloc(mm, pgd, addr);
+ pud = pud_alloc(mm, p4d, addr);
if (!pud)
return -ENOMEM;
do {
@@ -1939,6 +2055,26 @@ static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
return err;
}
+static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ pte_fn_t fn, void *data)
+{
+ p4d_t *p4d;
+ unsigned long next;
+ int err;
+
+ p4d = p4d_alloc(mm, pgd, addr);
+ if (!p4d)
+ return -ENOMEM;
+ do {
+ next = p4d_addr_end(addr, end);
+ err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
+ if (err)
+ break;
+ } while (p4d++, addr = next, addr != end);
+ return err;
+}
+
/*
* Scan a region of virtual memory, filling in page tables as necessary
* and calling a provided function on each leaf page table.
@@ -1957,7 +2093,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
- err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
+ err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
if (err)
break;
} while (pgd++, addr = next, addr != end);
@@ -2038,20 +2174,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
*
* We do this without the lock held, so that it can sleep if it needs to.
*/
-static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
- unsigned long address)
+static int do_page_mkwrite(struct vm_fault *vmf)
{
- struct vm_fault vmf;
int ret;
+ struct page *page = vmf->page;
+ unsigned int old_flags = vmf->flags;
- vmf.virtual_address = (void __user *)(address & PAGE_MASK);
- vmf.pgoff = page->index;
- vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- vmf.gfp_mask = __get_fault_gfp_mask(vma);
- vmf.page = page;
- vmf.cow_page = NULL;
+ vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- ret = vma->vm_ops->page_mkwrite(vma, &vmf);
+ ret = vmf->vma->vm_ops->page_mkwrite(vmf);
+ /* Restore original flags so that caller is not surprised */
+ vmf->flags = old_flags;
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
if (unlikely(!(ret & VM_FAULT_LOCKED))) {
@@ -2067,6 +2200,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
}
/*
+ * Handle dirtying of a page in shared file mapping on a write fault.
+ *
+ * The function expects the page to be locked and unlocks it.
+ */
+static void fault_dirty_shared_page(struct vm_area_struct *vma,
+ struct page *page)
+{
+ struct address_space *mapping;
+ bool dirtied;
+ bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
+
+ dirtied = set_page_dirty(page);
+ VM_BUG_ON_PAGE(PageAnon(page), page);
+ /*
+ * Take a local copy of the address_space - page.mapping may be zeroed
+ * by truncate after unlock_page(). The address_space itself remains
+ * pinned by vma->vm_file's reference. We rely on unlock_page()'s
+ * release semantics to prevent the compiler from undoing this copying.
+ */
+ mapping = page_rmapping(page);
+ unlock_page(page);
+
+ if ((dirtied || page_mkwrite) && mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ }
+
+ if (!page_mkwrite)
+ file_update_time(vma->vm_file);
+}
+
+/*
* Handle write page faults for pages that can be reused in the current vma
*
* This can happen either due to the mapping being with the VM_SHARED flag,
@@ -2074,11 +2242,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
* case, all we need to do here is to mark the page as writable and update
* any related book-keeping.
*/
-static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
- struct page *page, int page_mkwrite, int dirty_shared)
- __releases(fe->ptl)
+static inline void wp_page_reuse(struct vm_fault *vmf)
+ __releases(vmf->ptl)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
+ struct page *page = vmf->page;
pte_t entry;
/*
* Clear the pages cpupid information as the existing
@@ -2088,39 +2256,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
if (page)
page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
- flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
- entry = pte_mkyoung(orig_pte);
+ flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
+ entry = pte_mkyoung(vmf->orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1))
- update_mmu_cache(vma, fe->address, fe->pte);
- pte_unmap_unlock(fe->pte, fe->ptl);
-
- if (dirty_shared) {
- struct address_space *mapping;
- int dirtied;
-
- if (!page_mkwrite)
- lock_page(page);
-
- dirtied = set_page_dirty(page);
- VM_BUG_ON_PAGE(PageAnon(page), page);
- mapping = page->mapping;
- unlock_page(page);
- put_page(page);
-
- if ((dirtied || page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
-
- if (!page_mkwrite)
- file_update_time(vma->vm_file);
- }
-
- return VM_FAULT_WRITE;
+ if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
}
/*
@@ -2139,31 +2280,32 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
* held to the old page, as well as updating the rmap.
* - In any case, unlock the PTL and drop the reference we took to the old page.
*/
-static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
- struct page *old_page)
+static int wp_page_copy(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
+ struct page *old_page = vmf->page;
struct page *new_page = NULL;
pte_t entry;
int page_copied = 0;
- const unsigned long mmun_start = fe->address & PAGE_MASK;
+ const unsigned long mmun_start = vmf->address & PAGE_MASK;
const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- if (is_zero_pfn(pte_pfn(orig_pte))) {
- new_page = alloc_zeroed_user_highpage_movable(vma, fe->address);
+ if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
+ new_page = alloc_zeroed_user_highpage_movable(vma,
+ vmf->address);
if (!new_page)
goto oom;
} else {
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
- fe->address);
+ vmf->address);
if (!new_page)
goto oom;
- cow_user_page(new_page, old_page, fe->address, vma);
+ cow_user_page(new_page, old_page, vmf->address, vma);
}
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
@@ -2176,8 +2318,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
/*
* Re-check the pte - we dropped the lock
*/
- fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl);
- if (likely(pte_same(*fe->pte, orig_pte))) {
+ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
+ if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
dec_mm_counter_fast(mm,
@@ -2187,7 +2329,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
} else {
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
- flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
+ flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*
@@ -2196,8 +2338,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
- ptep_clear_flush_notify(vma, fe->address, fe->pte);
- page_add_new_anon_rmap(new_page, vma, fe->address, false);
+ ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
+ page_add_new_anon_rmap(new_page, vma, vmf->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
/*
@@ -2205,8 +2347,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
- set_pte_at_notify(mm, fe->address, fe->pte, entry);
- update_mmu_cache(vma, fe->address, fe->pte);
+ set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
if (old_page) {
/*
* Only after switching the pte to the new page may
@@ -2243,7 +2385,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
if (new_page)
put_page(new_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (old_page) {
/*
@@ -2267,79 +2409,91 @@ oom:
return VM_FAULT_OOM;
}
+/**
+ * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
+ * writeable once the page is prepared
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a write page fault in a
+ * shared mapping due to PTE being read-only once the mapped page is prepared.
+ * It handles locking of PTE and modifying it. The function returns
+ * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE
+ * lock.
+ *
+ * The function expects the page to be locked or other protection against
+ * concurrent faults / writeback (such as DAX radix tree locks).
+ */
+int finish_mkwrite_fault(struct vm_fault *vmf)
+{
+ WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ /*
+ * We might have raced with another page fault while we released the
+ * pte_offset_map_lock.
+ */
+ if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return VM_FAULT_NOPAGE;
+ }
+ wp_page_reuse(vmf);
+ return 0;
+}
+
/*
* Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
* mapping
*/
-static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte)
+static int wp_pfn_shared(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
- struct vm_fault vmf = {
- .page = NULL,
- .pgoff = linear_page_index(vma, fe->address),
- .virtual_address =
- (void __user *)(fe->address & PAGE_MASK),
- .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
- };
int ret;
- pte_unmap_unlock(fe->pte, fe->ptl);
- ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
- if (ret & VM_FAULT_ERROR)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ vmf->flags |= FAULT_FLAG_MKWRITE;
+ ret = vma->vm_ops->pfn_mkwrite(vmf);
+ if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
return ret;
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- /*
- * We might have raced with another page fault while we
- * released the pte_offset_map_lock.
- */
- if (!pte_same(*fe->pte, orig_pte)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
- return 0;
- }
+ return finish_mkwrite_fault(vmf);
}
- return wp_page_reuse(fe, orig_pte, NULL, 0, 0);
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
}
-static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
- struct page *old_page)
- __releases(fe->ptl)
+static int wp_page_shared(struct vm_fault *vmf)
+ __releases(vmf->ptl)
{
- struct vm_area_struct *vma = fe->vma;
- int page_mkwrite = 0;
+ struct vm_area_struct *vma = vmf->vma;
- get_page(old_page);
+ get_page(vmf->page);
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
int tmp;
- pte_unmap_unlock(fe->pte, fe->ptl);
- tmp = do_page_mkwrite(vma, old_page, fe->address);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp || (tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(old_page);
+ put_page(vmf->page);
return tmp;
}
- /*
- * Since we dropped the lock we need to revalidate
- * the PTE as someone else may have changed it. If
- * they did, we just return, as we can count on the
- * MMU to tell us if they didn't also make it writable.
- */
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_same(*fe->pte, orig_pte)) {
- unlock_page(old_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- put_page(old_page);
- return 0;
+ tmp = finish_mkwrite_fault(vmf);
+ if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ return tmp;
}
- page_mkwrite = 1;
+ } else {
+ wp_page_reuse(vmf);
+ lock_page(vmf->page);
}
+ fault_dirty_shared_page(vma, vmf->page);
+ put_page(vmf->page);
- return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1);
+ return VM_FAULT_WRITE;
}
/*
@@ -2360,14 +2514,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
* but allow concurrent faults), with pte both mapped and locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
- __releases(fe->ptl)
+static int do_wp_page(struct vm_fault *vmf)
+ __releases(vmf->ptl)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *old_page;
+ struct vm_area_struct *vma = vmf->vma;
- old_page = vm_normal_page(vma, fe->address, orig_pte);
- if (!old_page) {
+ vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
+ if (!vmf->page) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
* VM_PFNMAP VMA.
@@ -2377,33 +2530,33 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
*/
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
- return wp_pfn_shared(fe, orig_pte);
+ return wp_pfn_shared(vmf);
- pte_unmap_unlock(fe->pte, fe->ptl);
- return wp_page_copy(fe, orig_pte, old_page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return wp_page_copy(vmf);
}
/*
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
- if (PageAnon(old_page) && !PageKsm(old_page)) {
+ if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
int total_mapcount;
- if (!trylock_page(old_page)) {
- get_page(old_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- lock_page(old_page);
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
- fe->address, &fe->ptl);
- if (!pte_same(*fe->pte, orig_pte)) {
- unlock_page(old_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- put_page(old_page);
+ if (!trylock_page(vmf->page)) {
+ get_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ lock_page(vmf->page);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ unlock_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ put_page(vmf->page);
return 0;
}
- put_page(old_page);
+ put_page(vmf->page);
}
- if (reuse_swap_page(old_page, &total_mapcount)) {
+ if (reuse_swap_page(vmf->page, &total_mapcount)) {
if (total_mapcount == 1) {
/*
* The page is all ours. Move it to
@@ -2412,24 +2565,25 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
* Protected against the rmap code by
* the page lock.
*/
- page_move_anon_rmap(old_page, vma);
+ page_move_anon_rmap(vmf->page, vma);
}
- unlock_page(old_page);
- return wp_page_reuse(fe, orig_pte, old_page, 0, 0);
+ unlock_page(vmf->page);
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
}
- unlock_page(old_page);
+ unlock_page(vmf->page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
- return wp_page_shared(fe, orig_pte, old_page);
+ return wp_page_shared(vmf);
}
/*
* Ok, we need to copy. Oh, well..
*/
- get_page(old_page);
+ get_page(vmf->page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- return wp_page_copy(fe, orig_pte, old_page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return wp_page_copy(vmf);
}
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -2496,7 +2650,7 @@ void unmap_mapping_range(struct address_space *mapping,
hlen = ULONG_MAX - hba + 1;
}
- details.check_mapping = even_cows? NULL: mapping;
+ details.check_mapping = even_cows ? NULL : mapping;
details.first_index = hba;
details.last_index = hba + hlen - 1;
if (details.last_index < details.first_index)
@@ -2517,9 +2671,9 @@ EXPORT_SYMBOL(unmap_mapping_range);
* We return with the mmap_sem locked or unlocked in the same cases
* as does filemap_fault().
*/
-int do_swap_page(struct fault_env *fe, pte_t orig_pte)
+int do_swap_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct page *page, *swapcache;
struct mem_cgroup *memcg;
swp_entry_t entry;
@@ -2528,17 +2682,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
int exclusive = 0;
int ret = 0;
- if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte))
+ if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
goto out;
- entry = pte_to_swp_entry(orig_pte);
+ entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
- migration_entry_wait(vma->vm_mm, fe->pmd, fe->address);
+ migration_entry_wait(vma->vm_mm, vmf->pmd,
+ vmf->address);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {
- print_bad_pte(vma, fe->address, orig_pte, NULL);
+ print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
ret = VM_FAULT_SIGBUS;
}
goto out;
@@ -2546,16 +2701,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry);
if (!page) {
- page = swapin_readahead(entry,
- GFP_HIGHUSER_MOVABLE, vma, fe->address);
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
+ vmf->address);
if (!page) {
/*
* Back out if somebody else faulted in this pte
* while we released the pte lock.
*/
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
- fe->address, &fe->ptl);
- if (likely(pte_same(*fe->pte, orig_pte)))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
ret = VM_FAULT_OOM;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto unlock;
@@ -2577,7 +2732,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
}
swapcache = page;
- locked = lock_page_or_retry(page, vma->vm_mm, fe->flags);
+ locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
if (!locked) {
@@ -2594,7 +2749,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
goto out_page;
- page = ksm_might_need_to_copy(page, vma, fe->address);
+ page = ksm_might_need_to_copy(page, vma, vmf->address);
if (unlikely(!page)) {
ret = VM_FAULT_OOM;
page = swapcache;
@@ -2610,9 +2765,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
/*
* Back out if somebody else already faulted in this pte.
*/
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (unlikely(!pte_same(*fe->pte, orig_pte)))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
goto out_nomap;
if (unlikely(!PageUptodate(page))) {
@@ -2633,22 +2788,23 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
+ if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
- fe->flags &= ~FAULT_FLAG_WRITE;
+ vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
exclusive = RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
- if (pte_swp_soft_dirty(orig_pte))
+ if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
- set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ vmf->orig_pte = pte;
if (page == swapcache) {
- do_page_add_anon_rmap(page, vma, fe->address, exclusive);
+ do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
@@ -2671,22 +2827,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
put_page(swapcache);
}
- if (fe->flags & FAULT_FLAG_WRITE) {
- ret |= do_wp_page(fe, pte);
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ ret |= do_wp_page(vmf);
if (ret & VM_FAULT_ERROR)
ret &= VM_FAULT_ERROR;
goto out;
}
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, fe->address, fe->pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
return ret;
out_nomap:
mem_cgroup_cancel_charge(page, memcg, false);
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
unlock_page(page);
out_release:
@@ -2737,9 +2893,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_anonymous_page(struct fault_env *fe)
+static int do_anonymous_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
struct page *page;
pte_t entry;
@@ -2749,7 +2905,7 @@ static int do_anonymous_page(struct fault_env *fe)
return VM_FAULT_SIGBUS;
/* Check if we need to add a guard page to the stack */
- if (check_stack_guard_page(vma, fe->address) < 0)
+ if (check_stack_guard_page(vma, vmf->address) < 0)
return VM_FAULT_SIGSEGV;
/*
@@ -2762,26 +2918,26 @@ static int do_anonymous_page(struct fault_env *fe)
*
* Here we only have down_read(mmap_sem).
*/
- if (pte_alloc(vma->vm_mm, fe->pmd, fe->address))
+ if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
return VM_FAULT_OOM;
/* See the comment in pte_alloc_one_map() */
- if (unlikely(pmd_trans_unstable(fe->pmd)))
+ if (unlikely(pmd_trans_unstable(vmf->pmd)))
return 0;
/* Use the zero-page for reads */
- if (!(fe->flags & FAULT_FLAG_WRITE) &&
+ if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm)) {
- entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address),
+ entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
vma->vm_page_prot));
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_none(*fe->pte))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (!pte_none(*vmf->pte))
goto unlock;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
- return handle_userfault(fe, VM_UFFD_MISSING);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return handle_userfault(vmf, VM_UFFD_MISSING);
}
goto setpte;
}
@@ -2789,7 +2945,7 @@ static int do_anonymous_page(struct fault_env *fe)
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- page = alloc_zeroed_user_highpage_movable(vma, fe->address);
+ page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
if (!page)
goto oom;
@@ -2807,30 +2963,30 @@ static int do_anonymous_page(struct fault_env *fe)
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_none(*fe->pte))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (!pte_none(*vmf->pte))
goto release;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
- return handle_userfault(fe, VM_UFFD_MISSING);
+ return handle_userfault(vmf, VM_UFFD_MISSING);
}
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
setpte:
- set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, fe->address, fe->pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
release:
mem_cgroup_cancel_charge(page, memcg, false);
@@ -2847,62 +3003,50 @@ oom:
* released depending on flags and vma->vm_ops->fault() return value.
* See filemap_fault() and __lock_page_retry().
*/
-static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
- struct page *cow_page, struct page **page, void **entry)
+static int __do_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct vm_fault vmf;
+ struct vm_area_struct *vma = vmf->vma;
int ret;
- vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK);
- vmf.pgoff = pgoff;
- vmf.flags = fe->flags;
- vmf.page = NULL;
- vmf.gfp_mask = __get_fault_gfp_mask(vma);
- vmf.cow_page = cow_page;
-
- ret = vma->vm_ops->fault(vma, &vmf);
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- return ret;
- if (ret & VM_FAULT_DAX_LOCKED) {
- *entry = vmf.entry;
+ ret = vma->vm_ops->fault(vmf);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
+ VM_FAULT_DONE_COW)))
return ret;
- }
- if (unlikely(PageHWPoison(vmf.page))) {
+ if (unlikely(PageHWPoison(vmf->page))) {
if (ret & VM_FAULT_LOCKED)
- unlock_page(vmf.page);
- put_page(vmf.page);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ vmf->page = NULL;
return VM_FAULT_HWPOISON;
}
if (unlikely(!(ret & VM_FAULT_LOCKED)))
- lock_page(vmf.page);
+ lock_page(vmf->page);
else
- VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
+ VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
- *page = vmf.page;
return ret;
}
-static int pte_alloc_one_map(struct fault_env *fe)
+static int pte_alloc_one_map(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
- if (!pmd_none(*fe->pmd))
+ if (!pmd_none(*vmf->pmd))
goto map_pte;
- if (fe->prealloc_pte) {
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_none(*fe->pmd))) {
- spin_unlock(fe->ptl);
+ if (vmf->prealloc_pte) {
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd))) {
+ spin_unlock(vmf->ptl);
goto map_pte;
}
atomic_long_inc(&vma->vm_mm->nr_ptes);
- pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte);
- spin_unlock(fe->ptl);
- fe->prealloc_pte = 0;
- } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) {
+ pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
+ spin_unlock(vmf->ptl);
+ vmf->prealloc_pte = NULL;
+ } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
return VM_FAULT_OOM;
}
map_pte:
@@ -2917,11 +3061,11 @@ map_pte:
* through an atomic read in C, which is what pmd_trans_unstable()
* provides.
*/
- if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+ if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
return VM_FAULT_NOPAGE;
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
return 0;
}
@@ -2939,11 +3083,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
return true;
}
-static int do_set_pmd(struct fault_env *fe, struct page *page)
+static void deposit_prealloc_pte(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
+ /*
+ * We are going to consume the prealloc table,
+ * count that as nr_ptes.
+ */
+ atomic_long_inc(&vma->vm_mm->nr_ptes);
+ vmf->prealloc_pte = NULL;
+}
+
+static int do_set_pmd(struct vm_fault *vmf, struct page *page)
{
- struct vm_area_struct *vma = fe->vma;
- bool write = fe->flags & FAULT_FLAG_WRITE;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t entry;
int i, ret;
@@ -2953,8 +3110,19 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
ret = VM_FAULT_FALLBACK;
page = compound_head(page);
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_none(*fe->pmd)))
+ /*
+ * Archs like ppc64 need additonal space to store information
+ * related to pte entry. Use the preallocated table for that.
+ */
+ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
+ if (!vmf->prealloc_pte)
+ return VM_FAULT_OOM;
+ smp_wmb(); /* See comment in __pte_alloc() */
+ }
+
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd)))
goto out;
for (i = 0; i < HPAGE_PMD_NR; i++)
@@ -2966,20 +3134,25 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
page_add_file_rmap(page, true);
+ /*
+ * deposit and withdraw with pmd lock held
+ */
+ if (arch_needs_pgtable_deposit())
+ deposit_prealloc_pte(vmf);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
- update_mmu_cache_pmd(vma, haddr, fe->pmd);
+ update_mmu_cache_pmd(vma, haddr, vmf->pmd);
/* fault is handled */
ret = 0;
count_vm_event(THP_FILE_MAPPED);
out:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
return ret;
}
#else
-static int do_set_pmd(struct fault_env *fe, struct page *page)
+static int do_set_pmd(struct vm_fault *vmf, struct page *page)
{
BUILD_BUG();
return 0;
@@ -2990,41 +3163,42 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
* alloc_set_pte - setup new PTE entry for given page and add reverse page
* mapping. If needed, the fucntion allocates page table or use pre-allocated.
*
- * @fe: fault environment
+ * @vmf: fault environment
* @memcg: memcg to charge page (only for private mappings)
* @page: page to map
*
- * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return.
+ * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
+ * return.
*
* Target users are page handler itself and implementations of
* vm_ops->map_pages.
*/
-int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
+int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page)
{
- struct vm_area_struct *vma = fe->vma;
- bool write = fe->flags & FAULT_FLAG_WRITE;
+ struct vm_area_struct *vma = vmf->vma;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
pte_t entry;
int ret;
- if (pmd_none(*fe->pmd) && PageTransCompound(page) &&
+ if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
/* THP on COW? */
VM_BUG_ON_PAGE(memcg, page);
- ret = do_set_pmd(fe, page);
+ ret = do_set_pmd(vmf, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
}
- if (!fe->pte) {
- ret = pte_alloc_one_map(fe);
+ if (!vmf->pte) {
+ ret = pte_alloc_one_map(vmf);
if (ret)
return ret;
}
/* Re-check under ptl */
- if (unlikely(!pte_none(*fe->pte)))
+ if (unlikely(!pte_none(*vmf->pte)))
return VM_FAULT_NOPAGE;
flush_icache_page(vma, page);
@@ -3034,21 +3208,53 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
}
- set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, fe->address, fe->pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
return 0;
}
+
+/**
+ * finish_fault - finish page fault once we have prepared the page to fault
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a page fault once the
+ * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
+ * given page, adds reverse page mapping, handles memcg charges and LRU
+ * addition. The function returns 0 on success, VM_FAULT_ code in case of
+ * error.
+ *
+ * The function expects the page to be locked and on success it consumes a
+ * reference of a page being mapped (for the PTE which maps it).
+ */
+int finish_fault(struct vm_fault *vmf)
+{
+ struct page *page;
+ int ret;
+
+ /* Did we COW the page? */
+ if ((vmf->flags & FAULT_FLAG_WRITE) &&
+ !(vmf->vma->vm_flags & VM_SHARED))
+ page = vmf->cow_page;
+ else
+ page = vmf->page;
+ ret = alloc_set_pte(vmf, vmf->memcg, page);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return ret;
+}
+
static unsigned long fault_around_bytes __read_mostly =
rounddown_pow_of_two(65536);
@@ -3113,17 +3319,18 @@ late_initcall(fault_around_debugfs);
* fault_around_pages() value (and therefore to page order). This way it's
* easier to guarantee that we don't cross page table boundaries.
*/
-static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
+static int do_fault_around(struct vm_fault *vmf)
{
- unsigned long address = fe->address, nr_pages, mask;
+ unsigned long address = vmf->address, nr_pages, mask;
+ pgoff_t start_pgoff = vmf->pgoff;
pgoff_t end_pgoff;
int off, ret = 0;
nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
- fe->address = max(address & mask, fe->vma->vm_start);
- off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+ vmf->address = max(address & mask, vmf->vma->vm_start);
+ off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
start_pgoff -= off;
/*
@@ -3131,50 +3338,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
* or fault_around_pages() from start_pgoff, depending what is nearest.
*/
end_pgoff = start_pgoff -
- ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
+ ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
PTRS_PER_PTE - 1;
- end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1,
+ end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
start_pgoff + nr_pages - 1);
- if (pmd_none(*fe->pmd)) {
- fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address);
- if (!fe->prealloc_pte)
+ if (pmd_none(*vmf->pmd)) {
+ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
+ vmf->address);
+ if (!vmf->prealloc_pte)
goto out;
smp_wmb(); /* See comment in __pte_alloc() */
}
- fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
+ vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
- /* preallocated pagetable is unused: free it */
- if (fe->prealloc_pte) {
- pte_free(fe->vma->vm_mm, fe->prealloc_pte);
- fe->prealloc_pte = 0;
- }
/* Huge page is mapped? Page fault is solved */
- if (pmd_trans_huge(*fe->pmd)) {
+ if (pmd_trans_huge(*vmf->pmd)) {
ret = VM_FAULT_NOPAGE;
goto out;
}
/* ->map_pages() haven't done anything useful. Cold page cache? */
- if (!fe->pte)
+ if (!vmf->pte)
goto out;
/* check if the page fault is solved */
- fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
- if (!pte_none(*fe->pte))
+ vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
+ if (!pte_none(*vmf->pte))
ret = VM_FAULT_NOPAGE;
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
- fe->address = address;
- fe->pte = NULL;
+ vmf->address = address;
+ vmf->pte = NULL;
return ret;
}
-static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_read_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *fault_page;
+ struct vm_area_struct *vma = vmf->vma;
int ret = 0;
/*
@@ -3183,80 +3385,67 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
* something).
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
- ret = do_fault_around(fe, pgoff);
+ ret = do_fault_around(vmf);
if (ret)
return ret;
}
- ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
+ ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
- ret |= alloc_set_pte(fe, NULL, fault_page);
- if (fe->pte)
- pte_unmap_unlock(fe->pte, fe->ptl);
- unlock_page(fault_page);
+ ret |= finish_fault(vmf);
+ unlock_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- put_page(fault_page);
+ put_page(vmf->page);
return ret;
}
-static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_cow_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *fault_page, *new_page;
- void *fault_entry;
- struct mem_cgroup *memcg;
+ struct vm_area_struct *vma = vmf->vma;
int ret;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address);
- if (!new_page)
+ vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+ if (!vmf->cow_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
- &memcg, false)) {
- put_page(new_page);
+ if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
+ &vmf->memcg, false)) {
+ put_page(vmf->cow_page);
return VM_FAULT_OOM;
}
- ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry);
+ ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
+ if (ret & VM_FAULT_DONE_COW)
+ return ret;
- if (!(ret & VM_FAULT_DAX_LOCKED))
- copy_user_highpage(new_page, fault_page, fe->address, vma);
- __SetPageUptodate(new_page);
+ copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
+ __SetPageUptodate(vmf->cow_page);
- ret |= alloc_set_pte(fe, memcg, new_page);
- if (fe->pte)
- pte_unmap_unlock(fe->pte, fe->ptl);
- if (!(ret & VM_FAULT_DAX_LOCKED)) {
- unlock_page(fault_page);
- put_page(fault_page);
- } else {
- dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
- }
+ ret |= finish_fault(vmf);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
return ret;
uncharge_out:
- mem_cgroup_cancel_charge(new_page, memcg, false);
- put_page(new_page);
+ mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
+ put_page(vmf->cow_page);
return ret;
}
-static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_shared_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *fault_page;
- struct address_space *mapping;
- int dirtied = 0;
+ struct vm_area_struct *vma = vmf->vma;
int ret, tmp;
- ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
+ ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
@@ -3265,46 +3454,24 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
* about to become writable
*/
if (vma->vm_ops->page_mkwrite) {
- unlock_page(fault_page);
- tmp = do_page_mkwrite(vma, fault_page, fe->address);
+ unlock_page(vmf->page);
+ tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(fault_page);
+ put_page(vmf->page);
return tmp;
}
}
- ret |= alloc_set_pte(fe, NULL, fault_page);
- if (fe->pte)
- pte_unmap_unlock(fe->pte, fe->ptl);
+ ret |= finish_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY))) {
- unlock_page(fault_page);
- put_page(fault_page);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
return ret;
}
- if (set_page_dirty(fault_page))
- dirtied = 1;
- /*
- * Take a local copy of the address_space - page.mapping may be zeroed
- * by truncate after unlock_page(). The address_space itself remains
- * pinned by vma->vm_file's reference. We rely on unlock_page()'s
- * release semantics to prevent the compiler from undoing this copying.
- */
- mapping = page_rmapping(fault_page);
- unlock_page(fault_page);
- if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping but still
- * dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
-
- if (!vma->vm_ops->page_mkwrite)
- file_update_time(vma->vm_file);
-
+ fault_dirty_shared_page(vma, vmf->page);
return ret;
}
@@ -3314,19 +3481,27 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
-static int do_fault(struct fault_env *fe)
+static int do_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- pgoff_t pgoff = linear_page_index(vma, fe->address);
+ struct vm_area_struct *vma = vmf->vma;
+ int ret;
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
if (!vma->vm_ops->fault)
- return VM_FAULT_SIGBUS;
- if (!(fe->flags & FAULT_FLAG_WRITE))
- return do_read_fault(fe, pgoff);
- if (!(vma->vm_flags & VM_SHARED))
- return do_cow_fault(fe, pgoff);
- return do_shared_fault(fe, pgoff);
+ ret = VM_FAULT_SIGBUS;
+ else if (!(vmf->flags & FAULT_FLAG_WRITE))
+ ret = do_read_fault(vmf);
+ else if (!(vma->vm_flags & VM_SHARED))
+ ret = do_cow_fault(vmf);
+ else
+ ret = do_shared_fault(vmf);
+
+ /* preallocated pagetable is unused: free it */
+ if (vmf->prealloc_pte) {
+ pte_free(vma->vm_mm, vmf->prealloc_pte);
+ vmf->prealloc_pte = NULL;
+ }
+ return ret;
}
static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3344,50 +3519,51 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
return mpol_misplaced(page, vma, addr);
}
-static int do_numa_page(struct fault_env *fe, pte_t pte)
+static int do_numa_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
int page_nid = -1;
int last_cpupid;
int target_nid;
bool migrated = false;
- bool was_writable = pte_write(pte);
+ pte_t pte;
+ bool was_writable = pte_savedwrite(vmf->orig_pte);
int flags = 0;
/*
- * The "pte" at this point cannot be used safely without
- * validation through pte_unmap_same(). It's of NUMA type but
- * the pfn may be screwed if the read is non atomic.
- *
- * We can safely just do a "set_pte_at()", because the old
- * page table entry is not accessible, so there would be no
- * concurrent hardware modifications to the PTE.
- */
- fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd);
- spin_lock(fe->ptl);
- if (unlikely(!pte_same(*fe->pte, pte))) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ * The "pte" at this point cannot be used safely without
+ * validation through pte_unmap_same(). It's of NUMA type but
+ * the pfn may be screwed if the read is non atomic.
+ */
+ vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
- /* Make it present again */
+ /*
+ * Make it present again, Depending on how arch implementes non
+ * accessible ptes, some can allow access by kernel mode.
+ */
+ pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
pte = pte_modify(pte, vma->vm_page_prot);
pte = pte_mkyoung(pte);
if (was_writable)
pte = pte_mkwrite(pte);
- set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
- update_mmu_cache(vma, fe->address, fe->pte);
+ ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
- page = vm_normal_page(vma, fe->address, pte);
+ page = vm_normal_page(vma, vmf->address, pte);
if (!page) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
/* TODO: handle PTE-mapped THP */
if (PageCompound(page)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
@@ -3411,9 +3587,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page);
- target_nid = numa_migrate_prep(page, vma, fe->address, page_nid,
+ target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
if (target_nid == -1) {
put_page(page);
goto out;
@@ -3433,28 +3609,25 @@ out:
return 0;
}
-static int create_huge_pmd(struct fault_env *fe)
+static int create_huge_pmd(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- if (vma_is_anonymous(vma))
- return do_huge_pmd_anonymous_page(fe);
- if (vma->vm_ops->pmd_fault)
- return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd,
- fe->flags);
+ if (vma_is_anonymous(vmf->vma))
+ return do_huge_pmd_anonymous_page(vmf);
+ if (vmf->vma->vm_ops->huge_fault)
+ return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
return VM_FAULT_FALLBACK;
}
-static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
+static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
{
- if (vma_is_anonymous(fe->vma))
- return do_huge_pmd_wp_page(fe, orig_pmd);
- if (fe->vma->vm_ops->pmd_fault)
- return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd,
- fe->flags);
+ if (vma_is_anonymous(vmf->vma))
+ return do_huge_pmd_wp_page(vmf, orig_pmd);
+ if (vmf->vma->vm_ops->huge_fault)
+ return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
/* COW handled on pte level: split pmd */
- VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma);
- split_huge_pmd(fe->vma, fe->pmd, fe->address);
+ VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
+ __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
return VM_FAULT_FALLBACK;
}
@@ -3464,6 +3637,30 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
}
+static int create_huge_pud(struct vm_fault *vmf)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /* No support for anonymous transparent PUD pages yet */
+ if (vma_is_anonymous(vmf->vma))
+ return VM_FAULT_FALLBACK;
+ if (vmf->vma->vm_ops->huge_fault)
+ return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ return VM_FAULT_FALLBACK;
+}
+
+static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /* No support for anonymous transparent PUD pages yet */
+ if (vma_is_anonymous(vmf->vma))
+ return VM_FAULT_FALLBACK;
+ if (vmf->vma->vm_ops->huge_fault)
+ return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ return VM_FAULT_FALLBACK;
+}
+
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
@@ -3479,21 +3676,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
* The mmap_sem may have been released depending on flags and our return value.
* See filemap_fault() and __lock_page_or_retry().
*/
-static int handle_pte_fault(struct fault_env *fe)
+static int handle_pte_fault(struct vm_fault *vmf)
{
pte_t entry;
- if (unlikely(pmd_none(*fe->pmd))) {
+ if (unlikely(pmd_none(*vmf->pmd))) {
/*
* Leave __pte_alloc() until later: because vm_ops->fault may
* want to allocate huge page, and if we expose page table
* for an instant, it will be difficult to retract from
* concurrent faults and from rmap lookups.
*/
- fe->pte = NULL;
+ vmf->pte = NULL;
} else {
/* See comment in pte_alloc_one_map() */
- if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+ if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
return 0;
/*
* A regular pmd is established and it can't morph into a huge
@@ -3501,9 +3698,8 @@ static int handle_pte_fault(struct fault_env *fe)
* mmap_sem read mode and khugepaged takes it in write mode.
* So now it's safe to run pte_offset_map().
*/
- fe->pte = pte_offset_map(fe->pmd, fe->address);
-
- entry = *fe->pte;
+ vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+ vmf->orig_pte = *vmf->pte;
/*
* some architectures can have larger ptes than wordsize,
@@ -3514,38 +3710,39 @@ static int handle_pte_fault(struct fault_env *fe)
* ptl lock held. So here a barrier will do.
*/
barrier();
- if (pte_none(entry)) {
- pte_unmap(fe->pte);
- fe->pte = NULL;
+ if (pte_none(vmf->orig_pte)) {
+ pte_unmap(vmf->pte);
+ vmf->pte = NULL;
}
}
- if (!fe->pte) {
- if (vma_is_anonymous(fe->vma))
- return do_anonymous_page(fe);
+ if (!vmf->pte) {
+ if (vma_is_anonymous(vmf->vma))
+ return do_anonymous_page(vmf);
else
- return do_fault(fe);
+ return do_fault(vmf);
}
- if (!pte_present(entry))
- return do_swap_page(fe, entry);
+ if (!pte_present(vmf->orig_pte))
+ return do_swap_page(vmf);
- if (pte_protnone(entry) && vma_is_accessible(fe->vma))
- return do_numa_page(fe, entry);
+ if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
+ return do_numa_page(vmf);
- fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd);
- spin_lock(fe->ptl);
- if (unlikely(!pte_same(*fe->pte, entry)))
+ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+ spin_lock(vmf->ptl);
+ entry = vmf->orig_pte;
+ if (unlikely(!pte_same(*vmf->pte, entry)))
goto unlock;
- if (fe->flags & FAULT_FLAG_WRITE) {
+ if (vmf->flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
- return do_wp_page(fe, entry);
+ return do_wp_page(vmf);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
- if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry,
- fe->flags & FAULT_FLAG_WRITE)) {
- update_mmu_cache(fe->vma, fe->address, fe->pte);
+ if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
+ vmf->flags & FAULT_FLAG_WRITE)) {
+ update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
} else {
/*
* This is needed only for protection faults but the arch code
@@ -3553,11 +3750,11 @@ static int handle_pte_fault(struct fault_env *fe)
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
- if (fe->flags & FAULT_FLAG_WRITE)
- flush_tlb_fix_spurious_fault(fe->vma, fe->address);
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
}
unlock:
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
@@ -3570,48 +3767,78 @@ unlock:
static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
{
- struct fault_env fe = {
+ struct vm_fault vmf = {
.vma = vma,
- .address = address,
+ .address = address & PAGE_MASK,
.flags = flags,
+ .pgoff = linear_page_index(vma, address),
+ .gfp_mask = __get_fault_gfp_mask(vma),
};
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
- pud_t *pud;
+ p4d_t *p4d;
+ int ret;
pgd = pgd_offset(mm, address);
- pud = pud_alloc(mm, pgd, address);
- if (!pud)
+ p4d = p4d_alloc(mm, pgd, address);
+ if (!p4d)
return VM_FAULT_OOM;
- fe.pmd = pmd_alloc(mm, pud, address);
- if (!fe.pmd)
+
+ vmf.pud = pud_alloc(mm, p4d, address);
+ if (!vmf.pud)
return VM_FAULT_OOM;
- if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) {
- int ret = create_huge_pmd(&fe);
+ if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
+ ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- pmd_t orig_pmd = *fe.pmd;
- int ret;
+ pud_t orig_pud = *vmf.pud;
+
+ barrier();
+ if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
+ unsigned int dirty = flags & FAULT_FLAG_WRITE;
+
+ /* NUMA case for anonymous PUDs would go here */
+
+ if (dirty && !pud_write(orig_pud)) {
+ ret = wp_huge_pud(&vmf, orig_pud);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ } else {
+ huge_pud_set_accessed(&vmf, orig_pud);
+ return 0;
+ }
+ }
+ }
+
+ vmf.pmd = pmd_alloc(mm, vmf.pud, address);
+ if (!vmf.pmd)
+ return VM_FAULT_OOM;
+ if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
+ ret = create_huge_pmd(&vmf);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ } else {
+ pmd_t orig_pmd = *vmf.pmd;
barrier();
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
- return do_huge_pmd_numa_page(&fe, orig_pmd);
+ return do_huge_pmd_numa_page(&vmf, orig_pmd);
- if ((fe.flags & FAULT_FLAG_WRITE) &&
+ if ((vmf.flags & FAULT_FLAG_WRITE) &&
!pmd_write(orig_pmd)) {
- ret = wp_huge_pmd(&fe, orig_pmd);
+ ret = wp_huge_pmd(&vmf, orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- huge_pmd_set_accessed(&fe, orig_pmd);
+ huge_pmd_set_accessed(&vmf, orig_pmd);
return 0;
}
}
}
- return handle_pte_fault(&fe);
+ return handle_pte_fault(&vmf);
}
/*
@@ -3652,14 +3879,14 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
if (flags & FAULT_FLAG_USER) {
mem_cgroup_oom_disable();
- /*
- * The task may have entered a memcg OOM situation but
- * if the allocation error was handled gracefully (no
- * VM_FAULT_OOM), there is no need to kill anything.
- * Just clean up the OOM state peacefully.
- */
- if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
- mem_cgroup_oom_synchronize(false);
+ /*
+ * The task may have entered a memcg OOM situation but
+ * if the allocation error was handled gracefully (no
+ * VM_FAULT_OOM), there is no need to kill anything.
+ * Just clean up the OOM state peacefully.
+ */
+ if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
+ mem_cgroup_oom_synchronize(false);
}
/*
@@ -3679,12 +3906,35 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
+#ifndef __PAGETABLE_P4D_FOLDED
+/*
+ * Allocate p4d page table.
+ * We've already handled the fast-path in-line.
+ */
+int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+{
+ p4d_t *new = p4d_alloc_one(mm, address);
+ if (!new)
+ return -ENOMEM;
+
+ smp_wmb(); /* See comment in __pte_alloc */
+
+ spin_lock(&mm->page_table_lock);
+ if (pgd_present(*pgd)) /* Another has populated it */
+ p4d_free(mm, new);
+ else
+ pgd_populate(mm, pgd, new);
+ spin_unlock(&mm->page_table_lock);
+ return 0;
+}
+#endif /* __PAGETABLE_P4D_FOLDED */
+
#ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.
* We've already handled the fast-path in-line.
*/
-int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
{
pud_t *new = pud_alloc_one(mm, address);
if (!new)
@@ -3693,10 +3943,17 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
smp_wmb(); /* See comment in __pte_alloc */
spin_lock(&mm->page_table_lock);
- if (pgd_present(*pgd)) /* Another has populated it */
+#ifndef __ARCH_HAS_5LEVEL_HACK
+ if (p4d_present(*p4d)) /* Another has populated it */
pud_free(mm, new);
else
- pgd_populate(mm, pgd, new);
+ p4d_populate(mm, p4d, new);
+#else
+ if (pgd_present(*p4d)) /* Another has populated it */
+ pud_free(mm, new);
+ else
+ pgd_populate(mm, p4d, new);
+#endif /* __ARCH_HAS_5LEVEL_HACK */
spin_unlock(&mm->page_table_lock);
return 0;
}
@@ -3709,13 +3966,14 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
*/
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
+ spinlock_t *ptl;
pmd_t *new = pmd_alloc_one(mm, address);
if (!new)
return -ENOMEM;
smp_wmb(); /* See comment in __pte_alloc */
- spin_lock(&mm->page_table_lock);
+ ptl = pud_lock(mm, pud);
#ifndef __ARCH_HAS_4LEVEL_HACK
if (!pud_present(*pud)) {
mm_inc_nr_pmds(mm);
@@ -3729,15 +3987,16 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
} else /* Another has populated it */
pmd_free(mm, new);
#endif /* __ARCH_HAS_4LEVEL_HACK */
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
#endif /* __PAGETABLE_PMD_FOLDED */
-static int __follow_pte(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp)
+static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep;
@@ -3746,17 +4005,30 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address,
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
goto out;
- pud = pud_offset(pgd, address);
+ p4d = p4d_offset(pgd, address);
+ if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
+ goto out;
+
+ pud = pud_offset(p4d, address);
if (pud_none(*pud) || unlikely(pud_bad(*pud)))
goto out;
pmd = pmd_offset(pud, address);
VM_BUG_ON(pmd_trans_huge(*pmd));
- if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
- goto out;
- /* We cannot handle huge page PFN maps. Luckily they don't exist. */
- if (pmd_huge(*pmd))
+ if (pmd_huge(*pmd)) {
+ if (!pmdpp)
+ goto out;
+
+ *ptlp = pmd_lock(mm, pmd);
+ if (pmd_huge(*pmd)) {
+ *pmdpp = pmd;
+ return 0;
+ }
+ spin_unlock(*ptlp);
+ }
+
+ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
@@ -3779,10 +4051,24 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address,
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
- !(res = __follow_pte(mm, address, ptepp, ptlp)));
+ !(res = __follow_pte_pmd(mm, address, ptepp, NULL,
+ ptlp)));
return res;
}
+int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+{
+ int res;
+
+ /* (void) is needed to make gcc happy */
+ (void) __cond_lock(*ptlp,
+ !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp,
+ ptlp)));
+ return res;
+}
+EXPORT_SYMBOL(follow_pte_pmd);
+
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
@@ -3868,7 +4154,7 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
* Access another process' address space as given in mm. If non-NULL, use the
* given task for page fault accounting.
*/
-static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
unsigned long addr, void *buf, int len, unsigned int gup_flags)
{
struct vm_area_struct *vma;
@@ -3883,7 +4169,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
struct page *page = NULL;
ret = get_user_pages_remote(tsk, mm, addr, 1,
- gup_flags, &page, &vma);
+ gup_flags, &page, &vma, NULL);
if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
break;
@@ -3966,6 +4252,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr,
return ret;
}
+EXPORT_SYMBOL_GPL(access_process_vm);
/*
* Print the name of a VMA.
@@ -4093,6 +4380,38 @@ void copy_user_huge_page(struct page *dst, struct page *src,
copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
}
}
+
+long copy_huge_page_from_user(struct page *dst_page,
+ const void __user *usr_src,
+ unsigned int pages_per_huge_page,
+ bool allow_pagefault)
+{
+ void *src = (void *)usr_src;
+ void *page_kaddr;
+ unsigned long i, rc = 0;
+ unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
+
+ for (i = 0; i < pages_per_huge_page; i++) {
+ if (allow_pagefault)
+ page_kaddr = kmap(dst_page + i);
+ else
+ page_kaddr = kmap_atomic(dst_page + i);
+ rc = copy_from_user(page_kaddr,
+ (const void __user *)(src + i * PAGE_SIZE),
+ PAGE_SIZE);
+ if (allow_pagefault)
+ kunmap(dst_page + i);
+ else
+ kunmap_atomic(page_kaddr);
+
+ ret_val -= (PAGE_SIZE - rc);
+ if (rc)
+ break;
+
+ cond_resched();
+ }
+ return ret_val;
+}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index cad4b9125695..6fa7208bcd56 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -6,6 +6,7 @@
#include <linux/stddef.h>
#include <linux/mm.h>
+#include <linux/sched/signal.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
@@ -124,8 +125,13 @@ void put_online_mems(void)
}
+/* Serializes write accesses to mem_hotplug.active_writer. */
+static DEFINE_MUTEX(memory_add_remove_lock);
+
void mem_hotplug_begin(void)
{
+ mutex_lock(&memory_add_remove_lock);
+
mem_hotplug.active_writer = current;
memhp_lock_acquire();
@@ -144,6 +150,7 @@ void mem_hotplug_done(void)
mem_hotplug.active_writer = NULL;
mutex_unlock(&mem_hotplug.lock);
memhp_lock_release();
+ mutex_unlock(&memory_add_remove_lock);
}
/* add this memory to iomem resource */
@@ -179,7 +186,7 @@ static void release_memory_resource(struct resource *res)
void get_page_bootmem(unsigned long info, struct page *page,
unsigned long type)
{
- page->lru.next = (struct list_head *) type;
+ page->freelist = (void *)type;
SetPagePrivate(page);
set_page_private(page, info);
page_ref_inc(page);
@@ -189,11 +196,12 @@ void put_page_bootmem(struct page *page)
{
unsigned long type;
- type = (unsigned long) page->lru.next;
+ type = (unsigned long) page->freelist;
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
if (page_ref_dec_return(page) == 1) {
+ page->freelist = NULL;
ClearPagePrivate(page);
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);
@@ -861,7 +869,6 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
return ret;
}
-EXPORT_SYMBOL_GPL(__remove_pages);
#endif /* CONFIG_MEMORY_HOTREMOVE */
int set_online_page_callback(online_page_callback_t callback)
@@ -1033,36 +1040,39 @@ static void node_states_set_node(int node, struct memory_notify *arg)
node_set_state(node, N_MEMORY);
}
-int zone_can_shift(unsigned long pfn, unsigned long nr_pages,
- enum zone_type target)
+bool zone_can_shift(unsigned long pfn, unsigned long nr_pages,
+ enum zone_type target, int *zone_shift)
{
struct zone *zone = page_zone(pfn_to_page(pfn));
enum zone_type idx = zone_idx(zone);
int i;
+ *zone_shift = 0;
+
if (idx < target) {
/* pages must be at end of current zone */
if (pfn + nr_pages != zone_end_pfn(zone))
- return 0;
+ return false;
/* no zones in use between current zone and target */
for (i = idx + 1; i < target; i++)
if (zone_is_initialized(zone - idx + i))
- return 0;
+ return false;
}
if (target < idx) {
/* pages must be at beginning of current zone */
if (pfn != zone->zone_start_pfn)
- return 0;
+ return false;
/* no zones in use between current zone and target */
for (i = target + 1; i < idx; i++)
if (zone_is_initialized(zone - idx + i))
- return 0;
+ return false;
}
- return target - idx;
+ *zone_shift = target - idx;
+ return true;
}
/* Must be protected by mem_hotplug_begin() */
@@ -1089,10 +1099,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
!can_online_high_movable(zone))
return -EINVAL;
- if (online_type == MMOP_ONLINE_KERNEL)
- zone_shift = zone_can_shift(pfn, nr_pages, ZONE_NORMAL);
- else if (online_type == MMOP_ONLINE_MOVABLE)
- zone_shift = zone_can_shift(pfn, nr_pages, ZONE_MOVABLE);
+ if (online_type == MMOP_ONLINE_KERNEL) {
+ if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift))
+ return -EINVAL;
+ } else if (online_type == MMOP_ONLINE_MOVABLE) {
+ if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift))
+ return -EINVAL;
+ }
zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages);
if (!zone)
@@ -1329,7 +1342,7 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
static int online_memory_block(struct memory_block *mem, void *arg)
{
- return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
+ return device_online(&mem->dev);
}
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
@@ -1477,17 +1490,20 @@ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
}
/*
- * Confirm all pages in a range [start, end) is belongs to the same zone.
+ * Confirm all pages in a range [start, end) belong to the same zone.
+ * When true, return its valid [start, end).
*/
-int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
+int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
+ unsigned long *valid_start, unsigned long *valid_end)
{
unsigned long pfn, sec_end_pfn;
+ unsigned long start, end;
struct zone *zone = NULL;
struct page *page;
int i;
- for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn);
+ for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
pfn < end_pfn;
- pfn = sec_end_pfn + 1, sec_end_pfn += PAGES_PER_SECTION) {
+ pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
/* Make sure the memory section is present first */
if (!present_section_nr(pfn_to_section_nr(pfn)))
continue;
@@ -1498,22 +1514,32 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
while ((i < MAX_ORDER_NR_PAGES) &&
!pfn_valid_within(pfn + i))
i++;
- if (i == MAX_ORDER_NR_PAGES)
+ if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
continue;
page = pfn_to_page(pfn + i);
if (zone && page_zone(page) != zone)
return 0;
+ if (!zone)
+ start = pfn + i;
zone = page_zone(page);
+ end = pfn + MAX_ORDER_NR_PAGES;
}
}
- return 1;
+
+ if (zone) {
+ *valid_start = start;
+ *valid_end = min(end, end_pfn);
+ return 1;
+ } else {
+ return 0;
+ }
}
/*
- * Scan pfn range [start,end) to find movable/migratable pages (LRU pages
- * and hugepages). We scan pfn because it's much easier than scanning over
- * linked list. This function returns the pfn of the first found movable
- * page if it's found, otherwise 0.
+ * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
+ * non-lru movable pages and hugepages). We scan pfn because it's much
+ * easier than scanning over linked list. This function returns the pfn
+ * of the first found movable page if it's found, otherwise 0.
*/
static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
{
@@ -1524,6 +1550,8 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
page = pfn_to_page(pfn);
if (PageLRU(page))
return pfn;
+ if (__PageMovable(page))
+ return pfn;
if (PageHuge(page)) {
if (page_huge_active(page))
return pfn;
@@ -1600,21 +1628,25 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (!get_page_unless_zero(page))
continue;
/*
- * We can skip free pages. And we can only deal with pages on
- * LRU.
+ * We can skip free pages. And we can deal with pages on
+ * LRU and non-lru movable pages.
*/
- ret = isolate_lru_page(page);
+ if (PageLRU(page))
+ ret = isolate_lru_page(page);
+ else
+ ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
if (!ret) { /* Success */
put_page(page);
list_add_tail(&page->lru, &source);
move_pages--;
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ if (!__PageMovable(page))
+ inc_node_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
} else {
#ifdef CONFIG_DEBUG_VM
- pr_alert("removing pfn %lx from LRU failed\n", pfn);
- dump_page(page, "failed to remove from LRU");
+ pr_alert("failed to isolate pfn %lx\n", pfn);
+ dump_page(page, "isolation failed");
#endif
put_page(page);
/* Because we don't have big zone->lock. we should
@@ -1727,26 +1759,6 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
static int __init cmdline_parse_movable_node(char *p)
{
#ifdef CONFIG_MOVABLE_NODE
- /*
- * Memory used by the kernel cannot be hot-removed because Linux
- * cannot migrate the kernel pages. When memory hotplug is
- * enabled, we should prevent memblock from allocating memory
- * for the kernel.
- *
- * ACPI SRAT records all hotpluggable memory ranges. But before
- * SRAT is parsed, we don't know about it.
- *
- * The kernel image is loaded into memory at very early time. We
- * cannot prevent this anyway. So on NUMA system, we set any
- * node the kernel resides in as un-hotpluggable.
- *
- * Since on modern servers, one node could have double-digit
- * gigabytes memory, we can assume the memory around the kernel
- * image is also un-hotpluggable. So before SRAT is parsed, just
- * allocate memory near the kernel image to try the best to keep
- * the kernel away from hotpluggable memory.
- */
- memblock_set_bottom_up(true);
movable_node_enabled = true;
#else
pr_warn("movable_node option not supported\n");
@@ -1853,6 +1865,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
long offlined_pages;
int ret, drain, retry_max, node;
unsigned long flags;
+ unsigned long valid_start, valid_end;
struct zone *zone;
struct memory_notify arg;
@@ -1863,10 +1876,10 @@ static int __ref __offline_pages(unsigned long start_pfn,
return -EINVAL;
/* This makes hotplug much easier...and readable.
we assume this for now. .*/
- if (!test_pages_in_a_zone(start_pfn, end_pfn))
+ if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end))
return -EINVAL;
- zone = page_zone(pfn_to_page(start_pfn));
+ zone = page_zone(pfn_to_page(valid_start));
node = zone_to_nid(zone);
nr_pages = end_pfn - start_pfn;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0b859af06b87..37d0b334bfe9 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -73,6 +73,9 @@
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/numa_balancing.h>
+#include <linux/sched/task.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
#include <linux/slab.h>
@@ -96,7 +99,7 @@
#include <linux/printk.h>
#include <asm/tlbflush.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "internal.h"
@@ -276,7 +279,9 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
return ERR_PTR(-EINVAL);
}
} else if (mode == MPOL_LOCAL) {
- if (!nodes_empty(*nodes))
+ if (!nodes_empty(*nodes) ||
+ (flags & MPOL_F_STATIC_NODES) ||
+ (flags & MPOL_F_RELATIVE_NODES))
return ERR_PTR(-EINVAL);
mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
@@ -496,7 +501,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
spin_unlock(ptl);
- split_huge_pmd(vma, pmd, addr);
+ __split_huge_pmd(vma, pmd, addr, false, NULL);
} else {
get_page(page);
spin_unlock(ptl);
@@ -1524,7 +1529,6 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
compat_ulong_t, maxnode)
{
- long err = 0;
unsigned long __user *nm = NULL;
unsigned long nr_bits, alloc_size;
DECLARE_BITMAP(bm, MAX_NUMNODES);
@@ -1533,14 +1537,13 @@ COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask) {
- err = compat_get_bitmap(bm, nmask, nr_bits);
+ if (compat_get_bitmap(bm, nmask, nr_bits))
+ return -EFAULT;
nm = compat_alloc_user_space(alloc_size);
- err |= copy_to_user(nm, bm, alloc_size);
+ if (copy_to_user(nm, bm, alloc_size))
+ return -EFAULT;
}
- if (err)
- return -EFAULT;
-
return sys_set_mempolicy(mode, nm, nr_bits+1);
}
@@ -1548,7 +1551,6 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
compat_ulong_t, mode, compat_ulong_t __user *, nmask,
compat_ulong_t, maxnode, compat_ulong_t, flags)
{
- long err = 0;
unsigned long __user *nm = NULL;
unsigned long nr_bits, alloc_size;
nodemask_t bm;
@@ -1557,14 +1559,13 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask) {
- err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
+ if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
+ return -EFAULT;
nm = compat_alloc_user_space(alloc_size);
- err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
+ if (copy_to_user(nm, nodes_addr(bm), alloc_size))
+ return -EFAULT;
}
- if (err)
- return -EFAULT;
-
return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
}
@@ -1679,25 +1680,17 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
int nd)
{
- switch (policy->mode) {
- case MPOL_PREFERRED:
- if (!(policy->flags & MPOL_F_LOCAL))
- nd = policy->v.preferred_node;
- break;
- case MPOL_BIND:
+ if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
+ nd = policy->v.preferred_node;
+ else {
/*
- * Normally, MPOL_BIND allocations are node-local within the
- * allowed nodemask. However, if __GFP_THISNODE is set and the
- * current node isn't part of the mask, we use the zonelist for
- * the first node in the mask instead.
+ * __GFP_THISNODE shouldn't even be used with the bind policy
+ * because we might easily break the expectation to stay on the
+ * requested node and not break the policy.
*/
- if (unlikely(gfp & __GFP_THISNODE) &&
- unlikely(!node_isset(nd, policy->v.nodes)))
- nd = first_node(policy->v.nodes);
- break;
- default:
- BUG();
+ WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
}
+
return node_zonelist(nd, gfp);
}
@@ -2023,8 +2016,8 @@ retry_cpuset:
nmask = policy_nodemask(gfp, pol);
zl = policy_zonelist(gfp, pol, node);
- mpol_cond_put(pol);
page = __alloc_pages_nodemask(gfp, order, zl, nmask);
+ mpol_cond_put(pol);
out:
if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
diff --git a/mm/migrate.c b/mm/migrate.c
index 99250aee1ac1..738f1d5f8350 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -40,6 +40,7 @@
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
#include <linux/page_owner.h>
+#include <linux/sched/mm.h>
#include <asm/tlbflush.h>
@@ -74,7 +75,7 @@ int migrate_prep_local(void)
return 0;
}
-bool isolate_movable_page(struct page *page, isolate_mode_t mode)
+int isolate_movable_page(struct page *page, isolate_mode_t mode)
{
struct address_space *mapping;
@@ -125,14 +126,14 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode)
__SetPageIsolated(page);
unlock_page(page);
- return true;
+ return 0;
out_no_isolated:
unlock_page(page);
out_putpage:
put_page(page);
out:
- return false;
+ return -EBUSY;
}
/* It should be called on page which is PG_movable */
@@ -168,8 +169,6 @@ void putback_movable_pages(struct list_head *l)
continue;
}
list_del(&page->lru);
- dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
/*
* We isolated non-lru movable page so here we can use
* __PageMovable because LRU page's mapping cannot have
@@ -185,6 +184,8 @@ void putback_movable_pages(struct list_head *l)
unlock_page(page);
put_page(page);
} else {
+ dec_node_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
putback_lru_page(page);
}
}
@@ -193,82 +194,65 @@ void putback_movable_pages(struct list_head *l)
/*
* Restore a potential migration pte to a working pte entry
*/
-static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
+static int remove_migration_pte(struct page *page, struct vm_area_struct *vma,
unsigned long addr, void *old)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct page_vma_mapped_walk pvmw = {
+ .page = old,
+ .vma = vma,
+ .address = addr,
+ .flags = PVMW_SYNC | PVMW_MIGRATION,
+ };
+ struct page *new;
+ pte_t pte;
swp_entry_t entry;
- pmd_t *pmd;
- pte_t *ptep, pte;
- spinlock_t *ptl;
- if (unlikely(PageHuge(new))) {
- ptep = huge_pte_offset(mm, addr);
- if (!ptep)
- goto out;
- ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep);
- } else {
- pmd = mm_find_pmd(mm, addr);
- if (!pmd)
- goto out;
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ while (page_vma_mapped_walk(&pvmw)) {
+ if (PageKsm(page))
+ new = page;
+ else
+ new = page - pvmw.page->index +
+ linear_page_index(vma, pvmw.address);
- ptep = pte_offset_map(pmd, addr);
+ get_page(new);
+ pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
+ if (pte_swp_soft_dirty(*pvmw.pte))
+ pte = pte_mksoft_dirty(pte);
/*
- * Peek to check is_swap_pte() before taking ptlock? No, we
- * can race mremap's move_ptes(), which skips anon_vma lock.
+ * Recheck VMA as permissions can change since migration started
*/
-
- ptl = pte_lockptr(mm, pmd);
- }
-
- spin_lock(ptl);
- pte = *ptep;
- if (!is_swap_pte(pte))
- goto unlock;
-
- entry = pte_to_swp_entry(pte);
-
- if (!is_migration_entry(entry) ||
- migration_entry_to_page(entry) != old)
- goto unlock;
-
- get_page(new);
- pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
- if (pte_swp_soft_dirty(*ptep))
- pte = pte_mksoft_dirty(pte);
-
- /* Recheck VMA as permissions can change since migration started */
- if (is_write_migration_entry(entry))
- pte = maybe_mkwrite(pte, vma);
+ entry = pte_to_swp_entry(*pvmw.pte);
+ if (is_write_migration_entry(entry))
+ pte = maybe_mkwrite(pte, vma);
#ifdef CONFIG_HUGETLB_PAGE
- if (PageHuge(new)) {
- pte = pte_mkhuge(pte);
- pte = arch_make_huge_pte(pte, vma, new, 0);
- }
+ if (PageHuge(new)) {
+ pte = pte_mkhuge(pte);
+ pte = arch_make_huge_pte(pte, vma, new, 0);
+ }
#endif
- flush_dcache_page(new);
- set_pte_at(mm, addr, ptep, pte);
+ flush_dcache_page(new);
+ set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
- if (PageHuge(new)) {
- if (PageAnon(new))
- hugepage_add_anon_rmap(new, vma, addr);
+ if (PageHuge(new)) {
+ if (PageAnon(new))
+ hugepage_add_anon_rmap(new, vma, pvmw.address);
+ else
+ page_dup_rmap(new, true);
+ } else if (PageAnon(new))
+ page_add_anon_rmap(new, vma, pvmw.address, false);
else
- page_dup_rmap(new, true);
- } else if (PageAnon(new))
- page_add_anon_rmap(new, vma, addr, false);
- else
- page_add_file_rmap(new, false);
+ page_add_file_rmap(new, false);
- if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
- mlock_vma_page(new);
+ if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
+ mlock_vma_page(new);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, pvmw.address, pvmw.pte);
+ }
- /* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, addr, ptep);
-unlock:
- pte_unmap_unlock(ptep, ptl);
-out:
return SWAP_AGAIN;
}
@@ -466,13 +450,15 @@ int migrate_page_move_mapping(struct address_space *mapping,
*/
newpage->index = page->index;
newpage->mapping = page->mapping;
- if (PageSwapBacked(page))
- __SetPageSwapBacked(newpage);
-
get_page(newpage); /* add cache reference */
- if (PageSwapCache(page)) {
- SetPageSwapCache(newpage);
- set_page_private(newpage, page_private(page));
+ if (PageSwapBacked(page)) {
+ __SetPageSwapBacked(newpage);
+ if (PageSwapCache(page)) {
+ SetPageSwapCache(newpage);
+ set_page_private(newpage, page_private(page));
+ }
+ } else {
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
}
/* Move dirty while page refs frozen and newpage not yet exposed */
@@ -482,7 +468,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
SetPageDirty(newpage);
}
- radix_tree_replace_slot(pslot, newpage);
+ radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
/*
* Drop cache reference from old page by unfreezing
@@ -556,7 +542,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
get_page(newpage);
- radix_tree_replace_slot(pslot, newpage);
+ radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
page_ref_unfreeze(page, expected_count - 1);
@@ -1121,8 +1107,15 @@ out:
* restored.
*/
list_del(&page->lru);
- dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+
+ /*
+ * Compaction can migrate also non-LRU pages which are
+ * not accounted to NR_ISOLATED_*. They can be recognized
+ * as __PageMovable
+ */
+ if (likely(!__PageMovable(page)))
+ dec_node_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
}
/*
diff --git a/mm/mincore.c b/mm/mincore.c
index bfb866435478..c5687c45c326 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -14,9 +14,10 @@
#include <linux/syscalls.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/pgtable.h>
static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
diff --git a/mm/mlock.c b/mm/mlock.c
index cdbed8aaa426..0dd9ca18e19e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -8,6 +8,7 @@
#include <linux/capability.h>
#include <linux/mman.h>
#include <linux/mm.h>
+#include <linux/sched/user.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pagemap.h>
@@ -379,6 +380,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
pte = get_locked_pte(vma->vm_mm, start, &ptl);
/* Make sure we do not cross the page table boundary */
end = pgd_addr_end(start, end);
+ end = p4d_addr_end(start, end);
end = pud_addr_end(start, end);
end = pmd_addr_end(start, end);
@@ -441,7 +443,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
while (start < end) {
struct page *page;
- unsigned int page_mask;
+ unsigned int page_mask = 0;
unsigned long page_increm;
struct pagevec pvec;
struct zone *zone;
@@ -455,8 +457,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
* suits munlock very well (and if somehow an abnormal page
* has sneaked into the range, we won't oops here: great).
*/
- page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
- &page_mask);
+ page = follow_page(vma, start, FOLL_GET | FOLL_DUMP);
if (page && !IS_ERR(page)) {
if (PageTransTail(page)) {
@@ -467,8 +468,8 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
/*
* Any THP page found by follow_page_mask() may
* have gotten split before reaching
- * munlock_vma_page(), so we need to recompute
- * the page_mask here.
+ * munlock_vma_page(), so we need to compute
+ * the page_mask here instead.
*/
page_mask = munlock_vma_page(page);
unlock_page(page);
diff --git a/mm/mmap.c b/mm/mmap.c
index 1af87c14183d..bfbe8856d134 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -45,7 +45,7 @@
#include <linux/moduleparam.h>
#include <linux/pkeys.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
@@ -176,7 +176,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
return next;
}
-static int do_brk(unsigned long addr, unsigned long len);
+static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
@@ -185,6 +185,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
struct mm_struct *mm = current->mm;
unsigned long min_brk;
bool populate;
+ LIST_HEAD(uf);
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
@@ -222,7 +223,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/* Always allow shrinking brk. */
if (brk <= mm->brk) {
- if (!do_munmap(mm, newbrk, oldbrk-newbrk))
+ if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf))
goto set_brk;
goto out;
}
@@ -232,13 +233,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
goto out;
/* Ok, looks good - let it rip. */
- if (do_brk(oldbrk, newbrk-oldbrk) < 0)
+ if (do_brk(oldbrk, newbrk-oldbrk, &uf) < 0)
goto out;
set_brk:
mm->brk = brk;
populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
up_write(&mm->mmap_sem);
+ userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(oldbrk, newbrk - oldbrk);
return brk;
@@ -1304,7 +1306,8 @@ static inline int mlock_future_check(struct mm_struct *mm,
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, vm_flags_t vm_flags,
- unsigned long pgoff, unsigned long *populate)
+ unsigned long pgoff, unsigned long *populate,
+ struct list_head *uf)
{
struct mm_struct *mm = current->mm;
int pkey = 0;
@@ -1447,7 +1450,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags |= VM_NORESERVE;
}
- addr = mmap_region(file, addr, len, vm_flags, pgoff);
+ addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) ||
(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
@@ -1583,7 +1586,8 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
}
unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
+ unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+ struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
@@ -1609,7 +1613,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
/* Clear old maps */
while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
&rb_parent)) {
- if (do_munmap(mm, addr, len))
+ if (do_munmap(mm, addr, len, uf))
return -ENOMEM;
}
@@ -1668,7 +1672,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* new file must not have been exposed to user-space, yet.
*/
vma->vm_file = get_file(file);
- error = file->f_op->mmap(file, vma);
+ error = call_mmap(file, vma);
if (error)
goto unmap_and_free_vma;
@@ -2495,11 +2499,11 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
}
/*
- * __split_vma() bypasses sysctl_max_map_count checking. We use this on the
- * munmap path where it doesn't make sense to fail.
+ * __split_vma() bypasses sysctl_max_map_count checking. We use this where it
+ * has already been checked or doesn't make sense to fail.
*/
-static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, int new_below)
+int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, int new_below)
{
struct vm_area_struct *new;
int err;
@@ -2579,7 +2583,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
* work. This now handles partial unmappings.
* Jeremy Fitzhardinge <jeremy@goop.org>
*/
-int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
+ struct list_head *uf)
{
unsigned long end;
struct vm_area_struct *vma, *prev, *last;
@@ -2603,6 +2608,13 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
if (vma->vm_start >= end)
return 0;
+ if (uf) {
+ int error = userfaultfd_unmap_prep(vma, start, end, uf);
+
+ if (error)
+ return error;
+ }
+
/*
* If we need to split any vma, do it now to save pain later.
*
@@ -2668,27 +2680,22 @@ int vm_munmap(unsigned long start, size_t len)
{
int ret;
struct mm_struct *mm = current->mm;
+ LIST_HEAD(uf);
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
- ret = do_munmap(mm, start, len);
+ ret = do_munmap(mm, start, len, &uf);
up_write(&mm->mmap_sem);
+ userfaultfd_unmap_complete(mm, &uf);
return ret;
}
EXPORT_SYMBOL(vm_munmap);
SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
- int ret;
- struct mm_struct *mm = current->mm;
-
profile_munmap(addr);
- if (down_write_killable(&mm->mmap_sem))
- return -EINTR;
- ret = do_munmap(mm, addr, len);
- up_write(&mm->mmap_sem);
- return ret;
+ return vm_munmap(addr, len);
}
@@ -2780,7 +2787,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
file = get_file(vma->vm_file);
ret = do_mmap_pgoff(vma->vm_file, start, size,
- prot, flags, pgoff, &populate);
+ prot, flags, pgoff, &populate, NULL);
fput(file);
out:
up_write(&mm->mmap_sem);
@@ -2806,11 +2813,11 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
* anonymous maps. eventually we may be able to do some
* brk-specific accounting here.
*/
-static int do_brk(unsigned long addr, unsigned long request)
+static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
- unsigned long flags, len;
+ unsigned long len;
struct rb_node **rb_link, *rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
@@ -2821,7 +2828,10 @@ static int do_brk(unsigned long addr, unsigned long request)
if (!len)
return 0;
- flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+ /* Until we need other flags, refuse anything except VM_EXEC. */
+ if ((flags & (~VM_EXEC)) != 0)
+ return -EINVAL;
+ flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
if (offset_in_page(error))
@@ -2842,7 +2852,7 @@ static int do_brk(unsigned long addr, unsigned long request)
*/
while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
&rb_parent)) {
- if (do_munmap(mm, addr, len))
+ if (do_munmap(mm, addr, len, uf))
return -ENOMEM;
}
@@ -2889,22 +2899,35 @@ out:
return 0;
}
-int vm_brk(unsigned long addr, unsigned long len)
+static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf)
+{
+ return do_brk_flags(addr, len, 0, uf);
+}
+
+int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags)
{
struct mm_struct *mm = current->mm;
int ret;
bool populate;
+ LIST_HEAD(uf);
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
- ret = do_brk(addr, len);
+ ret = do_brk_flags(addr, len, flags, &uf);
populate = ((mm->def_flags & VM_LOCKED) != 0);
up_write(&mm->mmap_sem);
+ userfaultfd_unmap_complete(mm, &uf);
if (populate && !ret)
mm_populate(addr, len);
return ret;
}
+EXPORT_SYMBOL(vm_brk_flags);
+
+int vm_brk(unsigned long addr, unsigned long len)
+{
+ return vm_brk_flags(addr, len, 0);
+}
EXPORT_SYMBOL(vm_brk);
/* Release all mmaps. */
@@ -3111,8 +3134,7 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
mm->data_vm += npages;
}
-static int special_mapping_fault(struct vm_area_struct *vma,
- struct vm_fault *vmf);
+static int special_mapping_fault(struct vm_fault *vmf);
/*
* Having a close hook prevents vma merging regardless of flags.
@@ -3147,9 +3169,9 @@ static const struct vm_operations_struct legacy_special_mapping_vmops = {
.fault = special_mapping_fault,
};
-static int special_mapping_fault(struct vm_area_struct *vma,
- struct vm_fault *vmf)
+static int special_mapping_fault(struct vm_fault *vmf)
{
+ struct vm_area_struct *vma = vmf->vma;
pgoff_t pgoff;
struct page **pages;
@@ -3159,7 +3181,7 @@ static int special_mapping_fault(struct vm_area_struct *vma,
struct vm_special_mapping *sm = vma->vm_private_data;
if (sm->fault)
- return sm->fault(sm, vma, vmf);
+ return sm->fault(sm, vmf->vma, vmf);
pages = sm->pages;
}
@@ -3433,7 +3455,7 @@ void mm_drop_all_locks(struct mm_struct *mm)
}
/*
- * initialise the VMA slab
+ * initialise the percpu counter for VM
*/
void __init mmap_init(void)
{
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 6f4d27c5bb32..3e612ae748e9 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -5,6 +5,8 @@
#include <linux/mm.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
#include <linux/mmu_context.h>
#include <linux/export.h>
@@ -25,7 +27,7 @@ void use_mm(struct mm_struct *mm)
task_lock(tsk);
active_mm = tsk->active_mm;
if (active_mm != mm) {
- atomic_inc(&mm->mm_count);
+ mmgrab(mm);
tsk->active_mm = mm;
}
tsk->mm = mm;
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index f4259e496f83..a7652acd2ab9 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -17,6 +17,7 @@
#include <linux/srcu.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/slab.h>
/* global SRCU for all MMs */
@@ -275,7 +276,7 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
mm->mmu_notifier_mm = mmu_notifier_mm;
mmu_notifier_mm = NULL;
}
- atomic_inc(&mm->mm_count);
+ mmgrab(mm);
/*
* Serialize the update against mmu_notifier_unregister. A
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 5652be858e5e..a51c0a67ea3d 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -60,7 +60,7 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z,
* Find the next suitable zone to use for the allocation.
* Only filter based on nodemask if it's set
*/
- if (likely(nodes == NULL))
+ if (unlikely(nodes == NULL))
while (zonelist_zone_idx(z) > highest_zoneidx)
z++;
else
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 11936526b08b..8edd0d576254 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -25,7 +25,7 @@
#include <linux/perf_event.h>
#include <linux/pkeys.h>
#include <linux/ksm.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
@@ -33,34 +33,6 @@
#include "internal.h"
-/*
- * For a prot_numa update we only hold mmap_sem for read so there is a
- * potential race with faulting where a pmd was temporarily none. This
- * function checks for a transhuge pmd under the appropriate lock. It
- * returns a pte if it was successfully locked or NULL if it raced with
- * a transhuge insertion.
- */
-static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, int prot_numa, spinlock_t **ptl)
-{
- pte_t *pte;
- spinlock_t *pmdl;
-
- /* !prot_numa is protected by mmap_sem held for write */
- if (!prot_numa)
- return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
-
- pmdl = pmd_lock(vma->vm_mm, pmd);
- if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) {
- spin_unlock(pmdl);
- return NULL;
- }
-
- pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
- spin_unlock(pmdl);
- return pte;
-}
-
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa)
@@ -69,11 +41,31 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pte_t *pte, oldpte;
spinlock_t *ptl;
unsigned long pages = 0;
+ int target_node = NUMA_NO_NODE;
- pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
+ /*
+ * Can be called with only the mmap_sem for reading by
+ * prot_numa so we must check the pmd isn't constantly
+ * changing from under us from pmd_none to pmd_trans_huge
+ * and/or the other way around.
+ */
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ /*
+ * The pmd points to a regular pte so the pmd can't change
+ * from under us even if the mmap_sem is only hold for
+ * reading.
+ */
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (!pte)
return 0;
+ /* Get target node for single threaded private VMAs */
+ if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
+ atomic_read(&vma->vm_mm->mm_users) == 1)
+ target_node = numa_node_id();
+
arch_enter_lazy_mmu_mode();
do {
oldpte = *pte;
@@ -95,12 +87,19 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
/* Avoid TLB flush if possible */
if (pte_protnone(oldpte))
continue;
+
+ /*
+ * Don't mess with PTEs if page is already on the node
+ * a single-threaded process is running on.
+ */
+ if (target_node == page_to_nid(page))
+ continue;
}
ptent = ptep_modify_prot_start(mm, addr, pte);
ptent = pte_modify(ptent, newprot);
if (preserve_write)
- ptent = pte_mkwrite(ptent);
+ ptent = pte_mk_savedwrite(ptent);
/* Avoid taking write faults for known dirty pages */
if (dirty_accountable && pte_dirty(ptent) &&
@@ -163,9 +162,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE) {
- split_huge_pmd(vma, pmd, addr);
- if (pmd_trans_unstable(pmd))
- continue;
+ __split_huge_pmd(vma, pmd, addr, false, NULL);
} else {
int nr_ptes = change_huge_pmd(vma, pmd, addr,
newprot, prot_numa);
@@ -196,14 +193,14 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
}
static inline unsigned long change_pud_range(struct vm_area_struct *vma,
- pgd_t *pgd, unsigned long addr, unsigned long end,
+ p4d_t *p4d, unsigned long addr, unsigned long end,
pgprot_t newprot, int dirty_accountable, int prot_numa)
{
pud_t *pud;
unsigned long next;
unsigned long pages = 0;
- pud = pud_offset(pgd, addr);
+ pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
@@ -215,6 +212,26 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma,
return pages;
}
+static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
+ pgd_t *pgd, unsigned long addr, unsigned long end,
+ pgprot_t newprot, int dirty_accountable, int prot_numa)
+{
+ p4d_t *p4d;
+ unsigned long next;
+ unsigned long pages = 0;
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none_or_clear_bad(p4d))
+ continue;
+ pages += change_pud_range(vma, p4d, addr, next, newprot,
+ dirty_accountable, prot_numa);
+ } while (p4d++, addr = next, addr != end);
+
+ return pages;
+}
+
static unsigned long change_protection_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa)
@@ -233,7 +250,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- pages += change_pud_range(vma, pgd, addr, next, newprot,
+ pages += change_p4d_range(vma, pgd, addr, next, newprot,
dirty_accountable, prot_numa);
} while (pgd++, addr = next, addr != end);
@@ -484,6 +501,8 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
return do_mprotect_pkey(start, len, prot, -1);
}
+#ifdef CONFIG_ARCH_HAS_PKEYS
+
SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
unsigned long, prot, int, pkey)
{
@@ -534,3 +553,5 @@ SYSCALL_DEFINE1(pkey_free, int, pkey)
*/
return ret;
}
+
+#endif /* CONFIG_ARCH_HAS_PKEYS */
diff --git a/mm/mremap.c b/mm/mremap.c
index 30d7d2482eea..cd8a1b199ef9 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,6 +22,7 @@
#include <linux/mmu_notifier.h>
#include <linux/uaccess.h>
#include <linux/mm-arch-hooks.h>
+#include <linux/userfaultfd_k.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -31,6 +32,7 @@
static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
@@ -38,7 +40,11 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
if (pgd_none_or_clear_bad(pgd))
return NULL;
- pud = pud_offset(pgd, addr);
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none_or_clear_bad(p4d))
+ return NULL;
+
+ pud = pud_offset(p4d, addr);
if (pud_none_or_clear_bad(pud))
return NULL;
@@ -53,11 +59,15 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgd = pgd_offset(mm, addr);
- pud = pud_alloc(mm, pgd, addr);
+ p4d = p4d_alloc(mm, pgd, addr);
+ if (!p4d)
+ return NULL;
+ pud = pud_alloc(mm, p4d, addr);
if (!pud)
return NULL;
@@ -250,7 +260,9 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long old_addr, unsigned long old_len,
- unsigned long new_len, unsigned long new_addr, bool *locked)
+ unsigned long new_len, unsigned long new_addr,
+ bool *locked, struct vm_userfaultfd_ctx *uf,
+ struct list_head *uf_unmap)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
@@ -309,6 +321,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
old_addr = new_addr;
new_addr = err;
} else {
+ mremap_userfaultfd_prep(new_vma, uf);
arch_remap(mm, old_addr, old_addr + old_len,
new_addr, new_addr + new_len);
}
@@ -338,7 +351,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (unlikely(vma->vm_flags & VM_PFNMAP))
untrack_pfn_moved(vma);
- if (do_munmap(mm, old_addr, old_len) < 0) {
+ if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
/* OOM: unable to split vma, just get accounts right */
vm_unacct_memory(excess >> PAGE_SHIFT);
excess = 0;
@@ -413,7 +426,9 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
}
static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
- unsigned long new_addr, unsigned long new_len, bool *locked)
+ unsigned long new_addr, unsigned long new_len, bool *locked,
+ struct vm_userfaultfd_ctx *uf,
+ struct list_head *uf_unmap)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
@@ -431,12 +446,12 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (addr + old_len > new_addr && new_addr + new_len > addr)
goto out;
- ret = do_munmap(mm, new_addr, new_len);
+ ret = do_munmap(mm, new_addr, new_len, NULL);
if (ret)
goto out;
if (old_len >= new_len) {
- ret = do_munmap(mm, addr+new_len, old_len - new_len);
+ ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
if (ret && old_len != new_len)
goto out;
old_len = new_len;
@@ -458,7 +473,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (offset_in_page(ret))
goto out1;
- ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
+ ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
+ uf_unmap);
if (!(offset_in_page(ret)))
goto out;
out1:
@@ -497,6 +513,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
unsigned long ret = -EINVAL;
unsigned long charged = 0;
bool locked = false;
+ struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
+ LIST_HEAD(uf_unmap);
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
return ret;
@@ -523,7 +541,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (flags & MREMAP_FIXED) {
ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked);
+ &locked, &uf, &uf_unmap);
goto out;
}
@@ -533,7 +551,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
* do_munmap does all the needed commit accounting
*/
if (old_len >= new_len) {
- ret = do_munmap(mm, addr+new_len, old_len - new_len);
+ ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap);
if (ret && old_len != new_len)
goto out;
ret = addr;
@@ -592,7 +610,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
goto out;
}
- ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
+ ret = move_vma(vma, addr, old_len, new_len, new_addr,
+ &locked, &uf, &uf_unmap);
}
out:
if (offset_in_page(ret)) {
@@ -602,5 +621,7 @@ out:
up_write(&current->mm->mmap_sem);
if (locked && new_len > old_len)
mm_populate(new_addr + old_len, new_len - old_len);
+ mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
+ userfaultfd_unmap_complete(mm, &uf_unmap);
return ret;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 8b8faaf2a9e9..2d131b97a851 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -17,6 +17,7 @@
#include <linux/export.h>
#include <linux/mm.h>
+#include <linux/sched/mm.h>
#include <linux/vmacache.h>
#include <linux/mman.h>
#include <linux/swap.h>
@@ -35,7 +36,7 @@
#include <linux/audit.h>
#include <linux/printk.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -176,9 +177,10 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
}
EXPORT_SYMBOL(get_user_pages_locked);
-long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- struct page **pages, unsigned int gup_flags)
+static long __get_user_pages_unlocked(struct task_struct *tsk,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long nr_pages, struct page **pages,
+ unsigned int gup_flags)
{
long ret;
down_read(&mm->mmap_sem);
@@ -187,7 +189,6 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
up_read(&mm->mmap_sem);
return ret;
}
-EXPORT_SYMBOL(__get_user_pages_unlocked);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags)
@@ -517,7 +518,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
}
/*
- * initialise the VMA and region record slabs
+ * initialise the percpu counter for VM and region record slabs
*/
void __init mmap_init(void)
{
@@ -757,7 +758,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
mm->map_count--;
for (i = 0; i < VMACACHE_SIZE; i++) {
/* if the vma is cached, invalidate the entire cache */
- if (curr->vmacache[i] == vma) {
+ if (curr->vmacache.vmas[i] == vma) {
vmacache_invalidate(mm);
break;
}
@@ -1084,7 +1085,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
{
int ret;
- ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
+ ret = call_mmap(vma->vm_file, vma);
if (ret == 0) {
vma->vm_region->vm_top = vma->vm_region->vm_end;
return 0;
@@ -1115,7 +1116,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
* - VM_MAYSHARE will be set if it may attempt to share
*/
if (capabilities & NOMMU_MAP_DIRECT) {
- ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
+ ret = call_mmap(vma->vm_file, vma);
if (ret == 0) {
/* shouldn't return success if we're not sharing */
BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
@@ -1191,7 +1192,7 @@ error_free:
enomem:
pr_err("Allocation of length %lu from process %d (%s) failed\n",
len, current->pid, current->comm);
- show_free_areas(0);
+ show_free_areas(0, NULL);
return -ENOMEM;
}
@@ -1205,7 +1206,8 @@ unsigned long do_mmap(struct file *file,
unsigned long flags,
vm_flags_t vm_flags,
unsigned long pgoff,
- unsigned long *populate)
+ unsigned long *populate,
+ struct list_head *uf)
{
struct vm_area_struct *vma;
struct vm_region *region;
@@ -1412,13 +1414,13 @@ error_getting_vma:
kmem_cache_free(vm_region_jar, region);
pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
len, current->pid);
- show_free_areas(0);
+ show_free_areas(0, NULL);
return -ENOMEM;
error_getting_region:
pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
len, current->pid);
- show_free_areas(0);
+ show_free_areas(0, NULL);
return -ENOMEM;
}
@@ -1577,7 +1579,7 @@ static int shrink_vma(struct mm_struct *mm,
* - under NOMMU conditions the chunk to be unmapped must be backed by a single
* VMA, though it need not cover the whole VMA
*/
-int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf)
{
struct vm_area_struct *vma;
unsigned long end;
@@ -1643,7 +1645,7 @@ int vm_munmap(unsigned long addr, size_t len)
int ret;
down_write(&mm->mmap_sem);
- ret = do_munmap(mm, addr, len);
+ ret = do_munmap(mm, addr, len, NULL);
up_write(&mm->mmap_sem);
return ret;
}
@@ -1794,21 +1796,21 @@ void unmap_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL(unmap_mapping_range);
-int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int filemap_fault(struct vm_fault *vmf)
{
BUG();
return 0;
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct fault_env *fe,
+void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
BUG();
}
EXPORT_SYMBOL(filemap_map_pages);
-static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
unsigned long addr, void *buf, int len, unsigned int gup_flags)
{
struct vm_area_struct *vma;
@@ -1878,6 +1880,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
mmput(mm);
return len;
}
+EXPORT_SYMBOL_GPL(access_process_vm);
/**
* nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ec9f11d4f094..d083714a2bb9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -22,6 +22,9 @@
#include <linux/err.h>
#include <linux/gfp.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/task.h>
#include <linux/swap.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
@@ -403,12 +406,14 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
static void dump_header(struct oom_control *oc, struct task_struct *p)
{
- nodemask_t *nm = (oc->nodemask) ? oc->nodemask : &cpuset_current_mems_allowed;
-
- pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
- current->comm, oc->gfp_mask, &oc->gfp_mask,
- nodemask_pr_args(nm), oc->order,
- current->signal->oom_score_adj);
+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=",
+ current->comm, oc->gfp_mask, &oc->gfp_mask);
+ if (oc->nodemask)
+ pr_cont("%*pbl", nodemask_pr_args(oc->nodemask));
+ else
+ pr_cont("(null)");
+ pr_cont(", order=%d, oom_score_adj=%hd\n",
+ oc->order, current->signal->oom_score_adj);
if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
pr_warn("COMPACTION is disabled!!!\n");
@@ -417,7 +422,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
if (oc->memcg)
mem_cgroup_print_oom_info(oc->memcg, p);
else
- show_mem(SHOW_MEM_FILTER_NODES);
+ show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
if (sysctl_oom_dump_tasks)
dump_tasks(oc->memcg, oc->nodemask);
}
@@ -465,8 +470,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
struct mmu_gather tlb;
struct vm_area_struct *vma;
- struct zap_details details = {.check_swap_entries = true,
- .ignore_dirty = true};
bool ret = true;
/*
@@ -510,14 +513,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
tlb_gather_mmu(&tlb, mm, 0, -1);
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
- if (is_vm_hugetlb_page(vma))
- continue;
-
- /*
- * mlocked VMAs require explicit munlocking before unmap.
- * Let's keep it simple here and skip such VMAs.
- */
- if (vma->vm_flags & VM_LOCKED)
+ if (!can_madv_dontneed_vma(vma))
continue;
/*
@@ -532,7 +528,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
*/
if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
- &details);
+ NULL);
}
tlb_finish_mmu(&tlb, 0, -1);
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
@@ -660,7 +656,7 @@ static void mark_oom_victim(struct task_struct *tsk)
/* oom_mm is bound to the signal struct life time. */
if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
- atomic_inc(&tsk->signal->oom_mm->mm_count);
+ mmgrab(tsk->signal->oom_mm);
/*
* Make sure that the task is woken up from uninterruptible sleep
@@ -877,7 +873,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
/* Get a reference to safely compare mm after task_unlock(victim) */
mm = victim->mm;
- atomic_inc(&mm->mm_count);
+ mmgrab(mm);
/*
* We should send SIGKILL before setting TIF_MEMDIE in order to prevent
* the OOM victim from depleting the memory reserves from the user
@@ -1013,7 +1009,7 @@ bool out_of_memory(struct oom_control *oc)
* make sure exclude 0 mask - all other users should have at least
* ___GFP_DIRECT_RECLAIM to get here.
*/
- if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL)))
+ if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
return true;
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 439cc63ad903..d8ac2a7fb9e7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,6 +36,7 @@
#include <linux/pagevec.h>
#include <linux/timer.h>
#include <linux/sched/rt.h>
+#include <linux/sched/signal.h>
#include <linux/mm_inline.h>
#include <trace/events/writeback.h>
@@ -580,7 +581,7 @@ static void wb_domain_writeout_inc(struct wb_domain *dom,
__fprop_inc_percpu_max(&dom->completions, completions,
max_prop_frac);
/* First event after period switching was turned off? */
- if (!unlikely(dom->period_time)) {
+ if (unlikely(!dom->period_time)) {
/*
* We can race with other __bdi_writeout_inc calls here but
* it does not cause any harm since the resulting time when
@@ -1778,6 +1779,7 @@ pause:
pause,
start_time);
__set_current_state(TASK_KILLABLE);
+ wb->dirty_sleep = now;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
@@ -1796,7 +1798,7 @@ pause:
* pages exceeds dirty_thresh, give the other good wb's a pipe
* to go through, so that tasks on them still remain responsive.
*
- * In theory 1 page is enough to keep the comsumer-producer
+ * In theory 1 page is enough to keep the consumer-producer
* pipe going: the flusher cleans 1 page => the task dirties 1
* more page. However wb_dirty has accounting errors. So use
* the larger and more IO friendly wb_stat_error.
@@ -1987,11 +1989,11 @@ void laptop_mode_timer_fn(unsigned long data)
* We want to write everything out, not just down to the dirty
* threshold
*/
- if (!bdi_has_dirty_io(&q->backing_dev_info))
+ if (!bdi_has_dirty_io(q->backing_dev_info))
return;
rcu_read_lock();
- list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node)
+ list_for_each_entry_rcu(wb, &q->backing_dev_info->wb_list, bdi_node)
if (wb_has_dirty_io(wb))
wb_start_writeback(wb, nr_pages, true,
WB_REASON_LAPTOP_TIMER);
@@ -2105,18 +2107,26 @@ void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
#define WRITEBACK_TAG_BATCH 4096
- unsigned long tagged;
-
- do {
- spin_lock_irq(&mapping->tree_lock);
- tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
- &start, end, WRITEBACK_TAG_BATCH,
- PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+ unsigned long tagged = 0;
+ struct radix_tree_iter iter;
+ void **slot;
+
+ spin_lock_irq(&mapping->tree_lock);
+ radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start,
+ PAGECACHE_TAG_DIRTY) {
+ if (iter.index > end)
+ break;
+ radix_tree_iter_tag_set(&mapping->page_tree, &iter,
+ PAGECACHE_TAG_TOWRITE);
+ tagged++;
+ if ((tagged % WRITEBACK_TAG_BATCH) != 0)
+ continue;
+ slot = radix_tree_iter_resume(slot, &iter);
spin_unlock_irq(&mapping->tree_lock);
- WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
cond_resched();
- /* We check 'start' to handle wrapping when end == ~0UL */
- } while (tagged >= WRITEBACK_TAG_BATCH && start);
+ spin_lock_irq(&mapping->tree_lock);
+ }
+ spin_unlock_irq(&mapping->tree_lock);
}
EXPORT_SYMBOL(tag_pages_for_writeback);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6de9440e3ae2..07efbc3a8656 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -55,12 +55,13 @@
#include <linux/kmemleak.h>
#include <linux/compaction.h>
#include <trace/events/kmem.h>
+#include <trace/events/oom.h>
#include <linux/prefetch.h>
#include <linux/mm_inline.h>
#include <linux/migrate.h>
-#include <linux/page_ext.h>
#include <linux/hugetlb.h>
#include <linux/sched/rt.h>
+#include <linux/sched/mm.h>
#include <linux/page_owner.h>
#include <linux/kthread.h>
#include <linux/memcontrol.h>
@@ -91,6 +92,10 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_);
int _node_numa_mem_[MAX_NUMNODES];
#endif
+/* work_structs for global per-cpu drains */
+DEFINE_MUTEX(pcpu_drain_mutex);
+DEFINE_PER_CPU(struct work_struct, pcpu_drain);
+
#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
volatile unsigned long latent_entropy __latent_entropy;
EXPORT_SYMBOL(latent_entropy);
@@ -714,7 +719,7 @@ static inline void rmv_page_order(struct page *page)
/*
* This function checks whether a page is free && is the buddy
* we can do coalesce a page and its buddy if
- * (a) the buddy is not in a hole &&
+ * (a) the buddy is not in a hole (check before calling!) &&
* (b) the buddy is in the buddy system &&
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
@@ -729,9 +734,6 @@ static inline void rmv_page_order(struct page *page)
static inline int page_is_buddy(struct page *page, struct page *buddy,
unsigned int order)
{
- if (!pfn_valid_within(page_to_pfn(buddy)))
- return 0;
-
if (page_is_guard(buddy) && page_order(buddy) == order) {
if (page_zone_id(page) != page_zone_id(buddy))
return 0;
@@ -787,9 +789,8 @@ static inline void __free_one_page(struct page *page,
struct zone *zone, unsigned int order,
int migratetype)
{
- unsigned long page_idx;
- unsigned long combined_idx;
- unsigned long uninitialized_var(buddy_idx);
+ unsigned long combined_pfn;
+ unsigned long uninitialized_var(buddy_pfn);
struct page *buddy;
unsigned int max_order;
@@ -802,15 +803,16 @@ static inline void __free_one_page(struct page *page,
if (likely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);
- page_idx = pfn & ((1 << MAX_ORDER) - 1);
-
- VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
+ VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
continue_merging:
while (order < max_order - 1) {
- buddy_idx = __find_buddy_index(page_idx, order);
- buddy = page + (buddy_idx - page_idx);
+ buddy_pfn = __find_buddy_pfn(pfn, order);
+ buddy = page + (buddy_pfn - pfn);
+
+ if (!pfn_valid_within(buddy_pfn))
+ goto done_merging;
if (!page_is_buddy(page, buddy, order))
goto done_merging;
/*
@@ -824,9 +826,9 @@ continue_merging:
zone->free_area[order].nr_free--;
rmv_page_order(buddy);
}
- combined_idx = buddy_idx & page_idx;
- page = page + (combined_idx - page_idx);
- page_idx = combined_idx;
+ combined_pfn = buddy_pfn & pfn;
+ page = page + (combined_pfn - pfn);
+ pfn = combined_pfn;
order++;
}
if (max_order < MAX_ORDER) {
@@ -841,8 +843,8 @@ continue_merging:
if (unlikely(has_isolate_pageblock(zone))) {
int buddy_mt;
- buddy_idx = __find_buddy_index(page_idx, order);
- buddy = page + (buddy_idx - page_idx);
+ buddy_pfn = __find_buddy_pfn(pfn, order);
+ buddy = page + (buddy_pfn - pfn);
buddy_mt = get_pageblock_migratetype(buddy);
if (migratetype != buddy_mt
@@ -865,13 +867,14 @@ done_merging:
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
- if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
+ if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
struct page *higher_page, *higher_buddy;
- combined_idx = buddy_idx & page_idx;
- higher_page = page + (combined_idx - page_idx);
- buddy_idx = __find_buddy_index(combined_idx, order + 1);
- higher_buddy = higher_page + (buddy_idx - combined_idx);
- if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
+ combined_pfn = buddy_pfn & pfn;
+ higher_page = page + (combined_pfn - pfn);
+ buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
+ higher_buddy = higher_page + (buddy_pfn - combined_pfn);
+ if (pfn_valid_within(buddy_pfn) &&
+ page_is_buddy(higher_page, higher_buddy, order + 1)) {
list_add_tail(&page->lru,
&zone->free_area[order].free_list[migratetype]);
goto out;
@@ -1864,14 +1867,14 @@ int move_freepages(struct zone *zone,
#endif
for (page = start_page; page <= end_page;) {
- /* Make sure we are not inadvertently changing nodes */
- VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
-
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
continue;
}
+ /* Make sure we are not inadvertently changing nodes */
+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
+
if (!PageBuddy(page)) {
page++;
continue;
@@ -2058,8 +2061,12 @@ out_unlock:
* potentially hurts the reliability of high-order allocations when under
* intense memory pressure but failed atomic allocations should be easier
* to recover from than an OOM.
+ *
+ * If @force is true, try to unreserve a pageblock even though highatomic
+ * pageblock is exhausted.
*/
-static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
+static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ bool force)
{
struct zonelist *zonelist = ac->zonelist;
unsigned long flags;
@@ -2067,11 +2074,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
struct zone *zone;
struct page *page;
int order;
+ bool ret;
for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
ac->nodemask) {
- /* Preserve at least one pageblock */
- if (zone->nr_reserved_highatomic <= pageblock_nr_pages)
+ /*
+ * Preserve at least one pageblock unless memory pressure
+ * is really high.
+ */
+ if (!force && zone->nr_reserved_highatomic <=
+ pageblock_nr_pages)
continue;
spin_lock_irqsave(&zone->lock, flags);
@@ -2085,13 +2097,25 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
continue;
/*
- * It should never happen but changes to locking could
- * inadvertently allow a per-cpu drain to add pages
- * to MIGRATE_HIGHATOMIC while unreserving so be safe
- * and watch for underflows.
+ * In page freeing path, migratetype change is racy so
+ * we can counter several free pages in a pageblock
+ * in this loop althoug we changed the pageblock type
+ * from highatomic to ac->migratetype. So we should
+ * adjust the count once.
*/
- zone->nr_reserved_highatomic -= min(pageblock_nr_pages,
- zone->nr_reserved_highatomic);
+ if (get_pageblock_migratetype(page) ==
+ MIGRATE_HIGHATOMIC) {
+ /*
+ * It should never happen but changes to
+ * locking could inadvertently allow a per-cpu
+ * drain to add pages to MIGRATE_HIGHATOMIC
+ * while unreserving so be safe and watch for
+ * underflows.
+ */
+ zone->nr_reserved_highatomic -= min(
+ pageblock_nr_pages,
+ zone->nr_reserved_highatomic);
+ }
/*
* Convert to ac->migratetype and avoid the normal
@@ -2103,12 +2127,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
* may increase.
*/
set_pageblock_migratetype(page, ac->migratetype);
- move_freepages_block(zone, page, ac->migratetype);
- spin_unlock_irqrestore(&zone->lock, flags);
- return;
+ ret = move_freepages_block(zone, page, ac->migratetype);
+ if (ret) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return ret;
+ }
}
spin_unlock_irqrestore(&zone->lock, flags);
}
+
+ return false;
}
/* Remove an element from the buddy allocator from the fallback list */
@@ -2133,7 +2161,8 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
page = list_first_entry(&area->free_list[fallback_mt],
struct page, lru);
- if (can_steal)
+ if (can_steal &&
+ get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC)
steal_suitable_fallback(zone, page, start_migratetype);
/* Remove the page from the freelists */
@@ -2192,7 +2221,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, bool cold)
{
- int i;
+ int i, alloced = 0;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
@@ -2217,13 +2246,21 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
else
list_add_tail(&page->lru, list);
list = &page->lru;
+ alloced++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
+
+ /*
+ * i pages were removed from the buddy list even if some leak due
+ * to check_pcp_refill failing so adjust NR_FREE_PAGES based
+ * on i. Do not confuse with 'alloced' which is the number of
+ * pages added to the pcp list.
+ */
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock(&zone->lock);
- return i;
+ return alloced;
}
#ifdef CONFIG_NUMA
@@ -2307,16 +2344,26 @@ void drain_local_pages(struct zone *zone)
drain_pages(cpu);
}
+static void drain_local_pages_wq(struct work_struct *work)
+{
+ /*
+ * drain_all_pages doesn't use proper cpu hotplug protection so
+ * we can race with cpu offline when the WQ can move this from
+ * a cpu pinned worker to an unbound one. We can operate on a different
+ * cpu which is allright but we also have to make sure to not move to
+ * a different one.
+ */
+ preempt_disable();
+ drain_local_pages(NULL);
+ preempt_enable();
+}
+
/*
* Spill all the per-cpu pages from all CPUs back into the buddy allocator.
*
* When zone parameter is non-NULL, spill just the single zone's pages.
*
- * Note that this code is protected against sending an IPI to an offline
- * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
- * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
- * nothing keeps CPUs from showing up after we populated the cpumask and
- * before the call to on_each_cpu_mask().
+ * Note that this can be extremely slow as the draining happens in a workqueue.
*/
void drain_all_pages(struct zone *zone)
{
@@ -2329,6 +2376,28 @@ void drain_all_pages(struct zone *zone)
static cpumask_t cpus_with_pcps;
/*
+ * Make sure nobody triggers this path before mm_percpu_wq is fully
+ * initialized.
+ */
+ if (WARN_ON_ONCE(!mm_percpu_wq))
+ return;
+
+ /* Workqueues cannot recurse */
+ if (current->flags & PF_WQ_WORKER)
+ return;
+
+ /*
+ * Do not drain if one is already in progress unless it's specific to
+ * a zone. Such callers are primarily CMA and memory hotplug and need
+ * the drain to be complete when the call returns.
+ */
+ if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
+ if (!zone)
+ return;
+ mutex_lock(&pcpu_drain_mutex);
+ }
+
+ /*
* We don't care about racing with CPU hotplug event
* as offline notification will cause the notified
* cpu to drain that CPU pcps and on_each_cpu_mask
@@ -2358,8 +2427,16 @@ void drain_all_pages(struct zone *zone)
else
cpumask_clear_cpu(cpu, &cpus_with_pcps);
}
- on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
- zone, 1);
+
+ for_each_cpu(cpu, &cpus_with_pcps) {
+ struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
+ INIT_WORK(work, drain_local_pages_wq);
+ queue_work_on(cpu, mm_percpu_wq, work);
+ }
+ for_each_cpu(cpu, &cpus_with_pcps)
+ flush_work(per_cpu_ptr(&pcpu_drain, cpu));
+
+ mutex_unlock(&pcpu_drain_mutex);
}
#ifdef CONFIG_HIBERNATION
@@ -2534,7 +2611,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
int mt = get_pageblock_migratetype(page);
- if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
+ if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
+ && mt != MIGRATE_HIGHATOMIC)
set_pageblock_migratetype(page,
MIGRATE_MOVABLE);
}
@@ -2548,101 +2626,123 @@ int __isolate_free_page(struct page *page, unsigned int order)
* Update NUMA hit/miss statistics
*
* Must be called with interrupts disabled.
- *
- * When __GFP_OTHER_NODE is set assume the node of the preferred
- * zone is the local node. This is useful for daemons who allocate
- * memory on behalf of other processes.
*/
-static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
- gfp_t flags)
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
{
#ifdef CONFIG_NUMA
- int local_nid = numa_node_id();
enum zone_stat_item local_stat = NUMA_LOCAL;
- if (unlikely(flags & __GFP_OTHER_NODE)) {
+ if (z->node != numa_node_id())
local_stat = NUMA_OTHER;
- local_nid = preferred_zone->node;
- }
- if (z->node == local_nid) {
+ if (z->node == preferred_zone->node)
__inc_zone_state(z, NUMA_HIT);
- __inc_zone_state(z, local_stat);
- } else {
+ else {
__inc_zone_state(z, NUMA_MISS);
__inc_zone_state(preferred_zone, NUMA_FOREIGN);
}
+ __inc_zone_state(z, local_stat);
#endif
}
+/* Remove page from the per-cpu list, caller must protect the list */
+static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+ bool cold, struct per_cpu_pages *pcp,
+ struct list_head *list)
+{
+ struct page *page;
+
+ do {
+ if (list_empty(list)) {
+ pcp->count += rmqueue_bulk(zone, 0,
+ pcp->batch, list,
+ migratetype, cold);
+ if (unlikely(list_empty(list)))
+ return NULL;
+ }
+
+ if (cold)
+ page = list_last_entry(list, struct page, lru);
+ else
+ page = list_first_entry(list, struct page, lru);
+
+ list_del(&page->lru);
+ pcp->count--;
+ } while (check_new_pcp(page));
+
+ return page;
+}
+
+/* Lock and remove page from the per-cpu list */
+static struct page *rmqueue_pcplist(struct zone *preferred_zone,
+ struct zone *zone, unsigned int order,
+ gfp_t gfp_flags, int migratetype)
+{
+ struct per_cpu_pages *pcp;
+ struct list_head *list;
+ bool cold = ((gfp_flags & __GFP_COLD) != 0);
+ struct page *page;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
+ list = &pcp->lists[migratetype];
+ page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
+ if (page) {
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ zone_statistics(preferred_zone, zone);
+ }
+ local_irq_restore(flags);
+ return page;
+}
+
/*
* Allocate a page from the given zone. Use pcplists for order-0 allocations.
*/
static inline
-struct page *buffered_rmqueue(struct zone *preferred_zone,
+struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
unsigned long flags;
struct page *page;
- bool cold = ((gfp_flags & __GFP_COLD) != 0);
if (likely(order == 0)) {
- struct per_cpu_pages *pcp;
- struct list_head *list;
-
- local_irq_save(flags);
- do {
- pcp = &this_cpu_ptr(zone->pageset)->pcp;
- list = &pcp->lists[migratetype];
- if (list_empty(list)) {
- pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, list,
- migratetype, cold);
- if (unlikely(list_empty(list)))
- goto failed;
- }
-
- if (cold)
- page = list_last_entry(list, struct page, lru);
- else
- page = list_first_entry(list, struct page, lru);
-
- list_del(&page->lru);
- pcp->count--;
+ page = rmqueue_pcplist(preferred_zone, zone, order,
+ gfp_flags, migratetype);
+ goto out;
+ }
- } while (check_new_pcp(page));
- } else {
- /*
- * We most definitely don't want callers attempting to
- * allocate greater than order-1 page units with __GFP_NOFAIL.
- */
- WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
- spin_lock_irqsave(&zone->lock, flags);
+ /*
+ * We most definitely don't want callers attempting to
+ * allocate greater than order-1 page units with __GFP_NOFAIL.
+ */
+ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
+ spin_lock_irqsave(&zone->lock, flags);
- do {
- page = NULL;
- if (alloc_flags & ALLOC_HARDER) {
- page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
- if (page)
- trace_mm_page_alloc_zone_locked(page, order, migratetype);
- }
- if (!page)
- page = __rmqueue(zone, order, migratetype);
- } while (page && check_new_pages(page, order));
- spin_unlock(&zone->lock);
+ do {
+ page = NULL;
+ if (alloc_flags & ALLOC_HARDER) {
+ page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+ if (page)
+ trace_mm_page_alloc_zone_locked(page, order, migratetype);
+ }
if (!page)
- goto failed;
- __mod_zone_freepage_state(zone, -(1 << order),
- get_pcppage_migratetype(page));
- }
+ page = __rmqueue(zone, order, migratetype);
+ } while (page && check_new_pages(page, order));
+ spin_unlock(&zone->lock);
+ if (!page)
+ goto failed;
+ __mod_zone_freepage_state(zone, -(1 << order),
+ get_pcppage_migratetype(page));
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
- zone_statistics(preferred_zone, zone, gfp_flags);
+ zone_statistics(preferred_zone, zone);
local_irq_restore(flags);
- VM_BUG_ON_PAGE(bad_range(zone, page), page);
+out:
+ VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
failed:
@@ -2850,7 +2950,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
#ifdef CONFIG_NUMA
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
- return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
+ return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
RECLAIM_DISTANCE;
}
#else /* CONFIG_NUMA */
@@ -2947,7 +3047,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
}
try_this_zone:
- page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order,
+ page = rmqueue(ac->preferred_zoneref->zone, zone, order,
gfp_mask, alloc_flags, ac->migratetype);
if (page) {
prep_new_page(page, order, gfp_mask, alloc_flags);
@@ -2980,18 +3080,12 @@ static inline bool should_suppress_show_mem(void)
return ret;
}
-static DEFINE_RATELIMIT_STATE(nopage_rs,
- DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
-
-void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
+static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
- struct va_format vaf;
- va_list args;
+ static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
- if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
- debug_guardpage_minorder() > 0)
+ if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
return;
/*
@@ -3006,6 +3100,20 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
filter &= ~SHOW_MEM_FILTER_NODES;
+ show_mem(filter, nodemask);
+}
+
+void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+ debug_guardpage_minorder() > 0)
+ return;
+
pr_warn("%s: ", current->comm);
va_start(args, fmt);
@@ -3014,11 +3122,36 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
pr_cont("%pV", &vaf);
va_end(args);
- pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
+ pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
+ if (nodemask)
+ pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
+ else
+ pr_cont("(null)\n");
+
+ cpuset_print_current_mems_allowed();
dump_stack();
- if (!should_suppress_show_mem())
- show_mem(filter);
+ warn_alloc_show_mem(gfp_mask, nodemask);
+}
+
+static inline struct page *
+__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
+ unsigned int alloc_flags,
+ const struct alloc_context *ac)
+{
+ struct page *page;
+
+ page = get_page_from_freelist(gfp_mask, order,
+ alloc_flags|ALLOC_CPUSET, ac);
+ /*
+ * fallback to ignore cpuset restriction if our nodes
+ * are depleted
+ */
+ if (!page)
+ page = get_page_from_freelist(gfp_mask, order,
+ alloc_flags, ac);
+
+ return page;
}
static inline struct page *
@@ -3056,47 +3189,42 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
if (page)
goto out;
- if (!(gfp_mask & __GFP_NOFAIL)) {
- /* Coredumps can quickly deplete all memory reserves */
- if (current->flags & PF_DUMPCORE)
- goto out;
- /* The OOM killer will not help higher order allocs */
- if (order > PAGE_ALLOC_COSTLY_ORDER)
- goto out;
- /* The OOM killer does not needlessly kill tasks for lowmem */
- if (ac->high_zoneidx < ZONE_NORMAL)
- goto out;
- if (pm_suspended_storage())
- goto out;
- /*
- * XXX: GFP_NOFS allocations should rather fail than rely on
- * other request to make a forward progress.
- * We are in an unfortunate situation where out_of_memory cannot
- * do much for this context but let's try it to at least get
- * access to memory reserved if the current task is killed (see
- * out_of_memory). Once filesystems are ready to handle allocation
- * failures more gracefully we should just bail out here.
- */
+ /* Coredumps can quickly deplete all memory reserves */
+ if (current->flags & PF_DUMPCORE)
+ goto out;
+ /* The OOM killer will not help higher order allocs */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ goto out;
+ /* The OOM killer does not needlessly kill tasks for lowmem */
+ if (ac->high_zoneidx < ZONE_NORMAL)
+ goto out;
+ if (pm_suspended_storage())
+ goto out;
+ /*
+ * XXX: GFP_NOFS allocations should rather fail than rely on
+ * other request to make a forward progress.
+ * We are in an unfortunate situation where out_of_memory cannot
+ * do much for this context but let's try it to at least get
+ * access to memory reserved if the current task is killed (see
+ * out_of_memory). Once filesystems are ready to handle allocation
+ * failures more gracefully we should just bail out here.
+ */
+
+ /* The OOM killer may not free memory on a specific node */
+ if (gfp_mask & __GFP_THISNODE)
+ goto out;
- /* The OOM killer may not free memory on a specific node */
- if (gfp_mask & __GFP_THISNODE)
- goto out;
- }
/* Exhausted what can be done so it's blamo time */
if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
*did_some_progress = 1;
- if (gfp_mask & __GFP_NOFAIL) {
- page = get_page_from_freelist(gfp_mask, order,
- ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
- /*
- * fallback to ignore cpuset restriction if our nodes
- * are depleted
- */
- if (!page)
- page = get_page_from_freelist(gfp_mask, order,
+ /*
+ * Help non-failing allocations by giving them access to memory
+ * reserves
+ */
+ if (gfp_mask & __GFP_NOFAIL)
+ page = __alloc_pages_cpuset_fallback(gfp_mask, order,
ALLOC_NO_WATERMARKS, ac);
- }
}
out:
mutex_unlock(&oom_lock);
@@ -3165,6 +3293,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
{
int max_retries = MAX_COMPACT_RETRIES;
int min_priority;
+ bool ret = false;
+ int retries = *compaction_retries;
+ enum compact_priority priority = *compact_priority;
if (!order)
return false;
@@ -3186,8 +3317,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
* But do not retry if the given zonelist is not suitable for
* compaction.
*/
- if (compaction_withdrawn(compact_result))
- return compaction_zonelist_suitable(ac, order, alloc_flags);
+ if (compaction_withdrawn(compact_result)) {
+ ret = compaction_zonelist_suitable(ac, order, alloc_flags);
+ goto out;
+ }
/*
* !costly requests are much more important than __GFP_REPEAT
@@ -3199,8 +3332,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
*/
if (order > PAGE_ALLOC_COSTLY_ORDER)
max_retries /= 4;
- if (*compaction_retries <= max_retries)
- return true;
+ if (*compaction_retries <= max_retries) {
+ ret = true;
+ goto out;
+ }
/*
* Make sure there are attempts at the highest priority if we exhausted
@@ -3209,12 +3344,15 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
check_priority:
min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
+
if (*compact_priority > min_priority) {
(*compact_priority)--;
*compaction_retries = 0;
- return true;
+ ret = true;
}
- return false;
+out:
+ trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
+ return ret;
}
#else
static inline struct page *
@@ -3305,7 +3443,7 @@ retry:
* Shrink them them and try again
*/
if (!page && !drained) {
- unreserve_highatomic_pageblock(ac);
+ unreserve_highatomic_pageblock(ac, false);
drain_all_pages(NULL);
drained = true;
goto retry;
@@ -3422,8 +3560,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
* Make sure we converge to OOM if we cannot make any progress
* several times in the row.
*/
- if (*no_progress_loops > MAX_RECLAIM_RETRIES)
- return false;
+ if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
+ /* Before OOM, exhaust highatomic_reserve */
+ return unreserve_highatomic_pageblock(ac, true);
+ }
/*
* Keep reclaiming pages while there is a chance this will lead
@@ -3435,6 +3575,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
ac->nodemask) {
unsigned long available;
unsigned long reclaimable;
+ unsigned long min_wmark = min_wmark_pages(zone);
+ bool wmark;
available = reclaimable = zone_reclaimable_pages(zone);
available -= DIV_ROUND_UP((*no_progress_loops) * available,
@@ -3445,8 +3587,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
* Would the allocation succeed if we reclaimed the whole
* available?
*/
- if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
- ac_classzone_idx(ac), alloc_flags, available)) {
+ wmark = __zone_watermark_ok(zone, order, min_wmark,
+ ac_classzone_idx(ac), alloc_flags, available);
+ trace_reclaim_retry_zone(z, order, reclaimable,
+ available, min_wmark, *no_progress_loops, wmark);
+ if (wmark) {
/*
* If we didn't make any progress and have a lot of
* dirty + writeback pages then we should wait for
@@ -3494,12 +3639,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct page *page = NULL;
unsigned int alloc_flags;
unsigned long did_some_progress;
- enum compact_priority compact_priority = DEF_COMPACT_PRIORITY;
+ enum compact_priority compact_priority;
enum compact_result compact_result;
- int compaction_retries = 0;
- int no_progress_loops = 0;
+ int compaction_retries;
+ int no_progress_loops;
unsigned long alloc_start = jiffies;
unsigned int stall_timeout = 10 * HZ;
+ unsigned int cpuset_mems_cookie;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -3520,6 +3666,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
+retry_cpuset:
+ compaction_retries = 0;
+ no_progress_loops = 0;
+ compact_priority = DEF_COMPACT_PRIORITY;
+ cpuset_mems_cookie = read_mems_allowed_begin();
+
/*
* The fast path uses conservative alloc_flags to succeed only until
* kswapd needs to be woken up, and to avoid the cost of setting up
@@ -3527,6 +3679,17 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
+ /*
+ * We need to recalculate the starting point for the zonelist iterator
+ * because we might have used different nodemask in the fast path, or
+ * there was a cpuset modification and we are retrying - otherwise we
+ * could end up iterating over non-eligible zones endlessly.
+ */
+ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
+ ac->high_zoneidx, ac->nodemask);
+ if (!ac->preferred_zoneref->zone)
+ goto nopage;
+
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
@@ -3603,35 +3766,21 @@ retry:
goto got_pg;
/* Caller is not willing to reclaim, we can't balance anything */
- if (!can_direct_reclaim) {
- /*
- * All existing users of the __GFP_NOFAIL are blockable, so warn
- * of any new users that actually allow this type of allocation
- * to fail.
- */
- WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
+ if (!can_direct_reclaim)
goto nopage;
- }
- /* Avoid recursion of direct reclaim */
- if (current->flags & PF_MEMALLOC) {
- /*
- * __GFP_NOFAIL request from this context is rather bizarre
- * because we cannot reclaim anything and only can loop waiting
- * for somebody to do a work for us.
- */
- if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
- cond_resched();
- goto retry;
- }
- goto nopage;
+ /* Make sure we know about allocations which stall for too long */
+ if (time_after(jiffies, alloc_start + stall_timeout)) {
+ warn_alloc(gfp_mask, ac->nodemask,
+ "page allocation stalls for %ums, order:%u",
+ jiffies_to_msecs(jiffies-alloc_start), order);
+ stall_timeout += 10 * HZ;
}
- /* Avoid allocations with no watermarks from looping endlessly */
- if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
+ /* Avoid recursion of direct reclaim */
+ if (current->flags & PF_MEMALLOC)
goto nopage;
-
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
&did_some_progress);
@@ -3655,14 +3804,6 @@ retry:
if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
goto nopage;
- /* Make sure we know about allocations which stall for too long */
- if (time_after(jiffies, alloc_start + stall_timeout)) {
- warn_alloc(gfp_mask,
- "page allocation stalls for %ums, order:%u",
- jiffies_to_msecs(jiffies-alloc_start), order);
- stall_timeout += 10 * HZ;
- }
-
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
did_some_progress > 0, &no_progress_loops))
goto retry;
@@ -3679,11 +3820,22 @@ retry:
&compaction_retries))
goto retry;
+ /*
+ * It's possible we raced with cpuset update so the OOM would be
+ * premature (see below the nopage: label for full explanation).
+ */
+ if (read_mems_allowed_retry(cpuset_mems_cookie))
+ goto retry_cpuset;
+
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
if (page)
goto got_pg;
+ /* Avoid allocations with no watermarks from looping endlessly */
+ if (test_thread_flag(TIF_MEMDIE))
+ goto nopage;
+
/* Retry as long as the OOM killer is making progress */
if (did_some_progress) {
no_progress_loops = 0;
@@ -3691,74 +3843,127 @@ retry:
}
nopage:
- warn_alloc(gfp_mask,
+ /*
+ * When updating a task's mems_allowed or mempolicy nodemask, it is
+ * possible to race with parallel threads in such a way that our
+ * allocation can fail while the mask is being updated. If we are about
+ * to fail, check if the cpuset changed during allocation and if so,
+ * retry.
+ */
+ if (read_mems_allowed_retry(cpuset_mems_cookie))
+ goto retry_cpuset;
+
+ /*
+ * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
+ * we always retry
+ */
+ if (gfp_mask & __GFP_NOFAIL) {
+ /*
+ * All existing users of the __GFP_NOFAIL are blockable, so warn
+ * of any new users that actually require GFP_NOWAIT
+ */
+ if (WARN_ON_ONCE(!can_direct_reclaim))
+ goto fail;
+
+ /*
+ * PF_MEMALLOC request from this context is rather bizarre
+ * because we cannot reclaim anything and only can loop waiting
+ * for somebody to do a work for us
+ */
+ WARN_ON_ONCE(current->flags & PF_MEMALLOC);
+
+ /*
+ * non failing costly orders are a hard requirement which we
+ * are not prepared for much so let's warn about these users
+ * so that we can identify them and convert them to something
+ * else.
+ */
+ WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
+
+ /*
+ * Help non-failing allocations by giving them access to memory
+ * reserves but do not use ALLOC_NO_WATERMARKS because this
+ * could deplete whole memory reserves which would just make
+ * the situation worse
+ */
+ page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
+ if (page)
+ goto got_pg;
+
+ cond_resched();
+ goto retry;
+ }
+fail:
+ warn_alloc(gfp_mask, ac->nodemask,
"page allocation failure: order:%u", order);
got_pg:
return page;
}
-/*
- * This is the 'heart' of the zoned buddy allocator.
- */
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, nodemask_t *nodemask)
+static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, nodemask_t *nodemask,
+ struct alloc_context *ac, gfp_t *alloc_mask,
+ unsigned int *alloc_flags)
{
- struct page *page;
- unsigned int cpuset_mems_cookie;
- unsigned int alloc_flags = ALLOC_WMARK_LOW;
- gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
- struct alloc_context ac = {
- .high_zoneidx = gfp_zone(gfp_mask),
- .zonelist = zonelist,
- .nodemask = nodemask,
- .migratetype = gfpflags_to_migratetype(gfp_mask),
- };
+ ac->high_zoneidx = gfp_zone(gfp_mask);
+ ac->zonelist = zonelist;
+ ac->nodemask = nodemask;
+ ac->migratetype = gfpflags_to_migratetype(gfp_mask);
if (cpusets_enabled()) {
- alloc_mask |= __GFP_HARDWALL;
- alloc_flags |= ALLOC_CPUSET;
- if (!ac.nodemask)
- ac.nodemask = &cpuset_current_mems_allowed;
+ *alloc_mask |= __GFP_HARDWALL;
+ if (!ac->nodemask)
+ ac->nodemask = &cpuset_current_mems_allowed;
+ else
+ *alloc_flags |= ALLOC_CPUSET;
}
- gfp_mask &= gfp_allowed_mask;
-
lockdep_trace_alloc(gfp_mask);
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
if (should_fail_alloc_page(gfp_mask, order))
- return NULL;
-
- /*
- * Check the zones suitable for the gfp_mask contain at least one
- * valid zone. It's possible to have an empty zonelist as a result
- * of __GFP_THISNODE and a memoryless node
- */
- if (unlikely(!zonelist->_zonerefs->zone))
- return NULL;
+ return false;
- if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
- alloc_flags |= ALLOC_CMA;
+ if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
+ *alloc_flags |= ALLOC_CMA;
-retry_cpuset:
- cpuset_mems_cookie = read_mems_allowed_begin();
+ return true;
+}
+/* Determine whether to spread dirty pages and what the first usable zone */
+static inline void finalise_ac(gfp_t gfp_mask,
+ unsigned int order, struct alloc_context *ac)
+{
/* Dirty zone balancing only done in the fast path */
- ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
+ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
/*
* The preferred zone is used for statistics but crucially it is
* also used as the starting point for the zonelist iterator. It
* may get reset for allocations that ignore memory policies.
*/
- ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
- ac.high_zoneidx, ac.nodemask);
- if (!ac.preferred_zoneref) {
- page = NULL;
- goto no_zone;
- }
+ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
+ ac->high_zoneidx, ac->nodemask);
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, nodemask_t *nodemask)
+{
+ struct page *page;
+ unsigned int alloc_flags = ALLOC_WMARK_LOW;
+ gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
+ struct alloc_context ac = { };
+
+ gfp_mask &= gfp_allowed_mask;
+ if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags))
+ return NULL;
+
+ finalise_ac(gfp_mask, order, &ac);
/* First allocation attempt */
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
@@ -3776,21 +3981,10 @@ retry_cpuset:
* Restore the original nodemask if it was potentially replaced with
* &cpuset_current_mems_allowed to optimize the fast-path attempt.
*/
- if (cpusets_enabled())
+ if (unlikely(ac.nodemask != nodemask))
ac.nodemask = nodemask;
- page = __alloc_pages_slowpath(alloc_mask, order, &ac);
-no_zone:
- /*
- * When updating a task's mems_allowed, it is possible to race with
- * parallel threads in such a way that an allocation can fail while
- * the mask is being updated. If a page allocation is about to fail,
- * check if the cpuset changed during allocation and if so, retry.
- */
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) {
- alloc_mask = gfp_mask;
- goto retry_cpuset;
- }
+ page = __alloc_pages_slowpath(alloc_mask, order, &ac);
out:
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
@@ -3867,8 +4061,8 @@ EXPORT_SYMBOL(free_pages);
* drivers to provide a backing region of memory for use as either an
* sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
*/
-static struct page *__page_frag_refill(struct page_frag_cache *nc,
- gfp_t gfp_mask)
+static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
+ gfp_t gfp_mask)
{
struct page *page = NULL;
gfp_t gfp = gfp_mask;
@@ -3888,8 +4082,23 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc,
return page;
}
-void *__alloc_page_frag(struct page_frag_cache *nc,
- unsigned int fragsz, gfp_t gfp_mask)
+void __page_frag_cache_drain(struct page *page, unsigned int count)
+{
+ VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
+
+ if (page_ref_sub_and_test(page, count)) {
+ unsigned int order = compound_order(page);
+
+ if (order == 0)
+ free_hot_cold_page(page, false);
+ else
+ __free_pages_ok(page, order);
+ }
+}
+EXPORT_SYMBOL(__page_frag_cache_drain);
+
+void *page_frag_alloc(struct page_frag_cache *nc,
+ unsigned int fragsz, gfp_t gfp_mask)
{
unsigned int size = PAGE_SIZE;
struct page *page;
@@ -3897,7 +4106,7 @@ void *__alloc_page_frag(struct page_frag_cache *nc,
if (unlikely(!nc->va)) {
refill:
- page = __page_frag_refill(nc, gfp_mask);
+ page = __page_frag_cache_refill(nc, gfp_mask);
if (!page)
return NULL;
@@ -3940,19 +4149,19 @@ refill:
return nc->va + offset;
}
-EXPORT_SYMBOL(__alloc_page_frag);
+EXPORT_SYMBOL(page_frag_alloc);
/*
* Frees a page fragment allocated out of either a compound or order 0 page.
*/
-void __free_page_frag(void *addr)
+void page_frag_free(void *addr)
{
struct page *page = virt_to_head_page(addr);
if (unlikely(put_page_testzero(page)))
__free_pages_ok(page, compound_order(page));
}
-EXPORT_SYMBOL(__free_page_frag);
+EXPORT_SYMBOL(page_frag_free);
static void *make_alloc_exact(unsigned long addr, unsigned int order,
size_t size)
@@ -4182,20 +4391,20 @@ void si_meminfo_node(struct sysinfo *val, int nid)
* Determine whether the node should be displayed or not, depending on whether
* SHOW_MEM_FILTER_NODES was passed to show_free_areas().
*/
-bool skip_free_areas_node(unsigned int flags, int nid)
+static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
{
- bool ret = false;
- unsigned int cpuset_mems_cookie;
-
if (!(flags & SHOW_MEM_FILTER_NODES))
- goto out;
+ return false;
- do {
- cpuset_mems_cookie = read_mems_allowed_begin();
- ret = !node_isset(nid, cpuset_current_mems_allowed);
- } while (read_mems_allowed_retry(cpuset_mems_cookie));
-out:
- return ret;
+ /*
+ * no node mask - aka implicit memory numa policy. Do not bother with
+ * the synchronization - read_mems_allowed_begin - because we do not
+ * have to be precise here.
+ */
+ if (!nodemask)
+ nodemask = &cpuset_current_mems_allowed;
+
+ return !node_isset(nid, *nodemask);
}
#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -4236,7 +4445,7 @@ static void show_migration_types(unsigned char type)
* SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
* cpuset.
*/
-void show_free_areas(unsigned int filter)
+void show_free_areas(unsigned int filter, nodemask_t *nodemask)
{
unsigned long free_pcp = 0;
int cpu;
@@ -4244,7 +4453,7 @@ void show_free_areas(unsigned int filter)
pg_data_t *pgdat;
for_each_populated_zone(zone) {
- if (skip_free_areas_node(filter, zone_to_nid(zone)))
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
for_each_online_cpu(cpu)
@@ -4278,6 +4487,9 @@ void show_free_areas(unsigned int filter)
global_page_state(NR_FREE_CMA_PAGES));
for_each_online_pgdat(pgdat) {
+ if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
+ continue;
+
printk("Node %d"
" active_anon:%lukB"
" inactive_anon:%lukB"
@@ -4311,13 +4523,13 @@ void show_free_areas(unsigned int filter)
K(node_page_state(pgdat, NR_FILE_MAPPED)),
K(node_page_state(pgdat, NR_FILE_DIRTY)),
K(node_page_state(pgdat, NR_WRITEBACK)),
+ K(node_page_state(pgdat, NR_SHMEM)),
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
* HPAGE_PMD_NR),
K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
#endif
- K(node_page_state(pgdat, NR_SHMEM)),
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
node_page_state(pgdat, NR_PAGES_SCANNED),
@@ -4327,7 +4539,7 @@ void show_free_areas(unsigned int filter)
for_each_populated_zone(zone) {
int i;
- if (skip_free_areas_node(filter, zone_to_nid(zone)))
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
free_pcp = 0;
@@ -4392,7 +4604,7 @@ void show_free_areas(unsigned int filter)
unsigned long nr[MAX_ORDER], flags, total = 0;
unsigned char types[MAX_ORDER];
- if (skip_free_areas_node(filter, zone_to_nid(zone)))
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
show_node(zone);
printk(KERN_CONT "%s: ", zone->name);
@@ -5013,8 +5225,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
if (context != MEMMAP_EARLY)
goto not_early;
- if (!early_pfn_valid(pfn))
+ if (!early_pfn_valid(pfn)) {
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ /*
+ * Skip to the pfn preceding the next valid one (or
+ * end_pfn), such that we hit a valid pfn (or end_pfn)
+ * on our next iteration of the loop.
+ */
+ pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
+#endif
continue;
+ }
if (!early_pfn_in_nid(pfn, nid))
continue;
if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
@@ -5710,7 +5931,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
* the zone and SPARSEMEM is in use. If there are holes within the
* zone, each populated memory region may cost us one or two extra
* memmap pages due to alignment because memmap pages for each
- * populated regions may not naturally algined on page boundary.
+ * populated regions may not be naturally aligned on page boundary.
* So the (present_pages >> 4) heuristic is a tradeoff for that.
*/
if (spanned_pages > present_pages + (present_pages >> 4) &&
@@ -6274,8 +6495,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
start_pfn = end_pfn;
}
- arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
- arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
@@ -6399,8 +6618,8 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
}
if (pages && s)
- pr_info("Freeing %s memory: %ldK (%p - %p)\n",
- s, pages << (PAGE_SHIFT - 10), start, end);
+ pr_info("Freeing %s memory: %ldK\n",
+ s, pages << (PAGE_SHIFT - 10));
return pages;
}
@@ -6491,38 +6710,39 @@ void __init free_area_init(unsigned long *zones_size)
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
}
-static int page_alloc_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static int page_alloc_cpu_dead(unsigned int cpu)
{
- int cpu = (unsigned long)hcpu;
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- lru_add_drain_cpu(cpu);
- drain_pages(cpu);
+ lru_add_drain_cpu(cpu);
+ drain_pages(cpu);
- /*
- * Spill the event counters of the dead processor
- * into the current processors event counters.
- * This artificially elevates the count of the current
- * processor.
- */
- vm_events_fold_cpu(cpu);
+ /*
+ * Spill the event counters of the dead processor
+ * into the current processors event counters.
+ * This artificially elevates the count of the current
+ * processor.
+ */
+ vm_events_fold_cpu(cpu);
- /*
- * Zero the differential counters of the dead processor
- * so that the vm statistics are consistent.
- *
- * This is only okay since the processor is dead and cannot
- * race with what we are doing.
- */
- cpu_vm_stats_fold(cpu);
- }
- return NOTIFY_OK;
+ /*
+ * Zero the differential counters of the dead processor
+ * so that the vm statistics are consistent.
+ *
+ * This is only okay since the processor is dead and cannot
+ * race with what we are doing.
+ */
+ cpu_vm_stats_fold(cpu);
+ return 0;
}
void __init page_alloc_init(void)
{
- hotcpu_notifier(page_alloc_cpu_notify, 0);
+ int ret;
+
+ ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
+ "mm/page_alloc:dead", NULL,
+ page_alloc_cpu_dead);
+ WARN_ON(ret < 0);
}
/*
@@ -7010,8 +7230,9 @@ void *__init alloc_large_system_hash(const char *tablename,
* If @count is not zero, it is okay to include less @count unmovable pages
*
* PageLRU check without isolation or lru_lock could race so that
- * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
- * expect this function should be exact.
+ * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
+ * check without lock_page also may miss some movable non-lru pages at
+ * race condition. So you can't expect this function should be exact.
*/
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
bool skip_hwpoisoned_pages)
@@ -7067,6 +7288,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
if (skip_hwpoisoned_pages && PageHWPoison(page))
continue;
+ if (__PageMovable(page))
+ continue;
+
if (!PageLRU(page))
found++;
/*
@@ -7178,6 +7402,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
* in range must have the same migratetype and it must
* be either of the two.
+ * @gfp_mask: GFP mask to use during compaction
*
* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
* aligned, however it's the caller's responsibility to guarantee that
@@ -7191,7 +7416,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
* need to be freed with free_contig_range().
*/
int alloc_contig_range(unsigned long start, unsigned long end,
- unsigned migratetype)
+ unsigned migratetype, gfp_t gfp_mask)
{
unsigned long outer_start, outer_end;
unsigned int order;
@@ -7203,6 +7428,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
.zone = page_zone(pfn_to_page(start)),
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
+ .gfp_mask = memalloc_noio_flags(gfp_mask),
};
INIT_LIST_HEAD(&cc.migratepages);
diff --git a/mm/page_idle.c b/mm/page_idle.c
index ae11aa914e55..b0ee56c56b58 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -54,27 +54,27 @@ static int page_idle_clear_pte_refs_one(struct page *page,
struct vm_area_struct *vma,
unsigned long addr, void *arg)
{
- struct mm_struct *mm = vma->vm_mm;
- pmd_t *pmd;
- pte_t *pte;
- spinlock_t *ptl;
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .address = addr,
+ };
bool referenced = false;
- if (!page_check_address_transhuge(page, mm, addr, &pmd, &pte, &ptl))
- return SWAP_AGAIN;
-
- if (pte) {
- referenced = ptep_clear_young_notify(vma, addr, pte);
- pte_unmap(pte);
- } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- referenced = pmdp_clear_young_notify(vma, addr, pmd);
- } else {
- /* unexpected pmd-mapped page? */
- WARN_ON_ONCE(1);
+ while (page_vma_mapped_walk(&pvmw)) {
+ addr = pvmw.address;
+ if (pvmw.pte) {
+ referenced = ptep_clear_young_notify(vma, addr,
+ pvmw.pte);
+ } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ referenced = pmdp_clear_young_notify(vma, addr,
+ pvmw.pmd);
+ } else {
+ /* unexpected pmd-mapped page? */
+ WARN_ON_ONCE(1);
+ }
}
- spin_unlock(ptl);
-
if (referenced) {
clear_page_idle(page);
/*
diff --git a/mm/page_io.c b/mm/page_io.c
index a2651f58c86a..23f6d0d3470f 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -320,10 +320,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
ret = -ENOMEM;
goto out;
}
- if (wbc->sync_mode == WB_SYNC_ALL)
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC);
- else
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
count_vm_event(PSWPOUT);
set_page_writeback(page);
unlock_page(page);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index a5594bfcc5ed..f4e17a57926a 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -83,7 +83,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
unsigned long flags, nr_pages;
bool isolated_page = false;
unsigned int order;
- unsigned long page_idx, buddy_idx;
+ unsigned long pfn, buddy_pfn;
struct page *buddy;
zone = page_zone(page);
@@ -102,11 +102,11 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
if (PageBuddy(page)) {
order = page_order(page);
if (order >= pageblock_order) {
- page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
- buddy_idx = __find_buddy_index(page_idx, order);
- buddy = page + (buddy_idx - page_idx);
+ pfn = page_to_pfn(page);
+ buddy_pfn = __find_buddy_pfn(pfn, order);
+ buddy = page + (buddy_pfn - pfn);
- if (pfn_valid_within(page_to_pfn(buddy)) &&
+ if (pfn_valid_within(buddy_pfn) &&
!is_migrate_isolate_page(buddy)) {
__isolate_free_page(page, order);
isolated_page = true;
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
new file mode 100644
index 000000000000..de9c40d7304a
--- /dev/null
+++ b/mm/page_vma_mapped.c
@@ -0,0 +1,223 @@
+#include <linux/mm.h>
+#include <linux/rmap.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+
+#include "internal.h"
+
+static inline bool check_pmd(struct page_vma_mapped_walk *pvmw)
+{
+ pmd_t pmde;
+ /*
+ * Make sure we don't re-load pmd between present and !trans_huge check.
+ * We need a consistent view.
+ */
+ pmde = READ_ONCE(*pvmw->pmd);
+ return pmd_present(pmde) && !pmd_trans_huge(pmde);
+}
+
+static inline bool not_found(struct page_vma_mapped_walk *pvmw)
+{
+ page_vma_mapped_walk_done(pvmw);
+ return false;
+}
+
+static bool map_pte(struct page_vma_mapped_walk *pvmw)
+{
+ pvmw->pte = pte_offset_map(pvmw->pmd, pvmw->address);
+ if (!(pvmw->flags & PVMW_SYNC)) {
+ if (pvmw->flags & PVMW_MIGRATION) {
+ if (!is_swap_pte(*pvmw->pte))
+ return false;
+ } else {
+ if (!pte_present(*pvmw->pte))
+ return false;
+ }
+ }
+ pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
+ spin_lock(pvmw->ptl);
+ return true;
+}
+
+static bool check_pte(struct page_vma_mapped_walk *pvmw)
+{
+ if (pvmw->flags & PVMW_MIGRATION) {
+#ifdef CONFIG_MIGRATION
+ swp_entry_t entry;
+ if (!is_swap_pte(*pvmw->pte))
+ return false;
+ entry = pte_to_swp_entry(*pvmw->pte);
+ if (!is_migration_entry(entry))
+ return false;
+ if (migration_entry_to_page(entry) - pvmw->page >=
+ hpage_nr_pages(pvmw->page)) {
+ return false;
+ }
+ if (migration_entry_to_page(entry) < pvmw->page)
+ return false;
+#else
+ WARN_ON_ONCE(1);
+#endif
+ } else {
+ if (!pte_present(*pvmw->pte))
+ return false;
+
+ /* THP can be referenced by any subpage */
+ if (pte_page(*pvmw->pte) - pvmw->page >=
+ hpage_nr_pages(pvmw->page)) {
+ return false;
+ }
+ if (pte_page(*pvmw->pte) < pvmw->page)
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at
+ * @pvmw->address
+ * @pvmw: pointer to struct page_vma_mapped_walk. page, vma, address and flags
+ * must be set. pmd, pte and ptl must be NULL.
+ *
+ * Returns true if the page is mapped in the vma. @pvmw->pmd and @pvmw->pte point
+ * to relevant page table entries. @pvmw->ptl is locked. @pvmw->address is
+ * adjusted if needed (for PTE-mapped THPs).
+ *
+ * If @pvmw->pmd is set but @pvmw->pte is not, you have found PMD-mapped page
+ * (usually THP). For PTE-mapped THP, you should run page_vma_mapped_walk() in
+ * a loop to find all PTEs that map the THP.
+ *
+ * For HugeTLB pages, @pvmw->pte is set to the relevant page table entry
+ * regardless of which page table level the page is mapped at. @pvmw->pmd is
+ * NULL.
+ *
+ * Retruns false if there are no more page table entries for the page in
+ * the vma. @pvmw->ptl is unlocked and @pvmw->pte is unmapped.
+ *
+ * If you need to stop the walk before page_vma_mapped_walk() returned false,
+ * use page_vma_mapped_walk_done(). It will do the housekeeping.
+ */
+bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
+{
+ struct mm_struct *mm = pvmw->vma->vm_mm;
+ struct page *page = pvmw->page;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ /* The only possible pmd mapping has been handled on last iteration */
+ if (pvmw->pmd && !pvmw->pte)
+ return not_found(pvmw);
+
+ if (pvmw->pte)
+ goto next_pte;
+
+ if (unlikely(PageHuge(pvmw->page))) {
+ /* when pud is not present, pte will be NULL */
+ pvmw->pte = huge_pte_offset(mm, pvmw->address);
+ if (!pvmw->pte)
+ return false;
+
+ pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
+ spin_lock(pvmw->ptl);
+ if (!check_pte(pvmw))
+ return not_found(pvmw);
+ return true;
+ }
+restart:
+ pgd = pgd_offset(mm, pvmw->address);
+ if (!pgd_present(*pgd))
+ return false;
+ p4d = p4d_offset(pgd, pvmw->address);
+ if (!p4d_present(*p4d))
+ return false;
+ pud = pud_offset(p4d, pvmw->address);
+ if (!pud_present(*pud))
+ return false;
+ pvmw->pmd = pmd_offset(pud, pvmw->address);
+ if (pmd_trans_huge(*pvmw->pmd)) {
+ pvmw->ptl = pmd_lock(mm, pvmw->pmd);
+ if (!pmd_present(*pvmw->pmd))
+ return not_found(pvmw);
+ if (likely(pmd_trans_huge(*pvmw->pmd))) {
+ if (pvmw->flags & PVMW_MIGRATION)
+ return not_found(pvmw);
+ if (pmd_page(*pvmw->pmd) != page)
+ return not_found(pvmw);
+ return true;
+ } else {
+ /* THP pmd was split under us: handle on pte level */
+ spin_unlock(pvmw->ptl);
+ pvmw->ptl = NULL;
+ }
+ } else {
+ if (!check_pmd(pvmw))
+ return false;
+ }
+ if (!map_pte(pvmw))
+ goto next_pte;
+ while (1) {
+ if (check_pte(pvmw))
+ return true;
+next_pte:
+ /* Seek to next pte only makes sense for THP */
+ if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
+ return not_found(pvmw);
+ do {
+ pvmw->address += PAGE_SIZE;
+ if (pvmw->address >= pvmw->vma->vm_end ||
+ pvmw->address >=
+ __vma_address(pvmw->page, pvmw->vma) +
+ hpage_nr_pages(pvmw->page) * PAGE_SIZE)
+ return not_found(pvmw);
+ /* Did we cross page table boundary? */
+ if (pvmw->address % PMD_SIZE == 0) {
+ pte_unmap(pvmw->pte);
+ if (pvmw->ptl) {
+ spin_unlock(pvmw->ptl);
+ pvmw->ptl = NULL;
+ }
+ goto restart;
+ } else {
+ pvmw->pte++;
+ }
+ } while (pte_none(*pvmw->pte));
+
+ if (!pvmw->ptl) {
+ pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
+ spin_lock(pvmw->ptl);
+ }
+ }
+}
+
+/**
+ * page_mapped_in_vma - check whether a page is really mapped in a VMA
+ * @page: the page to test
+ * @vma: the VMA to test
+ *
+ * Returns 1 if the page is mapped into the page tables of the VMA, 0
+ * if the page is not mapped into the page tables of this VMA. Only
+ * valid for normal file or anonymous VMAs.
+ */
+int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+{
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .flags = PVMW_SYNC,
+ };
+ unsigned long start, end;
+
+ start = __vma_address(page, vma);
+ end = start + PAGE_SIZE * (hpage_nr_pages(page) - 1);
+
+ if (unlikely(end < vma->vm_start || start >= vma->vm_end))
+ return 0;
+ pvmw.address = max(start, vma->vm_start);
+ if (!page_vma_mapped_walk(&pvmw))
+ return 0;
+ page_vma_mapped_walk_done(&pvmw);
+ return 1;
+}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 207244489a68..60f7856e508f 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -69,23 +69,41 @@ again:
return err;
}
-static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
pud_t *pud;
unsigned long next;
int err = 0;
- pud = pud_offset(pgd, addr);
+ pud = pud_offset(p4d, addr);
do {
+ again:
next = pud_addr_end(addr, end);
- if (pud_none_or_clear_bad(pud)) {
+ if (pud_none(*pud) || !walk->vma) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
+
+ if (walk->pud_entry) {
+ spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
+
+ if (ptl) {
+ err = walk->pud_entry(pud, addr, next, walk);
+ spin_unlock(ptl);
+ if (err)
+ break;
+ continue;
+ }
+ }
+
+ split_huge_pud(walk->vma, pud, addr);
+ if (pud_none(*pud))
+ goto again;
+
if (walk->pmd_entry || walk->pte_entry)
err = walk_pmd_range(pud, addr, next, walk);
if (err)
@@ -95,6 +113,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
return err;
}
+static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ p4d_t *p4d;
+ unsigned long next;
+ int err = 0;
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none_or_clear_bad(p4d)) {
+ if (walk->pte_hole)
+ err = walk->pte_hole(addr, next, walk);
+ if (err)
+ break;
+ continue;
+ }
+ if (walk->pmd_entry || walk->pte_entry)
+ err = walk_pud_range(p4d, addr, next, walk);
+ if (err)
+ break;
+ } while (p4d++, addr = next, addr != end);
+
+ return err;
+}
+
static int walk_pgd_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
@@ -113,7 +157,7 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
continue;
}
if (walk->pmd_entry || walk->pte_entry)
- err = walk_pud_range(pgd, addr, next, walk);
+ err = walk_p4d_range(pgd, addr, next, walk);
if (err)
break;
} while (pgd++, addr = next, addr != end);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 538998a137d2..9ac639499bd1 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -21,7 +21,6 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
/**
* pcpu_get_pages - get temp pages array
- * @chunk: chunk of interest
*
* Returns pointer to array of pointers to struct page which can be indexed
* with pcpu_page_idx(). Note that there is only one array and accesses
@@ -30,7 +29,7 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
* RETURNS:
* Pointer to temp pages array on success.
*/
-static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc)
+static struct page **pcpu_get_pages(void)
{
static struct page **pages;
size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
@@ -275,7 +274,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
{
struct page **pages;
- pages = pcpu_get_pages(chunk);
+ pages = pcpu_get_pages();
if (!pages)
return -ENOMEM;
@@ -313,7 +312,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
* successful population attempt so the temp pages array must
* be available now.
*/
- pages = pcpu_get_pages(chunk);
+ pages = pcpu_get_pages();
BUG_ON(!pages);
/* unmap and free */
diff --git a/mm/percpu.c b/mm/percpu.c
index 255714302394..60a6488e9e6d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -43,7 +43,7 @@
* Chunks can be determined from the address using the index field
* in the page struct. The index field contains a pointer to the chunk.
*
- * To use this allocator, arch code should do the followings.
+ * To use this allocator, arch code should do the following:
*
* - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
* regular address to percpu pointer and back if they need to be
@@ -886,7 +886,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
size = ALIGN(size, 2);
- if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
+ if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
+ !is_power_of_2(align))) {
WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n",
size, align);
return NULL;
@@ -1010,8 +1011,11 @@ area_found:
mutex_unlock(&pcpu_alloc_mutex);
}
- if (chunk != pcpu_reserved_chunk)
+ if (chunk != pcpu_reserved_chunk) {
+ spin_lock_irqsave(&pcpu_lock, flags);
pcpu_nr_empty_pop_pages -= occ_pages;
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+ }
if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
pcpu_schedule_balance_work();
@@ -2093,6 +2097,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
size_t pages_size;
struct page **pages;
int unit, i, j, rc;
+ int upa;
+ int nr_g0_units;
snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
@@ -2100,7 +2106,12 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
if (IS_ERR(ai))
return PTR_ERR(ai);
BUG_ON(ai->nr_groups != 1);
- BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
+ upa = ai->alloc_size/ai->unit_size;
+ nr_g0_units = roundup(num_possible_cpus(), upa);
+ if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) {
+ pcpu_free_alloc_info(ai);
+ return -EINVAL;
+ }
unit_pages = ai->unit_size >> PAGE_SHIFT;
@@ -2111,21 +2122,22 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
/* allocate pages */
j = 0;
- for (unit = 0; unit < num_possible_cpus(); unit++)
+ for (unit = 0; unit < num_possible_cpus(); unit++) {
+ unsigned int cpu = ai->groups[0].cpu_map[unit];
for (i = 0; i < unit_pages; i++) {
- unsigned int cpu = ai->groups[0].cpu_map[unit];
void *ptr;
ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
if (!ptr) {
pr_warn("failed to allocate %s page for cpu%u\n",
- psize_str, cpu);
+ psize_str, cpu);
goto enomem;
}
/* kmemleak tracks the percpu allocations separately */
kmemleak_free(ptr);
pages[j++] = virt_to_page(ptr);
}
+ }
/* allocate vm area, map the pages and copy static data */
vm.flags = VM_ALLOC;
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 71c5f9109f2a..c99d9512a45b 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -22,6 +22,12 @@ void pgd_clear_bad(pgd_t *pgd)
pgd_clear(pgd);
}
+void p4d_clear_bad(p4d_t *p4d)
+{
+ p4d_ERROR(*p4d);
+ p4d_clear(p4d);
+}
+
void pud_clear_bad(pud_t *pud)
{
pud_ERROR(*pud);
@@ -123,6 +129,20 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp)
+{
+ pud_t pud;
+
+ VM_BUG_ON(address & ~HPAGE_PUD_MASK);
+ VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp));
+ pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
+ flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
+ return pud;
+}
+#endif
#endif
#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index be8dc8d1edb9..8973cd231ece 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -12,6 +12,7 @@
#include <linux/mm.h>
#include <linux/uio.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/highmem.h>
#include <linux/ptrace.h>
#include <linux/slab.h>
@@ -88,7 +89,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
ssize_t rc = 0;
unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
/ sizeof(struct pages *);
- unsigned int flags = FOLL_REMOTE;
+ unsigned int flags = 0;
/* Work out address and page range required */
if (len == 0)
@@ -100,15 +101,19 @@ static int process_vm_rw_single_vec(unsigned long addr,
while (!rc && nr_pages && iov_iter_count(iter)) {
int pages = min(nr_pages, max_pages_per_loop);
+ int locked = 1;
size_t bytes;
/*
* Get the pages we're interested in. We must
- * add FOLL_REMOTE because task/mm might not
+ * access remotely because task/mm might not
* current/current->mm
*/
- pages = __get_user_pages_unlocked(task, mm, pa, pages,
- process_pages, flags);
+ down_read(&mm->mmap_sem);
+ pages = get_user_pages_remote(task, mm, pa, pages, flags,
+ process_pages, NULL, &locked);
+ if (locked)
+ up_read(&mm->mmap_sem);
if (pages <= 0)
return -EFAULT;
diff --git a/mm/readahead.c b/mm/readahead.c
index c8a955b1297e..c4ca70239233 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -207,12 +207,21 @@ out:
* memory at once.
*/
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
- pgoff_t offset, unsigned long nr_to_read)
+ pgoff_t offset, unsigned long nr_to_read)
{
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ struct file_ra_state *ra = &filp->f_ra;
+ unsigned long max_pages;
+
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
return -EINVAL;
- nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages);
+ /*
+ * If the request exceeds the readahead window, allow the read to
+ * be up to the optimal hardware IO size
+ */
+ max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
+ nr_to_read = min(nr_to_read, max_pages);
while (nr_to_read) {
int err;
@@ -369,10 +378,18 @@ ondemand_readahead(struct address_space *mapping,
bool hit_readahead_marker, pgoff_t offset,
unsigned long req_size)
{
- unsigned long max = ra->ra_pages;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ unsigned long max_pages = ra->ra_pages;
pgoff_t prev_offset;
/*
+ * If the request exceeds the readahead window, allow the read to
+ * be up to the optimal hardware IO size
+ */
+ if (req_size > max_pages && bdi->io_pages > max_pages)
+ max_pages = min(req_size, bdi->io_pages);
+
+ /*
* start of file
*/
if (!offset)
@@ -385,7 +402,7 @@ ondemand_readahead(struct address_space *mapping,
if ((offset == (ra->start + ra->size - ra->async_size) ||
offset == (ra->start + ra->size))) {
ra->start += ra->size;
- ra->size = get_next_ra_size(ra, max);
+ ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
goto readit;
}
@@ -400,16 +417,16 @@ ondemand_readahead(struct address_space *mapping,
pgoff_t start;
rcu_read_lock();
- start = page_cache_next_hole(mapping, offset + 1, max);
+ start = page_cache_next_hole(mapping, offset + 1, max_pages);
rcu_read_unlock();
- if (!start || start - offset > max)
+ if (!start || start - offset > max_pages)
return 0;
ra->start = start;
ra->size = start - offset; /* old async_size */
ra->size += req_size;
- ra->size = get_next_ra_size(ra, max);
+ ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
goto readit;
}
@@ -417,7 +434,7 @@ ondemand_readahead(struct address_space *mapping,
/*
* oversize read
*/
- if (req_size > max)
+ if (req_size > max_pages)
goto initial_readahead;
/*
@@ -433,7 +450,7 @@ ondemand_readahead(struct address_space *mapping,
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
- if (try_context_readahead(mapping, ra, offset, req_size, max))
+ if (try_context_readahead(mapping, ra, offset, req_size, max_pages))
goto readit;
/*
@@ -444,7 +461,7 @@ ondemand_readahead(struct address_space *mapping,
initial_readahead:
ra->start = offset;
- ra->size = get_init_ra_size(req_size, max);
+ ra->size = get_init_ra_size(req_size, max_pages);
ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
readit:
@@ -454,7 +471,7 @@ readit:
* the resulted next readahead window into the current one.
*/
if (offset == ra->start && ra->size == ra->async_size) {
- ra->async_size = get_next_ra_size(ra, max);
+ ra->async_size = get_next_ra_size(ra, max_pages);
ra->size += ra->async_size;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 1ef36404e7b2..f6838015810f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -46,6 +46,8 @@
*/
#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
@@ -141,14 +143,15 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
}
/**
- * anon_vma_prepare - attach an anon_vma to a memory region
+ * __anon_vma_prepare - attach an anon_vma to a memory region
* @vma: the memory region in question
*
* This makes sure the memory mapping described by 'vma' has
* an 'anon_vma' attached to it, so that we can associate the
* anonymous pages mapped into it with that anon_vma.
*
- * The common case will be that we already have one, but if
+ * The common case will be that we already have one, which
+ * is handled inline by anon_vma_prepare(). But if
* not we either need to find an adjacent mapping that we
* can re-use the anon_vma from (very common when the only
* reason for splitting a vma has been mprotect()), or we
@@ -167,48 +170,46 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
*
* This must be called with the mmap_sem held for reading.
*/
-int anon_vma_prepare(struct vm_area_struct *vma)
+int __anon_vma_prepare(struct vm_area_struct *vma)
{
- struct anon_vma *anon_vma = vma->anon_vma;
+ struct mm_struct *mm = vma->vm_mm;
+ struct anon_vma *anon_vma, *allocated;
struct anon_vma_chain *avc;
might_sleep();
- if (unlikely(!anon_vma)) {
- struct mm_struct *mm = vma->vm_mm;
- struct anon_vma *allocated;
- avc = anon_vma_chain_alloc(GFP_KERNEL);
- if (!avc)
- goto out_enomem;
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto out_enomem;
+
+ anon_vma = find_mergeable_anon_vma(vma);
+ allocated = NULL;
+ if (!anon_vma) {
+ anon_vma = anon_vma_alloc();
+ if (unlikely(!anon_vma))
+ goto out_enomem_free_avc;
+ allocated = anon_vma;
+ }
- anon_vma = find_mergeable_anon_vma(vma);
+ anon_vma_lock_write(anon_vma);
+ /* page_table_lock to protect against threads */
+ spin_lock(&mm->page_table_lock);
+ if (likely(!vma->anon_vma)) {
+ vma->anon_vma = anon_vma;
+ anon_vma_chain_link(vma, avc, anon_vma);
+ /* vma reference or self-parent link for new root */
+ anon_vma->degree++;
allocated = NULL;
- if (!anon_vma) {
- anon_vma = anon_vma_alloc();
- if (unlikely(!anon_vma))
- goto out_enomem_free_avc;
- allocated = anon_vma;
- }
+ avc = NULL;
+ }
+ spin_unlock(&mm->page_table_lock);
+ anon_vma_unlock_write(anon_vma);
- anon_vma_lock_write(anon_vma);
- /* page_table_lock to protect against threads */
- spin_lock(&mm->page_table_lock);
- if (likely(!vma->anon_vma)) {
- vma->anon_vma = anon_vma;
- anon_vma_chain_link(vma, avc, anon_vma);
- /* vma reference or self-parent link for new root */
- anon_vma->degree++;
- allocated = NULL;
- avc = NULL;
- }
- spin_unlock(&mm->page_table_lock);
- anon_vma_unlock_write(anon_vma);
+ if (unlikely(allocated))
+ put_anon_vma(allocated);
+ if (unlikely(avc))
+ anon_vma_chain_free(avc);
- if (unlikely(allocated))
- put_anon_vma(allocated);
- if (unlikely(avc))
- anon_vma_chain_free(avc);
- }
return 0;
out_enomem_free_avc:
@@ -608,8 +609,7 @@ void try_to_unmap_flush_dirty(void)
try_to_unmap_flush();
}
-static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
- struct page *page, bool writable)
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
@@ -644,8 +644,7 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
return should_defer;
}
#else
-static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
- struct page *page, bool writable)
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
}
@@ -685,6 +684,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd = NULL;
pmd_t pmde;
@@ -693,7 +693,11 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
if (!pgd_present(*pgd))
goto out;
- pud = pud_offset(pgd, address);
+ p4d = p4d_offset(pgd, address);
+ if (!p4d_present(*p4d))
+ goto out;
+
+ pud = pud_offset(p4d, address);
if (!pud_present(*pud))
goto out;
@@ -711,170 +715,6 @@ out:
return pmd;
}
-/*
- * Check that @page is mapped at @address into @mm.
- *
- * If @sync is false, page_check_address may perform a racy check to avoid
- * the page table lock when the pte is not present (helpful when reclaiming
- * highly shared pages).
- *
- * On success returns with pte mapped and locked.
- */
-pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
- unsigned long address, spinlock_t **ptlp, int sync)
-{
- pmd_t *pmd;
- pte_t *pte;
- spinlock_t *ptl;
-
- if (unlikely(PageHuge(page))) {
- /* when pud is not present, pte will be NULL */
- pte = huge_pte_offset(mm, address);
- if (!pte)
- return NULL;
-
- ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
- goto check;
- }
-
- pmd = mm_find_pmd(mm, address);
- if (!pmd)
- return NULL;
-
- pte = pte_offset_map(pmd, address);
- /* Make a quick check before getting the lock */
- if (!sync && !pte_present(*pte)) {
- pte_unmap(pte);
- return NULL;
- }
-
- ptl = pte_lockptr(mm, pmd);
-check:
- spin_lock(ptl);
- if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
- *ptlp = ptl;
- return pte;
- }
- pte_unmap_unlock(pte, ptl);
- return NULL;
-}
-
-/**
- * page_mapped_in_vma - check whether a page is really mapped in a VMA
- * @page: the page to test
- * @vma: the VMA to test
- *
- * Returns 1 if the page is mapped into the page tables of the VMA, 0
- * if the page is not mapped into the page tables of this VMA. Only
- * valid for normal file or anonymous VMAs.
- */
-int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
-{
- unsigned long address;
- pte_t *pte;
- spinlock_t *ptl;
-
- address = __vma_address(page, vma);
- if (unlikely(address < vma->vm_start || address >= vma->vm_end))
- return 0;
- pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
- if (!pte) /* the page is not in this mm */
- return 0;
- pte_unmap_unlock(pte, ptl);
-
- return 1;
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/*
- * Check that @page is mapped at @address into @mm. In contrast to
- * page_check_address(), this function can handle transparent huge pages.
- *
- * On success returns true with pte mapped and locked. For PMD-mapped
- * transparent huge pages *@ptep is set to NULL.
- */
-bool page_check_address_transhuge(struct page *page, struct mm_struct *mm,
- unsigned long address, pmd_t **pmdp,
- pte_t **ptep, spinlock_t **ptlp)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- spinlock_t *ptl;
-
- if (unlikely(PageHuge(page))) {
- /* when pud is not present, pte will be NULL */
- pte = huge_pte_offset(mm, address);
- if (!pte)
- return false;
-
- ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
- pmd = NULL;
- goto check_pte;
- }
-
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- return false;
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return false;
- pmd = pmd_offset(pud, address);
-
- if (pmd_trans_huge(*pmd)) {
- ptl = pmd_lock(mm, pmd);
- if (!pmd_present(*pmd))
- goto unlock_pmd;
- if (unlikely(!pmd_trans_huge(*pmd))) {
- spin_unlock(ptl);
- goto map_pte;
- }
-
- if (pmd_page(*pmd) != page)
- goto unlock_pmd;
-
- pte = NULL;
- goto found;
-unlock_pmd:
- spin_unlock(ptl);
- return false;
- } else {
- pmd_t pmde = *pmd;
-
- barrier();
- if (!pmd_present(pmde) || pmd_trans_huge(pmde))
- return false;
- }
-map_pte:
- pte = pte_offset_map(pmd, address);
- if (!pte_present(*pte)) {
- pte_unmap(pte);
- return false;
- }
-
- ptl = pte_lockptr(mm, pmd);
-check_pte:
- spin_lock(ptl);
-
- if (!pte_present(*pte)) {
- pte_unmap_unlock(pte, ptl);
- return false;
- }
-
- /* THP can be referenced by any subpage */
- if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) {
- pte_unmap_unlock(pte, ptl);
- return false;
- }
-found:
- *ptep = pte;
- *pmdp = pmd;
- *ptlp = ptl;
- return true;
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
struct page_referenced_arg {
int mapcount;
int referenced;
@@ -887,45 +727,48 @@ struct page_referenced_arg {
static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
- struct mm_struct *mm = vma->vm_mm;
struct page_referenced_arg *pra = arg;
- pmd_t *pmd;
- pte_t *pte;
- spinlock_t *ptl;
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .address = address,
+ };
int referenced = 0;
- if (!page_check_address_transhuge(page, mm, address, &pmd, &pte, &ptl))
- return SWAP_AGAIN;
+ while (page_vma_mapped_walk(&pvmw)) {
+ address = pvmw.address;
- if (vma->vm_flags & VM_LOCKED) {
- if (pte)
- pte_unmap(pte);
- spin_unlock(ptl);
- pra->vm_flags |= VM_LOCKED;
- return SWAP_FAIL; /* To break the loop */
- }
+ if (vma->vm_flags & VM_LOCKED) {
+ page_vma_mapped_walk_done(&pvmw);
+ pra->vm_flags |= VM_LOCKED;
+ return SWAP_FAIL; /* To break the loop */
+ }
- if (pte) {
- if (ptep_clear_flush_young_notify(vma, address, pte)) {
- /*
- * Don't treat a reference through a sequentially read
- * mapping as such. If the page has been used in
- * another mapping, we will catch it; if this other
- * mapping is already gone, the unmap path will have
- * set PG_referenced or activated the page.
- */
- if (likely(!(vma->vm_flags & VM_SEQ_READ)))
+ if (pvmw.pte) {
+ if (ptep_clear_flush_young_notify(vma, address,
+ pvmw.pte)) {
+ /*
+ * Don't treat a reference through
+ * a sequentially read mapping as such.
+ * If the page has been used in another mapping,
+ * we will catch it; if this other mapping is
+ * already gone, the unmap path will have set
+ * PG_referenced or activated the page.
+ */
+ if (likely(!(vma->vm_flags & VM_SEQ_READ)))
+ referenced++;
+ }
+ } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ if (pmdp_clear_flush_young_notify(vma, address,
+ pvmw.pmd))
referenced++;
+ } else {
+ /* unexpected pmd-mapped page? */
+ WARN_ON_ONCE(1);
}
- pte_unmap(pte);
- } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- if (pmdp_clear_flush_young_notify(vma, address, pmd))
- referenced++;
- } else {
- /* unexpected pmd-mapped page? */
- WARN_ON_ONCE(1);
+
+ pra->mapcount--;
}
- spin_unlock(ptl);
if (referenced)
clear_page_idle(page);
@@ -937,7 +780,6 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
pra->vm_flags |= vma->vm_flags;
}
- pra->mapcount--;
if (!pra->mapcount)
return SWAP_SUCCESS; /* To break the loop */
@@ -1016,34 +858,56 @@ int page_referenced(struct page *page,
static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
- struct mm_struct *mm = vma->vm_mm;
- pte_t *pte;
- spinlock_t *ptl;
- int ret = 0;
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .address = address,
+ .flags = PVMW_SYNC,
+ };
int *cleaned = arg;
- pte = page_check_address(page, mm, address, &ptl, 1);
- if (!pte)
- goto out;
-
- if (pte_dirty(*pte) || pte_write(*pte)) {
- pte_t entry;
+ while (page_vma_mapped_walk(&pvmw)) {
+ int ret = 0;
+ address = pvmw.address;
+ if (pvmw.pte) {
+ pte_t entry;
+ pte_t *pte = pvmw.pte;
+
+ if (!pte_dirty(*pte) && !pte_write(*pte))
+ continue;
+
+ flush_cache_page(vma, address, pte_pfn(*pte));
+ entry = ptep_clear_flush(vma, address, pte);
+ entry = pte_wrprotect(entry);
+ entry = pte_mkclean(entry);
+ set_pte_at(vma->vm_mm, address, pte, entry);
+ ret = 1;
+ } else {
+#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+ pmd_t *pmd = pvmw.pmd;
+ pmd_t entry;
+
+ if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
+ continue;
+
+ flush_cache_page(vma, address, page_to_pfn(page));
+ entry = pmdp_huge_clear_flush(vma, address, pmd);
+ entry = pmd_wrprotect(entry);
+ entry = pmd_mkclean(entry);
+ set_pmd_at(vma->vm_mm, address, pmd, entry);
+ ret = 1;
+#else
+ /* unexpected pmd-mapped page? */
+ WARN_ON_ONCE(1);
+#endif
+ }
- flush_cache_page(vma, address, pte_pfn(*pte));
- entry = ptep_clear_flush(vma, address, pte);
- entry = pte_wrprotect(entry);
- entry = pte_mkclean(entry);
- set_pte_at(mm, address, pte, entry);
- ret = 1;
+ if (ret) {
+ mmu_notifier_invalidate_page(vma->vm_mm, address);
+ (*cleaned)++;
+ }
}
- pte_unmap_unlock(pte, ptl);
-
- if (ret) {
- mmu_notifier_invalidate_page(mm, address);
- (*cleaned)++;
- }
-out:
return SWAP_AGAIN;
}
@@ -1295,7 +1159,7 @@ void page_add_file_rmap(struct page *page, bool compound)
goto out;
}
__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr);
- mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, nr);
out:
unlock_page_memcg(page);
}
@@ -1335,7 +1199,7 @@ static void page_remove_file_rmap(struct page *page, bool compound)
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr);
- mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, -nr);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
@@ -1436,155 +1300,164 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
- pte_t *pte;
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .address = address,
+ };
pte_t pteval;
- spinlock_t *ptl;
+ struct page *subpage;
int ret = SWAP_AGAIN;
struct rmap_private *rp = arg;
enum ttu_flags flags = rp->flags;
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
- goto out;
+ return SWAP_AGAIN;
if (flags & TTU_SPLIT_HUGE_PMD) {
split_huge_pmd_address(vma, address,
flags & TTU_MIGRATION, page);
- /* check if we have anything to do after split */
- if (page_mapcount(page) == 0)
- goto out;
}
- pte = page_check_address(page, mm, address, &ptl,
- PageTransCompound(page));
- if (!pte)
- goto out;
-
- /*
- * If the page is mlock()d, we cannot swap it out.
- * If it's recently referenced (perhaps page_referenced
- * skipped over this mm) then we should reactivate it.
- */
- if (!(flags & TTU_IGNORE_MLOCK)) {
- if (vma->vm_flags & VM_LOCKED) {
- /* PTE-mapped THP are never mlocked */
- if (!PageTransCompound(page)) {
- /*
- * Holding pte lock, we do *not* need
- * mmap_sem here
- */
- mlock_vma_page(page);
+ while (page_vma_mapped_walk(&pvmw)) {
+ /*
+ * If the page is mlock()d, we cannot swap it out.
+ * If it's recently referenced (perhaps page_referenced
+ * skipped over this mm) then we should reactivate it.
+ */
+ if (!(flags & TTU_IGNORE_MLOCK)) {
+ if (vma->vm_flags & VM_LOCKED) {
+ /* PTE-mapped THP are never mlocked */
+ if (!PageTransCompound(page)) {
+ /*
+ * Holding pte lock, we do *not* need
+ * mmap_sem here
+ */
+ mlock_vma_page(page);
+ }
+ ret = SWAP_MLOCK;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
}
- ret = SWAP_MLOCK;
- goto out_unmap;
+ if (flags & TTU_MUNLOCK)
+ continue;
}
- if (flags & TTU_MUNLOCK)
- goto out_unmap;
- }
- if (!(flags & TTU_IGNORE_ACCESS)) {
- if (ptep_clear_flush_young_notify(vma, address, pte)) {
- ret = SWAP_FAIL;
- goto out_unmap;
- }
- }
- /* Nuke the page table entry. */
- flush_cache_page(vma, address, page_to_pfn(page));
- if (should_defer_flush(mm, flags)) {
- /*
- * We clear the PTE but do not flush so potentially a remote
- * CPU could still be writing to the page. If the entry was
- * previously clean then the architecture must guarantee that
- * a clear->dirty transition on a cached TLB entry is written
- * through and traps if the PTE is unmapped.
- */
- pteval = ptep_get_and_clear(mm, address, pte);
+ /* Unexpected PMD-mapped THP? */
+ VM_BUG_ON_PAGE(!pvmw.pte, page);
- set_tlb_ubc_flush_pending(mm, page, pte_dirty(pteval));
- } else {
- pteval = ptep_clear_flush(vma, address, pte);
- }
+ subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+ address = pvmw.address;
- /* Move the dirty bit to the physical page now the pte is gone. */
- if (pte_dirty(pteval))
- set_page_dirty(page);
- /* Update high watermark before we lower rss */
- update_hiwater_rss(mm);
+ if (!(flags & TTU_IGNORE_ACCESS)) {
+ if (ptep_clear_flush_young_notify(vma, address,
+ pvmw.pte)) {
+ ret = SWAP_FAIL;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ }
+
+ /* Nuke the page table entry. */
+ flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+ if (should_defer_flush(mm, flags)) {
+ /*
+ * We clear the PTE but do not flush so potentially
+ * a remote CPU could still be writing to the page.
+ * If the entry was previously clean then the
+ * architecture must guarantee that a clear->dirty
+ * transition on a cached TLB entry is written through
+ * and traps if the PTE is unmapped.
+ */
+ pteval = ptep_get_and_clear(mm, address, pvmw.pte);
- if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
- if (PageHuge(page)) {
- hugetlb_count_sub(1 << compound_order(page), mm);
+ set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
} else {
- dec_mm_counter(mm, mm_counter(page));
+ pteval = ptep_clear_flush(vma, address, pvmw.pte);
}
- set_pte_at(mm, address, pte,
- swp_entry_to_pte(make_hwpoison_entry(page)));
- } else if (pte_unused(pteval)) {
- /*
- * The guest indicated that the page content is of no
- * interest anymore. Simply discard the pte, vmscan
- * will take care of the rest.
- */
- dec_mm_counter(mm, mm_counter(page));
- } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) {
- swp_entry_t entry;
- pte_t swp_pte;
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- entry = make_migration_entry(page, pte_write(pteval));
- swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- set_pte_at(mm, address, pte, swp_pte);
- } else if (PageAnon(page)) {
- swp_entry_t entry = { .val = page_private(page) };
- pte_t swp_pte;
- /*
- * Store the swap location in the pte.
- * See handle_pte_fault() ...
- */
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- if (!PageDirty(page) && (flags & TTU_LZFREE)) {
- /* It's a freeable page by MADV_FREE */
- dec_mm_counter(mm, MM_ANONPAGES);
- rp->lazyfreed++;
- goto discard;
- }
+ /* Move the dirty bit to the page. Now the pte is gone. */
+ if (pte_dirty(pteval))
+ set_page_dirty(page);
- if (swap_duplicate(entry) < 0) {
- set_pte_at(mm, address, pte, pteval);
- ret = SWAP_FAIL;
- goto out_unmap;
- }
- if (list_empty(&mm->mmlist)) {
- spin_lock(&mmlist_lock);
- if (list_empty(&mm->mmlist))
- list_add(&mm->mmlist, &init_mm.mmlist);
- spin_unlock(&mmlist_lock);
- }
- dec_mm_counter(mm, MM_ANONPAGES);
- inc_mm_counter(mm, MM_SWAPENTS);
- swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- set_pte_at(mm, address, pte, swp_pte);
- } else
- dec_mm_counter(mm, mm_counter_file(page));
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
-discard:
- page_remove_rmap(page, PageHuge(page));
- put_page(page);
+ if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
+ if (PageHuge(page)) {
+ int nr = 1 << compound_order(page);
+ hugetlb_count_sub(nr, mm);
+ } else {
+ dec_mm_counter(mm, mm_counter(page));
+ }
+
+ pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ } else if (pte_unused(pteval)) {
+ /*
+ * The guest indicated that the page content is of no
+ * interest anymore. Simply discard the pte, vmscan
+ * will take care of the rest.
+ */
+ dec_mm_counter(mm, mm_counter(page));
+ } else if (IS_ENABLED(CONFIG_MIGRATION) &&
+ (flags & TTU_MIGRATION)) {
+ swp_entry_t entry;
+ pte_t swp_pte;
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ entry = make_migration_entry(subpage,
+ pte_write(pteval));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ set_pte_at(mm, address, pvmw.pte, swp_pte);
+ } else if (PageAnon(page)) {
+ swp_entry_t entry = { .val = page_private(subpage) };
+ pte_t swp_pte;
+ /*
+ * Store the swap location in the pte.
+ * See handle_pte_fault() ...
+ */
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
-out_unmap:
- pte_unmap_unlock(pte, ptl);
- if (ret != SWAP_FAIL && ret != SWAP_MLOCK && !(flags & TTU_MUNLOCK))
+ if (!PageDirty(page) && (flags & TTU_LZFREE)) {
+ /* It's a freeable page by MADV_FREE */
+ dec_mm_counter(mm, MM_ANONPAGES);
+ rp->lazyfreed++;
+ goto discard;
+ }
+
+ if (swap_duplicate(entry) < 0) {
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ ret = SWAP_FAIL;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ if (list_empty(&mm->mmlist)) {
+ spin_lock(&mmlist_lock);
+ if (list_empty(&mm->mmlist))
+ list_add(&mm->mmlist, &init_mm.mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ dec_mm_counter(mm, MM_ANONPAGES);
+ inc_mm_counter(mm, MM_SWAPENTS);
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ set_pte_at(mm, address, pvmw.pte, swp_pte);
+ } else
+ dec_mm_counter(mm, mm_counter_file(page));
+discard:
+ page_remove_rmap(subpage, PageHuge(page));
+ put_page(page);
mmu_notifier_invalidate_page(mm, address);
-out:
+ }
return ret;
}
@@ -1609,7 +1482,7 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
static int page_mapcount_is_zero(struct page *page)
{
- return !page_mapcount(page);
+ return !total_mapcount(page);
}
/**
@@ -1756,7 +1629,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
bool locked)
{
struct anon_vma *anon_vma;
- pgoff_t pgoff;
+ pgoff_t pgoff_start, pgoff_end;
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
@@ -1770,8 +1643,10 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
if (!anon_vma)
return ret;
- pgoff = page_to_pgoff(page);
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
+ pgoff_start = page_to_pgoff(page);
+ pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
+ pgoff_start, pgoff_end) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
@@ -1809,7 +1684,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
bool locked)
{
struct address_space *mapping = page_mapping(page);
- pgoff_t pgoff;
+ pgoff_t pgoff_start, pgoff_end;
struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
@@ -1824,10 +1699,12 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
if (!mapping)
return ret;
- pgoff = page_to_pgoff(page);
+ pgoff_start = page_to_pgoff(page);
+ pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
if (!locked)
i_mmap_lock_read(mapping);
- vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap,
+ pgoff_start, pgoff_end) {
unsigned long address = vma_address(page, vma);
cond_resched();
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
new file mode 100644
index 000000000000..0fd21670b513
--- /dev/null
+++ b/mm/rodata_test.c
@@ -0,0 +1,56 @@
+/*
+ * rodata_test.c: functional test for mark_rodata_ro function
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/uaccess.h>
+#include <asm/sections.h>
+
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
+
+void rodata_test(void)
+{
+ unsigned long start, end;
+ int zero = 0;
+
+ /* test 1: read the value */
+ /* If this test fails, some previous testrun has clobbered the state */
+ if (!rodata_test_data) {
+ pr_err("rodata_test: test 1 fails (start data)\n");
+ return;
+ }
+
+ /* test 2: write to the variable; this should fault */
+ if (!probe_kernel_write((void *)&rodata_test_data,
+ (void *)&zero, sizeof(zero))) {
+ pr_err("rodata_test: test data was not read only\n");
+ return;
+ }
+
+ /* test 3: check the value hasn't changed */
+ if (rodata_test_data == zero) {
+ pr_err("rodata_test: test data was changed\n");
+ return;
+ }
+
+ /* test 4: check if the rodata section is PAGE_SIZE aligned */
+ start = (unsigned long)__start_rodata;
+ end = (unsigned long)__end_rodata;
+ if (start & (PAGE_SIZE - 1)) {
+ pr_err("rodata_test: start of .rodata is not page size aligned\n");
+ return;
+ }
+ if (end & (PAGE_SIZE - 1)) {
+ pr_err("rodata_test: end of .rodata is not page size aligned\n");
+ return;
+ }
+
+ pr_info("rodata_test: all tests were successful\n");
+}
diff --git a/mm/shmem.c b/mm/shmem.c
index 9d32e1cb9f38..e67d6ba4e98e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -29,11 +29,14 @@
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/mm.h>
+#include <linux/sched/signal.h>
#include <linux/export.h>
#include <linux/swap.h>
#include <linux/uio.h>
#include <linux/khugepaged.h>
+#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
+
static struct vfsmount *shm_mnt;
#ifdef CONFIG_SHMEM
@@ -70,8 +73,10 @@ static struct vfsmount *shm_mnt;
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <uapi/linux/memfd.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/rmap.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/pgtable.h>
#include "internal.h"
@@ -115,13 +120,14 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index);
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp,
- gfp_t gfp, struct mm_struct *fault_mm, int *fault_type);
+ gfp_t gfp, struct vm_area_struct *vma,
+ struct vm_fault *vmf, int *fault_type);
int shmem_getpage(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp)
{
return shmem_getpage_gfp(inode, index, pagep, sgp,
- mapping_gfp_mask(inode->i_mapping), NULL, NULL);
+ mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
}
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
@@ -190,6 +196,11 @@ static const struct inode_operations shmem_special_inode_operations;
static const struct vm_operations_struct shmem_vm_ops;
static struct file_system_type shmem_fs_type;
+bool vma_is_shmem(struct vm_area_struct *vma)
+{
+ return vma->vm_ops == &shmem_vm_ops;
+}
+
static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);
@@ -300,18 +311,19 @@ void shmem_uncharge(struct inode *inode, long pages)
static int shmem_radix_tree_replace(struct address_space *mapping,
pgoff_t index, void *expected, void *replacement)
{
+ struct radix_tree_node *node;
void **pslot;
void *item;
VM_BUG_ON(!expected);
VM_BUG_ON(!replacement);
- pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
- if (!pslot)
+ item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot);
+ if (!item)
return -ENOENT;
- item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
if (item != expected)
return -ENOENT;
- radix_tree_replace_slot(pslot, replacement);
+ __radix_tree_replace(&mapping->page_tree, node, pslot,
+ replacement, NULL, NULL);
return 0;
}
@@ -370,6 +382,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
int shmem_huge __read_mostly;
+#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
static int shmem_parse_huge(const char *str)
{
if (!strcmp(str, "never"))
@@ -407,11 +420,13 @@ static const char *shmem_format_huge(int huge)
return "bad_val";
}
}
+#endif
static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
struct shrink_control *sc, unsigned long nr_to_split)
{
LIST_HEAD(list), *pos, *next;
+ LIST_HEAD(to_remove);
struct inode *inode;
struct shmem_inode_info *info;
struct page *page;
@@ -438,9 +453,8 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
/* Check if there's anything to gain */
if (round_up(inode->i_size, PAGE_SIZE) ==
round_up(inode->i_size, HPAGE_PMD_SIZE)) {
- list_del_init(&info->shrinklist);
+ list_move(&info->shrinklist, &to_remove);
removed++;
- iput(inode);
goto next;
}
@@ -451,6 +465,13 @@ next:
}
spin_unlock(&sbinfo->shrinklist_lock);
+ list_for_each_safe(pos, next, &to_remove) {
+ info = list_entry(pos, struct shmem_inode_info, shrinklist);
+ inode = &info->vfs_inode;
+ list_del_init(&info->shrinklist);
+ iput(inode);
+ }
+
list_for_each_safe(pos, next, &list) {
int ret;
@@ -658,8 +679,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
swapped++;
if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
@@ -938,10 +959,10 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
-static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry,
- struct kstat *stat)
+static int shmem_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = path->dentry->d_inode;
struct shmem_inode_info *info = SHMEM_I(inode);
if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
@@ -1046,6 +1067,30 @@ static void shmem_evict_inode(struct inode *inode)
clear_inode(inode);
}
+static unsigned long find_swap_entry(struct radix_tree_root *root, void *item)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ unsigned long found = -1;
+ unsigned int checked = 0;
+
+ rcu_read_lock();
+ radix_tree_for_each_slot(slot, root, &iter, 0) {
+ if (*slot == item) {
+ found = iter.index;
+ break;
+ }
+ checked++;
+ if ((checked % 4096) != 0)
+ continue;
+ slot = radix_tree_iter_resume(slot, &iter);
+ cond_resched_rcu();
+ }
+
+ rcu_read_unlock();
+ return found;
+}
+
/*
* If swap found in inode, free it and move page from swapcache to filecache.
*/
@@ -1059,7 +1104,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
int error = 0;
radswap = swp_to_radix_entry(swap);
- index = radix_tree_locate_item(&mapping->page_tree, radswap);
+ index = find_swap_entry(&mapping->page_tree, radswap);
if (index == -1)
return -EAGAIN; /* tell shmem_unuse we found nothing */
@@ -1536,10 +1581,10 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
*/
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp, gfp_t gfp,
- struct mm_struct *fault_mm, int *fault_type)
+ struct vm_area_struct *vma, struct vm_fault *vmf, int *fault_type)
{
struct address_space *mapping = inode->i_mapping;
- struct shmem_inode_info *info;
+ struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo;
struct mm_struct *charge_mm;
struct mem_cgroup *memcg;
@@ -1589,9 +1634,8 @@ repeat:
* Fast cache lookup did not find it:
* bring it back from swap or allocate.
*/
- info = SHMEM_I(inode);
sbinfo = SHMEM_SB(inode->i_sb);
- charge_mm = fault_mm ? : current->mm;
+ charge_mm = vma ? vma->vm_mm : current->mm;
if (swap.val) {
/* Look it up and read it in.. */
@@ -1601,7 +1645,8 @@ repeat:
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(fault_mm, PGMAJFAULT);
+ mem_cgroup_count_vm_event(charge_mm,
+ PGMAJFAULT);
}
/* Here we actually start the io */
page = shmem_swapin(swap, gfp, info, index);
@@ -1670,6 +1715,11 @@ repeat:
swap_free(swap);
} else {
+ if (vma && userfaultfd_missing(vma)) {
+ *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
+ return 0;
+ }
+
/* shmem_symlink() */
if (mapping->a_ops != &shmem_aops)
goto alloc_nohuge;
@@ -1837,7 +1887,6 @@ unlock:
put_page(page);
}
if (error == -ENOSPC && !once++) {
- info = SHMEM_I(inode);
spin_lock_irq(&info->lock);
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
@@ -1860,8 +1909,9 @@ static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync
return ret;
}
-static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int shmem_fault(struct vm_fault *vmf)
{
+ struct vm_area_struct *vma = vmf->vma;
struct inode *inode = file_inode(vma->vm_file);
gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
enum sgp_type sgp;
@@ -1933,7 +1983,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
sgp = SGP_NOHUGE;
error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
- gfp, vma->vm_mm, &ret);
+ gfp, vma, vmf, &ret);
if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
return ret;
@@ -2143,10 +2193,123 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
bool shmem_mapping(struct address_space *mapping)
{
- if (!mapping->host)
- return false;
+ return mapping->a_ops == &shmem_aops;
+}
- return mapping->host->i_sb->s_op == &shmem_ops;
+int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **pagep)
+{
+ struct inode *inode = file_inode(dst_vma->vm_file);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ struct address_space *mapping = inode->i_mapping;
+ gfp_t gfp = mapping_gfp_mask(mapping);
+ pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+ struct mem_cgroup *memcg;
+ spinlock_t *ptl;
+ void *page_kaddr;
+ struct page *page;
+ pte_t _dst_pte, *dst_pte;
+ int ret;
+
+ ret = -ENOMEM;
+ if (shmem_acct_block(info->flags, 1))
+ goto out;
+ if (sbinfo->max_blocks) {
+ if (percpu_counter_compare(&sbinfo->used_blocks,
+ sbinfo->max_blocks) >= 0)
+ goto out_unacct_blocks;
+ percpu_counter_inc(&sbinfo->used_blocks);
+ }
+
+ if (!*pagep) {
+ page = shmem_alloc_page(gfp, info, pgoff);
+ if (!page)
+ goto out_dec_used_blocks;
+
+ page_kaddr = kmap_atomic(page);
+ ret = copy_from_user(page_kaddr, (const void __user *)src_addr,
+ PAGE_SIZE);
+ kunmap_atomic(page_kaddr);
+
+ /* fallback to copy_from_user outside mmap_sem */
+ if (unlikely(ret)) {
+ *pagep = page;
+ if (sbinfo->max_blocks)
+ percpu_counter_add(&sbinfo->used_blocks, -1);
+ shmem_unacct_blocks(info->flags, 1);
+ /* don't free the page */
+ return -EFAULT;
+ }
+ } else {
+ page = *pagep;
+ *pagep = NULL;
+ }
+
+ VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
+ __SetPageLocked(page);
+ __SetPageSwapBacked(page);
+ __SetPageUptodate(page);
+
+ ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false);
+ if (ret)
+ goto out_release;
+
+ ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
+ if (!ret) {
+ ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL);
+ radix_tree_preload_end();
+ }
+ if (ret)
+ goto out_release_uncharge;
+
+ mem_cgroup_commit_charge(page, memcg, false, false);
+
+ _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+ if (dst_vma->vm_flags & VM_WRITE)
+ _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
+
+ ret = -EEXIST;
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (!pte_none(*dst_pte))
+ goto out_release_uncharge_unlock;
+
+ lru_cache_add_anon(page);
+
+ spin_lock(&info->lock);
+ info->alloced++;
+ inode->i_blocks += BLOCKS_PER_PAGE;
+ shmem_recalc_inode(inode);
+ spin_unlock(&info->lock);
+
+ inc_mm_counter(dst_mm, mm_counter_file(page));
+ page_add_file_rmap(page, false);
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+ unlock_page(page);
+ pte_unmap_unlock(dst_pte, ptl);
+ ret = 0;
+out:
+ return ret;
+out_release_uncharge_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+out_release_uncharge:
+ mem_cgroup_cancel_charge(page, memcg, false);
+out_release:
+ unlock_page(page);
+ put_page(page);
+out_dec_used_blocks:
+ if (sbinfo->max_blocks)
+ percpu_counter_add(&sbinfo->used_blocks, -1);
+out_unacct_blocks:
+ shmem_unacct_blocks(info->flags, 1);
+ goto out;
}
#ifdef CONFIG_TMPFS
@@ -2169,7 +2332,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index = pos >> PAGE_SHIFT;
/* i_mutex is held by caller */
- if (unlikely(info->seals)) {
+ if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) {
if (info->seals & F_SEAL_WRITE)
return -EPERM;
if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
@@ -2446,8 +2609,8 @@ static void shmem_tag_pins(struct address_space *mapping)
}
if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
@@ -2516,8 +2679,8 @@ static int shmem_wait_for_pins(struct address_space *mapping)
spin_unlock_irq(&mapping->tree_lock);
continue_resched:
if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
@@ -3187,7 +3350,6 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
#endif /* CONFIG_TMPFS_XATTR */
static const struct inode_operations shmem_short_symlink_operations = {
- .readlink = generic_readlink,
.get_link = simple_get_link,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
@@ -3195,7 +3357,6 @@ static const struct inode_operations shmem_short_symlink_operations = {
};
static const struct inode_operations shmem_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = shmem_get_link,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
@@ -4110,7 +4271,7 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
BUG_ON(mapping->a_ops != &shmem_aops);
error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
- gfp, NULL, NULL);
+ gfp, NULL, NULL, NULL);
if (error)
page = ERR_PTR(error);
else
diff --git a/mm/slab.c b/mm/slab.c
index 0b0550ca85b4..807d86c76908 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -116,6 +116,7 @@
#include <linux/kmemcheck.h>
#include <linux/memory.h>
#include <linux/prefetch.h>
+#include <linux/sched/task_stack.h>
#include <net/sock.h>
@@ -227,13 +228,14 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
INIT_LIST_HEAD(&parent->slabs_full);
INIT_LIST_HEAD(&parent->slabs_partial);
INIT_LIST_HEAD(&parent->slabs_free);
+ parent->total_slabs = 0;
+ parent->free_slabs = 0;
parent->shared = NULL;
parent->alien = NULL;
parent->colour_next = 0;
spin_lock_init(&parent->list_lock);
parent->free_objects = 0;
parent->free_touched = 0;
- parent->num_slabs = 0;
}
#define MAKE_LIST(cachep, listp, slab, nodeid) \
@@ -551,12 +553,7 @@ static void start_cpu_timer(int cpu)
{
struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
- /*
- * When this gets called from do_initcalls via cpucache_init(),
- * init_workqueues() has already run, so keventd will be setup
- * at that time.
- */
- if (keventd_up() && reap_work->work.func == NULL) {
+ if (reap_work->work.func == NULL) {
init_reap_node(cpu);
INIT_DEFERRABLE_WORK(reap_work, cache_reap);
schedule_delayed_work_on(cpu, reap_work,
@@ -1292,7 +1289,8 @@ void __init kmem_cache_init(void)
* Initialize the caches that provide memory for the kmem_cache_node
* structures first. Without this, further allocations will bug.
*/
- kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node",
+ kmalloc_caches[INDEX_NODE] = create_kmalloc_cache(
+ kmalloc_info[INDEX_NODE].name,
kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
slab_state = PARTIAL_NODE;
setup_kmalloc_cache_index_table();
@@ -1366,7 +1364,6 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
{
#if DEBUG
struct kmem_cache_node *n;
- struct page *page;
unsigned long flags;
int node;
static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -1381,32 +1378,18 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
cachep->name, cachep->size, cachep->gfporder);
for_each_kmem_cache_node(cachep, node, n) {
- unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
- unsigned long active_slabs = 0, num_slabs = 0;
- unsigned long num_slabs_partial = 0, num_slabs_free = 0;
- unsigned long num_slabs_full;
+ unsigned long total_slabs, free_slabs, free_objs;
spin_lock_irqsave(&n->list_lock, flags);
- num_slabs = n->num_slabs;
- list_for_each_entry(page, &n->slabs_partial, lru) {
- active_objs += page->active;
- num_slabs_partial++;
- }
- list_for_each_entry(page, &n->slabs_free, lru)
- num_slabs_free++;
-
- free_objects += n->free_objects;
+ total_slabs = n->total_slabs;
+ free_slabs = n->free_slabs;
+ free_objs = n->free_objects;
spin_unlock_irqrestore(&n->list_lock, flags);
- num_objs = num_slabs * cachep->num;
- active_slabs = num_slabs - num_slabs_free;
- num_slabs_full = num_slabs -
- (num_slabs_partial + num_slabs_free);
- active_objs += (num_slabs_full * cachep->num);
-
- pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
- node, active_slabs, num_slabs, active_objs, num_objs,
- free_objects);
+ pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
+ node, total_slabs - free_slabs, total_slabs,
+ (total_slabs * cachep->num) - free_objs,
+ total_slabs * cachep->num);
}
#endif
}
@@ -2318,7 +2301,8 @@ static int drain_freelist(struct kmem_cache *cache,
page = list_entry(p, struct page, lru);
list_del(&page->lru);
- n->num_slabs--;
+ n->free_slabs--;
+ n->total_slabs--;
/*
* Safe to drop the lock. The slab is no longer linked
* to the cache.
@@ -2332,7 +2316,7 @@ out:
return nr_freed;
}
-int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
+int __kmem_cache_shrink(struct kmem_cache *cachep)
{
int ret = 0;
int node;
@@ -2350,9 +2334,16 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
return (ret ? 1 : 0);
}
+#ifdef CONFIG_MEMCG
+void __kmemcg_cache_deactivate(struct kmem_cache *cachep)
+{
+ __kmem_cache_shrink(cachep);
+}
+#endif
+
int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
- return __kmem_cache_shrink(cachep, false);
+ return __kmem_cache_shrink(cachep);
}
void __kmem_cache_release(struct kmem_cache *cachep)
@@ -2475,7 +2466,6 @@ union freelist_init_state {
unsigned int pos;
unsigned int *list;
unsigned int count;
- unsigned int rand;
};
struct rnd_state rnd_state;
};
@@ -2501,8 +2491,7 @@ static bool freelist_state_initialize(union freelist_init_state *state,
} else {
state->list = cachep->random_seq;
state->count = count;
- state->pos = 0;
- state->rand = rand;
+ state->pos = rand % count;
ret = true;
}
return ret;
@@ -2511,7 +2500,9 @@ static bool freelist_state_initialize(union freelist_init_state *state,
/* Get the next entry on the list and randomize it using a random shift */
static freelist_idx_t next_random_slot(union freelist_init_state *state)
{
- return (state->list[state->pos++] + state->rand) % state->count;
+ if (state->pos >= state->count)
+ state->pos = 0;
+ return state->list[state->pos++];
}
/* Swap two freelist entries */
@@ -2753,12 +2744,13 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
n = get_node(cachep, page_to_nid(page));
spin_lock(&n->list_lock);
- if (!page->active)
+ n->total_slabs++;
+ if (!page->active) {
list_add_tail(&page->lru, &(n->slabs_free));
- else
+ n->free_slabs++;
+ } else
fixup_slab_list(cachep, n, page, &list);
- n->num_slabs++;
STATS_INC_GROWN(cachep);
n->free_objects += cachep->num - page->active;
spin_unlock(&n->list_lock);
@@ -2903,9 +2895,10 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
/* Move pfmemalloc slab to the end of list to speed up next search */
list_del(&page->lru);
- if (!page->active)
+ if (!page->active) {
list_add_tail(&page->lru, &n->slabs_free);
- else
+ n->free_slabs++;
+ } else
list_add_tail(&page->lru, &n->slabs_partial);
list_for_each_entry(page, &n->slabs_partial, lru) {
@@ -2913,9 +2906,12 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
return page;
}
+ n->free_touched = 1;
list_for_each_entry(page, &n->slabs_free, lru) {
- if (!PageSlabPfmemalloc(page))
+ if (!PageSlabPfmemalloc(page)) {
+ n->free_slabs--;
return page;
+ }
}
return NULL;
@@ -2925,16 +2921,18 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
{
struct page *page;
- page = list_first_entry_or_null(&n->slabs_partial,
- struct page, lru);
+ assert_spin_locked(&n->list_lock);
+ page = list_first_entry_or_null(&n->slabs_partial, struct page, lru);
if (!page) {
n->free_touched = 1;
- page = list_first_entry_or_null(&n->slabs_free,
- struct page, lru);
+ page = list_first_entry_or_null(&n->slabs_free, struct page,
+ lru);
+ if (page)
+ n->free_slabs--;
}
if (sk_memalloc_socks())
- return get_valid_first_slab(n, page, pfmemalloc);
+ page = get_valid_first_slab(n, page, pfmemalloc);
return page;
}
@@ -3434,9 +3432,10 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
STATS_DEC_ACTIVE(cachep);
/* fixup slab chains */
- if (page->active == 0)
+ if (page->active == 0) {
list_add(&page->lru, &n->slabs_free);
- else {
+ n->free_slabs++;
+ } else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
@@ -3450,7 +3449,8 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
page = list_last_entry(&n->slabs_free, struct page, lru);
list_move(&page->lru, list);
- n->num_slabs--;
+ n->free_slabs--;
+ n->total_slabs--;
}
}
@@ -4102,64 +4102,33 @@ out:
#ifdef CONFIG_SLABINFO
void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
{
- struct page *page;
- unsigned long active_objs;
- unsigned long num_objs;
- unsigned long active_slabs = 0;
- unsigned long num_slabs, free_objects = 0, shared_avail = 0;
- unsigned long num_slabs_partial = 0, num_slabs_free = 0;
- unsigned long num_slabs_full = 0;
- const char *name;
- char *error = NULL;
+ unsigned long active_objs, num_objs, active_slabs;
+ unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0;
+ unsigned long free_slabs = 0;
int node;
struct kmem_cache_node *n;
- active_objs = 0;
- num_slabs = 0;
for_each_kmem_cache_node(cachep, node, n) {
-
check_irq_on();
spin_lock_irq(&n->list_lock);
- num_slabs += n->num_slabs;
+ total_slabs += n->total_slabs;
+ free_slabs += n->free_slabs;
+ free_objs += n->free_objects;
- list_for_each_entry(page, &n->slabs_partial, lru) {
- if (page->active == cachep->num && !error)
- error = "slabs_partial accounting error";
- if (!page->active && !error)
- error = "slabs_partial accounting error";
- active_objs += page->active;
- num_slabs_partial++;
- }
-
- list_for_each_entry(page, &n->slabs_free, lru) {
- if (page->active && !error)
- error = "slabs_free accounting error";
- num_slabs_free++;
- }
-
- free_objects += n->free_objects;
if (n->shared)
shared_avail += n->shared->avail;
spin_unlock_irq(&n->list_lock);
}
- num_objs = num_slabs * cachep->num;
- active_slabs = num_slabs - num_slabs_free;
- num_slabs_full = num_slabs - (num_slabs_partial + num_slabs_free);
- active_objs += (num_slabs_full * cachep->num);
-
- if (num_objs - active_objs != free_objects && !error)
- error = "free_objects accounting error";
-
- name = cachep->name;
- if (error)
- pr_err("slab: cache %s error: %s\n", name, error);
+ num_objs = total_slabs * cachep->num;
+ active_slabs = total_slabs - free_slabs;
+ active_objs = num_objs - free_objs;
sinfo->active_objs = active_objs;
sinfo->num_objs = num_objs;
sinfo->active_slabs = active_slabs;
- sinfo->num_slabs = num_slabs;
+ sinfo->num_slabs = total_slabs;
sinfo->shared_avail = shared_avail;
sinfo->limit = cachep->limit;
sinfo->batchcount = cachep->batchcount;
diff --git a/mm/slab.h b/mm/slab.h
index bc05fdc3edce..65e7c3fcac72 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -71,6 +71,12 @@ extern struct list_head slab_caches;
/* The slab cache that manages slab cache information */
extern struct kmem_cache *kmem_cache;
+/* A table of kmalloc cache names and sizes */
+extern const struct kmalloc_info_struct {
+ const char *name;
+ unsigned long size;
+} kmalloc_info[];
+
unsigned long calculate_alignment(unsigned long flags,
unsigned long align, unsigned long size);
@@ -142,11 +148,27 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
#define SLAB_CACHE_FLAGS (0)
#endif
+/* Common flags available with current configuration */
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
+/* Common flags permitted for kmem_cache_create */
+#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \
+ SLAB_RED_ZONE | \
+ SLAB_POISON | \
+ SLAB_STORE_USER | \
+ SLAB_TRACE | \
+ SLAB_CONSISTENCY_CHECKS | \
+ SLAB_MEM_SPREAD | \
+ SLAB_NOLEAKTRACE | \
+ SLAB_RECLAIM_ACCOUNT | \
+ SLAB_TEMPORARY | \
+ SLAB_NOTRACK | \
+ SLAB_ACCOUNT)
+
int __kmem_cache_shutdown(struct kmem_cache *);
void __kmem_cache_release(struct kmem_cache *);
-int __kmem_cache_shrink(struct kmem_cache *, bool);
+int __kmem_cache_shrink(struct kmem_cache *);
+void __kmemcg_cache_deactivate(struct kmem_cache *s);
void slab_kmem_cache_release(struct kmem_cache *);
struct seq_file;
@@ -180,17 +202,22 @@ void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+
+/* List of all root caches. */
+extern struct list_head slab_root_caches;
+#define root_caches_node memcg_params.__root_caches_node
+
/*
* Iterate over all memcg caches of the given root cache. The caller must hold
* slab_mutex.
*/
#define for_each_memcg_cache(iter, root) \
- list_for_each_entry(iter, &(root)->memcg_params.list, \
- memcg_params.list)
+ list_for_each_entry(iter, &(root)->memcg_params.children, \
+ memcg_params.children_node)
static inline bool is_root_cache(struct kmem_cache *s)
{
- return s->memcg_params.is_root_cache;
+ return !s->memcg_params.root_cache;
}
static inline bool slab_equal_or_root(struct kmem_cache *s,
@@ -279,9 +306,16 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
}
extern void slab_init_memcg_params(struct kmem_cache *);
+extern void memcg_link_cache(struct kmem_cache *s);
+extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
+ void (*deact_fn)(struct kmem_cache *));
#else /* CONFIG_MEMCG && !CONFIG_SLOB */
+/* If !memcg, all caches are root. */
+#define slab_root_caches slab_caches
+#define root_caches_node list
+
#define for_each_memcg_cache(iter, root) \
for ((void)(iter), (void)(root); 0; )
@@ -326,6 +360,11 @@ static inline void memcg_uncharge_slab(struct page *page, int order,
static inline void slab_init_memcg_params(struct kmem_cache *s)
{
}
+
+static inline void memcg_link_cache(struct kmem_cache *s)
+{
+}
+
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
@@ -432,7 +471,8 @@ struct kmem_cache_node {
struct list_head slabs_partial; /* partial list first, better asm code */
struct list_head slabs_full;
struct list_head slabs_free;
- unsigned long num_slabs;
+ unsigned long total_slabs; /* length of all slab lists */
+ unsigned long free_slabs; /* length of free slab list only */
unsigned long free_objects;
unsigned int free_limit;
unsigned int colour_next; /* Per-node cache coloring */
@@ -472,6 +512,9 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
void *slab_start(struct seq_file *m, loff_t *pos);
void *slab_next(struct seq_file *m, void *p, loff_t *pos);
void slab_stop(struct seq_file *m, void *p);
+void *memcg_slab_start(struct seq_file *m, loff_t *pos);
+void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos);
+void memcg_slab_stop(struct seq_file *m, void *p);
int memcg_slab_show(struct seq_file *m, void *p);
void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 329b03843863..09d0e849b07f 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -30,6 +30,11 @@ LIST_HEAD(slab_caches);
DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;
+static LIST_HEAD(slab_caches_to_rcu_destroy);
+static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
+static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
+ slab_caches_to_rcu_destroy_workfn);
+
/*
* Set of flags that will prevent slab merging
*/
@@ -133,11 +138,14 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
}
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+
+LIST_HEAD(slab_root_caches);
+
void slab_init_memcg_params(struct kmem_cache *s)
{
- s->memcg_params.is_root_cache = true;
- INIT_LIST_HEAD(&s->memcg_params.list);
+ s->memcg_params.root_cache = NULL;
RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
+ INIT_LIST_HEAD(&s->memcg_params.children);
}
static int init_memcg_params(struct kmem_cache *s,
@@ -145,10 +153,11 @@ static int init_memcg_params(struct kmem_cache *s,
{
struct memcg_cache_array *arr;
- if (memcg) {
- s->memcg_params.is_root_cache = false;
- s->memcg_params.memcg = memcg;
+ if (root_cache) {
s->memcg_params.root_cache = root_cache;
+ s->memcg_params.memcg = memcg;
+ INIT_LIST_HEAD(&s->memcg_params.children_node);
+ INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
return 0;
}
@@ -177,9 +186,6 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size)
{
struct memcg_cache_array *old, *new;
- if (!is_root_cache(s))
- return 0;
-
new = kzalloc(sizeof(struct memcg_cache_array) +
new_array_size * sizeof(void *), GFP_KERNEL);
if (!new)
@@ -203,7 +209,7 @@ int memcg_update_all_caches(int num_memcgs)
int ret = 0;
mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_caches, list) {
+ list_for_each_entry(s, &slab_root_caches, root_caches_node) {
ret = update_memcg_params(s, num_memcgs);
/*
* Instead of freeing the memory, we'll just leave the caches
@@ -215,6 +221,28 @@ int memcg_update_all_caches(int num_memcgs)
mutex_unlock(&slab_mutex);
return ret;
}
+
+void memcg_link_cache(struct kmem_cache *s)
+{
+ if (is_root_cache(s)) {
+ list_add(&s->root_caches_node, &slab_root_caches);
+ } else {
+ list_add(&s->memcg_params.children_node,
+ &s->memcg_params.root_cache->memcg_params.children);
+ list_add(&s->memcg_params.kmem_caches_node,
+ &s->memcg_params.memcg->kmem_caches);
+ }
+}
+
+static void memcg_unlink_cache(struct kmem_cache *s)
+{
+ if (is_root_cache(s)) {
+ list_del(&s->root_caches_node);
+ } else {
+ list_del(&s->memcg_params.children_node);
+ list_del(&s->memcg_params.kmem_caches_node);
+ }
+}
#else
static inline int init_memcg_params(struct kmem_cache *s,
struct mem_cgroup *memcg, struct kmem_cache *root_cache)
@@ -225,6 +253,10 @@ static inline int init_memcg_params(struct kmem_cache *s,
static inline void destroy_memcg_params(struct kmem_cache *s)
{
}
+
+static inline void memcg_unlink_cache(struct kmem_cache *s)
+{
+}
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
/*
@@ -255,7 +287,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
{
struct kmem_cache *s;
- if (slab_nomerge || (flags & SLAB_NEVER_MERGE))
+ if (slab_nomerge)
return NULL;
if (ctor)
@@ -266,7 +298,10 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
size = ALIGN(size, align);
flags = kmem_cache_flags(size, flags, name, NULL);
- list_for_each_entry_reverse(s, &slab_caches, list) {
+ if (flags & SLAB_NEVER_MERGE)
+ return NULL;
+
+ list_for_each_entry_reverse(s, &slab_root_caches, root_caches_node) {
if (slab_unmergeable(s))
continue;
@@ -350,6 +385,7 @@ static struct kmem_cache *create_cache(const char *name,
s->refcount = 1;
list_add(&s->list, &slab_caches);
+ memcg_link_cache(s);
out:
if (err)
return ERR_PTR(err);
@@ -404,6 +440,12 @@ kmem_cache_create(const char *name, size_t size, size_t align,
goto out_unlock;
}
+ /* Refuse requests with allocator specific flags */
+ if (flags & ~SLAB_FLAGS_PERMITTED) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
/*
* Some allocators will constraint the set of valid flags to a subset
* of all flags. We expect them to define CACHE_CREATE_MASK in this
@@ -452,33 +494,61 @@ out_unlock:
}
EXPORT_SYMBOL(kmem_cache_create);
-static int shutdown_cache(struct kmem_cache *s,
- struct list_head *release, bool *need_rcu_barrier)
+static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
{
- if (__kmem_cache_shutdown(s) != 0)
- return -EBUSY;
+ LIST_HEAD(to_destroy);
+ struct kmem_cache *s, *s2;
- if (s->flags & SLAB_DESTROY_BY_RCU)
- *need_rcu_barrier = true;
+ /*
+ * On destruction, SLAB_DESTROY_BY_RCU kmem_caches are put on the
+ * @slab_caches_to_rcu_destroy list. The slab pages are freed
+ * through RCU and and the associated kmem_cache are dereferenced
+ * while freeing the pages, so the kmem_caches should be freed only
+ * after the pending RCU operations are finished. As rcu_barrier()
+ * is a pretty slow operation, we batch all pending destructions
+ * asynchronously.
+ */
+ mutex_lock(&slab_mutex);
+ list_splice_init(&slab_caches_to_rcu_destroy, &to_destroy);
+ mutex_unlock(&slab_mutex);
- list_move(&s->list, release);
- return 0;
+ if (list_empty(&to_destroy))
+ return;
+
+ rcu_barrier();
+
+ list_for_each_entry_safe(s, s2, &to_destroy, list) {
+#ifdef SLAB_SUPPORTS_SYSFS
+ sysfs_slab_release(s);
+#else
+ slab_kmem_cache_release(s);
+#endif
+ }
}
-static void release_caches(struct list_head *release, bool need_rcu_barrier)
+static int shutdown_cache(struct kmem_cache *s)
{
- struct kmem_cache *s, *s2;
+ /* free asan quarantined objects */
+ kasan_cache_shutdown(s);
+
+ if (__kmem_cache_shutdown(s) != 0)
+ return -EBUSY;
- if (need_rcu_barrier)
- rcu_barrier();
+ memcg_unlink_cache(s);
+ list_del(&s->list);
- list_for_each_entry_safe(s, s2, release, list) {
+ if (s->flags & SLAB_DESTROY_BY_RCU) {
+ list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
+ schedule_work(&slab_caches_to_rcu_destroy_work);
+ } else {
#ifdef SLAB_SUPPORTS_SYSFS
- sysfs_slab_remove(s);
+ sysfs_slab_release(s);
#else
slab_kmem_cache_release(s);
#endif
}
+
+ return 0;
}
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
@@ -545,8 +615,6 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
goto out_unlock;
}
- list_add(&s->memcg_params.list, &root_cache->memcg_params.list);
-
/*
* Since readers won't lock (see cache_from_memcg_idx()), we need a
* barrier here to ensure nobody will see the kmem_cache partially
@@ -562,6 +630,66 @@ out_unlock:
put_online_cpus();
}
+static void kmemcg_deactivate_workfn(struct work_struct *work)
+{
+ struct kmem_cache *s = container_of(work, struct kmem_cache,
+ memcg_params.deact_work);
+
+ get_online_cpus();
+ get_online_mems();
+
+ mutex_lock(&slab_mutex);
+
+ s->memcg_params.deact_fn(s);
+
+ mutex_unlock(&slab_mutex);
+
+ put_online_mems();
+ put_online_cpus();
+
+ /* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */
+ css_put(&s->memcg_params.memcg->css);
+}
+
+static void kmemcg_deactivate_rcufn(struct rcu_head *head)
+{
+ struct kmem_cache *s = container_of(head, struct kmem_cache,
+ memcg_params.deact_rcu_head);
+
+ /*
+ * We need to grab blocking locks. Bounce to ->deact_work. The
+ * work item shares the space with the RCU head and can't be
+ * initialized eariler.
+ */
+ INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn);
+ queue_work(memcg_kmem_cache_wq, &s->memcg_params.deact_work);
+}
+
+/**
+ * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a
+ * sched RCU grace period
+ * @s: target kmem_cache
+ * @deact_fn: deactivation function to call
+ *
+ * Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex
+ * held after a sched RCU grace period. The slab is guaranteed to stay
+ * alive until @deact_fn is finished. This is to be used from
+ * __kmemcg_cache_deactivate().
+ */
+void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
+ void (*deact_fn)(struct kmem_cache *))
+{
+ if (WARN_ON_ONCE(is_root_cache(s)) ||
+ WARN_ON_ONCE(s->memcg_params.deact_fn))
+ return;
+
+ /* pin memcg so that @s doesn't get destroyed in the middle */
+ css_get(&s->memcg_params.memcg->css);
+
+ s->memcg_params.deact_fn = deact_fn;
+ call_rcu_sched(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn);
+}
+
void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
{
int idx;
@@ -574,17 +702,14 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
get_online_mems();
mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_caches, list) {
- if (!is_root_cache(s))
- continue;
-
+ list_for_each_entry(s, &slab_root_caches, root_caches_node) {
arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
lockdep_is_held(&slab_mutex));
c = arr->entries[idx];
if (!c)
continue;
- __kmem_cache_shrink(c, true);
+ __kmemcg_cache_deactivate(c);
arr->entries[idx] = NULL;
}
mutex_unlock(&slab_mutex);
@@ -593,47 +718,29 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
put_online_cpus();
}
-static int __shutdown_memcg_cache(struct kmem_cache *s,
- struct list_head *release, bool *need_rcu_barrier)
-{
- BUG_ON(is_root_cache(s));
-
- if (shutdown_cache(s, release, need_rcu_barrier))
- return -EBUSY;
-
- list_del(&s->memcg_params.list);
- return 0;
-}
-
void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
{
- LIST_HEAD(release);
- bool need_rcu_barrier = false;
struct kmem_cache *s, *s2;
get_online_cpus();
get_online_mems();
mutex_lock(&slab_mutex);
- list_for_each_entry_safe(s, s2, &slab_caches, list) {
- if (is_root_cache(s) || s->memcg_params.memcg != memcg)
- continue;
+ list_for_each_entry_safe(s, s2, &memcg->kmem_caches,
+ memcg_params.kmem_caches_node) {
/*
* The cgroup is about to be freed and therefore has no charges
* left. Hence, all its caches must be empty by now.
*/
- BUG_ON(__shutdown_memcg_cache(s, &release, &need_rcu_barrier));
+ BUG_ON(shutdown_cache(s));
}
mutex_unlock(&slab_mutex);
put_online_mems();
put_online_cpus();
-
- release_caches(&release, need_rcu_barrier);
}
-static int shutdown_memcg_caches(struct kmem_cache *s,
- struct list_head *release, bool *need_rcu_barrier)
+static int shutdown_memcg_caches(struct kmem_cache *s)
{
struct memcg_cache_array *arr;
struct kmem_cache *c, *c2;
@@ -652,13 +759,13 @@ static int shutdown_memcg_caches(struct kmem_cache *s,
c = arr->entries[i];
if (!c)
continue;
- if (__shutdown_memcg_cache(c, release, need_rcu_barrier))
+ if (shutdown_cache(c))
/*
* The cache still has objects. Move it to a temporary
* list so as not to try to destroy it for a second
* time while iterating over inactive caches below.
*/
- list_move(&c->memcg_params.list, &busy);
+ list_move(&c->memcg_params.children_node, &busy);
else
/*
* The cache is empty and will be destroyed soon. Clear
@@ -673,23 +780,22 @@ static int shutdown_memcg_caches(struct kmem_cache *s,
* Second, shutdown all caches left from memory cgroups that are now
* offline.
*/
- list_for_each_entry_safe(c, c2, &s->memcg_params.list,
- memcg_params.list)
- __shutdown_memcg_cache(c, release, need_rcu_barrier);
+ list_for_each_entry_safe(c, c2, &s->memcg_params.children,
+ memcg_params.children_node)
+ shutdown_cache(c);
- list_splice(&busy, &s->memcg_params.list);
+ list_splice(&busy, &s->memcg_params.children);
/*
* A cache being destroyed must be empty. In particular, this means
* that all per memcg caches attached to it must be empty too.
*/
- if (!list_empty(&s->memcg_params.list))
+ if (!list_empty(&s->memcg_params.children))
return -EBUSY;
return 0;
}
#else
-static inline int shutdown_memcg_caches(struct kmem_cache *s,
- struct list_head *release, bool *need_rcu_barrier)
+static inline int shutdown_memcg_caches(struct kmem_cache *s)
{
return 0;
}
@@ -705,8 +811,6 @@ void slab_kmem_cache_release(struct kmem_cache *s)
void kmem_cache_destroy(struct kmem_cache *s)
{
- LIST_HEAD(release);
- bool need_rcu_barrier = false;
int err;
if (unlikely(!s))
@@ -715,16 +819,15 @@ void kmem_cache_destroy(struct kmem_cache *s)
get_online_cpus();
get_online_mems();
- kasan_cache_destroy(s);
mutex_lock(&slab_mutex);
s->refcount--;
if (s->refcount)
goto out_unlock;
- err = shutdown_memcg_caches(s, &release, &need_rcu_barrier);
+ err = shutdown_memcg_caches(s);
if (!err)
- err = shutdown_cache(s, &release, &need_rcu_barrier);
+ err = shutdown_cache(s);
if (err) {
pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
@@ -736,8 +839,6 @@ out_unlock:
put_online_mems();
put_online_cpus();
-
- release_caches(&release, need_rcu_barrier);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -755,7 +856,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
get_online_cpus();
get_online_mems();
kasan_cache_shrink(cachep);
- ret = __kmem_cache_shrink(cachep, false);
+ ret = __kmem_cache_shrink(cachep);
put_online_mems();
put_online_cpus();
return ret;
@@ -799,6 +900,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
create_boot_cache(s, name, size, flags);
list_add(&s->list, &slab_caches);
+ memcg_link_cache(s);
s->refcount = 1;
return s;
}
@@ -883,10 +985,7 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
* kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
* kmalloc-67108864.
*/
-static struct {
- const char *name;
- unsigned long size;
-} const kmalloc_info[] __initconst = {
+const struct kmalloc_info_struct kmalloc_info[] __initconst = {
{NULL, 0}, {"kmalloc-96", 96},
{"kmalloc-192", 192}, {"kmalloc-8", 8},
{"kmalloc-16", 16}, {"kmalloc-32", 32},
@@ -1109,12 +1208,12 @@ static void print_slabinfo_header(struct seq_file *m)
void *slab_start(struct seq_file *m, loff_t *pos)
{
mutex_lock(&slab_mutex);
- return seq_list_start(&slab_caches, *pos);
+ return seq_list_start(&slab_root_caches, *pos);
}
void *slab_next(struct seq_file *m, void *p, loff_t *pos)
{
- return seq_list_next(p, &slab_caches, pos);
+ return seq_list_next(p, &slab_root_caches, pos);
}
void slab_stop(struct seq_file *m, void *p)
@@ -1166,25 +1265,44 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m)
static int slab_show(struct seq_file *m, void *p)
{
- struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+ struct kmem_cache *s = list_entry(p, struct kmem_cache, root_caches_node);
- if (p == slab_caches.next)
+ if (p == slab_root_caches.next)
print_slabinfo_header(m);
- if (is_root_cache(s))
- cache_show(s, m);
+ cache_show(s, m);
return 0;
}
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+void *memcg_slab_start(struct seq_file *m, loff_t *pos)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+ mutex_lock(&slab_mutex);
+ return seq_list_start(&memcg->kmem_caches, *pos);
+}
+
+void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+ return seq_list_next(p, &memcg->kmem_caches, pos);
+}
+
+void memcg_slab_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&slab_mutex);
+}
+
int memcg_slab_show(struct seq_file *m, void *p)
{
- struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+ struct kmem_cache *s = list_entry(p, struct kmem_cache,
+ memcg_params.kmem_caches_node);
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- if (p == slab_caches.next)
+ if (p == memcg->kmem_caches.next)
print_slabinfo_header(m);
- if (!is_root_cache(s) && s->memcg_params.memcg == memcg)
- cache_show(s, m);
+ cache_show(s, m);
return 0;
}
#endif
diff --git a/mm/slob.c b/mm/slob.c
index 5ec158054ffe..eac04d4357ec 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -634,7 +634,7 @@ void __kmem_cache_release(struct kmem_cache *c)
{
}
-int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
+int __kmem_cache_shrink(struct kmem_cache *d)
{
return 0;
}
diff --git a/mm/slub.c b/mm/slub.c
index 2b3e740609e9..7f4bc7027ed5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -214,11 +214,13 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
static void memcg_propagate_slab_attrs(struct kmem_cache *s);
+static void sysfs_slab_remove(struct kmem_cache *s);
#else
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
+static inline void sysfs_slab_remove(struct kmem_cache *s) { }
#endif
static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -496,10 +498,11 @@ static inline int check_valid_pointer(struct kmem_cache *s,
return 1;
}
-static void print_section(char *text, u8 *addr, unsigned int length)
+static void print_section(char *level, char *text, u8 *addr,
+ unsigned int length)
{
metadata_access_enable();
- print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
+ print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
length, 1);
metadata_access_disable();
}
@@ -636,14 +639,15 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
p, p - addr, get_freepointer(s, p));
if (s->flags & SLAB_RED_ZONE)
- print_section("Redzone ", p - s->red_left_pad, s->red_left_pad);
+ print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
+ s->red_left_pad);
else if (p > addr + 16)
- print_section("Bytes b4 ", p - 16, 16);
+ print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
- print_section("Object ", p, min_t(unsigned long, s->object_size,
- PAGE_SIZE));
+ print_section(KERN_ERR, "Object ", p,
+ min_t(unsigned long, s->object_size, PAGE_SIZE));
if (s->flags & SLAB_RED_ZONE)
- print_section("Redzone ", p + s->object_size,
+ print_section(KERN_ERR, "Redzone ", p + s->object_size,
s->inuse - s->object_size);
if (s->offset)
@@ -658,7 +662,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
- print_section("Padding ", p + off, size_from_object(s) - off);
+ print_section(KERN_ERR, "Padding ", p + off,
+ size_from_object(s) - off);
dump_stack();
}
@@ -820,7 +825,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
end--;
slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
- print_section("Padding ", end - remainder, remainder);
+ print_section(KERN_ERR, "Padding ", end - remainder, remainder);
restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
return 0;
@@ -973,7 +978,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
page->freelist);
if (!alloc)
- print_section("Object ", (void *)object,
+ print_section(KERN_INFO, "Object ", (void *)object,
s->object_size);
dump_stack();
@@ -1419,6 +1424,10 @@ static int init_cache_random_seq(struct kmem_cache *s)
int err;
unsigned long i, count = oo_objects(s->oo);
+ /* Bailout if already initialised */
+ if (s->random_seq)
+ return 0;
+
err = cache_random_seq_create(s, count, GFP_KERNEL);
if (err) {
pr_err("SLUB: Unable to initialize free list for %s\n",
@@ -1623,6 +1632,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
flags &= ~GFP_SLAB_BUG_MASK;
pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
invalid_mask, &invalid_mask, flags, &flags);
+ dump_stack();
}
return allocate_slab(s,
@@ -3076,7 +3086,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
struct detached_freelist df;
size = build_detached_freelist(s, size, p, &df);
- if (unlikely(!df.page))
+ if (!df.page)
continue;
slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
@@ -3679,6 +3689,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
if (n->nr_partial || slabs_node(s, node))
return 1;
}
+ sysfs_slab_remove(s);
return 0;
}
@@ -3883,7 +3894,7 @@ EXPORT_SYMBOL(kfree);
* being allocated from last increasing the chance that the last objects
* are freed in them.
*/
-int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
+int __kmem_cache_shrink(struct kmem_cache *s)
{
int node;
int i;
@@ -3895,21 +3906,6 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
unsigned long flags;
int ret = 0;
- if (deactivate) {
- /*
- * Disable empty slabs caching. Used to avoid pinning offline
- * memory cgroups by kmem pages that can be freed.
- */
- s->cpu_partial = 0;
- s->min_partial = 0;
-
- /*
- * s->cpu_partial is checked locklessly (see put_cpu_partial),
- * so we have to make sure the change is visible.
- */
- synchronize_sched();
- }
-
flush_all(s);
for_each_kmem_cache_node(s, node, n) {
INIT_LIST_HEAD(&discard);
@@ -3960,13 +3956,49 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
return ret;
}
+#ifdef CONFIG_MEMCG
+static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s)
+{
+ /*
+ * Called with all the locks held after a sched RCU grace period.
+ * Even if @s becomes empty after shrinking, we can't know that @s
+ * doesn't have allocations already in-flight and thus can't
+ * destroy @s until the associated memcg is released.
+ *
+ * However, let's remove the sysfs files for empty caches here.
+ * Each cache has a lot of interface files which aren't
+ * particularly useful for empty draining caches; otherwise, we can
+ * easily end up with millions of unnecessary sysfs files on
+ * systems which have a lot of memory and transient cgroups.
+ */
+ if (!__kmem_cache_shrink(s))
+ sysfs_slab_remove(s);
+}
+
+void __kmemcg_cache_deactivate(struct kmem_cache *s)
+{
+ /*
+ * Disable empty slabs caching. Used to avoid pinning offline
+ * memory cgroups by kmem pages that can be freed.
+ */
+ s->cpu_partial = 0;
+ s->min_partial = 0;
+
+ /*
+ * s->cpu_partial is checked locklessly (see put_cpu_partial), so
+ * we have to make sure the change is visible before shrinking.
+ */
+ slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu);
+}
+#endif
+
static int slab_mem_going_offline_callback(void *arg)
{
struct kmem_cache *s;
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list)
- __kmem_cache_shrink(s, false);
+ __kmem_cache_shrink(s);
mutex_unlock(&slab_mutex);
return 0;
@@ -4116,6 +4148,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
}
slab_init_memcg_params(s);
list_add(&s->list, &slab_caches);
+ memcg_link_cache(s);
return s;
}
@@ -4675,6 +4708,22 @@ enum slab_stat_type {
#define SO_OBJECTS (1 << SL_OBJECTS)
#define SO_TOTAL (1 << SL_TOTAL)
+#ifdef CONFIG_MEMCG
+static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON);
+
+static int __init setup_slub_memcg_sysfs(char *str)
+{
+ int v;
+
+ if (get_option(&str, &v) > 0)
+ memcg_sysfs_enabled = v;
+
+ return 1;
+}
+
+__setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs);
+#endif
+
static ssize_t show_slab_objects(struct kmem_cache *s,
char *buf, unsigned long flags)
{
@@ -5578,8 +5627,14 @@ static int sysfs_slab_add(struct kmem_cache *s)
{
int err;
const char *name;
+ struct kset *kset = cache_kset(s);
int unmergeable = slab_unmergeable(s);
+ if (!kset) {
+ kobject_init(&s->kobj, &slab_ktype);
+ return 0;
+ }
+
if (unmergeable) {
/*
* Slabcache can never be merged so we can use the name proper.
@@ -5596,7 +5651,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
name = create_unique_id(s);
}
- s->kobj.kset = cache_kset(s);
+ s->kobj.kset = kset;
err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
if (err)
goto out;
@@ -5606,7 +5661,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
goto out_del_kobj;
#ifdef CONFIG_MEMCG
- if (is_root_cache(s)) {
+ if (is_root_cache(s) && memcg_sysfs_enabled) {
s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
if (!s->memcg_kset) {
err = -ENOMEM;
@@ -5629,7 +5684,7 @@ out_del_kobj:
goto out;
}
-void sysfs_slab_remove(struct kmem_cache *s)
+static void sysfs_slab_remove(struct kmem_cache *s)
{
if (slab_state < FULL)
/*
@@ -5638,12 +5693,26 @@ void sysfs_slab_remove(struct kmem_cache *s)
*/
return;
+ if (!s->kobj.state_in_sysfs)
+ /*
+ * For a memcg cache, this may be called during
+ * deactivation and again on shutdown. Remove only once.
+ * A cache is never shut down before deactivation is
+ * complete, so no need to worry about synchronization.
+ */
+ return;
+
#ifdef CONFIG_MEMCG
kset_unregister(s->memcg_kset);
#endif
kobject_uevent(&s->kobj, KOBJ_REMOVE);
kobject_del(&s->kobj);
- kobject_put(&s->kobj);
+}
+
+void sysfs_slab_release(struct kmem_cache *s)
+{
+ if (slab_state >= FULL)
+ kobject_put(&s->kobj);
}
/*
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 574c67b663fe..a56c3989f773 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -196,9 +196,9 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
return pmd;
}
-pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node)
+pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
{
- pud_t *pud = pud_offset(pgd, addr);
+ pud_t *pud = pud_offset(p4d, addr);
if (pud_none(*pud)) {
void *p = vmemmap_alloc_block(PAGE_SIZE, node);
if (!p)
@@ -208,6 +208,18 @@ pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node)
return pud;
}
+p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
+{
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d)) {
+ void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ if (!p)
+ return NULL;
+ p4d_populate(&init_mm, p4d, p);
+ }
+ return p4d;
+}
+
pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
{
pgd_t *pgd = pgd_offset_k(addr);
@@ -225,6 +237,7 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
{
unsigned long addr = start;
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -233,7 +246,10 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
pgd = vmemmap_pgd_populate(addr, node);
if (!pgd)
return -ENOMEM;
- pud = vmemmap_pud_populate(pgd, addr, node);
+ p4d = vmemmap_p4d_populate(pgd, addr, node);
+ if (!p4d)
+ return -ENOMEM;
+ pud = vmemmap_pud_populate(p4d, addr, node);
if (!pud)
return -ENOMEM;
pmd = vmemmap_pmd_populate(pud, addr, node);
diff --git a/mm/sparse.c b/mm/sparse.c
index 1e168bf2779a..db6bf3c97ea2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -662,12 +662,12 @@ static void free_map_bootmem(struct page *memmap)
>> PAGE_SHIFT;
for (i = 0; i < nr_pages; i++, page++) {
- magic = (unsigned long) page->lru.next;
+ magic = (unsigned long) page->freelist;
BUG_ON(magic == NODE_INFO);
maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
- removing_section_nr = page->private;
+ removing_section_nr = page_private(page);
/*
* When this function is called, the removing section is
diff --git a/mm/swap.c b/mm/swap.c
index 4dcf852e1e6d..5dabf444d724 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -69,6 +69,7 @@ static void __page_cache_release(struct page *page)
del_page_from_lru_list(page, lruvec, page_off_lru(page));
spin_unlock_irqrestore(zone_lru_lock(zone), flags);
}
+ __ClearPageWaiters(page);
mem_cgroup_uncharge(page);
}
@@ -208,9 +209,10 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
{
int *pgmoved = arg;
- if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- enum lru_list lru = page_lru_base_type(page);
- list_move_tail(&page->lru, &lruvec->lists[lru]);
+ if (PageLRU(page) && !PageUnevictable(page)) {
+ del_page_from_lru_list(page, lruvec, page_lru(page));
+ ClearPageActive(page);
+ add_page_to_lru_list_tail(page, lruvec, page_lru(page));
(*pgmoved)++;
}
}
@@ -234,7 +236,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
*/
void rotate_reclaimable_page(struct page *page)
{
- if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
+ if (!PageLocked(page) && !PageDirty(page) &&
!PageUnevictable(page) && PageLRU(page)) {
struct pagevec *pvec;
unsigned long flags;
@@ -668,30 +670,19 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
-/*
- * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
- * workqueue, aiding in getting memory freed.
- */
-static struct workqueue_struct *lru_add_drain_wq;
-
-static int __init lru_init(void)
-{
- lru_add_drain_wq = alloc_workqueue("lru-add-drain", WQ_MEM_RECLAIM, 0);
-
- if (WARN(!lru_add_drain_wq,
- "Failed to create workqueue lru_add_drain_wq"))
- return -ENOMEM;
-
- return 0;
-}
-early_initcall(lru_init);
-
void lru_add_drain_all(void)
{
static DEFINE_MUTEX(lock);
static struct cpumask has_work;
int cpu;
+ /*
+ * Make sure nobody triggers this path before mm_percpu_wq is fully
+ * initialized.
+ */
+ if (WARN_ON(!mm_percpu_wq))
+ return;
+
mutex_lock(&lock);
get_online_cpus();
cpumask_clear(&has_work);
@@ -705,7 +696,7 @@ void lru_add_drain_all(void)
pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
- queue_work_on(cpu, lru_add_drain_wq, work);
+ queue_work_on(cpu, mm_percpu_wq, work);
cpumask_set_cpu(cpu, &has_work);
}
}
@@ -784,6 +775,7 @@ void release_pages(struct page **pages, int nr, bool cold)
/* Clear Active bit in case of parallel mark_page_accessed */
__ClearPageActive(page);
+ __ClearPageWaiters(page);
list_add(&page->lru, &pages_to_free);
}
@@ -969,12 +961,6 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
void __init swap_setup(void)
{
unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
-#ifdef CONFIG_SWAP
- int i;
-
- for (i = 0; i < MAX_SWAPFILES; i++)
- spin_lock_init(&swapper_spaces[i].tree_lock);
-#endif
/* Use a smaller cluster for small-memory machines */
if (megs < 16)
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index 310ac0b8f974..ac6318a064d3 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -201,6 +201,8 @@ void swap_cgroup_swapoff(int type)
struct page *page = map[i];
if (page)
__free_page(page);
+ if (!(i % SWAP_CLUSTER_MAX))
+ cond_resched();
}
vfree(map);
}
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
new file mode 100644
index 000000000000..b1ccb58ad397
--- /dev/null
+++ b/mm/swap_slots.c
@@ -0,0 +1,340 @@
+/*
+ * Manage cache of swap slots to be used for and returned from
+ * swap.
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * We allocate the swap slots from the global pool and put
+ * it into local per cpu caches. This has the advantage
+ * of no needing to acquire the swap_info lock every time
+ * we need a new slot.
+ *
+ * There is also opportunity to simply return the slot
+ * to local caches without needing to acquire swap_info
+ * lock. We do not reuse the returned slots directly but
+ * move them back to the global pool in a batch. This
+ * allows the slots to coaellesce and reduce fragmentation.
+ *
+ * The swap entry allocated is marked with SWAP_HAS_CACHE
+ * flag in map_count that prevents it from being allocated
+ * again from the global pool.
+ *
+ * The swap slots cache is protected by a mutex instead of
+ * a spin lock as when we search for slots with scan_swap_map,
+ * we can possibly sleep.
+ */
+
+#include <linux/swap_slots.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
+
+#ifdef CONFIG_SWAP
+
+static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
+static bool swap_slot_cache_active;
+bool swap_slot_cache_enabled;
+static bool swap_slot_cache_initialized;
+DEFINE_MUTEX(swap_slots_cache_mutex);
+/* Serialize swap slots cache enable/disable operations */
+DEFINE_MUTEX(swap_slots_cache_enable_mutex);
+
+static void __drain_swap_slots_cache(unsigned int type);
+static void deactivate_swap_slots_cache(void);
+static void reactivate_swap_slots_cache(void);
+
+#define use_swap_slot_cache (swap_slot_cache_active && \
+ swap_slot_cache_enabled && swap_slot_cache_initialized)
+#define SLOTS_CACHE 0x1
+#define SLOTS_CACHE_RET 0x2
+
+static void deactivate_swap_slots_cache(void)
+{
+ mutex_lock(&swap_slots_cache_mutex);
+ swap_slot_cache_active = false;
+ __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
+ mutex_unlock(&swap_slots_cache_mutex);
+}
+
+static void reactivate_swap_slots_cache(void)
+{
+ mutex_lock(&swap_slots_cache_mutex);
+ swap_slot_cache_active = true;
+ mutex_unlock(&swap_slots_cache_mutex);
+}
+
+/* Must not be called with cpu hot plug lock */
+void disable_swap_slots_cache_lock(void)
+{
+ mutex_lock(&swap_slots_cache_enable_mutex);
+ swap_slot_cache_enabled = false;
+ if (swap_slot_cache_initialized) {
+ /* serialize with cpu hotplug operations */
+ get_online_cpus();
+ __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
+ put_online_cpus();
+ }
+}
+
+static void __reenable_swap_slots_cache(void)
+{
+ swap_slot_cache_enabled = has_usable_swap();
+}
+
+void reenable_swap_slots_cache_unlock(void)
+{
+ __reenable_swap_slots_cache();
+ mutex_unlock(&swap_slots_cache_enable_mutex);
+}
+
+static bool check_cache_active(void)
+{
+ long pages;
+
+ if (!swap_slot_cache_enabled || !swap_slot_cache_initialized)
+ return false;
+
+ pages = get_nr_swap_pages();
+ if (!swap_slot_cache_active) {
+ if (pages > num_online_cpus() *
+ THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE)
+ reactivate_swap_slots_cache();
+ goto out;
+ }
+
+ /* if global pool of slot caches too low, deactivate cache */
+ if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE)
+ deactivate_swap_slots_cache();
+out:
+ return swap_slot_cache_active;
+}
+
+static int alloc_swap_slot_cache(unsigned int cpu)
+{
+ struct swap_slots_cache *cache;
+ swp_entry_t *slots, *slots_ret;
+
+ /*
+ * Do allocation outside swap_slots_cache_mutex
+ * as vzalloc could trigger reclaim and get_swap_page,
+ * which can lock swap_slots_cache_mutex.
+ */
+ slots = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE);
+ if (!slots)
+ return -ENOMEM;
+
+ slots_ret = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE);
+ if (!slots_ret) {
+ vfree(slots);
+ return -ENOMEM;
+ }
+
+ mutex_lock(&swap_slots_cache_mutex);
+ cache = &per_cpu(swp_slots, cpu);
+ if (cache->slots || cache->slots_ret)
+ /* cache already allocated */
+ goto out;
+ if (!cache->lock_initialized) {
+ mutex_init(&cache->alloc_lock);
+ spin_lock_init(&cache->free_lock);
+ cache->lock_initialized = true;
+ }
+ cache->nr = 0;
+ cache->cur = 0;
+ cache->n_ret = 0;
+ cache->slots = slots;
+ slots = NULL;
+ cache->slots_ret = slots_ret;
+ slots_ret = NULL;
+out:
+ mutex_unlock(&swap_slots_cache_mutex);
+ if (slots)
+ vfree(slots);
+ if (slots_ret)
+ vfree(slots_ret);
+ return 0;
+}
+
+static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
+ bool free_slots)
+{
+ struct swap_slots_cache *cache;
+ swp_entry_t *slots = NULL;
+
+ cache = &per_cpu(swp_slots, cpu);
+ if ((type & SLOTS_CACHE) && cache->slots) {
+ mutex_lock(&cache->alloc_lock);
+ swapcache_free_entries(cache->slots + cache->cur, cache->nr);
+ cache->cur = 0;
+ cache->nr = 0;
+ if (free_slots && cache->slots) {
+ vfree(cache->slots);
+ cache->slots = NULL;
+ }
+ mutex_unlock(&cache->alloc_lock);
+ }
+ if ((type & SLOTS_CACHE_RET) && cache->slots_ret) {
+ spin_lock_irq(&cache->free_lock);
+ swapcache_free_entries(cache->slots_ret, cache->n_ret);
+ cache->n_ret = 0;
+ if (free_slots && cache->slots_ret) {
+ slots = cache->slots_ret;
+ cache->slots_ret = NULL;
+ }
+ spin_unlock_irq(&cache->free_lock);
+ if (slots)
+ vfree(slots);
+ }
+}
+
+static void __drain_swap_slots_cache(unsigned int type)
+{
+ unsigned int cpu;
+
+ /*
+ * This function is called during
+ * 1) swapoff, when we have to make sure no
+ * left over slots are in cache when we remove
+ * a swap device;
+ * 2) disabling of swap slot cache, when we run low
+ * on swap slots when allocating memory and need
+ * to return swap slots to global pool.
+ *
+ * We cannot acquire cpu hot plug lock here as
+ * this function can be invoked in the cpu
+ * hot plug path:
+ * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback
+ * -> memory allocation -> direct reclaim -> get_swap_page
+ * -> drain_swap_slots_cache
+ *
+ * Hence the loop over current online cpu below could miss cpu that
+ * is being brought online but not yet marked as online.
+ * That is okay as we do not schedule and run anything on a
+ * cpu before it has been marked online. Hence, we will not
+ * fill any swap slots in slots cache of such cpu.
+ * There are no slots on such cpu that need to be drained.
+ */
+ for_each_online_cpu(cpu)
+ drain_slots_cache_cpu(cpu, type, false);
+}
+
+static int free_slot_cache(unsigned int cpu)
+{
+ mutex_lock(&swap_slots_cache_mutex);
+ drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true);
+ mutex_unlock(&swap_slots_cache_mutex);
+ return 0;
+}
+
+int enable_swap_slots_cache(void)
+{
+ int ret = 0;
+
+ mutex_lock(&swap_slots_cache_enable_mutex);
+ if (swap_slot_cache_initialized) {
+ __reenable_swap_slots_cache();
+ goto out_unlock;
+ }
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
+ alloc_swap_slot_cache, free_slot_cache);
+ if (ret < 0)
+ goto out_unlock;
+ swap_slot_cache_initialized = true;
+ __reenable_swap_slots_cache();
+out_unlock:
+ mutex_unlock(&swap_slots_cache_enable_mutex);
+ return 0;
+}
+
+/* called with swap slot cache's alloc lock held */
+static int refill_swap_slots_cache(struct swap_slots_cache *cache)
+{
+ if (!use_swap_slot_cache || cache->nr)
+ return 0;
+
+ cache->cur = 0;
+ if (swap_slot_cache_active)
+ cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, cache->slots);
+
+ return cache->nr;
+}
+
+int free_swap_slot(swp_entry_t entry)
+{
+ struct swap_slots_cache *cache;
+
+ cache = &get_cpu_var(swp_slots);
+ if (use_swap_slot_cache && cache->slots_ret) {
+ spin_lock_irq(&cache->free_lock);
+ /* Swap slots cache may be deactivated before acquiring lock */
+ if (!use_swap_slot_cache) {
+ spin_unlock_irq(&cache->free_lock);
+ goto direct_free;
+ }
+ if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) {
+ /*
+ * Return slots to global pool.
+ * The current swap_map value is SWAP_HAS_CACHE.
+ * Set it to 0 to indicate it is available for
+ * allocation in global pool
+ */
+ swapcache_free_entries(cache->slots_ret, cache->n_ret);
+ cache->n_ret = 0;
+ }
+ cache->slots_ret[cache->n_ret++] = entry;
+ spin_unlock_irq(&cache->free_lock);
+ } else {
+direct_free:
+ swapcache_free_entries(&entry, 1);
+ }
+ put_cpu_var(swp_slots);
+
+ return 0;
+}
+
+swp_entry_t get_swap_page(void)
+{
+ swp_entry_t entry, *pentry;
+ struct swap_slots_cache *cache;
+
+ /*
+ * Preemption is allowed here, because we may sleep
+ * in refill_swap_slots_cache(). But it is safe, because
+ * accesses to the per-CPU data structure are protected by the
+ * mutex cache->alloc_lock.
+ *
+ * The alloc path here does not touch cache->slots_ret
+ * so cache->free_lock is not taken.
+ */
+ cache = raw_cpu_ptr(&swp_slots);
+
+ entry.val = 0;
+ if (check_cache_active()) {
+ mutex_lock(&cache->alloc_lock);
+ if (cache->slots) {
+repeat:
+ if (cache->nr) {
+ pentry = &cache->slots[cache->cur++];
+ entry = *pentry;
+ pentry->val = 0;
+ cache->nr--;
+ } else {
+ if (refill_swap_slots_cache(cache))
+ goto repeat;
+ }
+ }
+ mutex_unlock(&cache->alloc_lock);
+ if (entry.val)
+ return entry;
+ }
+
+ get_swap_pages(1, &entry);
+
+ return entry;
+}
+
+#endif /* CONFIG_SWAP */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 35d7e0ee1c77..473b71e052a8 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,6 +17,8 @@
#include <linux/blkdev.h>
#include <linux/pagevec.h>
#include <linux/migrate.h>
+#include <linux/vmalloc.h>
+#include <linux/swap_slots.h>
#include <asm/pgtable.h>
@@ -32,15 +34,8 @@ static const struct address_space_operations swap_aops = {
#endif
};
-struct address_space swapper_spaces[MAX_SWAPFILES] = {
- [0 ... MAX_SWAPFILES - 1] = {
- .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
- .i_mmap_writable = ATOMIC_INIT(0),
- .a_ops = &swap_aops,
- /* swap cache doesn't use writeback related tags */
- .flags = 1 << AS_NO_WRITEBACK_TAGS,
- }
-};
+struct address_space *swapper_spaces[MAX_SWAPFILES];
+static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
@@ -53,11 +48,26 @@ static struct {
unsigned long total_swapcache_pages(void)
{
- int i;
+ unsigned int i, j, nr;
unsigned long ret = 0;
+ struct address_space *spaces;
- for (i = 0; i < MAX_SWAPFILES; i++)
- ret += swapper_spaces[i].nrpages;
+ rcu_read_lock();
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ /*
+ * The corresponding entries in nr_swapper_spaces and
+ * swapper_spaces will be reused only after at least
+ * one grace period. So it is impossible for them
+ * belongs to different usage.
+ */
+ nr = nr_swapper_spaces[i];
+ spaces = rcu_dereference(swapper_spaces[i]);
+ if (!nr || !spaces)
+ continue;
+ for (j = 0; j < nr; j++)
+ ret += spaces[j].nrpages;
+ }
+ rcu_read_unlock();
return ret;
}
@@ -315,6 +325,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
break;
/*
+ * Just skip read ahead for unused swap slot.
+ * During swap_off when swap_slot_cache is disabled,
+ * we have to handle the race between putting
+ * swap entry in swap cache and marking swap slot
+ * as SWAP_HAS_CACHE. That's done in later part of code or
+ * else swap_off will be aborted if we return NULL.
+ */
+ if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
+ break;
+
+ /*
* Get a new page to read into from swap.
*/
if (!new_page) {
@@ -505,3 +526,38 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
skip:
return read_swap_cache_async(entry, gfp_mask, vma, addr);
}
+
+int init_swap_address_space(unsigned int type, unsigned long nr_pages)
+{
+ struct address_space *spaces, *space;
+ unsigned int i, nr;
+
+ nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
+ spaces = vzalloc(sizeof(struct address_space) * nr);
+ if (!spaces)
+ return -ENOMEM;
+ for (i = 0; i < nr; i++) {
+ space = spaces + i;
+ INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN);
+ atomic_set(&space->i_mmap_writable, 0);
+ space->a_ops = &swap_aops;
+ /* swap cache doesn't use writeback related tags */
+ mapping_set_no_writeback_tags(space);
+ spin_lock_init(&space->tree_lock);
+ }
+ nr_swapper_spaces[type] = nr;
+ rcu_assign_pointer(swapper_spaces[type], spaces);
+
+ return 0;
+}
+
+void exit_swap_address_space(unsigned int type)
+{
+ struct address_space *spaces;
+
+ spaces = swapper_spaces[type];
+ nr_swapper_spaces[type] = 0;
+ rcu_assign_pointer(swapper_spaces[type], NULL);
+ synchronize_rcu();
+ kvfree(spaces);
+}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f30438970cd1..178130880b90 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -6,6 +6,8 @@
*/
#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/slab.h>
@@ -34,6 +36,7 @@
#include <linux/frontswap.h>
#include <linux/swapfile.h>
#include <linux/export.h>
+#include <linux/swap_slots.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -257,6 +260,47 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
info->data = 0;
}
+static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
+ unsigned long offset)
+{
+ struct swap_cluster_info *ci;
+
+ ci = si->cluster_info;
+ if (ci) {
+ ci += offset / SWAPFILE_CLUSTER;
+ spin_lock(&ci->lock);
+ }
+ return ci;
+}
+
+static inline void unlock_cluster(struct swap_cluster_info *ci)
+{
+ if (ci)
+ spin_unlock(&ci->lock);
+}
+
+static inline struct swap_cluster_info *lock_cluster_or_swap_info(
+ struct swap_info_struct *si,
+ unsigned long offset)
+{
+ struct swap_cluster_info *ci;
+
+ ci = lock_cluster(si, offset);
+ if (!ci)
+ spin_lock(&si->lock);
+
+ return ci;
+}
+
+static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
+ struct swap_cluster_info *ci)
+{
+ if (ci)
+ unlock_cluster(ci);
+ else
+ spin_unlock(&si->lock);
+}
+
static inline bool cluster_list_empty(struct swap_cluster_list *list)
{
return cluster_is_null(&list->head);
@@ -281,9 +325,17 @@ static void cluster_list_add_tail(struct swap_cluster_list *list,
cluster_set_next_flag(&list->head, idx, 0);
cluster_set_next_flag(&list->tail, idx, 0);
} else {
+ struct swap_cluster_info *ci_tail;
unsigned int tail = cluster_next(&list->tail);
- cluster_set_next(&ci[tail], idx);
+ /*
+ * Nested cluster lock, but both cluster locks are
+ * only acquired when we held swap_info_struct->lock
+ */
+ ci_tail = ci + tail;
+ spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
+ cluster_set_next(ci_tail, idx);
+ unlock_cluster(ci_tail);
cluster_set_next_flag(&list->tail, idx, 0);
}
}
@@ -328,7 +380,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
*/
static void swap_do_scheduled_discard(struct swap_info_struct *si)
{
- struct swap_cluster_info *info;
+ struct swap_cluster_info *info, *ci;
unsigned int idx;
info = si->cluster_info;
@@ -341,10 +393,14 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
SWAPFILE_CLUSTER);
spin_lock(&si->lock);
- cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
+ ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
+ cluster_set_flag(ci, CLUSTER_FLAG_FREE);
+ unlock_cluster(ci);
cluster_list_add_tail(&si->free_clusters, info, idx);
+ ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
memset(si->swap_map + idx * SWAPFILE_CLUSTER,
0, SWAPFILE_CLUSTER);
+ unlock_cluster(ci);
}
}
@@ -443,12 +499,13 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
* Try to get a swap entry from current cpu's swap entry pool (a cluster). This
* might involve allocating a new cluster for current CPU too.
*/
-static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
unsigned long *offset, unsigned long *scan_base)
{
struct percpu_cluster *cluster;
+ struct swap_cluster_info *ci;
bool found_free;
- unsigned long tmp;
+ unsigned long tmp, max;
new_cluster:
cluster = this_cpu_ptr(si->percpu_cluster);
@@ -466,7 +523,7 @@ new_cluster:
*scan_base = *offset = si->cluster_next;
goto new_cluster;
} else
- return;
+ return false;
}
found_free = false;
@@ -476,14 +533,21 @@ new_cluster:
* check if there is still free entry in the cluster
*/
tmp = cluster->next;
- while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
- SWAPFILE_CLUSTER) {
+ max = min_t(unsigned long, si->max,
+ (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
+ if (tmp >= max) {
+ cluster_set_null(&cluster->index);
+ goto new_cluster;
+ }
+ ci = lock_cluster(si, tmp);
+ while (tmp < max) {
if (!si->swap_map[tmp]) {
found_free = true;
break;
}
tmp++;
}
+ unlock_cluster(ci);
if (!found_free) {
cluster_set_null(&cluster->index);
goto new_cluster;
@@ -491,15 +555,22 @@ new_cluster:
cluster->next = tmp + 1;
*offset = tmp;
*scan_base = tmp;
+ return found_free;
}
-static unsigned long scan_swap_map(struct swap_info_struct *si,
- unsigned char usage)
+static int scan_swap_map_slots(struct swap_info_struct *si,
+ unsigned char usage, int nr,
+ swp_entry_t slots[])
{
+ struct swap_cluster_info *ci;
unsigned long offset;
unsigned long scan_base;
unsigned long last_in_cluster = 0;
int latency_ration = LATENCY_LIMIT;
+ int n_ret = 0;
+
+ if (nr > SWAP_BATCH)
+ nr = SWAP_BATCH;
/*
* We try to cluster swap pages by allocating them sequentially
@@ -517,8 +588,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
/* SSD algorithm */
if (si->cluster_info) {
- scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
- goto checks;
+ if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
+ goto checks;
+ else
+ goto scan;
}
if (unlikely(!si->cluster_nr--)) {
@@ -562,8 +635,14 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
checks:
if (si->cluster_info) {
- while (scan_swap_map_ssd_cluster_conflict(si, offset))
- scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
+ while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
+ /* take a break if we already got some slots */
+ if (n_ret)
+ goto done;
+ if (!scan_swap_map_try_ssd_cluster(si, &offset,
+ &scan_base))
+ goto scan;
+ }
}
if (!(si->flags & SWP_WRITEOK))
goto no_page;
@@ -572,9 +651,11 @@ checks:
if (offset > si->highest_bit)
scan_base = offset = si->lowest_bit;
+ ci = lock_cluster(si, offset);
/* reuse swap entry of cache-only swap if not busy. */
if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
int swap_was_freed;
+ unlock_cluster(ci);
spin_unlock(&si->lock);
swap_was_freed = __try_to_reclaim_swap(si, offset);
spin_lock(&si->lock);
@@ -584,8 +665,13 @@ checks:
goto scan; /* check next one */
}
- if (si->swap_map[offset])
- goto scan;
+ if (si->swap_map[offset]) {
+ unlock_cluster(ci);
+ if (!n_ret)
+ goto scan;
+ else
+ goto done;
+ }
if (offset == si->lowest_bit)
si->lowest_bit++;
@@ -601,10 +687,45 @@ checks:
}
si->swap_map[offset] = usage;
inc_cluster_info_page(si, si->cluster_info, offset);
+ unlock_cluster(ci);
si->cluster_next = offset + 1;
- si->flags -= SWP_SCANNING;
+ slots[n_ret++] = swp_entry(si->type, offset);
+
+ /* got enough slots or reach max slots? */
+ if ((n_ret == nr) || (offset >= si->highest_bit))
+ goto done;
- return offset;
+ /* search for next available slot */
+
+ /* time to take a break? */
+ if (unlikely(--latency_ration < 0)) {
+ if (n_ret)
+ goto done;
+ spin_unlock(&si->lock);
+ cond_resched();
+ spin_lock(&si->lock);
+ latency_ration = LATENCY_LIMIT;
+ }
+
+ /* try to get more slots in cluster */
+ if (si->cluster_info) {
+ if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
+ goto checks;
+ else
+ goto done;
+ }
+ /* non-ssd case */
+ ++offset;
+
+ /* non-ssd case, still more slots in cluster? */
+ if (si->cluster_nr && !si->swap_map[offset]) {
+ --si->cluster_nr;
+ goto checks;
+ }
+
+done:
+ si->flags -= SWP_SCANNING;
+ return n_ret;
scan:
spin_unlock(&si->lock);
@@ -642,17 +763,41 @@ scan:
no_page:
si->flags -= SWP_SCANNING;
- return 0;
+ return n_ret;
+}
+
+static unsigned long scan_swap_map(struct swap_info_struct *si,
+ unsigned char usage)
+{
+ swp_entry_t entry;
+ int n_ret;
+
+ n_ret = scan_swap_map_slots(si, usage, 1, &entry);
+
+ if (n_ret)
+ return swp_offset(entry);
+ else
+ return 0;
+
}
-swp_entry_t get_swap_page(void)
+int get_swap_pages(int n_goal, swp_entry_t swp_entries[])
{
struct swap_info_struct *si, *next;
- pgoff_t offset;
+ long avail_pgs;
+ int n_ret = 0;
- if (atomic_long_read(&nr_swap_pages) <= 0)
+ avail_pgs = atomic_long_read(&nr_swap_pages);
+ if (avail_pgs <= 0)
goto noswap;
- atomic_long_dec(&nr_swap_pages);
+
+ if (n_goal > SWAP_BATCH)
+ n_goal = SWAP_BATCH;
+
+ if (n_goal > avail_pgs)
+ n_goal = avail_pgs;
+
+ atomic_long_sub(n_goal, &nr_swap_pages);
spin_lock(&swap_avail_lock);
@@ -678,14 +823,14 @@ start_over:
spin_unlock(&si->lock);
goto nextsi;
}
-
- /* This is called for allocating swap entry for cache */
- offset = scan_swap_map(si, SWAP_HAS_CACHE);
+ n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
+ n_goal, swp_entries);
spin_unlock(&si->lock);
- if (offset)
- return swp_entry(si->type, offset);
+ if (n_ret)
+ goto check_out;
pr_debug("scan_swap_map of si %d failed to find offset\n",
- si->type);
+ si->type);
+
spin_lock(&swap_avail_lock);
nextsi:
/*
@@ -696,7 +841,8 @@ nextsi:
* up between us dropping swap_avail_lock and taking si->lock.
* Since we dropped the swap_avail_lock, the swap_avail_head
* list may have been modified; so if next is still in the
- * swap_avail_head list then try it, otherwise start over.
+ * swap_avail_head list then try it, otherwise start over
+ * if we have not gotten any slots.
*/
if (plist_node_empty(&next->avail_list))
goto start_over;
@@ -704,9 +850,11 @@ nextsi:
spin_unlock(&swap_avail_lock);
- atomic_long_inc(&nr_swap_pages);
+check_out:
+ if (n_ret < n_goal)
+ atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages);
noswap:
- return (swp_entry_t) {0};
+ return n_ret;
}
/* The only caller of this function is now suspend routine */
@@ -731,7 +879,7 @@ swp_entry_t get_swap_page_of_type(int type)
return (swp_entry_t) {0};
}
-static struct swap_info_struct *swap_info_get(swp_entry_t entry)
+static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
{
struct swap_info_struct *p;
unsigned long offset, type;
@@ -747,34 +895,76 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
offset = swp_offset(entry);
if (offset >= p->max)
goto bad_offset;
- if (!p->swap_map[offset])
- goto bad_free;
- spin_lock(&p->lock);
return p;
-bad_free:
- pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val);
- goto out;
bad_offset:
- pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val);
+ pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
goto out;
bad_device:
- pr_err("swap_free: %s%08lx\n", Unused_file, entry.val);
+ pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
goto out;
bad_nofile:
- pr_err("swap_free: %s%08lx\n", Bad_file, entry.val);
+ pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
out:
return NULL;
}
-static unsigned char swap_entry_free(struct swap_info_struct *p,
- swp_entry_t entry, unsigned char usage)
+static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
{
+ struct swap_info_struct *p;
+
+ p = __swap_info_get(entry);
+ if (!p)
+ goto out;
+ if (!p->swap_map[swp_offset(entry)])
+ goto bad_free;
+ return p;
+
+bad_free:
+ pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
+ goto out;
+out:
+ return NULL;
+}
+
+static struct swap_info_struct *swap_info_get(swp_entry_t entry)
+{
+ struct swap_info_struct *p;
+
+ p = _swap_info_get(entry);
+ if (p)
+ spin_lock(&p->lock);
+ return p;
+}
+
+static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
+ struct swap_info_struct *q)
+{
+ struct swap_info_struct *p;
+
+ p = _swap_info_get(entry);
+
+ if (p != q) {
+ if (q != NULL)
+ spin_unlock(&q->lock);
+ if (p != NULL)
+ spin_lock(&p->lock);
+ }
+ return p;
+}
+
+static unsigned char __swap_entry_free(struct swap_info_struct *p,
+ swp_entry_t entry, unsigned char usage)
+{
+ struct swap_cluster_info *ci;
unsigned long offset = swp_offset(entry);
unsigned char count;
unsigned char has_cache;
+ ci = lock_cluster_or_swap_info(p, offset);
+
count = p->swap_map[offset];
+
has_cache = count & SWAP_HAS_CACHE;
count &= ~SWAP_HAS_CACHE;
@@ -798,38 +988,52 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
}
usage = count | has_cache;
- p->swap_map[offset] = usage;
-
- /* free if no reference */
- if (!usage) {
- mem_cgroup_uncharge_swap(entry);
- dec_cluster_info_page(p, p->cluster_info, offset);
- if (offset < p->lowest_bit)
- p->lowest_bit = offset;
- if (offset > p->highest_bit) {
- bool was_full = !p->highest_bit;
- p->highest_bit = offset;
- if (was_full && (p->flags & SWP_WRITEOK)) {
- spin_lock(&swap_avail_lock);
- WARN_ON(!plist_node_empty(&p->avail_list));
- if (plist_node_empty(&p->avail_list))
- plist_add(&p->avail_list,
- &swap_avail_head);
- spin_unlock(&swap_avail_lock);
- }
- }
- atomic_long_inc(&nr_swap_pages);
- p->inuse_pages--;
- frontswap_invalidate_page(p->type, offset);
- if (p->flags & SWP_BLKDEV) {
- struct gendisk *disk = p->bdev->bd_disk;
- if (disk->fops->swap_slot_free_notify)
- disk->fops->swap_slot_free_notify(p->bdev,
- offset);
+ p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
+
+ unlock_cluster_or_swap_info(p, ci);
+
+ return usage;
+}
+
+static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+ unsigned char count;
+
+ ci = lock_cluster(p, offset);
+ count = p->swap_map[offset];
+ VM_BUG_ON(count != SWAP_HAS_CACHE);
+ p->swap_map[offset] = 0;
+ dec_cluster_info_page(p, p->cluster_info, offset);
+ unlock_cluster(ci);
+
+ mem_cgroup_uncharge_swap(entry);
+ if (offset < p->lowest_bit)
+ p->lowest_bit = offset;
+ if (offset > p->highest_bit) {
+ bool was_full = !p->highest_bit;
+
+ p->highest_bit = offset;
+ if (was_full && (p->flags & SWP_WRITEOK)) {
+ spin_lock(&swap_avail_lock);
+ WARN_ON(!plist_node_empty(&p->avail_list));
+ if (plist_node_empty(&p->avail_list))
+ plist_add(&p->avail_list,
+ &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
}
}
+ atomic_long_inc(&nr_swap_pages);
+ p->inuse_pages--;
+ frontswap_invalidate_page(p->type, offset);
+ if (p->flags & SWP_BLKDEV) {
+ struct gendisk *disk = p->bdev->bd_disk;
- return usage;
+ if (disk->fops->swap_slot_free_notify)
+ disk->fops->swap_slot_free_notify(p->bdev,
+ offset);
+ }
}
/*
@@ -840,10 +1044,10 @@ void swap_free(swp_entry_t entry)
{
struct swap_info_struct *p;
- p = swap_info_get(entry);
+ p = _swap_info_get(entry);
if (p) {
- swap_entry_free(p, entry, 1);
- spin_unlock(&p->lock);
+ if (!__swap_entry_free(p, entry, 1))
+ free_swap_slot(entry);
}
}
@@ -854,13 +1058,35 @@ void swapcache_free(swp_entry_t entry)
{
struct swap_info_struct *p;
- p = swap_info_get(entry);
+ p = _swap_info_get(entry);
if (p) {
- swap_entry_free(p, entry, SWAP_HAS_CACHE);
- spin_unlock(&p->lock);
+ if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
+ free_swap_slot(entry);
}
}
+void swapcache_free_entries(swp_entry_t *entries, int n)
+{
+ struct swap_info_struct *p, *prev;
+ int i;
+
+ if (n <= 0)
+ return;
+
+ prev = NULL;
+ p = NULL;
+ for (i = 0; i < n; ++i) {
+ p = swap_info_get_cont(entries[i], prev);
+ if (p)
+ swap_entry_free(p, entries[i]);
+ else
+ break;
+ prev = p;
+ }
+ if (p)
+ spin_unlock(&p->lock);
+}
+
/*
* How many references to page are currently swapped out?
* This does not give an exact answer when swap count is continued,
@@ -870,13 +1096,39 @@ int page_swapcount(struct page *page)
{
int count = 0;
struct swap_info_struct *p;
+ struct swap_cluster_info *ci;
swp_entry_t entry;
+ unsigned long offset;
entry.val = page_private(page);
- p = swap_info_get(entry);
+ p = _swap_info_get(entry);
if (p) {
- count = swap_count(p->swap_map[swp_offset(entry)]);
- spin_unlock(&p->lock);
+ offset = swp_offset(entry);
+ ci = lock_cluster_or_swap_info(p, offset);
+ count = swap_count(p->swap_map[offset]);
+ unlock_cluster_or_swap_info(p, ci);
+ }
+ return count;
+}
+
+/*
+ * How many references to @entry are currently swapped out?
+ * This does not give an exact answer when swap count is continued,
+ * but does include the high COUNT_CONTINUED flag to allow for that.
+ */
+int __swp_swapcount(swp_entry_t entry)
+{
+ int count = 0;
+ pgoff_t offset;
+ struct swap_info_struct *si;
+ struct swap_cluster_info *ci;
+
+ si = __swap_info_get(entry);
+ if (si) {
+ offset = swp_offset(entry);
+ ci = lock_cluster_or_swap_info(si, offset);
+ count = swap_count(si->swap_map[offset]);
+ unlock_cluster_or_swap_info(si, ci);
}
return count;
}
@@ -889,22 +1141,26 @@ int swp_swapcount(swp_entry_t entry)
{
int count, tmp_count, n;
struct swap_info_struct *p;
+ struct swap_cluster_info *ci;
struct page *page;
pgoff_t offset;
unsigned char *map;
- p = swap_info_get(entry);
+ p = _swap_info_get(entry);
if (!p)
return 0;
- count = swap_count(p->swap_map[swp_offset(entry)]);
+ offset = swp_offset(entry);
+
+ ci = lock_cluster_or_swap_info(p, offset);
+
+ count = swap_count(p->swap_map[offset]);
if (!(count & COUNT_CONTINUED))
goto out;
count &= ~COUNT_CONTINUED;
n = SWAP_MAP_MAX + 1;
- offset = swp_offset(entry);
page = vmalloc_to_page(p->swap_map + offset);
offset &= ~PAGE_MASK;
VM_BUG_ON(page_private(page) != SWP_CONTINUED);
@@ -919,7 +1175,7 @@ int swp_swapcount(swp_entry_t entry)
n *= (SWAP_CONT_MAX + 1);
} while (tmp_count & COUNT_CONTINUED);
out:
- spin_unlock(&p->lock);
+ unlock_cluster_or_swap_info(p, ci);
return count;
}
@@ -943,11 +1199,25 @@ bool reuse_swap_page(struct page *page, int *total_mapcount)
count = page_trans_huge_mapcount(page, total_mapcount);
if (count <= 1 && PageSwapCache(page)) {
count += page_swapcount(page);
- if (count == 1 && !PageWriteback(page)) {
+ if (count != 1)
+ goto out;
+ if (!PageWriteback(page)) {
delete_from_swap_cache(page);
SetPageDirty(page);
+ } else {
+ swp_entry_t entry;
+ struct swap_info_struct *p;
+
+ entry.val = page_private(page);
+ p = swap_info_get(entry);
+ if (p->flags & SWP_STABLE_WRITES) {
+ spin_unlock(&p->lock);
+ return false;
+ }
+ spin_unlock(&p->lock);
}
}
+out:
return count <= 1;
}
@@ -997,21 +1267,23 @@ int free_swap_and_cache(swp_entry_t entry)
{
struct swap_info_struct *p;
struct page *page = NULL;
+ unsigned char count;
if (non_swap_entry(entry))
return 1;
- p = swap_info_get(entry);
+ p = _swap_info_get(entry);
if (p) {
- if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
+ count = __swap_entry_free(p, entry, 1);
+ if (count == SWAP_HAS_CACHE) {
page = find_get_page(swap_address_space(entry),
swp_offset(entry));
if (page && !trylock_page(page)) {
put_page(page);
page = NULL;
}
- }
- spin_unlock(&p->lock);
+ } else if (!count)
+ free_swap_slot(entry);
}
if (page) {
/*
@@ -1234,6 +1506,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
+ cond_resched();
next = pmd_addr_end(addr, end);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
@@ -1244,7 +1517,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
return 0;
}
-static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
unsigned long addr, unsigned long end,
swp_entry_t entry, struct page *page)
{
@@ -1252,7 +1525,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long next;
int ret;
- pud = pud_offset(pgd, addr);
+ pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
@@ -1264,6 +1537,26 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
return 0;
}
+static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ swp_entry_t entry, struct page *page)
+{
+ p4d_t *p4d;
+ unsigned long next;
+ int ret;
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none_or_clear_bad(p4d))
+ continue;
+ ret = unuse_pud_range(vma, p4d, addr, next, entry, page);
+ if (ret)
+ return ret;
+ } while (p4d++, addr = next, addr != end);
+ return 0;
+}
+
static int unuse_vma(struct vm_area_struct *vma,
swp_entry_t entry, struct page *page)
{
@@ -1287,7 +1580,7 @@ static int unuse_vma(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
+ ret = unuse_p4d_range(vma, pgd, addr, next, entry, page);
if (ret)
return ret;
} while (pgd++, addr = next, addr != end);
@@ -1313,6 +1606,7 @@ static int unuse_mm(struct mm_struct *mm,
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
break;
+ cond_resched();
}
up_read(&mm->mmap_sem);
return (ret < 0)? ret: 0;
@@ -1350,15 +1644,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
prev = 0;
i = 1;
}
- if (frontswap) {
- if (frontswap_test(si, i))
- break;
- else
- continue;
- }
count = READ_ONCE(si->swap_map[i]);
if (count && swap_count(count) != SWAP_MAP_BAD)
- break;
+ if (!frontswap || frontswap_test(si, i))
+ break;
+ if ((i % LATENCY_LIMIT) == 0)
+ cond_resched();
}
return i;
}
@@ -1402,7 +1693,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
* that.
*/
start_mm = &init_mm;
- atomic_inc(&init_mm.mm_users);
+ mmget(&init_mm);
/*
* Keep on scanning until all entries have gone. Usually,
@@ -1451,7 +1742,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
if (atomic_read(&start_mm->mm_users) == 1) {
mmput(start_mm);
start_mm = &init_mm;
- atomic_inc(&init_mm.mm_users);
+ mmget(&init_mm);
}
/*
@@ -1488,13 +1779,13 @@ int try_to_unuse(unsigned int type, bool frontswap,
struct mm_struct *prev_mm = start_mm;
struct mm_struct *mm;
- atomic_inc(&new_start_mm->mm_users);
- atomic_inc(&prev_mm->mm_users);
+ mmget(new_start_mm);
+ mmget(prev_mm);
spin_lock(&mmlist_lock);
while (swap_count(*swap_map) && !retval &&
(p = p->next) != &start_mm->mmlist) {
mm = list_entry(p, struct mm_struct, mmlist);
- if (!atomic_inc_not_zero(&mm->mm_users))
+ if (!mmget_not_zero(mm))
continue;
spin_unlock(&mmlist_lock);
mmput(prev_mm);
@@ -1512,7 +1803,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
if (set_start_mm && *swap_map < swcount) {
mmput(new_start_mm);
- atomic_inc(&mm->mm_users);
+ mmget(mm);
new_start_mm = mm;
set_start_mm = 0;
}
@@ -1840,6 +2131,17 @@ static void reinsert_swap_info(struct swap_info_struct *p)
spin_unlock(&swap_lock);
}
+bool has_usable_swap(void)
+{
+ bool ret = true;
+
+ spin_lock(&swap_lock);
+ if (plist_head_empty(&swap_active_head))
+ ret = false;
+ spin_unlock(&swap_lock);
+ return ret;
+}
+
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
@@ -1910,6 +2212,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
+ disable_swap_slots_cache_lock();
+
set_current_oom_origin();
err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
clear_current_oom_origin();
@@ -1917,9 +2221,12 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (err) {
/* re-insert swap space back into swap_list */
reinsert_swap_info(p);
+ reenable_swap_slots_cache_unlock();
goto out_dput;
}
+ reenable_swap_slots_cache_unlock();
+
flush_work(&p->discard_work);
destroy_swap_extents(p);
@@ -1962,6 +2269,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
vfree(frontswap_map);
/* Destroy swap account information */
swap_cgroup_swapoff(p->type);
+ exit_swap_address_space(p->type);
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
@@ -2285,6 +2593,13 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
return maxpages;
}
+#define SWAP_CLUSTER_INFO_COLS \
+ DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
+#define SWAP_CLUSTER_SPACE_COLS \
+ DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
+#define SWAP_CLUSTER_COLS \
+ max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
+
static int setup_swap_map_and_extents(struct swap_info_struct *p,
union swap_header *swap_header,
unsigned char *swap_map,
@@ -2292,11 +2607,12 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
unsigned long maxpages,
sector_t *span)
{
- int i;
+ unsigned int j, k;
unsigned int nr_good_pages;
int nr_extents;
unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
- unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER;
+ unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
+ unsigned long i, idx;
nr_good_pages = maxpages - 1; /* omit header page */
@@ -2344,15 +2660,23 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
if (!cluster_info)
return nr_extents;
- for (i = 0; i < nr_clusters; i++) {
- if (!cluster_count(&cluster_info[idx])) {
+
+ /*
+ * Reduce false cache line sharing between cluster_info and
+ * sharing same address space.
+ */
+ for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
+ j = (k + col) % SWAP_CLUSTER_COLS;
+ for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
+ idx = i * SWAP_CLUSTER_COLS + j;
+ if (idx >= nr_clusters)
+ continue;
+ if (cluster_count(&cluster_info[idx]))
+ continue;
cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
cluster_list_add_tail(&p->free_clusters, cluster_info,
idx);
}
- idx++;
- if (idx == nr_clusters)
- idx = 0;
}
return nr_extents;
}
@@ -2449,8 +2773,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = -ENOMEM;
goto bad_swap;
}
+
+ if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
+ p->flags |= SWP_STABLE_WRITES;
+
if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
int cpu;
+ unsigned long ci, nr_cluster;
p->flags |= SWP_SOLIDSTATE;
/*
@@ -2458,13 +2787,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
* SSD
*/
p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
+ nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
- cluster_info = vzalloc(DIV_ROUND_UP(maxpages,
- SWAPFILE_CLUSTER) * sizeof(*cluster_info));
+ cluster_info = vzalloc(nr_cluster * sizeof(*cluster_info));
if (!cluster_info) {
error = -ENOMEM;
goto bad_swap;
}
+
+ for (ci = 0; ci < nr_cluster; ci++)
+ spin_lock_init(&((cluster_info + ci)->lock));
+
p->percpu_cluster = alloc_percpu(struct percpu_cluster);
if (!p->percpu_cluster) {
error = -ENOMEM;
@@ -2521,6 +2854,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
}
+ error = init_swap_address_space(p->type, maxpages);
+ if (error)
+ goto bad_swap;
+
mutex_lock(&swapon_mutex);
prio = -1;
if (swap_flags & SWAP_FLAG_PREFER)
@@ -2576,6 +2913,8 @@ out:
putname(name);
if (inode && S_ISREG(inode->i_mode))
inode_unlock(inode);
+ if (!error)
+ enable_swap_slots_cache();
return error;
}
@@ -2610,6 +2949,7 @@ void si_swapinfo(struct sysinfo *val)
static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
{
struct swap_info_struct *p;
+ struct swap_cluster_info *ci;
unsigned long offset, type;
unsigned char count;
unsigned char has_cache;
@@ -2623,10 +2963,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
goto bad_file;
p = swap_info[type];
offset = swp_offset(entry);
-
- spin_lock(&p->lock);
if (unlikely(offset >= p->max))
- goto unlock_out;
+ goto out;
+
+ ci = lock_cluster_or_swap_info(p, offset);
count = p->swap_map[offset];
@@ -2669,7 +3009,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
p->swap_map[offset] = count | has_cache;
unlock_out:
- spin_unlock(&p->lock);
+ unlock_cluster_or_swap_info(p, ci);
out:
return err;
@@ -2758,6 +3098,7 @@ EXPORT_SYMBOL_GPL(__page_file_index);
int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
{
struct swap_info_struct *si;
+ struct swap_cluster_info *ci;
struct page *head;
struct page *page;
struct page *list_page;
@@ -2781,6 +3122,9 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
}
offset = swp_offset(entry);
+
+ ci = lock_cluster(si, offset);
+
count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
@@ -2793,6 +3137,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
}
if (!page) {
+ unlock_cluster(ci);
spin_unlock(&si->lock);
return -ENOMEM;
}
@@ -2841,6 +3186,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
list_add_tail(&page->lru, &head->lru);
page = NULL; /* now it's attached, don't free it */
out:
+ unlock_cluster(ci);
spin_unlock(&si->lock);
outer:
if (page)
@@ -2854,7 +3200,8 @@ outer:
* into, carry if so, or else fail until a new continuation page is allocated;
* when the original swap_map count is decremented from 0 with continuation,
* borrow from the continuation and report whether it still holds more.
- * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
+ * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
+ * lock.
*/
static bool swap_count_continued(struct swap_info_struct *si,
pgoff_t offset, unsigned char count)
diff --git a/mm/truncate.c b/mm/truncate.c
index 8d8c62d89e6d..6263affdef88 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -20,56 +20,84 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/buffer_head.h> /* grr. try_to_release_page,
do_invalidatepage */
+#include <linux/shmem_fs.h>
#include <linux/cleancache.h>
#include <linux/rmap.h>
#include "internal.h"
-static void clear_exceptional_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
+static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
+ void *entry)
{
struct radix_tree_node *node;
void **slot;
- /* Handled by shmem itself */
- if (shmem_mapping(mapping))
- return;
-
- if (dax_mapping(mapping)) {
- dax_delete_mapping_entry(mapping, index);
- return;
- }
spin_lock_irq(&mapping->tree_lock);
/*
* Regular page slots are stabilized by the page lock even
* without the tree itself locked. These unlocked entries
* need verification under the tree lock.
*/
- if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
- &slot))
+ if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
goto unlock;
if (*slot != entry)
goto unlock;
- radix_tree_replace_slot(slot, NULL);
+ __radix_tree_replace(&mapping->page_tree, node, slot, NULL,
+ workingset_update_node, mapping);
mapping->nrexceptional--;
- if (!node)
- goto unlock;
- workingset_node_shadows_dec(node);
- /*
- * Don't track node without shadow entries.
- *
- * Avoid acquiring the list_lru lock if already untracked.
- * The list_empty() test is safe as node->private_list is
- * protected by mapping->tree_lock.
- */
- if (!workingset_node_shadows(node) &&
- !list_empty(&node->private_list))
- list_lru_del(&workingset_shadow_nodes,
- &node->private_list);
- __radix_tree_delete_node(&mapping->page_tree, node);
unlock:
spin_unlock_irq(&mapping->tree_lock);
}
+/*
+ * Unconditionally remove exceptional entry. Usually called from truncate path.
+ */
+static void truncate_exceptional_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ /* Handled by shmem itself */
+ if (shmem_mapping(mapping))
+ return;
+
+ if (dax_mapping(mapping)) {
+ dax_delete_mapping_entry(mapping, index);
+ return;
+ }
+ clear_shadow_entry(mapping, index, entry);
+}
+
+/*
+ * Invalidate exceptional entry if easily possible. This handles exceptional
+ * entries for invalidate_inode_pages() so for DAX it evicts only unlocked and
+ * clean entries.
+ */
+static int invalidate_exceptional_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ /* Handled by shmem itself */
+ if (shmem_mapping(mapping))
+ return 1;
+ if (dax_mapping(mapping))
+ return dax_invalidate_mapping_entry(mapping, index);
+ clear_shadow_entry(mapping, index, entry);
+ return 1;
+}
+
+/*
+ * Invalidate exceptional entry if clean. This handles exceptional entries for
+ * invalidate_inode_pages2() so for DAX it evicts only clean entries.
+ */
+static int invalidate_exceptional_entry2(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ /* Handled by shmem itself */
+ if (shmem_mapping(mapping))
+ return 1;
+ if (dax_mapping(mapping))
+ return dax_invalidate_mapping_entry_sync(mapping, index);
+ clear_shadow_entry(mapping, index, entry);
+ return 1;
+}
+
/**
* do_invalidatepage - invalidate part or all of a page
* @page: the page which is affected
@@ -277,7 +305,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
break;
if (radix_tree_exceptional_entry(page)) {
- clear_exceptional_entry(mapping, index, page);
+ truncate_exceptional_entry(mapping, index,
+ page);
continue;
}
@@ -366,7 +395,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
}
if (radix_tree_exceptional_entry(page)) {
- clear_exceptional_entry(mapping, index, page);
+ truncate_exceptional_entry(mapping, index,
+ page);
continue;
}
@@ -485,7 +515,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
break;
if (radix_tree_exceptional_entry(page)) {
- clear_exceptional_entry(mapping, index, page);
+ invalidate_exceptional_entry(mapping, index,
+ page);
continue;
}
@@ -607,7 +638,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
break;
if (radix_tree_exceptional_entry(page)) {
- clear_exceptional_entry(mapping, index, page);
+ if (!invalidate_exceptional_entry2(mapping,
+ index, page))
+ ret = -EBUSY;
continue;
}
@@ -753,7 +786,7 @@ EXPORT_SYMBOL(truncate_setsize);
*/
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
{
- int bsize = 1 << inode->i_blkbits;
+ int bsize = i_blocksize(inode);
loff_t rounded_from;
struct page *page;
pgoff_t index;
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 3c8da0af9695..d155e12563b1 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -16,6 +16,9 @@
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
#include <asm/sections.h>
enum {
@@ -108,13 +111,13 @@ static inline const char *check_kernel_text_object(const void *ptr,
* __pa() is not just the reverse of __va(). This can be detected
* and checked:
*/
- textlow_linear = (unsigned long)__va(__pa(textlow));
+ textlow_linear = (unsigned long)lm_alias(textlow);
/* No different mapping: we're done. */
if (textlow_linear == textlow)
return NULL;
/* Check the secondary mapping... */
- texthigh_linear = (unsigned long)__va(__pa(texthigh));
+ texthigh_linear = (unsigned long)lm_alias(texthigh);
if (overlaps(ptr, n, textlow_linear, texthigh_linear))
return "<linear kernel text>";
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index af817e5060fb..8bcb501bce60 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -8,12 +8,16 @@
*/
#include <linux/mm.h>
+#include <linux/sched/signal.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/userfaultfd_k.h>
#include <linux/mmu_notifier.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/shmem_fs.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -124,20 +128,248 @@ out_unlock:
static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
- pmd_t *pmd = NULL;
pgd = pgd_offset(mm, address);
- pud = pud_alloc(mm, pgd, address);
- if (pud)
+ p4d = p4d_alloc(mm, pgd, address);
+ if (!p4d)
+ return NULL;
+ pud = pud_alloc(mm, p4d, address);
+ if (!pud)
+ return NULL;
+ /*
+ * Note that we didn't run this because the pmd was
+ * missing, the *pmd may be already established and in
+ * turn it may also be a trans_huge_pmd.
+ */
+ return pmd_alloc(mm, pud, address);
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
+ * called with mmap_sem held, it will release mmap_sem before returning.
+ */
+static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ bool zeropage)
+{
+ int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
+ int vm_shared = dst_vma->vm_flags & VM_SHARED;
+ ssize_t err;
+ pte_t *dst_pte;
+ unsigned long src_addr, dst_addr;
+ long copied;
+ struct page *page;
+ struct hstate *h;
+ unsigned long vma_hpagesize;
+ pgoff_t idx;
+ u32 hash;
+ struct address_space *mapping;
+
+ /*
+ * There is no default zero huge page for all huge page sizes as
+ * supported by hugetlb. A PMD_SIZE huge pages may exist as used
+ * by THP. Since we can not reliably insert a zero page, this
+ * feature is not supported.
+ */
+ if (zeropage) {
+ up_read(&dst_mm->mmap_sem);
+ return -EINVAL;
+ }
+
+ src_addr = src_start;
+ dst_addr = dst_start;
+ copied = 0;
+ page = NULL;
+ vma_hpagesize = vma_kernel_pagesize(dst_vma);
+
+ /*
+ * Validate alignment based on huge page size
+ */
+ err = -EINVAL;
+ if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
+ goto out_unlock;
+
+retry:
+ /*
+ * On routine entry dst_vma is set. If we had to drop mmap_sem and
+ * retry, dst_vma will be set to NULL and we must lookup again.
+ */
+ if (!dst_vma) {
+ err = -ENOENT;
+ dst_vma = find_vma(dst_mm, dst_start);
+ if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
+ goto out_unlock;
+ /*
+ * Only allow __mcopy_atomic_hugetlb on userfaultfd
+ * registered ranges.
+ */
+ if (!dst_vma->vm_userfaultfd_ctx.ctx)
+ goto out_unlock;
+
+ if (dst_start < dst_vma->vm_start ||
+ dst_start + len > dst_vma->vm_end)
+ goto out_unlock;
+
+ err = -EINVAL;
+ if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
+ goto out_unlock;
+
+ vm_shared = dst_vma->vm_flags & VM_SHARED;
+ }
+
+ if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
+ (len - copied) & (vma_hpagesize - 1)))
+ goto out_unlock;
+
+ /*
+ * If not shared, ensure the dst_vma has a anon_vma.
+ */
+ err = -ENOMEM;
+ if (!vm_shared) {
+ if (unlikely(anon_vma_prepare(dst_vma)))
+ goto out_unlock;
+ }
+
+ h = hstate_vma(dst_vma);
+
+ while (src_addr < src_start + len) {
+ pte_t dst_pteval;
+
+ BUG_ON(dst_addr >= dst_start + len);
+ VM_BUG_ON(dst_addr & ~huge_page_mask(h));
+
+ /*
+ * Serialize via hugetlb_fault_mutex
+ */
+ idx = linear_page_index(dst_vma, dst_addr);
+ mapping = dst_vma->vm_file->f_mapping;
+ hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
+ idx, dst_addr);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ err = -ENOMEM;
+ dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
+ if (!dst_pte) {
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ goto out_unlock;
+ }
+
+ err = -EEXIST;
+ dst_pteval = huge_ptep_get(dst_pte);
+ if (!huge_pte_none(dst_pteval)) {
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ goto out_unlock;
+ }
+
+ err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
+ dst_addr, src_addr, &page);
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ vm_alloc_shared = vm_shared;
+
+ cond_resched();
+
+ if (unlikely(err == -EFAULT)) {
+ up_read(&dst_mm->mmap_sem);
+ BUG_ON(!page);
+
+ err = copy_huge_page_from_user(page,
+ (const void __user *)src_addr,
+ pages_per_huge_page(h), true);
+ if (unlikely(err)) {
+ err = -EFAULT;
+ goto out;
+ }
+ down_read(&dst_mm->mmap_sem);
+
+ dst_vma = NULL;
+ goto retry;
+ } else
+ BUG_ON(page);
+
+ if (!err) {
+ dst_addr += vma_hpagesize;
+ src_addr += vma_hpagesize;
+ copied += vma_hpagesize;
+
+ if (fatal_signal_pending(current))
+ err = -EINTR;
+ }
+ if (err)
+ break;
+ }
+
+out_unlock:
+ up_read(&dst_mm->mmap_sem);
+out:
+ if (page) {
/*
- * Note that we didn't run this because the pmd was
- * missing, the *pmd may be already established and in
- * turn it may also be a trans_huge_pmd.
+ * We encountered an error and are about to free a newly
+ * allocated huge page.
+ *
+ * Reservation handling is very subtle, and is different for
+ * private and shared mappings. See the routine
+ * restore_reserve_on_error for details. Unfortunately, we
+ * can not call restore_reserve_on_error now as it would
+ * require holding mmap_sem.
+ *
+ * If a reservation for the page existed in the reservation
+ * map of a private mapping, the map was modified to indicate
+ * the reservation was consumed when the page was allocated.
+ * We clear the PagePrivate flag now so that the global
+ * reserve count will not be incremented in free_huge_page.
+ * The reservation map will still indicate the reservation
+ * was consumed and possibly prevent later page allocation.
+ * This is better than leaking a global reservation. If no
+ * reservation existed, it is still safe to clear PagePrivate
+ * as no adjustments to reservation counts were made during
+ * allocation.
+ *
+ * The reservation map for shared mappings indicates which
+ * pages have reservations. When a huge page is allocated
+ * for an address with a reservation, no change is made to
+ * the reserve map. In this case PagePrivate will be set
+ * to indicate that the global reservation count should be
+ * incremented when the page is freed. This is the desired
+ * behavior. However, when a huge page is allocated for an
+ * address without a reservation a reservation entry is added
+ * to the reservation map, and PagePrivate will not be set.
+ * When the page is freed, the global reserve count will NOT
+ * be incremented and it will appear as though we have leaked
+ * reserved page. In this case, set PagePrivate so that the
+ * global reserve count will be incremented to match the
+ * reservation map entry which was created.
+ *
+ * Note that vm_alloc_shared is based on the flags of the vma
+ * for which the page was originally allocated. dst_vma could
+ * be different or NULL on error.
*/
- pmd = pmd_alloc(mm, pud, address);
- return pmd;
+ if (vm_alloc_shared)
+ SetPagePrivate(page);
+ else
+ ClearPagePrivate(page);
+ put_page(page);
+ }
+ BUG_ON(copied < 0);
+ BUG_ON(err > 0);
+ BUG_ON(!copied && !err);
+ return copied ? copied : err;
}
+#else /* !CONFIG_HUGETLB_PAGE */
+/* fail at build time if gcc attempts to use this */
+extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ bool zeropage);
+#endif /* CONFIG_HUGETLB_PAGE */
static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
unsigned long dst_start,
@@ -173,14 +405,10 @@ retry:
* Make sure the vma is not shared, that the dst range is
* both valid and fully within a single existing vma.
*/
- err = -EINVAL;
+ err = -ENOENT;
dst_vma = find_vma(dst_mm, dst_start);
- if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
- goto out_unlock;
- if (dst_start < dst_vma->vm_start ||
- dst_start + len > dst_vma->vm_end)
+ if (!dst_vma)
goto out_unlock;
-
/*
* Be strict and only allow __mcopy_atomic on userfaultfd
* registered ranges to prevent userland errors going
@@ -193,11 +421,27 @@ retry:
if (!dst_vma->vm_userfaultfd_ctx.ctx)
goto out_unlock;
+ if (dst_start < dst_vma->vm_start ||
+ dst_start + len > dst_vma->vm_end)
+ goto out_unlock;
+
+ err = -EINVAL;
+ /*
+ * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
+ * it will overwrite vm_ops, so vma_is_anonymous must return false.
+ */
+ if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
+ dst_vma->vm_flags & VM_SHARED))
+ goto out_unlock;
+
/*
- * FIXME: only allow copying on anonymous vmas, tmpfs should
- * be added.
+ * If this is a HUGETLB vma, pass off to appropriate routine
*/
- if (dst_vma->vm_ops)
+ if (is_vm_hugetlb_page(dst_vma))
+ return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
+ src_start, len, zeropage);
+
+ if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
/*
@@ -206,7 +450,7 @@ retry:
* dst_vma.
*/
err = -ENOMEM;
- if (unlikely(anon_vma_prepare(dst_vma)))
+ if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma)))
goto out_unlock;
while (src_addr < src_start + len) {
@@ -243,12 +487,21 @@ retry:
BUG_ON(pmd_none(*dst_pmd));
BUG_ON(pmd_trans_huge(*dst_pmd));
- if (!zeropage)
- err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr, &page);
- else
- err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr);
+ if (vma_is_anonymous(dst_vma)) {
+ if (!zeropage)
+ err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr, src_addr,
+ &page);
+ else
+ err = mfill_zeropage_pte(dst_mm, dst_pmd,
+ dst_vma, dst_addr);
+ } else {
+ err = -EINVAL; /* if zeropage is true return -EINVAL */
+ if (likely(!zeropage))
+ err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
+ dst_vma, dst_addr,
+ src_addr, &page);
+ }
cond_resched();
diff --git a/mm/util.c b/mm/util.c
index 1a41553db866..656dc5e37a87 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -5,15 +5,18 @@
#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task_stack.h>
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
+#include <linux/userfaultfd_k.h>
#include <asm/sections.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "internal.h"
@@ -297,14 +300,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long ret;
struct mm_struct *mm = current->mm;
unsigned long populate;
+ LIST_HEAD(uf);
ret = security_mmap_file(file, prot, flag);
if (!ret) {
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
- &populate);
+ &populate, &uf);
up_write(&mm->mmap_sem);
+ userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(ret, populate);
}
diff --git a/mm/vmacache.c b/mm/vmacache.c
index 035fdeb35b43..7ffa0ee341b5 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -1,7 +1,8 @@
/*
* Copyright (C) 2014 Davidlohr Bueso.
*/
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
@@ -60,7 +61,7 @@ static inline bool vmacache_valid_mm(struct mm_struct *mm)
void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
{
if (vmacache_valid_mm(newvma->vm_mm))
- current->vmacache[VMACACHE_HASH(addr)] = newvma;
+ current->vmacache.vmas[VMACACHE_HASH(addr)] = newvma;
}
static bool vmacache_valid(struct mm_struct *mm)
@@ -71,12 +72,12 @@ static bool vmacache_valid(struct mm_struct *mm)
return false;
curr = current;
- if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
+ if (mm->vmacache_seqnum != curr->vmacache.seqnum) {
/*
* First attempt will always be invalid, initialize
* the new cache for this task here.
*/
- curr->vmacache_seqnum = mm->vmacache_seqnum;
+ curr->vmacache.seqnum = mm->vmacache_seqnum;
vmacache_flush(curr);
return false;
}
@@ -93,7 +94,7 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
return NULL;
for (i = 0; i < VMACACHE_SIZE; i++) {
- struct vm_area_struct *vma = current->vmacache[i];
+ struct vm_area_struct *vma = current->vmacache.vmas[i];
if (!vma)
continue;
@@ -121,7 +122,7 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
return NULL;
for (i = 0; i < VMACACHE_SIZE; i++) {
- struct vm_area_struct *vma = current->vmacache[i];
+ struct vm_area_struct *vma = current->vmacache.vmas[i];
if (vma && vma->vm_start == start && vma->vm_end == end) {
count_vm_vmacache_event(VMACACHE_FIND_HITS);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f2481cb4e6b2..0b057628a7ba 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -12,7 +12,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/highmem.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
@@ -32,7 +32,7 @@
#include <linux/llist.h>
#include <linux/bitops.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
@@ -86,12 +86,12 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
} while (pmd++, addr = next, addr != end);
}
-static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
+static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end)
{
pud_t *pud;
unsigned long next;
- pud = pud_offset(pgd, addr);
+ pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
if (pud_clear_huge(pud))
@@ -102,6 +102,22 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
} while (pud++, addr = next, addr != end);
}
+static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ if (p4d_clear_huge(p4d))
+ continue;
+ if (p4d_none_or_clear_bad(p4d))
+ continue;
+ vunmap_pud_range(p4d, addr, next);
+ } while (p4d++, addr = next, addr != end);
+}
+
static void vunmap_page_range(unsigned long addr, unsigned long end)
{
pgd_t *pgd;
@@ -113,7 +129,7 @@ static void vunmap_page_range(unsigned long addr, unsigned long end)
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- vunmap_pud_range(pgd, addr, next);
+ vunmap_p4d_range(pgd, addr, next);
} while (pgd++, addr = next, addr != end);
}
@@ -160,13 +176,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr,
return 0;
}
-static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
+static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
pud_t *pud;
unsigned long next;
- pud = pud_alloc(&init_mm, pgd, addr);
+ pud = pud_alloc(&init_mm, p4d, addr);
if (!pud)
return -ENOMEM;
do {
@@ -177,6 +193,23 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
return 0;
}
+static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_alloc(&init_mm, pgd, addr);
+ if (!p4d)
+ return -ENOMEM;
+ do {
+ next = p4d_addr_end(addr, end);
+ if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
+ return -ENOMEM;
+ } while (p4d++, addr = next, addr != end);
+ return 0;
+}
+
/*
* Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
* will have pfns corresponding to the "pages" array.
@@ -196,7 +229,7 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end,
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
- err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
+ err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
@@ -237,6 +270,10 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
unsigned long addr = (unsigned long) vmalloc_addr;
struct page *page = NULL;
pgd_t *pgd = pgd_offset_k(addr);
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep, pte;
/*
* XXX we might need to change this if we add VIRTUAL_BUG_ON for
@@ -244,21 +281,23 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
*/
VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
- if (!pgd_none(*pgd)) {
- pud_t *pud = pud_offset(pgd, addr);
- if (!pud_none(*pud)) {
- pmd_t *pmd = pmd_offset(pud, addr);
- if (!pmd_none(*pmd)) {
- pte_t *ptep, pte;
-
- ptep = pte_offset_map(pmd, addr);
- pte = *ptep;
- if (pte_present(pte))
- page = pte_page(pte);
- pte_unmap(ptep);
- }
- }
- }
+ if (pgd_none(*pgd))
+ return NULL;
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ return NULL;
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
+ return NULL;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ return NULL;
+
+ ptep = pte_offset_map(pmd, addr);
+ pte = *ptep;
+ if (pte_present(pte))
+ page = pte_page(pte);
+ pte_unmap(ptep);
return page;
}
EXPORT_SYMBOL(vmalloc_to_page);
@@ -365,7 +404,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
BUG_ON(offset_in_page(size));
BUG_ON(!is_power_of_2(align));
- might_sleep_if(gfpflags_allow_blocking(gfp_mask));
+ might_sleep();
va = kmalloc_node(sizeof(struct vmap_area),
gfp_mask & GFP_RECLAIM_MASK, node);
@@ -601,6 +640,13 @@ static unsigned long lazy_max_pages(void)
static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+/*
+ * Serialize vmap purging. There is no actual criticial section protected
+ * by this look, but we want to avoid concurrent calls for performance
+ * reasons and to make the pcpu_get_vm_areas more deterministic.
+ */
+static DEFINE_MUTEX(vmap_purge_lock);
+
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
@@ -615,59 +661,40 @@ void set_iounmap_nonlazy(void)
/*
* Purges all lazily-freed vmap areas.
- *
- * If sync is 0 then don't purge if there is already a purge in progress.
- * If force_flush is 1, then flush kernel TLBs between *start and *end even
- * if we found no lazy vmap areas to unmap (callers can use this to optimise
- * their own TLB flushing).
- * Returns with *start = min(*start, lowest purged address)
- * *end = max(*end, highest purged address)
*/
-static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
- int sync, int force_flush)
+static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
- static DEFINE_SPINLOCK(purge_lock);
struct llist_node *valist;
struct vmap_area *va;
struct vmap_area *n_va;
- int nr = 0;
-
- /*
- * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
- * should not expect such behaviour. This just simplifies locking for
- * the case that isn't actually used at the moment anyway.
- */
- if (!sync && !force_flush) {
- if (!spin_trylock(&purge_lock))
- return;
- } else
- spin_lock(&purge_lock);
+ bool do_free = false;
- if (sync)
- purge_fragmented_blocks_allcpus();
+ lockdep_assert_held(&vmap_purge_lock);
valist = llist_del_all(&vmap_purge_list);
llist_for_each_entry(va, valist, purge_list) {
- if (va->va_start < *start)
- *start = va->va_start;
- if (va->va_end > *end)
- *end = va->va_end;
- nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
+ if (va->va_start < start)
+ start = va->va_start;
+ if (va->va_end > end)
+ end = va->va_end;
+ do_free = true;
}
- if (nr)
- atomic_sub(nr, &vmap_lazy_nr);
+ if (!do_free)
+ return false;
- if (nr || force_flush)
- flush_tlb_kernel_range(*start, *end);
+ flush_tlb_kernel_range(start, end);
- if (nr) {
- spin_lock(&vmap_area_lock);
- llist_for_each_entry_safe(va, n_va, valist, purge_list)
- __free_vmap_area(va);
- spin_unlock(&vmap_area_lock);
+ spin_lock(&vmap_area_lock);
+ llist_for_each_entry_safe(va, n_va, valist, purge_list) {
+ int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
+
+ __free_vmap_area(va);
+ atomic_sub(nr, &vmap_lazy_nr);
+ cond_resched_lock(&vmap_area_lock);
}
- spin_unlock(&purge_lock);
+ spin_unlock(&vmap_area_lock);
+ return true;
}
/*
@@ -676,9 +703,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
*/
static void try_purge_vmap_area_lazy(void)
{
- unsigned long start = ULONG_MAX, end = 0;
-
- __purge_vmap_area_lazy(&start, &end, 0, 0);
+ if (mutex_trylock(&vmap_purge_lock)) {
+ __purge_vmap_area_lazy(ULONG_MAX, 0);
+ mutex_unlock(&vmap_purge_lock);
+ }
}
/*
@@ -686,9 +714,10 @@ static void try_purge_vmap_area_lazy(void)
*/
static void purge_vmap_area_lazy(void)
{
- unsigned long start = ULONG_MAX, end = 0;
-
- __purge_vmap_area_lazy(&start, &end, 1, 0);
+ mutex_lock(&vmap_purge_lock);
+ purge_fragmented_blocks_allcpus();
+ __purge_vmap_area_lazy(ULONG_MAX, 0);
+ mutex_unlock(&vmap_purge_lock);
}
/*
@@ -711,22 +740,13 @@ static void free_vmap_area_noflush(struct vmap_area *va)
}
/*
- * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
- * called for the correct range previously.
- */
-static void free_unmap_vmap_area_noflush(struct vmap_area *va)
-{
- unmap_vmap_area(va);
- free_vmap_area_noflush(va);
-}
-
-/*
* Free and unmap a vmap area
*/
static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
- free_unmap_vmap_area_noflush(va);
+ unmap_vmap_area(va);
+ free_vmap_area_noflush(va);
}
static struct vmap_area *find_vmap_area(unsigned long addr)
@@ -740,16 +760,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr)
return va;
}
-static void free_unmap_vmap_area_addr(unsigned long addr)
-{
- struct vmap_area *va;
-
- va = find_vmap_area(addr);
- BUG_ON(!va);
- free_unmap_vmap_area(va);
-}
-
-
/*** Per cpu kva allocator ***/
/*
@@ -1070,6 +1080,8 @@ void vm_unmap_aliases(void)
if (unlikely(!vmap_initialized))
return;
+ might_sleep();
+
for_each_possible_cpu(cpu) {
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
struct vmap_block *vb;
@@ -1094,7 +1106,11 @@ void vm_unmap_aliases(void)
rcu_read_unlock();
}
- __purge_vmap_area_lazy(&start, &end, 1, flush);
+ mutex_lock(&vmap_purge_lock);
+ purge_fragmented_blocks_allcpus();
+ if (!__purge_vmap_area_lazy(start, end) && flush)
+ flush_tlb_kernel_range(start, end);
+ mutex_unlock(&vmap_purge_lock);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
@@ -1107,7 +1123,9 @@ void vm_unmap_ram(const void *mem, unsigned int count)
{
unsigned long size = (unsigned long)count << PAGE_SHIFT;
unsigned long addr = (unsigned long)mem;
+ struct vmap_area *va;
+ might_sleep();
BUG_ON(!addr);
BUG_ON(addr < VMALLOC_START);
BUG_ON(addr > VMALLOC_END);
@@ -1116,10 +1134,14 @@ void vm_unmap_ram(const void *mem, unsigned int count)
debug_check_no_locks_freed(mem, size);
vmap_debug_free_range(addr, addr+size);
- if (likely(count <= VMAP_MAX_ALLOC))
+ if (likely(count <= VMAP_MAX_ALLOC)) {
vb_free(mem, size);
- else
- free_unmap_vmap_area_addr(addr);
+ return;
+ }
+
+ va = find_vmap_area(addr);
+ BUG_ON(!va);
+ free_unmap_vmap_area(va);
}
EXPORT_SYMBOL(vm_unmap_ram);
@@ -1455,6 +1477,8 @@ struct vm_struct *remove_vm_area(const void *addr)
{
struct vmap_area *va;
+ might_sleep();
+
va = find_vmap_area((unsigned long)addr);
if (va && va->flags & VM_VM_AREA) {
struct vm_struct *vm = va->vm;
@@ -1510,7 +1534,39 @@ static void __vunmap(const void *addr, int deallocate_pages)
kfree(area);
return;
}
-
+
+static inline void __vfree_deferred(const void *addr)
+{
+ /*
+ * Use raw_cpu_ptr() because this can be called from preemptible
+ * context. Preemption is absolutely fine here, because the llist_add()
+ * implementation is lockless, so it works even if we are adding to
+ * nother cpu's list. schedule_work() should be fine with this too.
+ */
+ struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
+
+ if (llist_add((struct llist_node *)addr, &p->list))
+ schedule_work(&p->wq);
+}
+
+/**
+ * vfree_atomic - release memory allocated by vmalloc()
+ * @addr: memory base address
+ *
+ * This one is just like vfree() but can be called in any atomic context
+ * except NMIs.
+ */
+void vfree_atomic(const void *addr)
+{
+ BUG_ON(in_nmi());
+
+ kmemleak_free(addr);
+
+ if (!addr)
+ return;
+ __vfree_deferred(addr);
+}
+
/**
* vfree - release memory allocated by vmalloc()
* @addr: memory base address
@@ -1533,11 +1589,9 @@ void vfree(const void *addr)
if (!addr)
return;
- if (unlikely(in_interrupt())) {
- struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred);
- if (llist_add((struct llist_node *)addr, &p->list))
- schedule_work(&p->wq);
- } else
+ if (unlikely(in_interrupt()))
+ __vfree_deferred(addr);
+ else
__vunmap(addr, 1);
}
EXPORT_SYMBOL(vfree);
@@ -1627,6 +1681,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
for (i = 0; i < area->nr_pages; i++) {
struct page *page;
+ if (fatal_signal_pending(current)) {
+ area->nr_pages = i;
+ goto fail_no_warn;
+ }
+
if (node == NUMA_NO_NODE)
page = alloc_page(alloc_mask);
else
@@ -1647,9 +1706,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
return area->addr;
fail:
- warn_alloc(gfp_mask,
+ warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure, allocated %ld of %ld bytes",
(area->nr_pages*PAGE_SIZE), area->size);
+fail_no_warn:
vfree(area->addr);
return NULL;
}
@@ -1709,7 +1769,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
return addr;
fail:
- warn_alloc(gfp_mask,
+ warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure: %lu bytes", real_size);
return NULL;
}
@@ -2294,7 +2354,7 @@ EXPORT_SYMBOL_GPL(free_vm_area);
#ifdef CONFIG_SMP
static struct vmap_area *node_to_va(struct rb_node *n)
{
- return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
+ return rb_entry_safe(n, struct vmap_area, rb_node);
}
/**
@@ -2574,32 +2634,13 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
static void *s_start(struct seq_file *m, loff_t *pos)
__acquires(&vmap_area_lock)
{
- loff_t n = *pos;
- struct vmap_area *va;
-
spin_lock(&vmap_area_lock);
- va = list_first_entry(&vmap_area_list, typeof(*va), list);
- while (n > 0 && &va->list != &vmap_area_list) {
- n--;
- va = list_next_entry(va, list);
- }
- if (!n && &va->list != &vmap_area_list)
- return va;
-
- return NULL;
-
+ return seq_list_start(&vmap_area_list, *pos);
}
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
- struct vmap_area *va = p, *next;
-
- ++*pos;
- next = list_next_entry(va, list);
- if (&next->list != &vmap_area_list)
- return next;
-
- return NULL;
+ return seq_list_next(p, &vmap_area_list, pos);
}
static void s_stop(struct seq_file *m, void *p)
@@ -2634,9 +2675,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
static int s_show(struct seq_file *m, void *p)
{
- struct vmap_area *va = p;
+ struct vmap_area *va;
struct vm_struct *v;
+ va = list_entry(p, struct vmap_area, list);
+
/*
* s_show can encounter race with remove_vm_area, !VM_VM_AREA on
* behalf of vmap area is being tear down or vm_map_ram allocation.
@@ -2656,7 +2699,7 @@ static int s_show(struct seq_file *m, void *p)
seq_printf(m, " pages=%d", v->nr_pages);
if (v->phys_addr)
- seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr);
+ seq_printf(m, " phys=%pa", &v->phys_addr);
if (v->flags & VM_IOREMAP)
seq_puts(m, " ioremap");
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 149fdf6c5c56..6063581f705c 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -112,9 +112,16 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
unsigned long reclaimed)
{
unsigned long scale = scanned + reclaimed;
- unsigned long pressure;
+ unsigned long pressure = 0;
/*
+ * reclaimed can be greater than scanned in cases
+ * like THP, where the scanned is 1 and reclaimed
+ * could be 512
+ */
+ if (reclaimed >= scanned)
+ goto out;
+ /*
* We calculate the ratio (in percents) of how many pages were
* scanned vs. reclaimed in a given time frame (window). Note that
* time is in VM reclaimer's "ticks", i.e. number of pages
@@ -124,6 +131,7 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
pressure = scale - (reclaimed * scale / scanned);
pressure = pressure * 100 / scale;
+out:
pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
scanned, reclaimed);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d75cdf360730..bc8031ef994d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -14,6 +14,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mm.h>
+#include <linux/sched/mm.h>
#include <linux/module.h>
#include <linux/gfp.h>
#include <linux/kernel_stat.h>
@@ -87,6 +88,7 @@ struct scan_control {
/* The highest zone to isolate pages for reclaim from */
enum zone_type reclaim_idx;
+ /* Writepage batching in laptop mode; RECLAIM_WRITE */
unsigned int may_writepage:1;
/* Can mapped pages be reclaimed? */
@@ -234,12 +236,39 @@ bool pgdat_reclaimable(struct pglist_data *pgdat)
pgdat_reclaimable_pages(pgdat) * 6;
}
-unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
+/**
+ * lruvec_lru_size - Returns the number of pages on the given LRU list.
+ * @lruvec: lru vector
+ * @lru: lru to use
+ * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
+ */
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
{
+ unsigned long lru_size;
+ int zid;
+
if (!mem_cgroup_disabled())
- return mem_cgroup_get_lru_size(lruvec, lru);
+ lru_size = mem_cgroup_get_lru_size(lruvec, lru);
+ else
+ lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
+
+ for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
+ struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
+ unsigned long size;
+
+ if (!managed_zone(zone))
+ continue;
+
+ if (!mem_cgroup_disabled())
+ size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
+ else
+ size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
+ NR_ZONE_LRU_BASE + lru);
+ lru_size -= min(size, lru_size);
+ }
+
+ return lru_size;
- return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
}
/*
@@ -291,6 +320,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
int nid = shrinkctl->nid;
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
+ long scanned = 0, next_deferred;
freeable = shrinker->count_objects(shrinker, shrinkctl);
if (freeable == 0)
@@ -312,7 +342,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
shrinker->scan_objects, total_scan);
total_scan = freeable;
- }
+ next_deferred = nr;
+ } else
+ next_deferred = total_scan;
/*
* We need to avoid excessive windup on filesystem shrinkers
@@ -369,17 +401,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
count_vm_events(SLABS_SCANNED, nr_to_scan);
total_scan -= nr_to_scan;
+ scanned += nr_to_scan;
cond_resched();
}
+ if (next_deferred >= scanned)
+ next_deferred -= scanned;
+ else
+ next_deferred = 0;
/*
* move the unused scan count back into the shrinker in a
* manner that handles concurrent updates. If we exhausted the
* scan, there is no need to do an update.
*/
- if (total_scan > 0)
- new_nr = atomic_long_add_return(total_scan,
+ if (next_deferred > 0)
+ new_nr = atomic_long_add_return(next_deferred,
&shrinker->nr_deferred[nid]);
else
new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
@@ -894,6 +931,17 @@ static void page_check_dirty_writeback(struct page *page,
mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
}
+struct reclaim_stat {
+ unsigned nr_dirty;
+ unsigned nr_unqueued_dirty;
+ unsigned nr_congested;
+ unsigned nr_writeback;
+ unsigned nr_immediate;
+ unsigned nr_activate;
+ unsigned nr_ref_keep;
+ unsigned nr_unmap_fail;
+};
+
/*
* shrink_page_list() returns the number of reclaimed pages
*/
@@ -901,22 +949,20 @@ static unsigned long shrink_page_list(struct list_head *page_list,
struct pglist_data *pgdat,
struct scan_control *sc,
enum ttu_flags ttu_flags,
- unsigned long *ret_nr_dirty,
- unsigned long *ret_nr_unqueued_dirty,
- unsigned long *ret_nr_congested,
- unsigned long *ret_nr_writeback,
- unsigned long *ret_nr_immediate,
+ struct reclaim_stat *stat,
bool force_reclaim)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
int pgactivate = 0;
- unsigned long nr_unqueued_dirty = 0;
- unsigned long nr_dirty = 0;
- unsigned long nr_congested = 0;
- unsigned long nr_reclaimed = 0;
- unsigned long nr_writeback = 0;
- unsigned long nr_immediate = 0;
+ unsigned nr_unqueued_dirty = 0;
+ unsigned nr_dirty = 0;
+ unsigned nr_congested = 0;
+ unsigned nr_reclaimed = 0;
+ unsigned nr_writeback = 0;
+ unsigned nr_immediate = 0;
+ unsigned nr_ref_keep = 0;
+ unsigned nr_unmap_fail = 0;
cond_resched();
@@ -1011,6 +1057,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* throttling so we could easily OOM just because too many
* pages are in writeback and there is nothing else to
* reclaim. Wait for the writeback to complete.
+ *
+ * In cases 1) and 2) we activate the pages to get them out of
+ * the way while we continue scanning for clean pages on the
+ * inactive list and refilling from the active list. The
+ * observation here is that waiting for disk writes is more
+ * expensive than potentially causing reloads down the line.
+ * Since they're marked for immediate reclaim, they won't put
+ * memory pressure on the cache working set any longer than it
+ * takes to write them to disk.
*/
if (PageWriteback(page)) {
/* Case 1 above */
@@ -1018,7 +1073,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
PageReclaim(page) &&
test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
nr_immediate++;
- goto keep_locked;
+ goto activate_locked;
/* Case 2 above */
} else if (sane_reclaim(sc) ||
@@ -1036,7 +1091,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
SetPageReclaim(page);
nr_writeback++;
- goto keep_locked;
+ goto activate_locked;
/* Case 3 above */
} else {
@@ -1055,6 +1110,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
case PAGEREF_ACTIVATE:
goto activate_locked;
case PAGEREF_KEEP:
+ nr_ref_keep++;
goto keep_locked;
case PAGEREF_RECLAIM:
case PAGEREF_RECLAIM_CLEAN:
@@ -1092,6 +1148,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
(ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
(ttu_flags | TTU_BATCH_FLUSH))) {
case SWAP_FAIL:
+ nr_unmap_fail++;
goto activate_locked;
case SWAP_AGAIN:
goto keep_locked;
@@ -1106,13 +1163,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (PageDirty(page)) {
/*
- * Only kswapd can writeback filesystem pages to
- * avoid risk of stack overflow but only writeback
- * if many dirty pages have been encountered.
+ * Only kswapd can writeback filesystem pages
+ * to avoid risk of stack overflow. But avoid
+ * injecting inefficient single-page IO into
+ * flusher writeback as much as possible: only
+ * write pages when we've encountered many
+ * dirty pages, and when we've already scanned
+ * the rest of the LRU for clean pages and see
+ * the same dirty pages again (PageReclaim).
*/
if (page_is_file_cache(page) &&
- (!current_is_kswapd() ||
- !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
+ (!current_is_kswapd() || !PageReclaim(page) ||
+ !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
/*
* Immediately reclaim when written back.
* Similar in principal to deactivate_page()
@@ -1122,7 +1184,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
SetPageReclaim(page);
- goto keep_locked;
+ goto activate_locked;
}
if (references == PAGEREF_RECLAIM_CLEAN)
@@ -1258,11 +1320,16 @@ keep:
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
- *ret_nr_dirty += nr_dirty;
- *ret_nr_congested += nr_congested;
- *ret_nr_unqueued_dirty += nr_unqueued_dirty;
- *ret_nr_writeback += nr_writeback;
- *ret_nr_immediate += nr_immediate;
+ if (stat) {
+ stat->nr_dirty = nr_dirty;
+ stat->nr_congested = nr_congested;
+ stat->nr_unqueued_dirty = nr_unqueued_dirty;
+ stat->nr_writeback = nr_writeback;
+ stat->nr_immediate = nr_immediate;
+ stat->nr_activate = pgactivate;
+ stat->nr_ref_keep = nr_ref_keep;
+ stat->nr_unmap_fail = nr_unmap_fail;
+ }
return nr_reclaimed;
}
@@ -1274,7 +1341,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
.priority = DEF_PRIORITY,
.may_unmap = 1,
};
- unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
+ unsigned long ret;
struct page *page, *next;
LIST_HEAD(clean_pages);
@@ -1287,8 +1354,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
}
ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
- TTU_UNMAP|TTU_IGNORE_ACCESS,
- &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
+ TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true);
list_splice(&clean_pages, page_list);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
return ret;
@@ -1323,13 +1389,10 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
* wants to isolate pages it will be able to operate on without
* blocking - clean pages for the most part.
*
- * ISOLATE_CLEAN means that only clean pages should be isolated. This
- * is used by reclaim when it is cannot write to backing storage
- *
* ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
* that it is possible to migrate without blocking
*/
- if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
+ if (mode & ISOLATE_ASYNC_MIGRATE) {
/* All the caller can do on PageWriteback is block */
if (PageWriteback(page))
return ret;
@@ -1337,10 +1400,6 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
if (PageDirty(page)) {
struct address_space *mapping;
- /* ISOLATE_CLEAN means only clean pages */
- if (mode & ISOLATE_CLEAN)
- return ret;
-
/*
* Only pages without mappings or that have a
* ->migratepage callback are possible to migrate
@@ -1374,8 +1433,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
* be complete before mem_cgroup_update_lru_size due to a santity check.
*/
static __always_inline void update_lru_sizes(struct lruvec *lruvec,
- enum lru_list lru, unsigned long *nr_zone_taken,
- unsigned long nr_taken)
+ enum lru_list lru, unsigned long *nr_zone_taken)
{
int zid;
@@ -1384,11 +1442,11 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
continue;
__update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
- }
-
#ifdef CONFIG_MEMCG
- mem_cgroup_update_lru_size(lruvec, lru, -nr_taken);
+ mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
#endif
+ }
+
}
/*
@@ -1420,6 +1478,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long nr_taken = 0;
unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
+ unsigned long skipped = 0, total_skipped = 0;
unsigned long scan, nr_pages;
LIST_HEAD(pages_skipped);
@@ -1471,14 +1530,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
*/
if (!list_empty(&pages_skipped)) {
int zid;
- unsigned long total_skipped = 0;
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
if (!nr_skipped[zid])
continue;
__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
- total_skipped += nr_skipped[zid];
+ skipped += nr_skipped[zid];
}
/*
@@ -1486,14 +1544,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* close to unreclaimable. If the LRU list is empty, account
* skipped pages as a full scan.
*/
- scan += list_empty(src) ? total_skipped : total_skipped >> 2;
+ total_skipped = list_empty(src) ? skipped : skipped >> 2;
list_splice(&pages_skipped, src);
}
- *nr_scanned = scan;
- trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
- nr_taken, mode, is_file_lru(lru));
- update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken);
+ *nr_scanned = scan + total_skipped;
+ trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
+ scan, skipped, nr_taken, mode, lru);
+ update_lru_sizes(lruvec, lru, nr_zone_taken);
return nr_taken;
}
@@ -1652,30 +1710,6 @@ static int current_may_throttle(void)
bdi_write_congested(current->backing_dev_info);
}
-static bool inactive_reclaimable_pages(struct lruvec *lruvec,
- struct scan_control *sc, enum lru_list lru)
-{
- int zid;
- struct zone *zone;
- int file = is_file_lru(lru);
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-
- if (!global_reclaim(sc))
- return true;
-
- for (zid = sc->reclaim_idx; zid >= 0; zid--) {
- zone = &pgdat->node_zones[zid];
- if (!managed_zone(zone))
- continue;
-
- if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE +
- LRU_FILE * file) >= SWAP_CLUSTER_MAX)
- return true;
- }
-
- return false;
-}
-
/*
* shrink_inactive_list() is a helper for shrink_node(). It returns the number
* of reclaimed pages
@@ -1688,19 +1722,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
unsigned long nr_scanned;
unsigned long nr_reclaimed = 0;
unsigned long nr_taken;
- unsigned long nr_dirty = 0;
- unsigned long nr_congested = 0;
- unsigned long nr_unqueued_dirty = 0;
- unsigned long nr_writeback = 0;
- unsigned long nr_immediate = 0;
+ struct reclaim_stat stat = {};
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
- if (!inactive_reclaimable_pages(lruvec, sc, lru))
- return 0;
-
while (unlikely(too_many_isolated(pgdat, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1713,8 +1740,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (!sc->may_unmap)
isolate_mode |= ISOLATE_UNMAPPED;
- if (!sc->may_writepage)
- isolate_mode |= ISOLATE_CLEAN;
spin_lock_irq(&pgdat->lru_lock);
@@ -1737,9 +1762,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
return 0;
nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
- &nr_dirty, &nr_unqueued_dirty, &nr_congested,
- &nr_writeback, &nr_immediate,
- false);
+ &stat, false);
spin_lock_irq(&pgdat->lru_lock);
@@ -1773,7 +1796,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* of pages under pages flagged for immediate reclaim and stall if any
* are encountered in the nr_immediate check below.
*/
- if (nr_writeback && nr_writeback == nr_taken)
+ if (stat.nr_writeback && stat.nr_writeback == nr_taken)
set_bit(PGDAT_WRITEBACK, &pgdat->flags);
/*
@@ -1785,17 +1808,25 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* Tag a zone as congested if all the dirty pages scanned were
* backed by a congested BDI and wait_iff_congested will stall.
*/
- if (nr_dirty && nr_dirty == nr_congested)
+ if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)
set_bit(PGDAT_CONGESTED, &pgdat->flags);
/*
* If dirty pages are scanned that are not queued for IO, it
- * implies that flushers are not keeping up. In this case, flag
- * the pgdat PGDAT_DIRTY and kswapd will start writing pages from
- * reclaim context.
+ * implies that flushers are not doing their job. This can
+ * happen when memory pressure pushes dirty pages to the end of
+ * the LRU before the dirty limits are breached and the dirty
+ * data has expired. It can also happen when the proportion of
+ * dirty pages grows not through writes but through memory
+ * pressure reclaiming all the clean cache. And in some cases,
+ * the flushers simply cannot keep up with the allocation
+ * rate. Nudge the flusher threads in case they are asleep, but
+ * also allow kswapd to start writing pages during reclaim.
*/
- if (nr_unqueued_dirty == nr_taken)
+ if (stat.nr_unqueued_dirty == nr_taken) {
+ wakeup_flusher_threads(0, WB_REASON_VMSCAN);
set_bit(PGDAT_DIRTY, &pgdat->flags);
+ }
/*
* If kswapd scans pages marked marked for immediate
@@ -1803,7 +1834,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* that pages are cycling through the LRU faster than
* they are written so also forcibly stall.
*/
- if (nr_immediate && current_may_throttle())
+ if (stat.nr_immediate && current_may_throttle())
congestion_wait(BLK_RW_ASYNC, HZ/10);
}
@@ -1818,6 +1849,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
nr_scanned, nr_reclaimed,
+ stat.nr_dirty, stat.nr_writeback,
+ stat.nr_congested, stat.nr_immediate,
+ stat.nr_activate, stat.nr_ref_keep,
+ stat.nr_unmap_fail,
sc->priority, file);
return nr_reclaimed;
}
@@ -1838,17 +1873,19 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
*
* The downside is that we have to touch page->_refcount against each page.
* But we had to alter page->flags anyway.
+ *
+ * Returns the number of pages moved to the given lru.
*/
-static void move_active_pages_to_lru(struct lruvec *lruvec,
+static unsigned move_active_pages_to_lru(struct lruvec *lruvec,
struct list_head *list,
struct list_head *pages_to_free,
enum lru_list lru)
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- unsigned long pgmoved = 0;
struct page *page;
int nr_pages;
+ int nr_moved = 0;
while (!list_empty(list)) {
page = lru_to_page(list);
@@ -1860,7 +1897,6 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
nr_pages = hpage_nr_pages(page);
update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
list_move(&page->lru, &lruvec->lists[lru]);
- pgmoved += nr_pages;
if (put_page_testzero(page)) {
__ClearPageLRU(page);
@@ -1874,11 +1910,15 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
spin_lock_irq(&pgdat->lru_lock);
} else
list_add(&page->lru, pages_to_free);
+ } else {
+ nr_moved += nr_pages;
}
}
if (!is_active_lru(lru))
- __count_vm_events(PGDEACTIVATE, pgmoved);
+ __count_vm_events(PGDEACTIVATE, nr_moved);
+
+ return nr_moved;
}
static void shrink_active_list(unsigned long nr_to_scan,
@@ -1894,7 +1934,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
LIST_HEAD(l_inactive);
struct page *page;
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
- unsigned long nr_rotated = 0;
+ unsigned nr_deactivate, nr_activate;
+ unsigned nr_rotated = 0;
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -1903,8 +1944,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
if (!sc->may_unmap)
isolate_mode |= ISOLATE_UNMAPPED;
- if (!sc->may_writepage)
- isolate_mode |= ISOLATE_CLEAN;
spin_lock_irq(&pgdat->lru_lock);
@@ -1972,13 +2011,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
*/
reclaim_stat->recent_rotated[file] += nr_rotated;
- move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
- move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
+ nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
+ nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&pgdat->lru_lock);
mem_cgroup_uncharge_list(&l_hold);
free_hot_cold_page_list(&l_hold, true);
+ trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
+ nr_deactivate, nr_rotated, sc->priority, file);
}
/*
@@ -2008,14 +2049,13 @@ static void shrink_active_list(unsigned long nr_to_scan,
* 10TB 320 32GB
*/
static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
- struct scan_control *sc)
+ struct scan_control *sc, bool trace)
{
unsigned long inactive_ratio;
- unsigned long inactive;
- unsigned long active;
+ unsigned long inactive, active;
+ enum lru_list inactive_lru = file * LRU_FILE;
+ enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
unsigned long gb;
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- int zid;
/*
* If we don't have swap space, anonymous page deactivation
@@ -2024,29 +2064,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
if (!file && !total_swap_pages)
return false;
- inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
- active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
-
- /*
- * For zone-constrained allocations, it is necessary to check if
- * deactivations are required for lowmem to be reclaimed. This
- * calculates the inactive/active pages available in eligible zones.
- */
- for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) {
- struct zone *zone = &pgdat->node_zones[zid];
- unsigned long inactive_zone, active_zone;
-
- if (!managed_zone(zone))
- continue;
-
- inactive_zone = zone_page_state(zone,
- NR_ZONE_LRU_BASE + (file * LRU_FILE));
- active_zone = zone_page_state(zone,
- NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE);
-
- inactive -= min(inactive, inactive_zone);
- active -= min(active, active_zone);
- }
+ inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
+ active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
@@ -2054,6 +2073,13 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
else
inactive_ratio = 1;
+ if (trace)
+ trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id,
+ sc->reclaim_idx,
+ lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
+ lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
+ inactive_ratio, file);
+
return inactive * inactive_ratio < active;
}
@@ -2061,7 +2087,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
struct lruvec *lruvec, struct scan_control *sc)
{
if (is_active_lru(lru)) {
- if (inactive_list_is_low(lruvec, is_file_lru(lru), sc))
+ if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
@@ -2192,8 +2218,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* lruvec even if it has plenty of old anonymous pages unless the
* system is under heavy pressure.
*/
- if (!inactive_list_is_low(lruvec, true, sc) &&
- lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+ if (!inactive_list_is_low(lruvec, true, sc, false) &&
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2219,10 +2245,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* anon in [0], file in [1]
*/
- anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
- lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
- file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
- lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+ anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
+ file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
spin_lock_irq(&pgdat->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2260,7 +2286,7 @@ out:
unsigned long size;
unsigned long scan;
- size = lruvec_lru_size(lruvec, lru);
+ size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
scan = size >> sc->priority;
if (!scan && pass && force_scan)
@@ -2417,7 +2443,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_list_is_low(lruvec, false, sc))
+ if (inactive_list_is_low(lruvec, false, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
@@ -2746,8 +2772,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
int initial_priority = sc->priority;
- unsigned long total_scanned = 0;
- unsigned long writeback_threshold;
retry:
delayacct_freepages_start();
@@ -2760,7 +2784,6 @@ retry:
sc->nr_scanned = 0;
shrink_zones(zonelist, sc);
- total_scanned += sc->nr_scanned;
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
break;
@@ -2773,20 +2796,6 @@ retry:
*/
if (sc->priority < DEF_PRIORITY - 2)
sc->may_writepage = 1;
-
- /*
- * Try to write back as many pages as we just scanned. This
- * tends to cause slow streaming writers to write data to the
- * disk smoothly, at the dirtying rate, which is nice. But
- * that's undesirable in laptop mode, where we *want* lumpy
- * writeout. So in laptop mode, write out the whole world.
- */
- writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
- if (total_scanned > writeback_threshold) {
- wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
- WB_REASON_TRY_TO_FREE_PAGES);
- sc->may_writepage = 1;
- }
} while (--sc->priority >= 0);
delayacct_freepages_end();
@@ -3067,7 +3076,7 @@ static void age_active_anon(struct pglist_data *pgdat,
do {
struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
- if (inactive_list_is_low(lruvec, false, sc))
+ if (inactive_list_is_low(lruvec, false, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -3088,6 +3097,7 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
*/
clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
+ clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags);
return true;
}
@@ -3558,24 +3568,21 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
not required for correctness. So if the last cpu in a node goes
away, we get changed to run anywhere: as the first one comes back,
restore their cpu bindings. */
-static int cpu_callback(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
+static int kswapd_cpu_online(unsigned int cpu)
{
int nid;
- if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
- for_each_node_state(nid, N_MEMORY) {
- pg_data_t *pgdat = NODE_DATA(nid);
- const struct cpumask *mask;
+ for_each_node_state(nid, N_MEMORY) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ const struct cpumask *mask;
- mask = cpumask_of_node(pgdat->node_id);
+ mask = cpumask_of_node(pgdat->node_id);
- if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
- /* One of our CPUs online: restore mask */
- set_cpus_allowed_ptr(pgdat->kswapd, mask);
- }
+ if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+ /* One of our CPUs online: restore mask */
+ set_cpus_allowed_ptr(pgdat->kswapd, mask);
}
- return NOTIFY_OK;
+ return 0;
}
/*
@@ -3617,12 +3624,15 @@ void kswapd_stop(int nid)
static int __init kswapd_init(void)
{
- int nid;
+ int nid, ret;
swap_setup();
for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
- hotcpu_notifier(cpu_callback, 0);
+ ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "mm/vmscan:online", kswapd_cpu_online,
+ NULL);
+ WARN_ON(ret < 0);
return 0;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 604f26a4f696..5a4f5c5a31e8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1038,6 +1038,8 @@ const char * const vmstat_text[] = {
"compact_fail",
"compact_success",
"compact_daemon_wake",
+ "compact_daemon_migrate_scanned",
+ "compact_daemon_free_scanned",
#endif
#ifdef CONFIG_HUGETLB_PAGE
@@ -1063,6 +1065,9 @@ const char * const vmstat_text[] = {
"thp_split_page_failed",
"thp_deferred_split_page",
"thp_split_pmd",
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+ "thp_split_pud",
+#endif
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
#endif
@@ -1547,7 +1552,6 @@ static const struct file_operations proc_vmstat_file_operations = {
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SMP
-static struct workqueue_struct *vmstat_wq;
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
@@ -1618,7 +1622,7 @@ static void vmstat_update(struct work_struct *w)
* to occur in the future. Keep on running the
* update worker thread.
*/
- queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+ queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
this_cpu_ptr(&vmstat_work),
round_jiffies_relative(sysctl_stat_interval));
}
@@ -1697,7 +1701,7 @@ static void vmstat_shepherd(struct work_struct *w)
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
if (!delayed_work_pending(dw) && need_update(cpu))
- queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
+ queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
}
put_online_cpus();
@@ -1713,82 +1717,76 @@ static void __init start_shepherd_timer(void)
INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_update);
- vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
}
static void __init init_cpu_node_state(void)
{
- int cpu;
+ int node;
- get_online_cpus();
- for_each_online_cpu(cpu)
- node_set_state(cpu_to_node(cpu), N_CPU);
- put_online_cpus();
+ for_each_online_node(node) {
+ if (cpumask_weight(cpumask_of_node(node)) > 0)
+ node_set_state(node, N_CPU);
+ }
}
-static void vmstat_cpu_dead(int node)
+static int vmstat_cpu_online(unsigned int cpu)
{
- int cpu;
-
- get_online_cpus();
- for_each_online_cpu(cpu)
- if (cpu_to_node(cpu) == node)
- goto end;
+ refresh_zone_stat_thresholds();
+ node_set_state(cpu_to_node(cpu), N_CPU);
+ return 0;
+}
- node_clear_state(node, N_CPU);
-end:
- put_online_cpus();
+static int vmstat_cpu_down_prep(unsigned int cpu)
+{
+ cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+ return 0;
}
-/*
- * Use the cpu notifier to insure that the thresholds are recalculated
- * when necessary.
- */
-static int vmstat_cpuup_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- long cpu = (long)hcpu;
-
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- refresh_zone_stat_thresholds();
- node_set_state(cpu_to_node(cpu), N_CPU);
- break;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
- break;
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- refresh_zone_stat_thresholds();
- vmstat_cpu_dead(cpu_to_node(cpu));
- break;
- default:
- break;
- }
- return NOTIFY_OK;
+static int vmstat_cpu_dead(unsigned int cpu)
+{
+ const struct cpumask *node_cpus;
+ int node;
+
+ node = cpu_to_node(cpu);
+
+ refresh_zone_stat_thresholds();
+ node_cpus = cpumask_of_node(node);
+ if (cpumask_weight(node_cpus) > 0)
+ return 0;
+
+ node_clear_state(node, N_CPU);
+ return 0;
}
-static struct notifier_block vmstat_notifier =
- { &vmstat_cpuup_callback, NULL, 0 };
#endif
-static int __init setup_vmstat(void)
+struct workqueue_struct *mm_percpu_wq;
+
+void __init init_mm_internals(void)
{
+ int ret __maybe_unused;
+
+ mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
+
#ifdef CONFIG_SMP
- cpu_notifier_register_begin();
- __register_cpu_notifier(&vmstat_notifier);
+ ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
+ NULL, vmstat_cpu_dead);
+ if (ret < 0)
+ pr_err("vmstat: failed to register 'dead' hotplug state\n");
+
+ ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
+ vmstat_cpu_online,
+ vmstat_cpu_down_prep);
+ if (ret < 0)
+ pr_err("vmstat: failed to register 'online' hotplug state\n");
+
+ get_online_cpus();
init_cpu_node_state();
+ put_online_cpus();
start_shepherd_timer();
- cpu_notifier_register_done();
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
@@ -1796,9 +1794,7 @@ static int __init setup_vmstat(void)
proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
#endif
- return 0;
}
-module_init(setup_vmstat)
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
diff --git a/mm/workingset.c b/mm/workingset.c
index fb1f9183d89a..eda05c71fa49 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -6,10 +6,12 @@
#include <linux/memcontrol.h>
#include <linux/writeback.h>
+#include <linux/shmem_fs.h>
#include <linux/pagemap.h>
#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/swap.h>
+#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/mm.h>
@@ -266,7 +268,7 @@ bool workingset_refault(void *shadow)
}
lruvec = mem_cgroup_lruvec(pgdat, memcg);
refault = atomic_long_read(&lruvec->inactive_age);
- active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+ active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
rcu_read_unlock();
/*
@@ -334,48 +336,79 @@ out:
* point where they would still be useful.
*/
-struct list_lru workingset_shadow_nodes;
+static struct list_lru shadow_nodes;
+
+void workingset_update_node(struct radix_tree_node *node, void *private)
+{
+ struct address_space *mapping = private;
+
+ /* Only regular page cache has shadow entries */
+ if (dax_mapping(mapping) || shmem_mapping(mapping))
+ return;
+
+ /*
+ * Track non-empty nodes that contain only shadow entries;
+ * unlink those that contain pages or are being freed.
+ *
+ * Avoid acquiring the list_lru lock when the nodes are
+ * already where they should be. The list_empty() test is safe
+ * as node->private_list is protected by &mapping->tree_lock.
+ */
+ if (node->count && node->count == node->exceptional) {
+ if (list_empty(&node->private_list))
+ list_lru_add(&shadow_nodes, &node->private_list);
+ } else {
+ if (!list_empty(&node->private_list))
+ list_lru_del(&shadow_nodes, &node->private_list);
+ }
+}
static unsigned long count_shadow_nodes(struct shrinker *shrinker,
struct shrink_control *sc)
{
- unsigned long shadow_nodes;
unsigned long max_nodes;
- unsigned long pages;
+ unsigned long nodes;
+ unsigned long cache;
/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
local_irq_disable();
- shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
+ nodes = list_lru_shrink_count(&shadow_nodes, sc);
local_irq_enable();
- if (sc->memcg) {
- pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
- LRU_ALL_FILE);
- } else {
- pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
- node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
- }
-
/*
- * Active cache pages are limited to 50% of memory, and shadow
- * entries that represent a refault distance bigger than that
- * do not have any effect. Limit the number of shadow nodes
- * such that shadow entries do not exceed the number of active
- * cache pages, assuming a worst-case node population density
- * of 1/8th on average.
+ * Approximate a reasonable limit for the radix tree nodes
+ * containing shadow entries. We don't need to keep more
+ * shadow entries than possible pages on the active list,
+ * since refault distances bigger than that are dismissed.
+ *
+ * The size of the active list converges toward 100% of
+ * overall page cache as memory grows, with only a tiny
+ * inactive list. Assume the total cache size for that.
+ *
+ * Nodes might be sparsely populated, with only one shadow
+ * entry in the extreme case. Obviously, we cannot keep one
+ * node for every eligible shadow entry, so compromise on a
+ * worst-case density of 1/8th. Below that, not all eligible
+ * refaults can be detected anymore.
*
* On 64-bit with 7 radix_tree_nodes per page and 64 slots
* each, this will reclaim shadow entries when they consume
- * ~2% of available memory:
+ * ~1.8% of available memory:
*
- * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE
+ * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE
*/
- max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3);
+ if (sc->memcg) {
+ cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
+ LRU_ALL_FILE);
+ } else {
+ cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
+ node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
+ }
+ max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3);
- if (shadow_nodes <= max_nodes)
+ if (nodes <= max_nodes)
return 0;
-
- return shadow_nodes - max_nodes;
+ return nodes - max_nodes;
}
static enum lru_status shadow_lru_isolate(struct list_head *item,
@@ -401,7 +434,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
*/
node = container_of(item, struct radix_tree_node, private_list);
- mapping = node->private_data;
+ mapping = container_of(node->root, struct address_space, page_tree);
/* Coming from the list, invert the lock order */
if (!spin_trylock(&mapping->tree_lock)) {
@@ -418,23 +451,31 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
* no pages, so we expect to be able to remove them all and
* delete and free the empty node afterwards.
*/
- BUG_ON(!workingset_node_shadows(node));
- BUG_ON(workingset_node_pages(node));
-
+ if (WARN_ON_ONCE(!node->exceptional))
+ goto out_invalid;
+ if (WARN_ON_ONCE(node->count != node->exceptional))
+ goto out_invalid;
for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
if (node->slots[i]) {
- BUG_ON(!radix_tree_exceptional_entry(node->slots[i]));
+ if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i])))
+ goto out_invalid;
+ if (WARN_ON_ONCE(!node->exceptional))
+ goto out_invalid;
+ if (WARN_ON_ONCE(!mapping->nrexceptional))
+ goto out_invalid;
node->slots[i] = NULL;
- workingset_node_shadows_dec(node);
- BUG_ON(!mapping->nrexceptional);
+ node->exceptional--;
+ node->count--;
mapping->nrexceptional--;
}
}
- BUG_ON(workingset_node_shadows(node));
+ if (WARN_ON_ONCE(node->exceptional))
+ goto out_invalid;
inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
- if (!__radix_tree_delete_node(&mapping->page_tree, node))
- BUG();
+ __radix_tree_delete_node(&mapping->page_tree, node,
+ workingset_update_node, mapping);
+out_invalid:
spin_unlock(&mapping->tree_lock);
ret = LRU_REMOVED_RETRY;
out:
@@ -452,8 +493,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
local_irq_disable();
- ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc,
- shadow_lru_isolate, NULL);
+ ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
local_irq_enable();
return ret;
}
@@ -492,7 +532,7 @@ static int __init workingset_init(void)
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
timestamp_bits, max_order, bucket_order);
- ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
+ ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key);
if (ret)
goto err;
ret = register_shrinker(&workingset_shadow_shrinker);
@@ -500,7 +540,7 @@ static int __init workingset_init(void)
goto err_list_lru;
return 0;
err_list_lru:
- list_lru_destroy(&workingset_shadow_nodes);
+ list_lru_destroy(&shadow_nodes);
err:
return ret;
}
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 8f9e89ca1d31..54f63c4a809a 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -34,28 +34,61 @@
/*****************
* Structures
*****************/
+struct z3fold_pool;
+struct z3fold_ops {
+ int (*evict)(struct z3fold_pool *pool, unsigned long handle);
+};
+
+enum buddy {
+ HEADLESS = 0,
+ FIRST,
+ MIDDLE,
+ LAST,
+ BUDDIES_MAX
+};
+
+/*
+ * struct z3fold_header - z3fold page metadata occupying the first chunk of each
+ * z3fold page, except for HEADLESS pages
+ * @buddy: links the z3fold page into the relevant list in the pool
+ * @page_lock: per-page lock
+ * @refcount: reference cound for the z3fold page
+ * @first_chunks: the size of the first buddy in chunks, 0 if free
+ * @middle_chunks: the size of the middle buddy in chunks, 0 if free
+ * @last_chunks: the size of the last buddy in chunks, 0 if free
+ * @first_num: the starting number (for the first handle)
+ */
+struct z3fold_header {
+ struct list_head buddy;
+ spinlock_t page_lock;
+ struct kref refcount;
+ unsigned short first_chunks;
+ unsigned short middle_chunks;
+ unsigned short last_chunks;
+ unsigned short start_middle;
+ unsigned short first_num:2;
+};
+
/*
* NCHUNKS_ORDER determines the internal allocation granularity, effectively
* adjusting internal fragmentation. It also determines the number of
* freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
- * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk
- * in allocated page is occupied by z3fold header, NCHUNKS will be calculated
- * to 63 which shows the max number of free chunks in z3fold page, also there
- * will be 63 freelists per pool.
+ * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
+ * in the beginning of an allocated page are occupied by z3fold header, so
+ * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
+ * which shows the max number of free chunks in z3fold page, also there will
+ * be 63, or 62, respectively, freelists per pool.
*/
#define NCHUNKS_ORDER 6
#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
#define CHUNK_SIZE (1 << CHUNK_SHIFT)
-#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
+#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
+#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
+#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
-#define BUDDY_MASK ((1 << NCHUNKS_ORDER) - 1)
-
-struct z3fold_pool;
-struct z3fold_ops {
- int (*evict)(struct z3fold_pool *pool, unsigned long handle);
-};
+#define BUDDY_MASK (0x3)
/**
* struct z3fold_pool - stores metadata for each z3fold pool
@@ -64,8 +97,6 @@ struct z3fold_ops {
* @unbuddied: array of lists tracking z3fold pages that contain 2- buddies;
* the lists each z3fold page is added to depends on the size of
* its free region.
- * @buddied: list tracking the z3fold pages that contain 3 buddies;
- * these z3fold pages are full
* @lru: list tracking the z3fold pages in LRU order by most recently
* added buddy.
* @pages_nr: number of z3fold pages in the pool.
@@ -78,49 +109,22 @@ struct z3fold_ops {
struct z3fold_pool {
spinlock_t lock;
struct list_head unbuddied[NCHUNKS];
- struct list_head buddied;
struct list_head lru;
- u64 pages_nr;
+ atomic64_t pages_nr;
const struct z3fold_ops *ops;
struct zpool *zpool;
const struct zpool_ops *zpool_ops;
};
-enum buddy {
- HEADLESS = 0,
- FIRST,
- MIDDLE,
- LAST,
- BUDDIES_MAX
-};
-
-/*
- * struct z3fold_header - z3fold page metadata occupying the first chunk of each
- * z3fold page, except for HEADLESS pages
- * @buddy: links the z3fold page into the relevant list in the pool
- * @first_chunks: the size of the first buddy in chunks, 0 if free
- * @middle_chunks: the size of the middle buddy in chunks, 0 if free
- * @last_chunks: the size of the last buddy in chunks, 0 if free
- * @first_num: the starting number (for the first handle)
- */
-struct z3fold_header {
- struct list_head buddy;
- unsigned short first_chunks;
- unsigned short middle_chunks;
- unsigned short last_chunks;
- unsigned short start_middle;
- unsigned short first_num:NCHUNKS_ORDER;
-};
-
/*
* Internal z3fold page flags
*/
enum z3fold_page_flags {
- UNDER_RECLAIM = 0,
- PAGE_HEADLESS,
+ PAGE_HEADLESS = 0,
MIDDLE_CHUNK_MAPPED,
};
+
/*****************
* Helpers
*****************/
@@ -140,10 +144,11 @@ static struct z3fold_header *init_z3fold_page(struct page *page)
struct z3fold_header *zhdr = page_address(page);
INIT_LIST_HEAD(&page->lru);
- clear_bit(UNDER_RECLAIM, &page->private);
clear_bit(PAGE_HEADLESS, &page->private);
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+ spin_lock_init(&zhdr->page_lock);
+ kref_init(&zhdr->refcount);
zhdr->first_chunks = 0;
zhdr->middle_chunks = 0;
zhdr->last_chunks = 0;
@@ -154,9 +159,42 @@ static struct z3fold_header *init_z3fold_page(struct page *page)
}
/* Resets the struct page fields and frees the page */
-static void free_z3fold_page(struct z3fold_header *zhdr)
+static void free_z3fold_page(struct page *page)
{
- __free_page(virt_to_page(zhdr));
+ __free_page(page);
+}
+
+static void release_z3fold_page(struct kref *ref)
+{
+ struct z3fold_header *zhdr;
+ struct page *page;
+
+ zhdr = container_of(ref, struct z3fold_header, refcount);
+ page = virt_to_page(zhdr);
+
+ if (!list_empty(&zhdr->buddy))
+ list_del(&zhdr->buddy);
+ if (!list_empty(&page->lru))
+ list_del(&page->lru);
+ free_z3fold_page(page);
+}
+
+/* Lock a z3fold page */
+static inline void z3fold_page_lock(struct z3fold_header *zhdr)
+{
+ spin_lock(&zhdr->page_lock);
+}
+
+/* Try to lock a z3fold page */
+static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
+{
+ return spin_trylock(&zhdr->page_lock);
+}
+
+/* Unlock a z3fold page */
+static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
+{
+ spin_unlock(&zhdr->page_lock);
}
/*
@@ -179,7 +217,11 @@ static struct z3fold_header *handle_to_z3fold_header(unsigned long handle)
return (struct z3fold_header *)(handle & PAGE_MASK);
}
-/* Returns buddy number */
+/*
+ * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle
+ * but that doesn't matter. because the masking will result in the
+ * correct buddy number.
+ */
static enum buddy handle_to_buddy(unsigned long handle)
{
struct z3fold_header *zhdr = handle_to_z3fold_header(handle);
@@ -200,9 +242,10 @@ static int num_free_chunks(struct z3fold_header *zhdr)
*/
if (zhdr->middle_chunks != 0) {
int nfree_before = zhdr->first_chunks ?
- 0 : zhdr->start_middle - 1;
+ 0 : zhdr->start_middle - ZHDR_CHUNKS;
int nfree_after = zhdr->last_chunks ?
- 0 : NCHUNKS - zhdr->start_middle - zhdr->middle_chunks;
+ 0 : TOTAL_CHUNKS -
+ (zhdr->start_middle + zhdr->middle_chunks);
nfree = max(nfree_before, nfree_after);
} else
nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
@@ -232,9 +275,8 @@ static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
spin_lock_init(&pool->lock);
for_each_unbuddied_list(i, 0)
INIT_LIST_HEAD(&pool->unbuddied[i]);
- INIT_LIST_HEAD(&pool->buddied);
INIT_LIST_HEAD(&pool->lru);
- pool->pages_nr = 0;
+ atomic64_set(&pool->pages_nr, 0);
pool->ops = ops;
return pool;
}
@@ -250,25 +292,58 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool)
kfree(pool);
}
+static inline void *mchunk_memmove(struct z3fold_header *zhdr,
+ unsigned short dst_chunk)
+{
+ void *beg = zhdr;
+ return memmove(beg + (dst_chunk << CHUNK_SHIFT),
+ beg + (zhdr->start_middle << CHUNK_SHIFT),
+ zhdr->middle_chunks << CHUNK_SHIFT);
+}
+
+#define BIG_CHUNK_GAP 3
/* Has to be called with lock held */
static int z3fold_compact_page(struct z3fold_header *zhdr)
{
struct page *page = virt_to_page(zhdr);
- void *beg = zhdr;
+ if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
+ return 0; /* can't move middle chunk, it's used */
+
+ if (zhdr->middle_chunks == 0)
+ return 0; /* nothing to compact */
- if (!test_bit(MIDDLE_CHUNK_MAPPED, &page->private) &&
- zhdr->middle_chunks != 0 &&
- zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
- memmove(beg + ZHDR_SIZE_ALIGNED,
- beg + (zhdr->start_middle << CHUNK_SHIFT),
- zhdr->middle_chunks << CHUNK_SHIFT);
+ if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
+ /* move to the beginning */
+ mchunk_memmove(zhdr, ZHDR_CHUNKS);
zhdr->first_chunks = zhdr->middle_chunks;
zhdr->middle_chunks = 0;
zhdr->start_middle = 0;
zhdr->first_num++;
return 1;
}
+
+ /*
+ * moving data is expensive, so let's only do that if
+ * there's substantial gain (at least BIG_CHUNK_GAP chunks)
+ */
+ if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 &&
+ zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >=
+ BIG_CHUNK_GAP) {
+ mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS);
+ zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
+ return 1;
+ } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 &&
+ TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle
+ + zhdr->middle_chunks) >=
+ BIG_CHUNK_GAP) {
+ unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks -
+ zhdr->middle_chunks;
+ mchunk_memmove(zhdr, new_start);
+ zhdr->start_middle = new_start;
+ return 1;
+ }
+
return 0;
}
@@ -309,50 +384,62 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
bud = HEADLESS;
else {
chunks = size_to_chunks(size);
- spin_lock(&pool->lock);
/* First, try to find an unbuddied z3fold page. */
zhdr = NULL;
for_each_unbuddied_list(i, chunks) {
- if (!list_empty(&pool->unbuddied[i])) {
- zhdr = list_first_entry(&pool->unbuddied[i],
+ spin_lock(&pool->lock);
+ zhdr = list_first_entry_or_null(&pool->unbuddied[i],
struct z3fold_header, buddy);
- page = virt_to_page(zhdr);
- if (zhdr->first_chunks == 0) {
- if (zhdr->middle_chunks != 0 &&
- chunks >= zhdr->start_middle)
- bud = LAST;
- else
- bud = FIRST;
- } else if (zhdr->last_chunks == 0)
+ if (!zhdr || !z3fold_page_trylock(zhdr)) {
+ spin_unlock(&pool->lock);
+ continue;
+ }
+ kref_get(&zhdr->refcount);
+ list_del_init(&zhdr->buddy);
+ spin_unlock(&pool->lock);
+
+ page = virt_to_page(zhdr);
+ if (zhdr->first_chunks == 0) {
+ if (zhdr->middle_chunks != 0 &&
+ chunks >= zhdr->start_middle)
bud = LAST;
- else if (zhdr->middle_chunks == 0)
- bud = MIDDLE;
- else {
- pr_err("No free chunks in unbuddied\n");
- WARN_ON(1);
- continue;
- }
- list_del(&zhdr->buddy);
- goto found;
+ else
+ bud = FIRST;
+ } else if (zhdr->last_chunks == 0)
+ bud = LAST;
+ else if (zhdr->middle_chunks == 0)
+ bud = MIDDLE;
+ else {
+ z3fold_page_unlock(zhdr);
+ spin_lock(&pool->lock);
+ if (kref_put(&zhdr->refcount,
+ release_z3fold_page))
+ atomic64_dec(&pool->pages_nr);
+ spin_unlock(&pool->lock);
+ pr_err("No free chunks in unbuddied\n");
+ WARN_ON(1);
+ continue;
}
+ goto found;
}
bud = FIRST;
- spin_unlock(&pool->lock);
}
/* Couldn't find unbuddied z3fold page, create new one */
page = alloc_page(gfp);
if (!page)
return -ENOMEM;
- spin_lock(&pool->lock);
- pool->pages_nr++;
+
+ atomic64_inc(&pool->pages_nr);
zhdr = init_z3fold_page(page);
if (bud == HEADLESS) {
set_bit(PAGE_HEADLESS, &page->private);
+ spin_lock(&pool->lock);
goto headless;
}
+ z3fold_page_lock(zhdr);
found:
if (bud == FIRST)
@@ -361,17 +448,15 @@ found:
zhdr->last_chunks = chunks;
else {
zhdr->middle_chunks = chunks;
- zhdr->start_middle = zhdr->first_chunks + 1;
+ zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
}
+ spin_lock(&pool->lock);
if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
zhdr->middle_chunks == 0) {
/* Add to unbuddied list */
freechunks = num_free_chunks(zhdr);
list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
- } else {
- /* Add to buddied list */
- list_add(&zhdr->buddy, &pool->buddied);
}
headless:
@@ -383,6 +468,8 @@ headless:
*handle = encode_handle(zhdr, bud);
spin_unlock(&pool->lock);
+ if (bud != HEADLESS)
+ z3fold_page_unlock(zhdr);
return 0;
}
@@ -404,7 +491,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
struct page *page;
enum buddy bud;
- spin_lock(&pool->lock);
zhdr = handle_to_z3fold_header(handle);
page = virt_to_page(zhdr);
@@ -412,6 +498,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
/* HEADLESS page stored */
bud = HEADLESS;
} else {
+ z3fold_page_lock(zhdr);
bud = handle_to_buddy(handle);
switch (bud) {
@@ -428,38 +515,36 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
default:
pr_err("%s: unknown bud %d\n", __func__, bud);
WARN_ON(1);
- spin_unlock(&pool->lock);
+ z3fold_page_unlock(zhdr);
return;
}
}
- if (test_bit(UNDER_RECLAIM, &page->private)) {
- /* z3fold page is under reclaim, reclaim will free */
- spin_unlock(&pool->lock);
- return;
- }
-
- if (bud != HEADLESS) {
- /* Remove from existing buddy list */
- list_del(&zhdr->buddy);
- }
-
- if (bud == HEADLESS ||
- (zhdr->first_chunks == 0 && zhdr->middle_chunks == 0 &&
- zhdr->last_chunks == 0)) {
- /* z3fold page is empty, free */
+ if (bud == HEADLESS) {
+ spin_lock(&pool->lock);
list_del(&page->lru);
- clear_bit(PAGE_HEADLESS, &page->private);
- free_z3fold_page(zhdr);
- pool->pages_nr--;
+ spin_unlock(&pool->lock);
+ free_z3fold_page(page);
+ atomic64_dec(&pool->pages_nr);
} else {
- z3fold_compact_page(zhdr);
- /* Add to the unbuddied list */
- freechunks = num_free_chunks(zhdr);
- list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+ if (zhdr->first_chunks != 0 || zhdr->middle_chunks != 0 ||
+ zhdr->last_chunks != 0) {
+ z3fold_compact_page(zhdr);
+ /* Add to the unbuddied list */
+ spin_lock(&pool->lock);
+ if (!list_empty(&zhdr->buddy))
+ list_del(&zhdr->buddy);
+ freechunks = num_free_chunks(zhdr);
+ list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+ spin_unlock(&pool->lock);
+ }
+ z3fold_page_unlock(zhdr);
+ spin_lock(&pool->lock);
+ if (kref_put(&zhdr->refcount, release_z3fold_page))
+ atomic64_dec(&pool->pages_nr);
+ spin_unlock(&pool->lock);
}
- spin_unlock(&pool->lock);
}
/**
@@ -506,20 +591,25 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
spin_lock(&pool->lock);
- if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
- retries == 0) {
+ if (!pool->ops || !pool->ops->evict || retries == 0) {
spin_unlock(&pool->lock);
return -EINVAL;
}
for (i = 0; i < retries; i++) {
+ if (list_empty(&pool->lru)) {
+ spin_unlock(&pool->lock);
+ return -EINVAL;
+ }
page = list_last_entry(&pool->lru, struct page, lru);
- list_del(&page->lru);
+ list_del_init(&page->lru);
- /* Protect z3fold page against free */
- set_bit(UNDER_RECLAIM, &page->private);
zhdr = page_address(page);
if (!test_bit(PAGE_HEADLESS, &page->private)) {
- list_del(&zhdr->buddy);
+ if (!list_empty(&zhdr->buddy))
+ list_del_init(&zhdr->buddy);
+ kref_get(&zhdr->refcount);
+ spin_unlock(&pool->lock);
+ z3fold_page_lock(zhdr);
/*
* We need encode the handles before unlocking, since
* we can race with free that will set
@@ -534,13 +624,13 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
middle_handle = encode_handle(zhdr, MIDDLE);
if (zhdr->last_chunks)
last_handle = encode_handle(zhdr, LAST);
+ z3fold_page_unlock(zhdr);
} else {
first_handle = encode_handle(zhdr, HEADLESS);
last_handle = middle_handle = 0;
+ spin_unlock(&pool->lock);
}
- spin_unlock(&pool->lock);
-
/* Issue the eviction callback(s) */
if (middle_handle) {
ret = pool->ops->evict(pool, middle_handle);
@@ -558,36 +648,41 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
goto next;
}
next:
- spin_lock(&pool->lock);
- clear_bit(UNDER_RECLAIM, &page->private);
- if ((test_bit(PAGE_HEADLESS, &page->private) && ret == 0) ||
- (zhdr->first_chunks == 0 && zhdr->last_chunks == 0 &&
- zhdr->middle_chunks == 0)) {
- /*
- * All buddies are now free, free the z3fold page and
- * return success.
- */
- clear_bit(PAGE_HEADLESS, &page->private);
- free_z3fold_page(zhdr);
- pool->pages_nr--;
- spin_unlock(&pool->lock);
- return 0;
- } else if (!test_bit(PAGE_HEADLESS, &page->private)) {
- if (zhdr->first_chunks != 0 &&
- zhdr->last_chunks != 0 &&
- zhdr->middle_chunks != 0) {
- /* Full, add to buddied list */
- list_add(&zhdr->buddy, &pool->buddied);
+ if (test_bit(PAGE_HEADLESS, &page->private)) {
+ if (ret == 0) {
+ free_z3fold_page(page);
+ return 0;
} else {
+ spin_lock(&pool->lock);
+ }
+ } else {
+ z3fold_page_lock(zhdr);
+ if ((zhdr->first_chunks || zhdr->last_chunks ||
+ zhdr->middle_chunks) &&
+ !(zhdr->first_chunks && zhdr->last_chunks &&
+ zhdr->middle_chunks)) {
z3fold_compact_page(zhdr);
/* add to unbuddied list */
+ spin_lock(&pool->lock);
freechunks = num_free_chunks(zhdr);
list_add(&zhdr->buddy,
&pool->unbuddied[freechunks]);
+ spin_unlock(&pool->lock);
+ }
+ z3fold_page_unlock(zhdr);
+ spin_lock(&pool->lock);
+ if (kref_put(&zhdr->refcount, release_z3fold_page)) {
+ spin_unlock(&pool->lock);
+ atomic64_dec(&pool->pages_nr);
+ return 0;
}
}
- /* add to beginning of LRU */
+ /*
+ * Add to the beginning of LRU.
+ * Pool lock has to be kept here to ensure the page has
+ * not already been released
+ */
list_add(&page->lru, &pool->lru);
}
spin_unlock(&pool->lock);
@@ -611,7 +706,6 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
void *addr;
enum buddy buddy;
- spin_lock(&pool->lock);
zhdr = handle_to_z3fold_header(handle);
addr = zhdr;
page = virt_to_page(zhdr);
@@ -619,6 +713,7 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
if (test_bit(PAGE_HEADLESS, &page->private))
goto out;
+ z3fold_page_lock(zhdr);
buddy = handle_to_buddy(handle);
switch (buddy) {
case FIRST:
@@ -637,8 +732,9 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
addr = NULL;
break;
}
+
+ z3fold_page_unlock(zhdr);
out:
- spin_unlock(&pool->lock);
return addr;
}
@@ -653,31 +749,28 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
struct page *page;
enum buddy buddy;
- spin_lock(&pool->lock);
zhdr = handle_to_z3fold_header(handle);
page = virt_to_page(zhdr);
- if (test_bit(PAGE_HEADLESS, &page->private)) {
- spin_unlock(&pool->lock);
+ if (test_bit(PAGE_HEADLESS, &page->private))
return;
- }
+ z3fold_page_lock(zhdr);
buddy = handle_to_buddy(handle);
if (buddy == MIDDLE)
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
- spin_unlock(&pool->lock);
+ z3fold_page_unlock(zhdr);
}
/**
* z3fold_get_pool_size() - gets the z3fold pool size in pages
* @pool: pool whose size is being queried
*
- * Returns: size in pages of the given pool. The pool lock need not be
- * taken to access pages_nr.
+ * Returns: size in pages of the given pool.
*/
static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
{
- return pool->pages_nr;
+ return atomic64_read(&pool->pages_nr);
}
/*****************
@@ -776,8 +869,8 @@ MODULE_ALIAS("zpool-z3fold");
static int __init init_z3fold(void)
{
- /* Make sure the z3fold header will fit in one chunk */
- BUILD_BUG_ON(sizeof(struct z3fold_header) > ZHDR_SIZE_ALIGNED);
+ /* Make sure the z3fold header is not larger than the page size */
+ BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE);
zpool_register_driver(&z3fold_zpool_driver);
return 0;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b0bc023d25c5..d41edd28298b 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -24,8 +24,7 @@
*
* Usage of struct page flags:
* PG_private: identifies the first component page
- * PG_private2: identifies the last component page
- * PG_owner_priv_1: indentifies the huge component page
+ * PG_owner_priv_1: identifies the huge component page
*
*/
@@ -34,6 +33,7 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/magic.h>
#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/highmem.h>
@@ -268,10 +268,6 @@ struct zs_pool {
#endif
};
-/*
- * A zspage's class index and fullness group
- * are encoded in its (first)page->mapping
- */
#define FULLNESS_BITS 2
#define CLASS_BITS 8
#define ISOLATED_BITS 3
@@ -280,7 +276,7 @@ struct zs_pool {
struct zspage {
struct {
unsigned int fullness:FULLNESS_BITS;
- unsigned int class:CLASS_BITS;
+ unsigned int class:CLASS_BITS + 1;
unsigned int isolated:ISOLATED_BITS;
unsigned int magic:MAGIC_VAL_BITS;
};
@@ -364,7 +360,7 @@ static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
{
return kmem_cache_alloc(pool->zspage_cachep,
flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
-};
+}
static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
{
@@ -938,7 +934,6 @@ static void reset_page(struct page *page)
{
__ClearPageMovable(page);
ClearPagePrivate(page);
- ClearPagePrivate2(page);
set_page_private(page, 0);
page_mapcount_reset(page);
ClearPageHugeObject(page);
@@ -1085,7 +1080,7 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage,
* 2. each sub-page point to zspage using page->private
*
* we set PG_private to identify the first page (i.e. no other sub-page
- * has this flag set) and PG_private_2 to identify the last page.
+ * has this flag set).
*/
for (i = 0; i < nr_pages; i++) {
page = pages[i];
@@ -1100,8 +1095,6 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage,
} else {
prev_page->freelist = page;
}
- if (i == nr_pages - 1)
- SetPagePrivate2(page);
prev_page = page;
}
}
@@ -1284,61 +1277,21 @@ out:
#endif /* CONFIG_PGTABLE_MAPPING */
-static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action,
- void *pcpu)
+static int zs_cpu_prepare(unsigned int cpu)
{
- int ret, cpu = (long)pcpu;
struct mapping_area *area;
- switch (action) {
- case CPU_UP_PREPARE:
- area = &per_cpu(zs_map_area, cpu);
- ret = __zs_cpu_up(area);
- if (ret)
- return notifier_from_errno(ret);
- break;
- case CPU_DEAD:
- case CPU_UP_CANCELED:
- area = &per_cpu(zs_map_area, cpu);
- __zs_cpu_down(area);
- break;
- }
-
- return NOTIFY_OK;
+ area = &per_cpu(zs_map_area, cpu);
+ return __zs_cpu_up(area);
}
-static struct notifier_block zs_cpu_nb = {
- .notifier_call = zs_cpu_notifier
-};
-
-static int zs_register_cpu_notifier(void)
+static int zs_cpu_dead(unsigned int cpu)
{
- int cpu, uninitialized_var(ret);
-
- cpu_notifier_register_begin();
-
- __register_cpu_notifier(&zs_cpu_nb);
- for_each_online_cpu(cpu) {
- ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
- if (notifier_to_errno(ret))
- break;
- }
-
- cpu_notifier_register_done();
- return notifier_to_errno(ret);
-}
-
-static void zs_unregister_cpu_notifier(void)
-{
- int cpu;
-
- cpu_notifier_register_begin();
-
- for_each_online_cpu(cpu)
- zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu);
- __unregister_cpu_notifier(&zs_cpu_nb);
+ struct mapping_area *area;
- cpu_notifier_register_done();
+ area = &per_cpu(zs_map_area, cpu);
+ __zs_cpu_down(area);
+ return 0;
}
static void __init init_zs_size_classes(void)
@@ -2423,7 +2376,7 @@ struct zs_pool *zs_create_pool(const char *name)
goto err;
/*
- * Iterate reversly, because, size of size_class that we want to use
+ * Iterate reversely, because, size of size_class that we want to use
* for merging should be larger or equal to current size.
*/
for (i = zs_size_classes - 1; i >= 0; i--) {
@@ -2534,10 +2487,10 @@ static int __init zs_init(void)
if (ret)
goto out;
- ret = zs_register_cpu_notifier();
-
+ ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare",
+ zs_cpu_prepare, zs_cpu_dead);
if (ret)
- goto notifier_fail;
+ goto hp_setup_fail;
init_zs_size_classes();
@@ -2549,8 +2502,7 @@ static int __init zs_init(void)
return 0;
-notifier_fail:
- zs_unregister_cpu_notifier();
+hp_setup_fail:
zsmalloc_unmount();
out:
return ret;
@@ -2562,7 +2514,7 @@ static void __exit zs_exit(void)
zpool_unregister_driver(&zs_zpool_driver);
#endif
zsmalloc_unmount();
- zs_unregister_cpu_notifier();
+ cpuhp_remove_state(CPUHP_MM_ZS_PREPARE);
zs_stat_exit();
}
diff --git a/mm/zswap.c b/mm/zswap.c
index 275b22cc8df4..eedc27894b10 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -76,9 +76,17 @@ static u64 zswap_duplicate_entry;
* tunables
**********************************/
+#define ZSWAP_PARAM_UNSET ""
+
/* Enable/disable zswap (disabled by default) */
static bool zswap_enabled;
-module_param_named(enabled, zswap_enabled, bool, 0644);
+static int zswap_enabled_param_set(const char *,
+ const struct kernel_param *);
+static struct kernel_param_ops zswap_enabled_param_ops = {
+ .set = zswap_enabled_param_set,
+ .get = param_get_bool,
+};
+module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644);
/* Crypto compressor to use */
#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
@@ -118,7 +126,7 @@ struct zswap_pool {
struct kref kref;
struct list_head list;
struct work_struct work;
- struct notifier_block notifier;
+ struct hlist_node node;
char tfm_name[CRYPTO_MAX_ALG_NAME];
};
@@ -176,6 +184,12 @@ static atomic_t zswap_pools_count = ATOMIC_INIT(0);
/* used by param callback function */
static bool zswap_init_started;
+/* fatal error during init */
+static bool zswap_init_failed;
+
+/* init completed, but couldn't create the initial pool */
+static bool zswap_has_pool;
+
/*********************************
* helpers and fwd declarations
**********************************/
@@ -352,143 +366,58 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
**********************************/
static DEFINE_PER_CPU(u8 *, zswap_dstmem);
-static int __zswap_cpu_dstmem_notifier(unsigned long action, unsigned long cpu)
+static int zswap_dstmem_prepare(unsigned int cpu)
{
u8 *dst;
- switch (action) {
- case CPU_UP_PREPARE:
- dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!dst) {
- pr_err("can't allocate compressor buffer\n");
- return NOTIFY_BAD;
- }
- per_cpu(zswap_dstmem, cpu) = dst;
- break;
- case CPU_DEAD:
- case CPU_UP_CANCELED:
- dst = per_cpu(zswap_dstmem, cpu);
- kfree(dst);
- per_cpu(zswap_dstmem, cpu) = NULL;
- break;
- default:
- break;
+ dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ if (!dst) {
+ pr_err("can't allocate compressor buffer\n");
+ return -ENOMEM;
}
- return NOTIFY_OK;
+ per_cpu(zswap_dstmem, cpu) = dst;
+ return 0;
}
-static int zswap_cpu_dstmem_notifier(struct notifier_block *nb,
- unsigned long action, void *pcpu)
+static int zswap_dstmem_dead(unsigned int cpu)
{
- return __zswap_cpu_dstmem_notifier(action, (unsigned long)pcpu);
-}
+ u8 *dst;
-static struct notifier_block zswap_dstmem_notifier = {
- .notifier_call = zswap_cpu_dstmem_notifier,
-};
+ dst = per_cpu(zswap_dstmem, cpu);
+ kfree(dst);
+ per_cpu(zswap_dstmem, cpu) = NULL;
-static int __init zswap_cpu_dstmem_init(void)
-{
- unsigned long cpu;
-
- cpu_notifier_register_begin();
- for_each_online_cpu(cpu)
- if (__zswap_cpu_dstmem_notifier(CPU_UP_PREPARE, cpu) ==
- NOTIFY_BAD)
- goto cleanup;
- __register_cpu_notifier(&zswap_dstmem_notifier);
- cpu_notifier_register_done();
return 0;
-
-cleanup:
- for_each_online_cpu(cpu)
- __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
- cpu_notifier_register_done();
- return -ENOMEM;
-}
-
-static void zswap_cpu_dstmem_destroy(void)
-{
- unsigned long cpu;
-
- cpu_notifier_register_begin();
- for_each_online_cpu(cpu)
- __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
- __unregister_cpu_notifier(&zswap_dstmem_notifier);
- cpu_notifier_register_done();
}
-static int __zswap_cpu_comp_notifier(struct zswap_pool *pool,
- unsigned long action, unsigned long cpu)
+static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
{
+ struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_comp *tfm;
- switch (action) {
- case CPU_UP_PREPARE:
- if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
- break;
- tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
- if (IS_ERR_OR_NULL(tfm)) {
- pr_err("could not alloc crypto comp %s : %ld\n",
- pool->tfm_name, PTR_ERR(tfm));
- return NOTIFY_BAD;
- }
- *per_cpu_ptr(pool->tfm, cpu) = tfm;
- break;
- case CPU_DEAD:
- case CPU_UP_CANCELED:
- tfm = *per_cpu_ptr(pool->tfm, cpu);
- if (!IS_ERR_OR_NULL(tfm))
- crypto_free_comp(tfm);
- *per_cpu_ptr(pool->tfm, cpu) = NULL;
- break;
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
-static int zswap_cpu_comp_notifier(struct notifier_block *nb,
- unsigned long action, void *pcpu)
-{
- unsigned long cpu = (unsigned long)pcpu;
- struct zswap_pool *pool = container_of(nb, typeof(*pool), notifier);
-
- return __zswap_cpu_comp_notifier(pool, action, cpu);
-}
+ if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
+ return 0;
-static int zswap_cpu_comp_init(struct zswap_pool *pool)
-{
- unsigned long cpu;
-
- memset(&pool->notifier, 0, sizeof(pool->notifier));
- pool->notifier.notifier_call = zswap_cpu_comp_notifier;
-
- cpu_notifier_register_begin();
- for_each_online_cpu(cpu)
- if (__zswap_cpu_comp_notifier(pool, CPU_UP_PREPARE, cpu) ==
- NOTIFY_BAD)
- goto cleanup;
- __register_cpu_notifier(&pool->notifier);
- cpu_notifier_register_done();
+ tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
+ if (IS_ERR_OR_NULL(tfm)) {
+ pr_err("could not alloc crypto comp %s : %ld\n",
+ pool->tfm_name, PTR_ERR(tfm));
+ return -ENOMEM;
+ }
+ *per_cpu_ptr(pool->tfm, cpu) = tfm;
return 0;
-
-cleanup:
- for_each_online_cpu(cpu)
- __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
- cpu_notifier_register_done();
- return -ENOMEM;
}
-static void zswap_cpu_comp_destroy(struct zswap_pool *pool)
+static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
{
- unsigned long cpu;
+ struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
+ struct crypto_comp *tfm;
- cpu_notifier_register_begin();
- for_each_online_cpu(cpu)
- __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
- __unregister_cpu_notifier(&pool->notifier);
- cpu_notifier_register_done();
+ tfm = *per_cpu_ptr(pool->tfm, cpu);
+ if (!IS_ERR_OR_NULL(tfm))
+ crypto_free_comp(tfm);
+ *per_cpu_ptr(pool->tfm, cpu) = NULL;
+ return 0;
}
/*********************************
@@ -500,7 +429,8 @@ static struct zswap_pool *__zswap_pool_current(void)
struct zswap_pool *pool;
pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
- WARN_ON(!pool);
+ WARN_ONCE(!pool && zswap_has_pool,
+ "%s: no page storage pool!\n", __func__);
return pool;
}
@@ -519,7 +449,7 @@ static struct zswap_pool *zswap_pool_current_get(void)
rcu_read_lock();
pool = __zswap_pool_current();
- if (!pool || !zswap_pool_get(pool))
+ if (!zswap_pool_get(pool))
pool = NULL;
rcu_read_unlock();
@@ -535,7 +465,9 @@ static struct zswap_pool *zswap_pool_last_get(void)
list_for_each_entry_rcu(pool, &zswap_pools, list)
last = pool;
- if (!WARN_ON(!last) && !zswap_pool_get(last))
+ WARN_ONCE(!last && zswap_has_pool,
+ "%s: no page storage pool!\n", __func__);
+ if (!zswap_pool_get(last))
last = NULL;
rcu_read_unlock();
@@ -569,6 +501,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
struct zswap_pool *pool;
char name[38]; /* 'zswap' + 32 char (max) num + \0 */
gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+ int ret;
+
+ if (!zswap_has_pool) {
+ /* if either are unset, pool initialization failed, and we
+ * need both params to be set correctly before trying to
+ * create a pool.
+ */
+ if (!strcmp(type, ZSWAP_PARAM_UNSET))
+ return NULL;
+ if (!strcmp(compressor, ZSWAP_PARAM_UNSET))
+ return NULL;
+ }
pool = kzalloc(sizeof(*pool), GFP_KERNEL);
if (!pool) {
@@ -593,7 +537,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
goto error;
}
- if (zswap_cpu_comp_init(pool))
+ ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
+ &pool->node);
+ if (ret)
goto error;
pr_debug("using %s compressor\n", pool->tfm_name);
@@ -617,29 +563,41 @@ error:
static __init struct zswap_pool *__zswap_pool_create_fallback(void)
{
- if (!crypto_has_comp(zswap_compressor, 0, 0)) {
- if (!strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) {
- pr_err("default compressor %s not available\n",
- zswap_compressor);
- return NULL;
- }
+ bool has_comp, has_zpool;
+
+ has_comp = crypto_has_comp(zswap_compressor, 0, 0);
+ if (!has_comp && strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) {
pr_err("compressor %s not available, using default %s\n",
zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
param_free_charp(&zswap_compressor);
zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
+ has_comp = crypto_has_comp(zswap_compressor, 0, 0);
}
- if (!zpool_has_pool(zswap_zpool_type)) {
- if (!strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
- pr_err("default zpool %s not available\n",
- zswap_zpool_type);
- return NULL;
- }
+ if (!has_comp) {
+ pr_err("default compressor %s not available\n",
+ zswap_compressor);
+ param_free_charp(&zswap_compressor);
+ zswap_compressor = ZSWAP_PARAM_UNSET;
+ }
+
+ has_zpool = zpool_has_pool(zswap_zpool_type);
+ if (!has_zpool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
pr_err("zpool %s not available, using default %s\n",
zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
param_free_charp(&zswap_zpool_type);
zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
+ has_zpool = zpool_has_pool(zswap_zpool_type);
+ }
+ if (!has_zpool) {
+ pr_err("default zpool %s not available\n",
+ zswap_zpool_type);
+ param_free_charp(&zswap_zpool_type);
+ zswap_zpool_type = ZSWAP_PARAM_UNSET;
}
+ if (!has_comp || !has_zpool)
+ return NULL;
+
return zswap_pool_create(zswap_zpool_type, zswap_compressor);
}
@@ -647,7 +605,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
{
zswap_pool_debug("destroying", pool);
- zswap_cpu_comp_destroy(pool);
+ cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
free_percpu(pool->tfm);
zpool_destroy_pool(pool->zpool);
kfree(pool);
@@ -655,6 +613,9 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
static int __must_check zswap_pool_get(struct zswap_pool *pool)
{
+ if (!pool)
+ return 0;
+
return kref_get_unless_zero(&pool->kref);
}
@@ -706,8 +667,13 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
char *s = strstrip((char *)val);
int ret;
+ if (zswap_init_failed) {
+ pr_err("can't set param, initialization failed\n");
+ return -ENODEV;
+ }
+
/* no change required */
- if (!strcmp(s, *(char **)kp->arg))
+ if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
return 0;
/* if this is load-time (pre-init) param setting,
@@ -738,21 +704,26 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
pool = zswap_pool_find_get(type, compressor);
if (pool) {
zswap_pool_debug("using existing", pool);
+ WARN_ON(pool == zswap_pool_current());
list_del_rcu(&pool->list);
- } else {
- spin_unlock(&zswap_pools_lock);
- pool = zswap_pool_create(type, compressor);
- spin_lock(&zswap_pools_lock);
}
+ spin_unlock(&zswap_pools_lock);
+
+ if (!pool)
+ pool = zswap_pool_create(type, compressor);
+
if (pool)
ret = param_set_charp(s, kp);
else
ret = -EINVAL;
+ spin_lock(&zswap_pools_lock);
+
if (!ret) {
put_pool = zswap_pool_current();
list_add_rcu(&pool->list, &zswap_pools);
+ zswap_has_pool = true;
} else if (pool) {
/* add the possibly pre-existing pool to the end of the pools
* list; if it's new (and empty) then it'll be removed and
@@ -764,6 +735,17 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
spin_unlock(&zswap_pools_lock);
+ if (!zswap_has_pool && !pool) {
+ /* if initial pool creation failed, and this pool creation also
+ * failed, maybe both compressor and zpool params were bad.
+ * Allow changing this param, so pool creation will succeed
+ * when the other param is changed. We already verified this
+ * param is ok in the zpool_has_pool() or crypto_has_comp()
+ * checks above.
+ */
+ ret = param_set_charp(s, kp);
+ }
+
/* drop the ref from either the old current pool,
* or the new pool we failed to add
*/
@@ -785,6 +767,21 @@ static int zswap_zpool_param_set(const char *val,
return __zswap_param_set(val, kp, NULL, zswap_compressor);
}
+static int zswap_enabled_param_set(const char *val,
+ const struct kernel_param *kp)
+{
+ if (zswap_init_failed) {
+ pr_err("can't enable, initialization failed\n");
+ return -ENODEV;
+ }
+ if (!zswap_has_pool && zswap_init_started) {
+ pr_err("can't enable, no pool configured\n");
+ return -ENODEV;
+ }
+
+ return param_set_bool(val, kp);
+}
+
/*********************************
* writeback code
**********************************/
@@ -1238,6 +1235,7 @@ static void __exit zswap_debugfs_exit(void) { }
static int __init init_zswap(void)
{
struct zswap_pool *pool;
+ int ret;
zswap_init_started = true;
@@ -1246,31 +1244,44 @@ static int __init init_zswap(void)
goto cache_fail;
}
- if (zswap_cpu_dstmem_init()) {
+ ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare",
+ zswap_dstmem_prepare, zswap_dstmem_dead);
+ if (ret) {
pr_err("dstmem alloc failed\n");
goto dstmem_fail;
}
+ ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
+ "mm/zswap_pool:prepare",
+ zswap_cpu_comp_prepare,
+ zswap_cpu_comp_dead);
+ if (ret)
+ goto hp_fail;
+
pool = __zswap_pool_create_fallback();
- if (!pool) {
+ if (pool) {
+ pr_info("loaded using pool %s/%s\n", pool->tfm_name,
+ zpool_get_type(pool->zpool));
+ list_add(&pool->list, &zswap_pools);
+ zswap_has_pool = true;
+ } else {
pr_err("pool creation failed\n");
- goto pool_fail;
+ zswap_enabled = false;
}
- pr_info("loaded using pool %s/%s\n", pool->tfm_name,
- zpool_get_type(pool->zpool));
-
- list_add(&pool->list, &zswap_pools);
frontswap_register_ops(&zswap_frontswap_ops);
if (zswap_debugfs_init())
pr_warn("debugfs initialization failed\n");
return 0;
-pool_fail:
- zswap_cpu_dstmem_destroy();
+hp_fail:
+ cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
dstmem_fail:
zswap_entry_cache_destroy();
cache_fail:
+ /* if built-in, we aren't unloaded on failure; don't allow use */
+ zswap_init_failed = true;
+ zswap_enabled = false;
return -ENOMEM;
}
/* must be late so crypto has time to come up */