diff options
Diffstat (limited to 'mm')
83 files changed, 7182 insertions, 3591 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 86e3e0e74d20..9b8fccb969dc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -153,7 +153,7 @@ config MOVABLE_NODE bool "Enable to assign a node which has only movable memory" depends on HAVE_MEMBLOCK depends on NO_BOOTMEM - depends on X86_64 + depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG depends on NUMA default n help @@ -447,13 +447,9 @@ choice benefit. endchoice -# -# We don't deposit page tables on file THP mapping, -# but Power makes use of them to address MMU quirk. -# config TRANSPARENT_HUGE_PAGECACHE def_bool y - depends on TRANSPARENT_HUGEPAGE && !PPC + depends on TRANSPARENT_HUGEPAGE # # UP and nommu archs use km based percpu allocator diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index afcc550877ff..79d0fd13b5b3 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -90,3 +90,9 @@ config DEBUG_PAGE_REF careful when enabling this feature because it adds about 30 KB to the kernel code. However the runtime performance overhead is virtually nil until the tracepoints are actually enabled. + +config DEBUG_RODATA_TEST + bool "Testcase for the marking rodata read-only" + depends on STRICT_KERNEL_RWX + ---help--- + This option enables a testcase for the setting rodata read-only. diff --git a/mm/Makefile b/mm/Makefile index 295bd7a9f76b..026f6a828a50 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -23,8 +23,10 @@ KCOV_INSTRUMENT_vmstat.o := n mmu-y := nommu.o mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ - mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o pagewalk.o pgtable-generic.o + mlock.o mmap.o mprotect.o mremap.o msync.o \ + page_vma_mapped.o pagewalk.o pgtable-generic.o \ + rmap.o vmalloc.o + ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o @@ -35,7 +37,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o vmacache.o \ + compaction.o vmacache.o swap_slots.o \ interval_tree.o list_lru.o workingset.o \ debug.o $(mmu-y) @@ -83,6 +85,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o +obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8fde443f36d7..c6f2a37028c2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -237,6 +237,7 @@ static __init int bdi_class_init(void) bdi_class->dev_groups = bdi_dev_groups; bdi_debug_init(); + return 0; } postcore_initcall(bdi_class_init); @@ -310,6 +311,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); + wb->dirty_sleep = jiffies; wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp); if (!wb->congested) @@ -409,8 +411,8 @@ retry: while (*node != NULL) { parent = *node; - congested = container_of(parent, struct bdi_writeback_congested, - rb_node); + congested = rb_entry(parent, struct bdi_writeback_congested, + rb_node); if (congested->blkcg_id < blkcg_id) node = &parent->rb_left; else if (congested->blkcg_id > blkcg_id) @@ -681,33 +683,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { struct radix_tree_iter iter; - struct rb_node *rbn; void **slot; WARN_ON(test_bit(WB_registered, &bdi->wb.state)); spin_lock_irq(&cgwb_lock); - radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) cgwb_kill(*slot); - - while ((rbn = rb_first(&bdi->cgwb_congested_tree))) { - struct bdi_writeback_congested *congested = - rb_entry(rbn, struct bdi_writeback_congested, rb_node); - - rb_erase(rbn, &bdi->cgwb_congested_tree); - congested->bdi = NULL; /* mark @congested unlinked */ - } - spin_unlock_irq(&cgwb_lock); /* - * All cgwb's and their congested states must be shutdown and - * released before returning. Drain the usage counter to wait for - * all cgwb's and cgwb_congested's ever created on @bdi. + * All cgwb's must be shutdown and released before returning. Drain + * the usage counter to wait for all cgwb's ever created on @bdi. */ atomic_dec(&bdi->usage_cnt); wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt)); + /* + * Grab back our reference so that we hold it when @bdi gets + * re-registered. + */ + atomic_inc(&bdi->usage_cnt); } /** @@ -747,6 +742,21 @@ void wb_blkcg_offline(struct blkcg *blkcg) spin_unlock_irq(&cgwb_lock); } +static void cgwb_bdi_exit(struct backing_dev_info *bdi) +{ + struct rb_node *rbn; + + spin_lock_irq(&cgwb_lock); + while ((rbn = rb_first(&bdi->cgwb_congested_tree))) { + struct bdi_writeback_congested *congested = + rb_entry(rbn, struct bdi_writeback_congested, rb_node); + + rb_erase(rbn, &bdi->cgwb_congested_tree); + congested->bdi = NULL; /* mark @congested unlinked */ + } + spin_unlock_irq(&cgwb_lock); +} + #else /* CONFIG_CGROUP_WRITEBACK */ static int cgwb_bdi_init(struct backing_dev_info *bdi) @@ -757,9 +767,11 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) if (!bdi->wb_congested) return -ENOMEM; + atomic_set(&bdi->wb_congested->refcnt, 1); + err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (err) { - kfree(bdi->wb_congested); + wb_congested_put(bdi->wb_congested); return err; } return 0; @@ -767,6 +779,11 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { } +static void cgwb_bdi_exit(struct backing_dev_info *bdi) +{ + wb_congested_put(bdi->wb_congested); +} + #endif /* CONFIG_CGROUP_WRITEBACK */ int bdi_init(struct backing_dev_info *bdi) @@ -775,6 +792,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->dev = NULL; + kref_init(&bdi->refcnt); bdi->min_ratio = 0; bdi->max_ratio = 100; bdi->max_prop_frac = FPROP_FRAC_BASE; @@ -790,6 +808,22 @@ int bdi_init(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_init); +struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) +{ + struct backing_dev_info *bdi; + + bdi = kmalloc_node(sizeof(struct backing_dev_info), + gfp_mask | __GFP_ZERO, node_id); + if (!bdi) + return NULL; + + if (bdi_init(bdi)) { + kfree(bdi); + return NULL; + } + return bdi; +} + int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...) { @@ -833,6 +867,8 @@ int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner) MINOR(owner->devt)); if (rc) return rc; + /* Leaking owner reference... */ + WARN_ON(bdi->owner); bdi->owner = owner; get_device(owner); return 0; @@ -870,10 +906,25 @@ void bdi_unregister(struct backing_dev_info *bdi) } } -void bdi_exit(struct backing_dev_info *bdi) +static void bdi_exit(struct backing_dev_info *bdi) { WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); + cgwb_bdi_exit(bdi); +} + +static void release_bdi(struct kref *ref) +{ + struct backing_dev_info *bdi = + container_of(ref, struct backing_dev_info, refcnt); + + bdi_exit(bdi); + kfree(bdi); +} + +void bdi_put(struct backing_dev_info *bdi) +{ + kref_put(&bdi->refcnt, release_bdi); } void bdi_destroy(struct backing_dev_info *bdi) diff --git a/mm/bootmem.c b/mm/bootmem.c index e8a55a3c9feb..9fedb27c6451 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -53,7 +53,7 @@ early_param("bootmem_debug", bootmem_debug_setup); static unsigned long __init bootmap_bytes(unsigned long pages) { - unsigned long bytes = DIV_ROUND_UP(pages, 8); + unsigned long bytes = DIV_ROUND_UP(pages, BITS_PER_BYTE); return ALIGN(bytes, sizeof(long)); } @@ -235,18 +235,13 @@ int __init cma_declare_contiguous(phys_addr_t base, phys_addr_t highmem_start; int ret = 0; -#ifdef CONFIG_X86 /* - * high_memory isn't direct mapped memory so retrieving its physical - * address isn't appropriate. But it would be useful to check the - * physical address of the highmem boundary so it's justifiable to get - * the physical address from it. On x86 there is a validation check for - * this case, so the following workaround is needed to avoid it. + * We can't use __pa(high_memory) directly, since high_memory + * isn't a valid direct map VA, and DEBUG_VIRTUAL will (validly) + * complain. Find the boundary by adding one to the last valid + * address. */ - highmem_start = __pa_nodebug(high_memory); -#else - highmem_start = __pa(high_memory); -#endif + highmem_start = __pa(high_memory - 1) + 1; pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", __func__, &size, &base, &limit, &alignment); @@ -353,6 +348,32 @@ err: return ret; } +#ifdef CONFIG_CMA_DEBUG +static void cma_debug_show_areas(struct cma *cma) +{ + unsigned long next_zero_bit, next_set_bit; + unsigned long start = 0; + unsigned int nr_zero, nr_total = 0; + + mutex_lock(&cma->lock); + pr_info("number of available pages: "); + for (;;) { + next_zero_bit = find_next_zero_bit(cma->bitmap, cma->count, start); + if (next_zero_bit >= cma->count) + break; + next_set_bit = find_next_bit(cma->bitmap, cma->count, next_zero_bit); + nr_zero = next_set_bit - next_zero_bit; + pr_cont("%s%u@%lu", nr_total ? "+" : "", nr_zero, next_zero_bit); + nr_total += nr_zero; + start = next_zero_bit + nr_zero; + } + pr_cont("=> %u free of %lu total pages\n", nr_total, cma->count); + mutex_unlock(&cma->lock); +} +#else +static inline void cma_debug_show_areas(struct cma *cma) { } +#endif + /** * cma_alloc() - allocate pages from contiguous area * @cma: Contiguous memory region for which the allocation is performed. @@ -362,14 +383,15 @@ err: * This function allocates part of contiguous memory on specific * contiguous memory area. */ -struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) +struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, + gfp_t gfp_mask) { unsigned long mask, offset; unsigned long pfn = -1; unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; struct page *page = NULL; - int ret; + int ret = -ENOMEM; if (!cma || !cma->count) return NULL; @@ -407,7 +429,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma_mutex); - ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); + ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, + gfp_mask); mutex_unlock(&cma_mutex); if (ret == 0) { page = pfn_to_page(pfn); @@ -426,6 +449,12 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) trace_cma_alloc(pfn, page, count, align); + if (ret) { + pr_info("%s: alloc failed, req-size: %zu pages, ret: %d\n", + __func__, count, ret); + cma_debug_show_areas(cma); + } + pr_debug("%s(): returned %p\n", __func__, page); return page; } diff --git a/mm/cma_debug.c b/mm/cma_debug.c index f8e4b60db167..ffc0c3d0ae64 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -138,7 +138,7 @@ static int cma_alloc_mem(struct cma *cma, int count) if (!mem) return -ENOMEM; - p = cma_alloc(cma, count, 0); + p = cma_alloc(cma, count, 0, GFP_KERNEL); if (!p) { kfree(mem); return -ENOMEM; diff --git a/mm/compaction.c b/mm/compaction.c index 0409a4ad6ea1..81e1eaa2a2cf 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -12,6 +12,7 @@ #include <linux/migrate.h> #include <linux/compaction.h> #include <linux/mm_inline.h> +#include <linux/sched/signal.h> #include <linux/backing-dev.h> #include <linux/sysctl.h> #include <linux/sysfs.h> @@ -548,7 +549,7 @@ isolate_fail: if (blockpfn == end_pfn) update_pageblock_skip(cc, valid_page, total_isolated, false); - count_compact_events(COMPACTFREE_SCANNED, nr_scanned); + cc->total_free_scanned += nr_scanned; if (total_isolated) count_compact_events(COMPACTISOLATED, total_isolated); return total_isolated; @@ -634,22 +635,6 @@ isolate_freepages_range(struct compact_control *cc, return pfn; } -/* Update the number of anon and file isolated pages in the zone */ -static void acct_isolated(struct zone *zone, struct compact_control *cc) -{ - struct page *page; - unsigned int count[2] = { 0, }; - - if (list_empty(&cc->migratepages)) - return; - - list_for_each_entry(page, &cc->migratepages, lru) - count[!!page_is_file_cache(page)]++; - - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]); - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]); -} - /* Similar to reclaim, but different enough that they don't share logic */ static bool too_many_isolated(struct zone *zone) { @@ -818,7 +803,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, locked = false; } - if (isolate_movable_page(page, isolate_mode)) + if (!isolate_movable_page(page, isolate_mode)) goto isolate_success; } @@ -834,6 +819,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, page_count(page) > page_mapcount(page)) goto isolate_fail; + /* + * Only allow to migrate anonymous pages in GFP_NOFS context + * because those do not depend on fs locks. + */ + if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page)) + goto isolate_fail; + /* If we already hold the lock, we can skip some rechecking */ if (!locked) { locked = compact_trylock_irqsave(zone_lru_lock(zone), @@ -866,6 +858,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Successfully isolated */ del_page_from_lru_list(page, lruvec, page_lru(page)); + inc_node_page_state(page, + NR_ISOLATED_ANON + page_is_file_cache(page)); isolate_success: list_add(&page->lru, &cc->migratepages); @@ -902,7 +896,6 @@ isolate_fail: spin_unlock_irqrestore(zone_lru_lock(zone), flags); locked = false; } - acct_isolated(zone, cc); putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; cc->last_migrated_pfn = 0; @@ -939,7 +932,7 @@ isolate_fail: trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, nr_scanned, nr_isolated); - count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); + cc->total_migrate_scanned += nr_scanned; if (nr_isolated) count_compact_events(COMPACTISOLATED, nr_isolated); @@ -988,7 +981,6 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) break; } - acct_isolated(cc->zone, cc); return pfn; } @@ -1258,10 +1250,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, low_pfn = isolate_migratepages_block(cc, low_pfn, block_end_pfn, isolate_mode); - if (!low_pfn || cc->contended) { - acct_isolated(zone, cc); + if (!low_pfn || cc->contended) return ISOLATE_ABORT; - } /* * Either we isolated something and proceed with migration. Or @@ -1271,7 +1261,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, break; } - acct_isolated(zone, cc); /* Record where migration scanner will be restarted. */ cc->migrate_pfn = low_pfn; @@ -1643,6 +1632,9 @@ out: zone->compact_cached_free_pfn = free_pfn; } + count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned); + count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned); + trace_mm_compaction_end(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync, ret); @@ -1657,6 +1649,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, struct compact_control cc = { .nr_freepages = 0, .nr_migratepages = 0, + .total_migrate_scanned = 0, + .total_free_scanned = 0, .order = order, .gfp_mask = gfp_mask, .zone = zone, @@ -1696,14 +1690,16 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio) { - int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; struct zone *zone; enum compact_result rc = COMPACT_SKIPPED; - /* Check if the GFP flags allow compaction */ - if (!may_enter_fs || !may_perform_io) + /* + * Check if the GFP flags allow compaction - GFP_NOIO is really + * tricky context because the migration might require IO + */ + if (!may_perform_io) return COMPACT_SKIPPED; trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); @@ -1767,9 +1763,12 @@ static void compact_node(int nid) struct zone *zone; struct compact_control cc = { .order = -1, + .total_migrate_scanned = 0, + .total_free_scanned = 0, .mode = MIGRATE_SYNC, .ignore_skip_hint = true, .whole_zone = true, + .gfp_mask = GFP_KERNEL, }; @@ -1892,14 +1891,17 @@ static void kcompactd_do_work(pg_data_t *pgdat) struct zone *zone; struct compact_control cc = { .order = pgdat->kcompactd_max_order, + .total_migrate_scanned = 0, + .total_free_scanned = 0, .classzone_idx = pgdat->kcompactd_classzone_idx, .mode = MIGRATE_SYNC_LIGHT, .ignore_skip_hint = true, + .gfp_mask = GFP_KERNEL, }; trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, cc.classzone_idx); - count_vm_event(KCOMPACTD_WAKE); + count_compact_event(KCOMPACTD_WAKE); for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) { int status; @@ -1917,6 +1919,8 @@ static void kcompactd_do_work(pg_data_t *pgdat) cc.nr_freepages = 0; cc.nr_migratepages = 0; + cc.total_migrate_scanned = 0; + cc.total_free_scanned = 0; cc.zone = zone; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1935,6 +1939,11 @@ static void kcompactd_do_work(pg_data_t *pgdat) defer_compaction(zone, cc.order); } + count_compact_events(KCOMPACTD_MIGRATE_SCANNED, + cc.total_migrate_scanned); + count_compact_events(KCOMPACTD_FREE_SCANNED, + cc.total_free_scanned); + VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); } @@ -1958,6 +1967,13 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) if (pgdat->kcompactd_max_order < order) pgdat->kcompactd_max_order = order; + /* + * Pairs with implicit barrier in wait_event_freezable() + * such that wakeups are not missed in the lockless + * waitqueue_active() call. + */ + smp_acquire__after_ctrl_dep(); + if (pgdat->kcompactd_classzone_idx > classzone_idx) pgdat->kcompactd_classzone_idx = classzone_idx; @@ -2043,33 +2059,38 @@ void kcompactd_stop(int nid) * away, we get changed to run anywhere: as the first one comes back, * restore their cpu bindings. */ -static int cpu_callback(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static int kcompactd_cpu_online(unsigned int cpu) { int nid; - if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { - for_each_node_state(nid, N_MEMORY) { - pg_data_t *pgdat = NODE_DATA(nid); - const struct cpumask *mask; + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; - mask = cpumask_of_node(pgdat->node_id); + mask = cpumask_of_node(pgdat->node_id); - if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) - /* One of our CPUs online: restore mask */ - set_cpus_allowed_ptr(pgdat->kcompactd, mask); - } + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->kcompactd, mask); } - return NOTIFY_OK; + return 0; } static int __init kcompactd_init(void) { int nid; + int ret; + + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "mm/compaction:online", + kcompactd_cpu_online, NULL); + if (ret < 0) { + pr_err("kcompactd: failed to register hotplug callbacks.\n"); + return ret; + } for_each_node_state(nid, N_MEMORY) kcompactd_run(nid); - hotcpu_notifier(cpu_callback, 0); return 0; } subsys_initcall(kcompactd_init) diff --git a/mm/debug.c b/mm/debug.c index 9feb699c5d25..db1cd26d8752 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -59,6 +59,10 @@ void __dump_page(struct page *page, const char *reason) pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); + print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32, + sizeof(unsigned long), page, + sizeof(struct page), false); + if (reason) pr_alert("page dumped because: %s\n", reason); diff --git a/mm/dmapool.c b/mm/dmapool.c index abcbfe86c25a..4d90a64b2fdc 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -93,7 +93,7 @@ show_pools(struct device *dev, struct device_attribute *attr, char *buf) spin_unlock_irq(&pool->lock); /* per-pool info, no real statistics yet */ - temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n", + temp = scnprintf(next, size, "%-16s %4u %4zu %4zu %2u\n", pool->name, blocks, pages * (pool->allocation / pool->size), pool->size, pages); @@ -434,11 +434,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, - "dma_pool_free %s, %p (bad vaddr)/%Lx\n", - pool->name, vaddr, (unsigned long long)dma); + "dma_pool_free %s, %p (bad vaddr)/%pad\n", + pool->name, vaddr, &dma); else - pr_err("dma_pool_free %s, %p (bad vaddr)/%Lx\n", - pool->name, vaddr, (unsigned long long)dma); + pr_err("dma_pool_free %s, %p (bad vaddr)/%pad\n", + pool->name, vaddr, &dma); return; } { @@ -450,11 +450,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) } spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) - dev_err(pool->dev, "dma_pool_free %s, dma %Lx already free\n", - pool->name, (unsigned long long)dma); + dev_err(pool->dev, "dma_pool_free %s, dma %pad already free\n", + pool->name, &dma); else - pr_err("dma_pool_free %s, dma %Lx already free\n", - pool->name, (unsigned long long)dma); + pr_err("dma_pool_free %s, dma %pad already free\n", + pool->name, &dma); return; } } diff --git a/mm/fadvise.c b/mm/fadvise.c index 6c707bfe02fd..a43013112581 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -139,7 +139,20 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) } if (end_index >= start_index) { - unsigned long count = invalidate_mapping_pages(mapping, + unsigned long count; + + /* + * It's common to FADV_DONTNEED right after + * the read or write that instantiates the + * pages, in which case there will be some + * sitting on the local LRU cache. Try to + * avoid the expensive remote drain and the + * second cache tree walk below by flushing + * them out right away. + */ + lru_add_drain(); + + count = invalidate_mapping_pages(mapping, start_index, end_index); /* diff --git a/mm/filemap.c b/mm/filemap.c index 50b52fe51937..1694623a6289 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -13,6 +13,7 @@ #include <linux/compiler.h> #include <linux/dax.h> #include <linux/fs.h> +#include <linux/sched/signal.h> #include <linux/uaccess.h> #include <linux/capability.h> #include <linux/kernel_stat.h> @@ -132,44 +133,28 @@ static int page_cache_tree_insert(struct address_space *mapping, if (!dax_mapping(mapping)) { if (shadowp) *shadowp = p; - if (node) - workingset_node_shadows_dec(node); } else { /* DAX can replace empty locked entry with a hole */ WARN_ON_ONCE(p != - (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | - RADIX_DAX_ENTRY_LOCK)); - /* DAX accounts exceptional entries as normal pages */ - if (node) - workingset_node_pages_dec(node); + dax_radix_locked_entry(0, RADIX_DAX_EMPTY)); /* Wakeup waiters for exceptional entry lock */ - dax_wake_mapping_entry_waiter(mapping, page->index, - false); + dax_wake_mapping_entry_waiter(mapping, page->index, p, + true); } } - radix_tree_replace_slot(slot, page); + __radix_tree_replace(&mapping->page_tree, node, slot, page, + workingset_update_node, mapping); mapping->nrpages++; - if (node) { - workingset_node_pages_inc(node); - /* - * Don't track node that contains actual pages. - * - * Avoid acquiring the list_lru lock if already - * untracked. The list_empty() test is safe as - * node->private_list is protected by - * mapping->tree_lock. - */ - if (!list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - } return 0; } static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { - int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page); + int i, nr; + + /* hugetlb pages are represented by one entry in the radix tree */ + nr = PageHuge(page) ? 1 : hpage_nr_pages(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); @@ -182,44 +167,11 @@ static void page_cache_tree_delete(struct address_space *mapping, __radix_tree_lookup(&mapping->page_tree, page->index + i, &node, &slot); - radix_tree_clear_tags(&mapping->page_tree, node, slot); - - if (!node) { - VM_BUG_ON_PAGE(nr != 1, page); - /* - * We need a node to properly account shadow - * entries. Don't plant any without. XXX - */ - shadow = NULL; - } - - radix_tree_replace_slot(slot, shadow); + VM_BUG_ON_PAGE(!node && nr != 1, page); - if (!node) - break; - - workingset_node_pages_dec(node); - if (shadow) - workingset_node_shadows_inc(node); - else - if (__radix_tree_delete_node(&mapping->page_tree, node)) - continue; - - /* - * Track node that only contains shadow entries. DAX mappings - * contain no shadow entries and may contain other exceptional - * entries so skip those. - * - * Avoid acquiring the list_lru lock if already tracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!dax_mapping(mapping) && !workingset_node_pages(node) && - list_empty(&node->private_list)) { - node->private_data = mapping; - list_lru_add(&workingset_shadow_nodes, - &node->private_list); - } + radix_tree_clear_tags(&mapping->page_tree, node, slot); + __radix_tree_replace(&mapping->page_tree, node, slot, shadow, + workingset_update_node, mapping); } if (shadow) { @@ -788,45 +740,165 @@ EXPORT_SYMBOL(__page_cache_alloc); * at a cost of "thundering herd" phenomena during rare hash * collisions. */ -wait_queue_head_t *page_waitqueue(struct page *page) +#define PAGE_WAIT_TABLE_BITS 8 +#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS) +static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned; + +static wait_queue_head_t *page_waitqueue(struct page *page) { - return bit_waitqueue(page, 0); + return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)]; } -EXPORT_SYMBOL(page_waitqueue); -void wait_on_page_bit(struct page *page, int bit_nr) +void __init pagecache_init(void) { - DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + int i; + + for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++) + init_waitqueue_head(&page_wait_table[i]); - if (test_bit(bit_nr, &page->flags)) - __wait_on_bit(page_waitqueue(page), &wait, bit_wait_io, - TASK_UNINTERRUPTIBLE); + page_writeback_init(); } -EXPORT_SYMBOL(wait_on_page_bit); -int wait_on_page_bit_killable(struct page *page, int bit_nr) +struct wait_page_key { + struct page *page; + int bit_nr; + int page_match; +}; + +struct wait_page_queue { + struct page *page; + int bit_nr; + wait_queue_t wait; +}; + +static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) { - DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + struct wait_page_key *key = arg; + struct wait_page_queue *wait_page + = container_of(wait, struct wait_page_queue, wait); + + if (wait_page->page != key->page) + return 0; + key->page_match = 1; - if (!test_bit(bit_nr, &page->flags)) + if (wait_page->bit_nr != key->bit_nr) + return 0; + if (test_bit(key->bit_nr, &key->page->flags)) return 0; - return __wait_on_bit(page_waitqueue(page), &wait, - bit_wait_io, TASK_KILLABLE); + return autoremove_wake_function(wait, mode, sync, key); } -int wait_on_page_bit_killable_timeout(struct page *page, - int bit_nr, unsigned long timeout) +static void wake_up_page_bit(struct page *page, int bit_nr) { - DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + wait_queue_head_t *q = page_waitqueue(page); + struct wait_page_key key; + unsigned long flags; - wait.key.timeout = jiffies + timeout; - if (!test_bit(bit_nr, &page->flags)) - return 0; - return __wait_on_bit(page_waitqueue(page), &wait, - bit_wait_io_timeout, TASK_KILLABLE); + key.page = page; + key.bit_nr = bit_nr; + key.page_match = 0; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_locked_key(q, TASK_NORMAL, &key); + /* + * It is possible for other pages to have collided on the waitqueue + * hash, so in that case check for a page match. That prevents a long- + * term waiter + * + * It is still possible to miss a case here, when we woke page waiters + * and removed them from the waitqueue, but there are still other + * page waiters. + */ + if (!waitqueue_active(q) || !key.page_match) { + ClearPageWaiters(page); + /* + * It's possible to miss clearing Waiters here, when we woke + * our page waiters, but the hashed waitqueue has waiters for + * other pages on it. + * + * That's okay, it's a rare case. The next waker will clear it. + */ + } + spin_unlock_irqrestore(&q->lock, flags); +} + +static void wake_up_page(struct page *page, int bit) +{ + if (!PageWaiters(page)) + return; + wake_up_page_bit(page, bit); +} + +static inline int wait_on_page_bit_common(wait_queue_head_t *q, + struct page *page, int bit_nr, int state, bool lock) +{ + struct wait_page_queue wait_page; + wait_queue_t *wait = &wait_page.wait; + int ret = 0; + + init_wait(wait); + wait->func = wake_page_function; + wait_page.page = page; + wait_page.bit_nr = bit_nr; + + for (;;) { + spin_lock_irq(&q->lock); + + if (likely(list_empty(&wait->task_list))) { + if (lock) + __add_wait_queue_tail_exclusive(q, wait); + else + __add_wait_queue(q, wait); + SetPageWaiters(page); + } + + set_current_state(state); + + spin_unlock_irq(&q->lock); + + if (likely(test_bit(bit_nr, &page->flags))) { + io_schedule(); + if (unlikely(signal_pending_state(state, current))) { + ret = -EINTR; + break; + } + } + + if (lock) { + if (!test_and_set_bit_lock(bit_nr, &page->flags)) + break; + } else { + if (!test_bit(bit_nr, &page->flags)) + break; + } + } + + finish_wait(q, wait); + + /* + * A signal could leave PageWaiters set. Clearing it here if + * !waitqueue_active would be possible (by open-coding finish_wait), + * but still fail to catch it in the case of wait hash collision. We + * already can fail to clear wait hash collision cases, so don't + * bother with signals either. + */ + + return ret; +} + +void wait_on_page_bit(struct page *page, int bit_nr) +{ + wait_queue_head_t *q = page_waitqueue(page); + wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false); +} +EXPORT_SYMBOL(wait_on_page_bit); + +int wait_on_page_bit_killable(struct page *page, int bit_nr) +{ + wait_queue_head_t *q = page_waitqueue(page); + return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false); } -EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout); /** * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue @@ -842,10 +914,34 @@ void add_page_wait_queue(struct page *page, wait_queue_t *waiter) spin_lock_irqsave(&q->lock, flags); __add_wait_queue(q, waiter); + SetPageWaiters(page); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL_GPL(add_page_wait_queue); +#ifndef clear_bit_unlock_is_negative_byte + +/* + * PG_waiters is the high bit in the same byte as PG_lock. + * + * On x86 (and on many other architectures), we can clear PG_lock and + * test the sign bit at the same time. But if the architecture does + * not support that special operation, we just do this all by hand + * instead. + * + * The read of PG_waiters has to be after (or concurrently with) PG_locked + * being cleared, but a memory barrier should be unneccssary since it is + * in the same byte as PG_locked. + */ +static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem) +{ + clear_bit_unlock(nr, mem); + /* smp_mb__after_atomic(); */ + return test_bit(PG_waiters, mem); +} + +#endif + /** * unlock_page - unlock a locked page * @page: the page @@ -855,16 +951,19 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue); * mechanism between PageLocked pages and PageWriteback pages is shared. * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. * - * The mb is necessary to enforce ordering between the clear_bit and the read - * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). + * Note that this depends on PG_waiters being the sign bit in the byte + * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to + * clear the PG_locked bit and test PG_waiters at the same time fairly + * portably (architectures that do LL/SC can test any bit, while x86 can + * test the sign bit). */ void unlock_page(struct page *page) { + BUILD_BUG_ON(PG_waiters != 7); page = compound_head(page); VM_BUG_ON_PAGE(!PageLocked(page), page); - clear_bit_unlock(PG_locked, &page->flags); - smp_mb__after_atomic(); - wake_up_page(page, PG_locked); + if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags)) + wake_up_page_bit(page, PG_locked); } EXPORT_SYMBOL(unlock_page); @@ -910,9 +1009,12 @@ void page_endio(struct page *page, bool is_write, int err) unlock_page(page); } else { if (err) { + struct address_space *mapping; + SetPageError(page); - if (page->mapping) - mapping_set_error(page->mapping, err); + mapping = page_mapping(page); + if (mapping) + mapping_set_error(mapping, err); } end_page_writeback(page); } @@ -921,25 +1023,21 @@ EXPORT_SYMBOL_GPL(page_endio); /** * __lock_page - get a lock on the page, assuming we need to sleep to get it - * @page: the page to lock + * @__page: the page to lock */ -void __lock_page(struct page *page) +void __lock_page(struct page *__page) { - struct page *page_head = compound_head(page); - DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); - - __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io, - TASK_UNINTERRUPTIBLE); + struct page *page = compound_head(__page); + wait_queue_head_t *q = page_waitqueue(page); + wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true); } EXPORT_SYMBOL(__lock_page); -int __lock_page_killable(struct page *page) +int __lock_page_killable(struct page *__page) { - struct page *page_head = compound_head(page); - DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); - - return __wait_on_bit_lock(page_waitqueue(page_head), &wait, - bit_wait_io, TASK_KILLABLE); + struct page *page = compound_head(__page); + wait_queue_head_t *q = page_waitqueue(page); + return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true); } EXPORT_SYMBOL_GPL(__lock_page_killable); @@ -1686,7 +1784,7 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, int error = 0; if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) - return -EINVAL; + return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes); index = *ppos >> PAGE_SHIFT; @@ -1703,6 +1801,11 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, cond_resched(); find_page: + if (fatal_signal_pending(current)) { + error = -EINTR; + goto out; + } + page = find_get_page(mapping, index); if (!page) { page_cache_sync_readahead(mapping, @@ -2070,7 +2173,6 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, /** * filemap_fault - read in file data for page fault handling - * @vma: vma in which the fault was taken * @vmf: struct vm_fault containing details of the fault * * filemap_fault() is invoked via the vma operations vector for a @@ -2092,10 +2194,10 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, * * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. */ -int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +int filemap_fault(struct vm_fault *vmf) { int error; - struct file *file = vma->vm_file; + struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; @@ -2117,12 +2219,12 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * We found the page, so try async readahead before * waiting for the lock. */ - do_async_mmap_readahead(vma, ra, file, page, offset); + do_async_mmap_readahead(vmf->vma, ra, file, page, offset); } else if (!page) { /* No page in the page cache at all */ - do_sync_mmap_readahead(vma, ra, file, offset); + do_sync_mmap_readahead(vmf->vma, ra, file, offset); count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: page = find_get_page(mapping, offset); @@ -2130,7 +2232,7 @@ retry_find: goto no_cached_page; } - if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { + if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) { put_page(page); return ret | VM_FAULT_RETRY; } @@ -2213,12 +2315,12 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct fault_env *fe, +void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { struct radix_tree_iter iter; void **slot; - struct file *file = fe->vma->vm_file; + struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; loff_t size; @@ -2274,11 +2376,11 @@ repeat: if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; - fe->address += (iter.index - last_pgoff) << PAGE_SHIFT; - if (fe->pte) - fe->pte += iter.index - last_pgoff; + vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; + if (vmf->pte) + vmf->pte += iter.index - last_pgoff; last_pgoff = iter.index; - if (alloc_set_pte(fe, NULL, page)) + if (alloc_set_pte(vmf, NULL, page)) goto unlock; unlock_page(page); goto next; @@ -2288,7 +2390,7 @@ skip: put_page(page); next: /* Huge page is mapped? No need to proceed. */ - if (pmd_trans_huge(*fe->pmd)) + if (pmd_trans_huge(*vmf->pmd)) break; if (iter.index == end_pgoff) break; @@ -2297,14 +2399,14 @@ next: } EXPORT_SYMBOL(filemap_map_pages); -int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +int filemap_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); int ret = VM_FAULT_LOCKED; sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); + file_update_time(vmf->vma->vm_file); lock_page(page); if (page->mapping != inode->i_mapping) { unlock_page(page); @@ -10,7 +10,7 @@ #include <linux/swap.h> #include <linux/swapops.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/rwsem.h> #include <linux/hugetlb.h> @@ -226,6 +226,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma, unsigned int *page_mask) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; spinlock_t *ptl; @@ -243,8 +244,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma, pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return no_page_table(vma, flags); - - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d)) + return no_page_table(vma, flags); + BUILD_BUG_ON(p4d_huge(*p4d)); + if (unlikely(p4d_bad(*p4d))) + return no_page_table(vma, flags); + pud = pud_offset(p4d, address); if (pud_none(*pud)) return no_page_table(vma, flags); if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { @@ -253,6 +259,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } + if (pud_devmap(*pud)) { + ptl = pud_lock(mm, pud); + page = follow_devmap_pud(vma, address, pud, flags); + spin_unlock(ptl); + if (page) + return page; + } if (unlikely(pud_bad(*pud))) return no_page_table(vma, flags); @@ -265,8 +278,6 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } - if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) - return no_page_table(vma, flags); if (pmd_devmap(*pmd)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags); @@ -277,6 +288,9 @@ struct page *follow_page_mask(struct vm_area_struct *vma, if (likely(!pmd_trans_huge(*pmd))) return follow_page_pte(vma, address, pmd, flags); + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) + return no_page_table(vma, flags); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); @@ -317,6 +331,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, struct page **page) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -330,7 +345,9 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, else pgd = pgd_offset_gate(mm, address); BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + BUG_ON(p4d_none(*p4d)); + pud = pud_offset(p4d, address); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) @@ -572,7 +589,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, - gup_flags); + gup_flags, nonblocking); continue; } } @@ -632,7 +649,8 @@ next_page: return i; } -bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) +static bool vma_permits_fault(struct vm_area_struct *vma, + unsigned int fault_flags) { bool write = !!(fault_flags & FAULT_FLAG_WRITE); bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); @@ -857,18 +875,17 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, EXPORT_SYMBOL(get_user_pages_locked); /* - * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to - * pass additional gup_flags as last parameter (like FOLL_HWPOISON). + * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows for + * tsk, mm to be specified. * * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the - * caller if required (just like with __get_user_pages). "FOLL_GET", - * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed - * according to the parameters "pages", "write", "force" - * respectively. + * caller if required (just like with __get_user_pages). "FOLL_GET" + * is set implicitly if "pages" is non-NULL. */ -__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) +static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, + struct mm_struct *mm, unsigned long start, + unsigned long nr_pages, struct page **pages, + unsigned int gup_flags) { long ret; int locked = 1; @@ -880,7 +897,6 @@ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct m up_read(&mm->mmap_sem); return ret; } -EXPORT_SYMBOL(__get_user_pages_unlocked); /* * get_user_pages_unlocked() is suitable to replace the form: @@ -894,10 +910,8 @@ EXPORT_SYMBOL(__get_user_pages_unlocked); * get_user_pages_unlocked(tsk, mm, ..., pages); * * It is functionally equivalent to get_user_pages_fast so - * get_user_pages_fast should be used instead, if the two parameters - * "tsk" and "mm" are respectively equal to current and current->mm, - * or if "force" shall be set to 1 (get_user_pages_fast misses the - * "force" parameter). + * get_user_pages_fast should be used instead if specific gup_flags + * (e.g. FOLL_FORCE) are not required. */ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) @@ -920,6 +934,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages @@ -963,10 +980,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked); long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) + struct vm_area_struct **vmas, int *locked) { return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, - NULL, false, + locked, true, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); @@ -974,8 +991,9 @@ EXPORT_SYMBOL(get_user_pages_remote); /* * This is the same as get_user_pages_remote(), just with a * less-flexible calling convention where we assume that the task - * and mm being operated on are the current task's. We also - * obviously don't pass FOLL_REMOTE in here. + * and mm being operated on are the current task's and don't allow + * passing of a locked parameter. We also obviously don't pass + * FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, @@ -1391,13 +1409,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, return 1; } -static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, +static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; - pudp = pud_offset(&pgd, addr); + pudp = pud_offset(&p4d, addr); do { pud_t pud = READ_ONCE(*pudp); @@ -1419,6 +1437,31 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, return 1; } +static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + p4d_t *p4dp; + + p4dp = p4d_offset(&pgd, addr); + do { + p4d_t p4d = READ_ONCE(*p4dp); + + next = p4d_addr_end(addr, end); + if (p4d_none(p4d)) + return 0; + BUILD_BUG_ON(p4d_huge(p4d)); + if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { + if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, + P4D_SHIFT, next, write, pages, nr)) + return 0; + } else if (!gup_pud_range(p4d, addr, next, write, pages, nr)) + return 0; + } while (p4dp++, addr = next, addr != end); + + return 1; +} + /* * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. It will only return non-negative values. @@ -1469,7 +1512,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, PGDIR_SHIFT, next, write, pages, &nr)) break; - } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + } else if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) break; } while (pgdp++, addr = next, addr != end); local_irq_restore(flags); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d4a6e4001512..f3c4f9d22821 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -9,6 +9,8 @@ #include <linux/mm.h> #include <linux/sched.h> +#include <linux/sched/coredump.h> +#include <linux/sched/numa_balancing.h> #include <linux/highmem.h> #include <linux/hugetlb.h> #include <linux/mmu_notifier.h> @@ -142,42 +144,6 @@ static struct shrinker huge_zero_page_shrinker = { }; #ifdef CONFIG_SYSFS - -static ssize_t triple_flag_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count, - enum transparent_hugepage_flag enabled, - enum transparent_hugepage_flag deferred, - enum transparent_hugepage_flag req_madv) -{ - if (!memcmp("defer", buf, - min(sizeof("defer")-1, count))) { - if (enabled == deferred) - return -EINVAL; - clear_bit(enabled, &transparent_hugepage_flags); - clear_bit(req_madv, &transparent_hugepage_flags); - set_bit(deferred, &transparent_hugepage_flags); - } else if (!memcmp("always", buf, - min(sizeof("always")-1, count))) { - clear_bit(deferred, &transparent_hugepage_flags); - clear_bit(req_madv, &transparent_hugepage_flags); - set_bit(enabled, &transparent_hugepage_flags); - } else if (!memcmp("madvise", buf, - min(sizeof("madvise")-1, count))) { - clear_bit(enabled, &transparent_hugepage_flags); - clear_bit(deferred, &transparent_hugepage_flags); - set_bit(req_madv, &transparent_hugepage_flags); - } else if (!memcmp("never", buf, - min(sizeof("never")-1, count))) { - clear_bit(enabled, &transparent_hugepage_flags); - clear_bit(req_madv, &transparent_hugepage_flags); - clear_bit(deferred, &transparent_hugepage_flags); - } else - return -EINVAL; - - return count; -} - static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -193,19 +159,28 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - ssize_t ret; + ssize_t ret = count; - ret = triple_flag_store(kobj, attr, buf, count, - TRANSPARENT_HUGEPAGE_FLAG, - TRANSPARENT_HUGEPAGE_FLAG, - TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); + if (!memcmp("always", buf, + min(sizeof("always")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("madvise", buf, + min(sizeof("madvise")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("never", buf, + min(sizeof("never")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else + ret = -EINVAL; if (ret > 0) { int err = start_stop_khugepaged(); if (err) ret = err; } - return ret; } static struct kobj_attribute enabled_attr = @@ -241,32 +216,58 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj, return count; } -/* - * Currently defrag only disables __GFP_NOWAIT for allocation. A blind - * __GFP_REPEAT is too aggressive, it's never worth swapping tons of - * memory just to allocate one more hugepage. - */ static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "[always] defer madvise never\n"); + return sprintf(buf, "[always] defer defer+madvise madvise never\n"); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "always [defer] madvise never\n"); - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "always defer [madvise] never\n"); - else - return sprintf(buf, "always defer madvise [never]\n"); - + return sprintf(buf, "always [defer] defer+madvise madvise never\n"); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) + return sprintf(buf, "always defer [defer+madvise] madvise never\n"); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) + return sprintf(buf, "always defer defer+madvise [madvise] never\n"); + return sprintf(buf, "always defer defer+madvise madvise [never]\n"); } + static ssize_t defrag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - return triple_flag_store(kobj, attr, buf, count, - TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, - TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, - TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); + if (!memcmp("always", buf, + min(sizeof("always")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("defer+madvise", buf, + min(sizeof("defer+madvise")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("defer", buf, + min(sizeof("defer")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("madvise", buf, + min(sizeof("madvise")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("never", buf, + min(sizeof("never")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else + return -EINVAL; + + return count; } static struct kobj_attribute defrag_attr = __ATTR(defrag, 0644, defrag_show, defrag_store); @@ -285,6 +286,15 @@ static ssize_t use_zero_page_store(struct kobject *kobj, } static struct kobj_attribute use_zero_page_attr = __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); + +static ssize_t hpage_pmd_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE); +} +static struct kobj_attribute hpage_pmd_size_attr = + __ATTR_RO(hpage_pmd_size); + #ifdef CONFIG_DEBUG_VM static ssize_t debug_cow_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -307,6 +317,7 @@ static struct attribute *hugepage_attr[] = { &enabled_attr.attr, &defrag_attr.attr, &use_zero_page_attr.attr, + &hpage_pmd_size_attr.attr, #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &shmem_enabled_attr.attr, #endif @@ -532,13 +543,13 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); -static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, +static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, gfp_t gfp) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; pgtable_t pgtable; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; VM_BUG_ON_PAGE(!PageCompound(page), page); @@ -563,9 +574,9 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, */ __SetPageUptodate(page); - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) { - spin_unlock(fe->ptl); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) { + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); @@ -576,11 +587,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, if (userfaultfd_missing(vma)) { int ret; - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); - ret = handle_userfault(fe, VM_UFFD_MISSING); + ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); return ret; } @@ -590,11 +601,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, page_add_new_anon_rmap(page, vma, haddr, true); mem_cgroup_commit_charge(page, memcg, false, true); lru_cache_add_active_or_unevictable(page, vma); - pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); atomic_long_inc(&vma->vm_mm->nr_ptes); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); } @@ -602,25 +613,28 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, } /* - * If THP defrag is set to always then directly reclaim/compact as necessary - * If set to defer then do only background reclaim/compact and defer to khugepaged - * If set to madvise and the VMA is flagged then directly reclaim/compact - * When direct reclaim/compact is allowed, don't retry except for flagged VMA's + * always: directly stall for all thp allocations + * defer: wake kswapd and fail if not immediately available + * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise + * fail if not immediately available + * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately + * available + * never: never stall for any thp allocation */ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) { - bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, - &transparent_hugepage_flags) && vma_madvised) - return GFP_TRANSHUGE; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, - &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, - &transparent_hugepage_flags)) + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); - + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + 0); return GFP_TRANSHUGE_LIGHT; } @@ -641,12 +655,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return true; } -int do_huge_pmd_anonymous_page(struct fault_env *fe) +int do_huge_pmd_anonymous_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; gfp_t gfp; struct page *page; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return VM_FAULT_FALLBACK; @@ -654,7 +668,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) return VM_FAULT_OOM; if (unlikely(khugepaged_enter(vma, vma->vm_flags))) return VM_FAULT_OOM; - if (!(fe->flags & FAULT_FLAG_WRITE) && + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { pgtable_t pgtable; @@ -670,22 +684,22 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); ret = 0; set = false; - if (pmd_none(*fe->pmd)) { + if (pmd_none(*vmf->pmd)) { if (userfaultfd_missing(vma)) { - spin_unlock(fe->ptl); - ret = handle_userfault(fe, VM_UFFD_MISSING); + spin_unlock(vmf->ptl); + ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); } else { set_huge_zero_page(pgtable, vma->vm_mm, vma, - haddr, fe->pmd, zero_page); - spin_unlock(fe->ptl); + haddr, vmf->pmd, zero_page); + spin_unlock(vmf->ptl); set = true; } } else - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); if (!set) pte_free(vma->vm_mm, pgtable); return ret; @@ -697,7 +711,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) return VM_FAULT_FALLBACK; } prep_transhuge_page(page); - return __do_huge_pmd_anonymous_page(fe, page, gfp); + return __do_huge_pmd_anonymous_page(vmf, page, gfp); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -737,13 +751,68 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; - if (track_pfn_insert(vma, &pgprot, pfn)) - return VM_FAULT_SIGBUS; + + track_pfn_insert(vma, &pgprot, pfn); + insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pud = pud_mkwrite(pud); + return pud; +} + +static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, pfn_t pfn, pgprot_t prot, bool write) +{ + struct mm_struct *mm = vma->vm_mm; + pud_t entry; + spinlock_t *ptl; + + ptl = pud_lock(mm, pud); + entry = pud_mkhuge(pfn_t_pud(pfn, prot)); + if (pfn_t_devmap(pfn)) + entry = pud_mkdevmap(entry); + if (write) { + entry = pud_mkyoung(pud_mkdirty(entry)); + entry = maybe_pud_mkwrite(entry, vma); + } + set_pud_at(mm, addr, pud, entry); + update_mmu_cache_pud(vma, addr, pud); + spin_unlock(ptl); +} + +int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, pfn_t pfn, bool write) +{ + pgprot_t pgprot = vma->vm_page_prot; + /* + * If we had pud_special, we could avoid all these restrictions, + * but we need to be consistent with PTEs and architectures that + * can't support a 'special' bit. + */ + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == + (VM_PFNMAP|VM_MIXEDMAP)); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); + BUG_ON(!pfn_t_devmap(pfn)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + + track_pfn_insert(vma, &pgprot, pfn); + + insert_pfn_pud(vma, addr, pud, pfn, pgprot, write); + return VM_FAULT_NOPAGE; +} +EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ + static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { @@ -772,6 +841,12 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, assert_spin_locked(pmd_lockptr(mm, pmd)); + /* + * When we COW a devmap PMD entry, we split it into PTEs, so we should + * not be in this function with `flags & FOLL_COW` set. + */ + WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set"); + if (flags & FOLL_WRITE && !pmd_write(*pmd)) return NULL; @@ -868,30 +943,149 @@ out: return ret; } -void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd) +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static void touch_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud) +{ + pud_t _pud; + + /* + * We should set the dirty bit only for FOLL_WRITE but for now + * the dirty bit in the pud is meaningless. And if the dirty + * bit will become meaningful and we'll only set it with + * FOLL_WRITE, an atomic set_bit will be required on the pud to + * set the young bit, instead of the current set_pud_at. + */ + _pud = pud_mkyoung(pud_mkdirty(*pud)); + if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, + pud, _pud, 1)) + update_mmu_cache_pud(vma, addr, pud); +} + +struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, int flags) +{ + unsigned long pfn = pud_pfn(*pud); + struct mm_struct *mm = vma->vm_mm; + struct dev_pagemap *pgmap; + struct page *page; + + assert_spin_locked(pud_lockptr(mm, pud)); + + if (flags & FOLL_WRITE && !pud_write(*pud)) + return NULL; + + if (pud_present(*pud) && pud_devmap(*pud)) + /* pass */; + else + return NULL; + + if (flags & FOLL_TOUCH) + touch_pud(vma, addr, pud); + + /* + * device mapped pages can only be returned if the + * caller will manage the page reference count. + */ + if (!(flags & FOLL_GET)) + return ERR_PTR(-EEXIST); + + pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; + pgmap = get_dev_pagemap(pfn, NULL); + if (!pgmap) + return ERR_PTR(-EFAULT); + page = pfn_to_page(pfn); + get_page(page); + put_dev_pagemap(pgmap); + + return page; +} + +int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pud_t *dst_pud, pud_t *src_pud, unsigned long addr, + struct vm_area_struct *vma) +{ + spinlock_t *dst_ptl, *src_ptl; + pud_t pud; + int ret; + + dst_ptl = pud_lock(dst_mm, dst_pud); + src_ptl = pud_lockptr(src_mm, src_pud); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + + ret = -EAGAIN; + pud = *src_pud; + if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) + goto out_unlock; + + /* + * When page table lock is held, the huge zero pud should not be + * under splitting since we don't split the page itself, only pud to + * a page table. + */ + if (is_huge_zero_pud(pud)) { + /* No huge zero pud yet */ + } + + pudp_set_wrprotect(src_mm, addr, src_pud); + pud = pud_mkold(pud_wrprotect(pud)); + set_pud_at(dst_mm, addr, dst_pud, pud); + + ret = 0; +out_unlock: + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + return ret; +} + +void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) +{ + pud_t entry; + unsigned long haddr; + bool write = vmf->flags & FAULT_FLAG_WRITE; + + vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); + if (unlikely(!pud_same(*vmf->pud, orig_pud))) + goto unlock; + + entry = pud_mkyoung(orig_pud); + if (write) + entry = pud_mkdirty(entry); + haddr = vmf->address & HPAGE_PUD_MASK; + if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write)) + update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud); + +unlock: + spin_unlock(vmf->ptl); +} +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ + +void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) { pmd_t entry; unsigned long haddr; + bool write = vmf->flags & FAULT_FLAG_WRITE; - fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto unlock; entry = pmd_mkyoung(orig_pmd); - haddr = fe->address & HPAGE_PMD_MASK; - if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry, - fe->flags & FAULT_FLAG_WRITE)) - update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd); + if (write) + entry = pmd_mkdirty(entry); + haddr = vmf->address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write)) + update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd); unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); } -static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, +static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, struct page *page) { - struct vm_area_struct *vma = fe->vma; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; struct mem_cgroup *memcg; pgtable_t pgtable; pmd_t _pmd; @@ -908,9 +1102,8 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, } for (i = 0; i < HPAGE_PMD_NR; i++) { - pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | - __GFP_OTHER_NODE, vma, - fe->address, page_to_nid(page)); + pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, + vmf->address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_try_charge(pages[i], vma->vm_mm, GFP_KERNEL, &memcg, false))) { @@ -941,15 +1134,15 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); + pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd); + pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); pmd_populate(vma->vm_mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -958,20 +1151,20 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, entry = maybe_mkwrite(pte_mkdirty(entry), vma); memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], fe->vma, haddr, false); + page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); mem_cgroup_commit_charge(pages[i], memcg, false, false); lru_cache_add_active_or_unevictable(pages[i], vma); - fe->pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*fe->pte)); - set_pte_at(vma->vm_mm, haddr, fe->pte, entry); - pte_unmap(fe->pte); + vmf->pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*vmf->pte)); + set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); + pte_unmap(vmf->pte); } kfree(pages); smp_wmb(); /* make pte visible before pmd */ - pmd_populate(vma->vm_mm, fe->pmd, pgtable); + pmd_populate(vma->vm_mm, vmf->pmd, pgtable); page_remove_rmap(page, true); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); @@ -982,7 +1175,7 @@ out: return ret; out_free_pages: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); for (i = 0; i < HPAGE_PMD_NR; i++) { memcg = (void *)page_private(pages[i]); @@ -994,23 +1187,23 @@ out_free_pages: goto out; } -int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) +int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *new_page; struct mem_cgroup *memcg; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ gfp_t huge_gfp; /* for allocation and charge */ int ret = 0; - fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd); + vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); if (is_huge_zero_pmd(orig_pmd)) goto alloc; - spin_lock(fe->ptl); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto out_unlock; page = pmd_page(orig_pmd); @@ -1023,13 +1216,13 @@ int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1)) - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } get_page(page); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { @@ -1042,12 +1235,12 @@ alloc: prep_transhuge_page(new_page); } else { if (!page) { - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); ret |= VM_FAULT_FALLBACK; } else { - ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page); + ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page); if (ret & VM_FAULT_OOM) { - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); ret |= VM_FAULT_FALLBACK; } put_page(page); @@ -1059,7 +1252,7 @@ alloc: if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, huge_gfp, &memcg, true))) { put_page(new_page); - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); if (page) put_page(page); ret |= VM_FAULT_FALLBACK; @@ -1079,11 +1272,11 @@ alloc: mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - spin_lock(fe->ptl); + spin_lock(vmf->ptl); if (page) put_page(page); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) { - spin_unlock(fe->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(new_page, memcg, true); put_page(new_page); goto out_mn; @@ -1091,12 +1284,12 @@ alloc: pmd_t entry; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); + pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); page_add_new_anon_rmap(new_page, vma, haddr, true); mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_active_or_unevictable(new_page, vma); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); if (!page) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); } else { @@ -1106,16 +1299,26 @@ alloc: } ret |= VM_FAULT_WRITE; } - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); out_mn: mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); out: return ret; out_unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); return ret; } +/* + * FOLL_FORCE can write to even unwritable pmd's, but only + * after we've gone through a COW cycle and they are dirty. + */ +static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) +{ + return pmd_write(pmd) || + ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); +} + struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, @@ -1126,7 +1329,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, assert_spin_locked(pmd_lockptr(mm, pmd)); - if (flags & FOLL_WRITE && !pmd_write(*pmd)) + if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags)) goto out; /* Avoid dumping huge zero page */ @@ -1185,12 +1388,12 @@ out: } /* NUMA hinting page fault entry point for trans huge pmds */ -int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) +int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct anon_vma *anon_vma = NULL; struct page *page; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); int target_nid, last_cpupid = -1; bool page_locked; @@ -1198,8 +1401,8 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) bool was_writable; int flags = 0; - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(pmd, *fe->pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(pmd, *vmf->pmd))) goto out_unlock; /* @@ -1207,9 +1410,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * without disrupting NUMA hinting information. Do not relock and * check_same as the page may no longer be mapped. */ - if (unlikely(pmd_trans_migrating(*fe->pmd))) { - page = pmd_page(*fe->pmd); - spin_unlock(fe->ptl); + if (unlikely(pmd_trans_migrating(*vmf->pmd))) { + page = pmd_page(*vmf->pmd); + spin_unlock(vmf->ptl); wait_on_page_locked(page); goto out; } @@ -1225,7 +1428,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) } /* See similar comment in do_numa_page for explanation */ - if (!pmd_write(pmd)) + if (!pmd_savedwrite(pmd)) flags |= TNF_NO_GROUP; /* @@ -1242,7 +1445,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); wait_on_page_locked(page); page_nid = -1; goto out; @@ -1253,12 +1456,12 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * to serialises splits */ get_page(page); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); anon_vma = page_lock_anon_vma_read(page); /* Confirm the PMD did not change while page_table_lock was released */ - spin_lock(fe->ptl); - if (unlikely(!pmd_same(pmd, *fe->pmd))) { + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(pmd, *vmf->pmd))) { unlock_page(page); put_page(page); page_nid = -1; @@ -1276,9 +1479,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * Migrate the THP to the requested node, returns with page unlocked * and access rights restored. */ - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, - fe->pmd, pmd, fe->address, page, target_nid); + vmf->pmd, pmd, vmf->address, page, target_nid); if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; @@ -1288,23 +1491,24 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) goto out; clear_pmdnuma: BUG_ON(!PageLocked(page)); - was_writable = pmd_write(pmd); + was_writable = pmd_savedwrite(pmd); pmd = pmd_modify(pmd, vma->vm_page_prot); pmd = pmd_mkyoung(pmd); if (was_writable) pmd = pmd_mkwrite(pmd); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd); - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); unlock_page(page); out_unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); out: if (anon_vma) page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags); + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, + flags); return 0; } @@ -1322,6 +1526,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = tlb->mm; bool ret = false; + tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); + ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) goto out_unlocked; @@ -1362,8 +1568,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, deactivate_page(page); if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { - orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, - tlb->fullmm); + pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); orig_pmd = pmd_mkclean(orig_pmd); @@ -1377,12 +1582,23 @@ out_unlocked: return ret; } +static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) +{ + pgtable_t pgtable; + + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pte_free(mm, pgtable); + atomic_long_dec(&mm->nr_ptes); +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { pmd_t orig_pmd; spinlock_t *ptl; + tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); + ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) return 0; @@ -1398,12 +1614,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (vma_is_dax(vma)) { spin_unlock(ptl); if (is_huge_zero_pmd(orig_pmd)) - tlb_remove_page(tlb, pmd_page(orig_pmd)); + tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else if (is_huge_zero_pmd(orig_pmd)) { pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); atomic_long_dec(&tlb->mm->nr_ptes); spin_unlock(ptl); - tlb_remove_page(tlb, pmd_page(orig_pmd)); + tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else { struct page *page = pmd_page(orig_pmd); page_remove_rmap(page, true); @@ -1416,6 +1632,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, atomic_long_dec(&tlb->mm->nr_ptes); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { + if (arch_needs_pgtable_deposit()) + zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); } spin_unlock(ptl); @@ -1424,6 +1642,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return 1; } +#ifndef pmd_move_must_withdraw +static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, + spinlock_t *old_pmd_ptl, + struct vm_area_struct *vma) +{ + /* + * With split pmd lock we also need to move preallocated + * PTE page table if new_pmd is on different PMD page table. + * + * We also don't deposit and withdraw tables for file pages. + */ + return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); +} +#endif + bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush) @@ -1461,8 +1694,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, force_flush = true; VM_BUG_ON(!pmd_none(*new_pmd)); - if (pmd_move_must_withdraw(new_ptl, old_ptl) && - vma_is_anonymous(vma)) { + if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); @@ -1491,37 +1723,69 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; - int ret = 0; + pmd_t entry; + bool preserve_write; + int ret; ptl = __pmd_trans_huge_lock(pmd, vma); - if (ptl) { - pmd_t entry; - bool preserve_write = prot_numa && pmd_write(*pmd); - ret = 1; + if (!ptl) + return 0; - /* - * Avoid trapping faults against the zero page. The read-only - * data is likely to be read-cached on the local CPU and - * local/remote hits to the zero page are not interesting. - */ - if (prot_numa && is_huge_zero_pmd(*pmd)) { - spin_unlock(ptl); - return ret; - } + preserve_write = prot_numa && pmd_write(*pmd); + ret = 1; - if (!prot_numa || !pmd_protnone(*pmd)) { - entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); - entry = pmd_modify(entry, newprot); - if (preserve_write) - entry = pmd_mkwrite(entry); - ret = HPAGE_PMD_NR; - set_pmd_at(mm, addr, pmd, entry); - BUG_ON(vma_is_anonymous(vma) && !preserve_write && - pmd_write(entry)); - } - spin_unlock(ptl); - } + /* + * Avoid trapping faults against the zero page. The read-only + * data is likely to be read-cached on the local CPU and + * local/remote hits to the zero page are not interesting. + */ + if (prot_numa && is_huge_zero_pmd(*pmd)) + goto unlock; + + if (prot_numa && pmd_protnone(*pmd)) + goto unlock; + /* + * In case prot_numa, we are under down_read(mmap_sem). It's critical + * to not clear pmd intermittently to avoid race with MADV_DONTNEED + * which is also under down_read(mmap_sem): + * + * CPU0: CPU1: + * change_huge_pmd(prot_numa=1) + * pmdp_huge_get_and_clear_notify() + * madvise_dontneed() + * zap_pmd_range() + * pmd_trans_huge(*pmd) == 0 (without ptl) + * // skip the pmd + * set_pmd_at(); + * // pmd is re-established + * + * The race makes MADV_DONTNEED miss the huge pmd and don't clear it + * which may break userspace. + * + * pmdp_invalidate() is required to make sure we don't miss + * dirty/young flags set by hardware. + */ + entry = *pmd; + pmdp_invalidate(vma, addr, pmd); + + /* + * Recover dirty/young flags. It relies on pmdp_invalidate to not + * corrupt them. + */ + if (pmd_dirty(*pmd)) + entry = pmd_mkdirty(entry); + if (pmd_young(*pmd)) + entry = pmd_mkyoung(entry); + + entry = pmd_modify(entry, newprot); + if (preserve_write) + entry = pmd_mk_savedwrite(entry); + ret = HPAGE_PMD_NR; + set_pmd_at(mm, addr, pmd, entry); + BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); +unlock: + spin_unlock(ptl); return ret; } @@ -1541,6 +1805,84 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) return NULL; } +/* + * Returns true if a given pud maps a thp, false otherwise. + * + * Note that if it returns true, this routine returns without unlocking page + * table lock. So callers must unlock it. + */ +spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) +{ + spinlock_t *ptl; + + ptl = pud_lock(vma->vm_mm, pud); + if (likely(pud_trans_huge(*pud) || pud_devmap(*pud))) + return ptl; + spin_unlock(ptl); + return NULL; +} + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, + pud_t *pud, unsigned long addr) +{ + pud_t orig_pud; + spinlock_t *ptl; + + ptl = __pud_trans_huge_lock(pud, vma); + if (!ptl) + return 0; + /* + * For architectures like ppc64 we look at deposited pgtable + * when calling pudp_huge_get_and_clear. So do the + * pgtable_trans_huge_withdraw after finishing pudp related + * operations. + */ + orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud, + tlb->fullmm); + tlb_remove_pud_tlb_entry(tlb, pud, addr); + if (vma_is_dax(vma)) { + spin_unlock(ptl); + /* No zero page support yet */ + } else { + /* No support for anonymous PUD pages yet */ + BUG(); + } + return 1; +} + +static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, + unsigned long haddr) +{ + VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); + VM_BUG_ON_VMA(vma->vm_start > haddr, vma); + VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); + VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); + + count_vm_event(THP_SPLIT_PUD); + + pudp_huge_clear_flush_notify(vma, haddr, pud); +} + +void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, + unsigned long address) +{ + spinlock_t *ptl; + struct mm_struct *mm = vma->vm_mm; + unsigned long haddr = address & HPAGE_PUD_MASK; + + mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE); + ptl = pud_lock(mm, pud); + if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) + goto out; + __split_huge_pud_locked(vma, pud, haddr); + +out: + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE); +} +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ + static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd) { @@ -1588,6 +1930,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (!vma_is_anonymous(vma)) { _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); + /* + * We are going to unmap this huge page. So + * just go ahead and zap it + */ + if (arch_needs_pgtable_deposit()) + zap_deposited_table(mm, pmd); if (vma_is_dax(vma)) return; page = pmd_page(_pmd); @@ -1731,6 +2079,7 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -1738,7 +2087,11 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, if (!pgd_present(*pgd)) return; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return; + + pud = pud_offset(p4d, address); if (!pud_present(*pud)) return; @@ -1791,32 +2144,27 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, static void freeze_page(struct page *page) { enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | - TTU_RMAP_LOCKED; - int i, ret; + TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; + int ret; VM_BUG_ON_PAGE(!PageHead(page), page); if (PageAnon(page)) ttu_flags |= TTU_MIGRATION; - /* We only need TTU_SPLIT_HUGE_PMD once */ - ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD); - for (i = 1; !ret && i < HPAGE_PMD_NR; i++) { - /* Cut short if the page is unmapped */ - if (page_count(page) == 1) - return; - - ret = try_to_unmap(page + i, ttu_flags); - } - VM_BUG_ON_PAGE(ret, page + i - 1); + ret = try_to_unmap(page, ttu_flags); + VM_BUG_ON_PAGE(ret, page); } static void unfreeze_page(struct page *page) { int i; - - for (i = 0; i < HPAGE_PMD_NR; i++) - remove_migration_ptes(page + i, page + i, true); + if (PageTransHuge(page)) { + remove_migration_ptes(page, page, true); + } else { + for (i = 0; i < HPAGE_PMD_NR; i++) + remove_migration_ptes(page + i, page + i, true); + } } static void __split_huge_page_tail(struct page *head, int tail, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 418bf01a50ed..e5828875f7bb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -18,6 +18,7 @@ #include <linux/bootmem.h> #include <linux/sysfs.h> #include <linux/slab.h> +#include <linux/sched/signal.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> @@ -32,6 +33,7 @@ #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #include <linux/node.h> +#include <linux/userfaultfd_k.h> #include "internal.h" int hugepages_treat_as_movable; @@ -1051,7 +1053,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long nr_pages) { unsigned long end_pfn = start_pfn + nr_pages; - return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, + GFP_KERNEL); } static bool pfn_range_valid_gigantic(struct zone *z, @@ -1773,23 +1776,32 @@ free: } /* - * When releasing a hugetlb pool reservation, any surplus pages that were - * allocated to satisfy the reservation must be explicitly freed if they were - * never used. - * Called with hugetlb_lock held. + * This routine has two main purposes: + * 1) Decrement the reservation count (resv_huge_pages) by the value passed + * in unused_resv_pages. This corresponds to the prior adjustments made + * to the associated reservation map. + * 2) Free any unused surplus pages that may have been allocated to satisfy + * the reservation. As many as unused_resv_pages may be freed. + * + * Called with hugetlb_lock held. However, the lock could be dropped (and + * reacquired) during calls to cond_resched_lock. Whenever dropping the lock, + * we must make sure nobody else can claim pages we are in the process of + * freeing. Do this by ensuring resv_huge_page always is greater than the + * number of huge pages we plan to free when dropping the lock. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; - /* Uncommit the reservation */ - h->resv_huge_pages -= unused_resv_pages; - /* Cannot return gigantic pages currently */ if (hstate_is_gigantic(h)) - return; + goto out; + /* + * Part (or even all) of the reservation could have been backed + * by pre-allocated pages. Only free surplus pages. + */ nr_pages = min(unused_resv_pages, h->surplus_huge_pages); /* @@ -1799,12 +1811,22 @@ static void return_unused_surplus_pages(struct hstate *h, * when the nodes with surplus pages have no free pages. * free_pool_huge_page() will balance the the freed pages across the * on-line nodes with memory and will handle the hstate accounting. + * + * Note that we decrement resv_huge_pages as we free the pages. If + * we drop the lock, resv_huge_pages will still be sufficiently large + * to cover subsequent pages we may free. */ while (nr_pages--) { + h->resv_huge_pages--; + unused_resv_pages--; if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) - break; + goto out; cond_resched_lock(&hugetlb_lock); } + +out: + /* Fully uncommit the reservation */ + h->resv_huge_pages -= unused_resv_pages; } @@ -3122,7 +3144,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ -static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int hugetlb_vm_op_fault(struct vm_fault *vmf) { BUG(); return 0; @@ -3286,6 +3308,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); + /* + * This is a hugetlb vma, all the pte entries should point + * to huge page. + */ + tlb_remove_check_page_size_change(tlb, sz); tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); address = start; @@ -3336,7 +3363,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, } pte = huge_ptep_get_and_clear(mm, address, ptep); - tlb_remove_tlb_entry(tlb, ptep, address); + tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); @@ -3450,15 +3477,17 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * Keep the pte_same checks anyway to make transition from the mutex easier. */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, pte_t pte, - struct page *pagecache_page, spinlock_t *ptl) + unsigned long address, pte_t *ptep, + struct page *pagecache_page, spinlock_t *ptl) { + pte_t pte; struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; int ret = 0, outside_reserve = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + pte = huge_ptep_get(ptep); old_page = pte_page(pte); retry_avoidcopy: @@ -3654,6 +3683,38 @@ retry: size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto out; + + /* + * Check for page in userfault range + */ + if (userfaultfd_missing(vma)) { + u32 hash; + struct vm_fault vmf = { + .vma = vma, + .address = address, + .flags = flags, + /* + * Hard to debug if it ends up being + * used by a callee that assumes + * something about the other + * uninitialized fields... same as in + * memory.c + */ + }; + + /* + * hugetlb_fault_mutex must be dropped before + * handling userfault. Reacquire after handling + * fault to make calling code simpler. + */ + hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, + idx, address); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + ret = handle_userfault(&vmf, VM_UFFD_MISSING); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + goto out; + } + page = alloc_huge_page(vma, address, 0); if (IS_ERR(page)) { ret = PTR_ERR(page); @@ -3711,8 +3772,7 @@ retry: vma_end_reservation(h, vma, address); } - ptl = huge_pte_lockptr(h, mm, ptep); - spin_lock(ptl); + ptl = huge_pte_lock(h, mm, ptep); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto backout; @@ -3733,7 +3793,7 @@ retry: hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); + ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); } spin_unlock(ptl); @@ -3888,8 +3948,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { - ret = hugetlb_cow(mm, vma, address, ptep, entry, - pagecache_page, ptl); + ret = hugetlb_cow(mm, vma, address, ptep, + pagecache_page, ptl); goto out_put_page; } entry = huge_pte_mkdirty(entry); @@ -3923,10 +3983,113 @@ out_mutex: return ret; } +/* + * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with + * modifications for huge pages. + */ +int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, + pte_t *dst_pte, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep) +{ + int vm_shared = dst_vma->vm_flags & VM_SHARED; + struct hstate *h = hstate_vma(dst_vma); + pte_t _dst_pte; + spinlock_t *ptl; + int ret; + struct page *page; + + if (!*pagep) { + ret = -ENOMEM; + page = alloc_huge_page(dst_vma, dst_addr, 0); + if (IS_ERR(page)) + goto out; + + ret = copy_huge_page_from_user(page, + (const void __user *) src_addr, + pages_per_huge_page(h), false); + + /* fallback to copy_from_user outside mmap_sem */ + if (unlikely(ret)) { + ret = -EFAULT; + *pagep = page; + /* don't free the page */ + goto out; + } + } else { + page = *pagep; + *pagep = NULL; + } + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + set_page_huge_active(page); + + /* + * If shared, add to page cache + */ + if (vm_shared) { + struct address_space *mapping = dst_vma->vm_file->f_mapping; + pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); + + ret = huge_add_to_page_cache(page, mapping, idx); + if (ret) + goto out_release_nounlock; + } + + ptl = huge_pte_lockptr(h, dst_mm, dst_pte); + spin_lock(ptl); + + ret = -EEXIST; + if (!huge_pte_none(huge_ptep_get(dst_pte))) + goto out_release_unlock; + + if (vm_shared) { + page_dup_rmap(page, true); + } else { + ClearPagePrivate(page); + hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); + } + + _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE); + if (dst_vma->vm_flags & VM_WRITE) + _dst_pte = huge_pte_mkdirty(_dst_pte); + _dst_pte = pte_mkyoung(_dst_pte); + + set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, + dst_vma->vm_flags & VM_WRITE); + hugetlb_count_add(pages_per_huge_page(h), dst_mm); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + + spin_unlock(ptl); + if (vm_shared) + unlock_page(page); + ret = 0; +out: + return ret; +out_release_unlock: + spin_unlock(ptl); +out_release_nounlock: + if (vm_shared) + unlock_page(page); + put_page(page); + goto out; +} + long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, unsigned long *nr_pages, - long i, unsigned int flags) + long i, unsigned int flags, int *nonblocking) { unsigned long pfn_offset; unsigned long vaddr = *position; @@ -3989,16 +4152,43 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, ((flags & FOLL_WRITE) && !huge_pte_write(huge_ptep_get(pte)))) { int ret; + unsigned int fault_flags = 0; if (pte) spin_unlock(ptl); - ret = hugetlb_fault(mm, vma, vaddr, - (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); - if (!(ret & VM_FAULT_ERROR)) - continue; - - remainder = 0; - break; + if (flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (nonblocking) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; + if (flags & FOLL_NOWAIT) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | + FAULT_FLAG_RETRY_NOWAIT; + if (flags & FOLL_TRIED) { + VM_WARN_ON_ONCE(fault_flags & + FAULT_FLAG_ALLOW_RETRY); + fault_flags |= FAULT_FLAG_TRIED; + } + ret = hugetlb_fault(mm, vma, vaddr, fault_flags); + if (ret & VM_FAULT_ERROR) { + remainder = 0; + break; + } + if (ret & VM_FAULT_RETRY) { + if (nonblocking) + *nonblocking = 0; + *nr_pages = 0; + /* + * VM_FAULT_RETRY must not return an + * error, it will return zero + * instead. + * + * No need to update "position" as the + * caller will not check it after + * *nr_pages is set to 0. + */ + return i; + } + continue; } pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; @@ -4027,6 +4217,11 @@ same_page: spin_unlock(ptl); } *nr_pages = remainder; + /* + * setting position is actually required only if remainder is + * not zero but it's faster not to add a "if (remainder)" + * branch. + */ *position = vaddr; return i ? i : -EFAULT; @@ -4208,7 +4403,9 @@ int hugetlb_reserve_pages(struct inode *inode, return 0; out_err: if (!vma || vma->vm_flags & VM_MAYSHARE) - region_abort(resv_map, from, to); + /* Don't call region_abort if region_chg failed */ + if (chg >= 0) + region_abort(resv_map, from, to); if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) kref_put(&resv_map->refs, resv_map_release); return ret; @@ -4330,8 +4527,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!spte) goto out; - ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); - spin_lock(ptl); + ptl = huge_pte_lock(hstate_vma(vma), mm, spte); if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); @@ -4361,7 +4557,8 @@ out: int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) { pgd_t *pgd = pgd_offset(mm, *addr); - pud_t *pud = pud_offset(pgd, *addr); + p4d_t *p4d = p4d_offset(pgd, *addr); + pud_t *pud = pud_offset(p4d, *addr); BUG_ON(page_count(virt_to_page(ptep)) == 0); if (page_count(virt_to_page(ptep)) == 1) @@ -4392,11 +4589,13 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pte_t *pte = NULL; pgd = pgd_offset(mm, addr); - pud = pud_alloc(mm, pgd, addr); + p4d = p4d_offset(pgd, addr); + pud = pud_alloc(mm, p4d, addr); if (pud) { if (sz == PUD_SIZE) { pte = (pte_t *)pud; @@ -4416,18 +4615,22 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; - pmd_t *pmd = NULL; + pmd_t *pmd; pgd = pgd_offset(mm, addr); - if (pgd_present(*pgd)) { - pud = pud_offset(pgd, addr); - if (pud_present(*pud)) { - if (pud_huge(*pud)) - return (pte_t *)pud; - pmd = pmd_offset(pud, addr); - } - } + if (!pgd_present(*pgd)) + return NULL; + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return NULL; + pud = pud_offset(p4d, addr); + if (!pud_present(*pud)) + return NULL; + if (pud_huge(*pud)) + return (pte_t *)pud; + pmd = pmd_offset(pud, addr); return (pte_t *) pmd; } @@ -4450,6 +4653,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, { struct page *page = NULL; spinlock_t *ptl; + pte_t pte; retry: ptl = pmd_lockptr(mm, pmd); spin_lock(ptl); @@ -4459,12 +4663,13 @@ retry: */ if (!pmd_huge(*pmd)) goto out; - if (pmd_present(*pmd)) { + pte = huge_ptep_get((pte_t *)pmd); + if (pte_present(pte)) { page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); if (flags & FOLL_GET) get_page(page); } else { - if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) { + if (is_hugetlb_entry_migration(pte)) { spin_unlock(ptl); __migration_entry_wait(mm, (pte_t *)pmd, ptl); goto retry; diff --git a/mm/init-mm.c b/mm/init-mm.c index a56a851908d2..975e49f00f34 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -6,6 +6,7 @@ #include <linux/cpumask.h> #include <linux/atomic.h> +#include <linux/user_namespace.h> #include <asm/pgtable.h> #include <asm/mmu.h> @@ -21,5 +22,6 @@ struct mm_struct init_mm = { .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), + .user_ns = &init_user_ns, INIT_MM_CONTEXT(init_mm) }; diff --git a/mm/internal.h b/mm/internal.h index 537ac9951f5f..266efaeaa370 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -36,11 +36,18 @@ /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) -int do_swap_page(struct fault_env *fe, pte_t orig_pte); +void page_writeback_init(void); + +int do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); +static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma) +{ + return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); +} + void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, @@ -131,9 +138,9 @@ struct alloc_context { * Assumption: *_mem_map is contiguous at least up to MAX_ORDER */ static inline unsigned long -__find_buddy_index(unsigned long page_idx, unsigned int order) +__find_buddy_pfn(unsigned long page_pfn, unsigned int order) { - return page_idx ^ (1 << order); + return page_pfn ^ (1 << order); } extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, @@ -173,6 +180,8 @@ struct compact_control { struct list_head migratepages; /* List of pages being migrated */ unsigned long nr_freepages; /* Number of isolated free pages */ unsigned long nr_migratepages; /* Number of pages to migrate */ + unsigned long total_migrate_scanned; + unsigned long total_free_scanned; unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ @@ -326,12 +335,15 @@ __vma_address(struct page *page, struct vm_area_struct *vma) static inline unsigned long vma_address(struct page *page, struct vm_area_struct *vma) { - unsigned long address = __vma_address(page, vma); + unsigned long start, end; + + start = __vma_address(page, vma); + end = start + PAGE_SIZE * (hpage_nr_pages(page) - 1); /* page should be within @vma mapping range */ - VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); + VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma); - return address; + return max(start, vma->vm_start); } #else /* !CONFIG_MMU */ @@ -469,6 +481,13 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, enum ttu_flags; struct tlbflush_unmap_batch; + +/* + * only for MM internal work items which do not depend on + * any allocations or locks which might depend on allocations + */ +extern struct workqueue_struct *mm_percpu_wq; + #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH void try_to_unmap_flush(void); void try_to_unmap_flush_dirty(void); diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 0e9505f66ec1..98b27195e38b 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -29,6 +29,7 @@ #include <linux/module.h> #include <linux/printk.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/slab.h> #include <linux/stacktrace.h> #include <linux/string.h> @@ -39,6 +40,16 @@ #include "kasan.h" #include "../slab.h" +void kasan_enable_current(void) +{ + current->kasan_depth++; +} + +void kasan_disable_current(void) +{ + current->kasan_depth--; +} + /* * Poisons the shadow memory for 'size' bytes starting from 'addr'. * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE. @@ -80,7 +91,14 @@ void kasan_unpoison_task_stack(struct task_struct *task) /* Unpoison the stack for the current task beyond a watermark sp value. */ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) { - __kasan_unpoison_stack(current, watermark); + /* + * Calculate the task stack base address. Avoid using 'current' + * because this function is called by early resume code which hasn't + * yet set up the percpu register (%gs). + */ + void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1)); + + kasan_unpoison_shadow(base, watermark - base); } /* @@ -428,7 +446,7 @@ void kasan_cache_shrink(struct kmem_cache *cache) quarantine_remove_cache(cache); } -void kasan_cache_destroy(struct kmem_cache *cache) +void kasan_cache_shutdown(struct kmem_cache *cache) { quarantine_remove_cache(cache); } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 1c260e6b3b3c..dd2dea8eb077 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -96,11 +96,6 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) << KASAN_SHADOW_SCALE_SHIFT); } -static inline bool kasan_report_enabled(void) -{ - return !current->kasan_depth; -} - void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_double_free(struct kmem_cache *cache, void *object, diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c index 3f9a41cf0ac6..b96a5f773d88 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/kasan_init.c @@ -15,6 +15,7 @@ #include <linux/kasan.h> #include <linux/kernel.h> #include <linux/memblock.h> +#include <linux/mm.h> #include <linux/pfn.h> #include <asm/page.h> @@ -29,6 +30,9 @@ */ unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; +#if CONFIG_PGTABLE_LEVELS > 4 +p4d_t kasan_zero_p4d[PTRS_PER_P4D] __page_aligned_bss; +#endif #if CONFIG_PGTABLE_LEVELS > 3 pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; #endif @@ -49,7 +53,7 @@ static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr, pte_t *pte = pte_offset_kernel(pmd, addr); pte_t zero_pte; - zero_pte = pfn_pte(PFN_DOWN(__pa(kasan_zero_page)), PAGE_KERNEL); + zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_zero_page)), PAGE_KERNEL); zero_pte = pte_wrprotect(zero_pte); while (addr + PAGE_SIZE <= end) { @@ -69,7 +73,7 @@ static void __init zero_pmd_populate(pud_t *pud, unsigned long addr, next = pmd_addr_end(addr, end); if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) { - pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); + pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); continue; } @@ -81,10 +85,10 @@ static void __init zero_pmd_populate(pud_t *pud, unsigned long addr, } while (pmd++, addr = next, addr != end); } -static void __init zero_pud_populate(pgd_t *pgd, unsigned long addr, +static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end) { - pud_t *pud = pud_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); unsigned long next; do { @@ -92,9 +96,9 @@ static void __init zero_pud_populate(pgd_t *pgd, unsigned long addr, if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) { pmd_t *pmd; - pud_populate(&init_mm, pud, kasan_zero_pmd); + pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); pmd = pmd_offset(pud, addr); - pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); + pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); continue; } @@ -106,6 +110,23 @@ static void __init zero_pud_populate(pgd_t *pgd, unsigned long addr, } while (pud++, addr = next, addr != end); } +static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, + unsigned long end) +{ + p4d_t *p4d = p4d_offset(pgd, addr); + unsigned long next; + + do { + next = p4d_addr_end(addr, end); + + if (p4d_none(*p4d)) { + p4d_populate(&init_mm, p4d, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } + zero_pud_populate(p4d, addr, next); + } while (p4d++, addr = next, addr != end); +} + /** * kasan_populate_zero_shadow - populate shadow memory region with * kasan_zero_page @@ -124,6 +145,7 @@ void __init kasan_populate_zero_shadow(const void *shadow_start, next = pgd_addr_end(addr, end); if (IS_ALIGNED(addr, PGDIR_SIZE) && end - addr >= PGDIR_SIZE) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -134,12 +156,25 @@ void __init kasan_populate_zero_shadow(const void *shadow_start, * 3,2 - level page tables where we don't have * puds,pmds, so pgd_populate(), pud_populate() * is noops. + * + * The ifndef is required to avoid build breakage. + * + * With 5level-fixup.h, pgd_populate() is not nop and + * we reference kasan_zero_p4d. It's not defined + * unless 5-level paging enabled. + * + * The ifndef can be dropped once all KASAN-enabled + * architectures will switch to pgtable-nop4d.h. */ - pgd_populate(&init_mm, pgd, kasan_zero_pud); - pud = pud_offset(pgd, addr); - pud_populate(&init_mm, pud, kasan_zero_pmd); +#ifndef __ARCH_HAS_5LEVEL_HACK + pgd_populate(&init_mm, pgd, lm_alias(kasan_zero_p4d)); +#endif + p4d = p4d_offset(pgd, addr); + p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud)); + pud = pud_offset(p4d, addr); + pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); pmd = pmd_offset(pud, addr); - pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); + pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); continue; } @@ -147,6 +182,6 @@ void __init kasan_populate_zero_shadow(const void *shadow_start, pgd_populate(&init_mm, pgd, early_alloc(PAGE_SIZE, NUMA_NO_NODE)); } - zero_pud_populate(pgd, addr, next); + zero_p4d_populate(pgd, addr, next); } while (pgd++, addr = next, addr != end); } diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index baabaad4a4aa..3a8ddf8baf7d 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -25,6 +25,7 @@ #include <linux/printk.h> #include <linux/shrinker.h> #include <linux/slab.h> +#include <linux/srcu.h> #include <linux/string.h> #include <linux/types.h> @@ -86,24 +87,9 @@ static void qlist_move_all(struct qlist_head *from, struct qlist_head *to) qlist_init(from); } -static void qlist_move(struct qlist_head *from, struct qlist_node *last, - struct qlist_head *to, size_t size) -{ - if (unlikely(last == from->tail)) { - qlist_move_all(from, to); - return; - } - if (qlist_empty(to)) - to->head = from->head; - else - to->tail->next = from->head; - to->tail = last; - from->head = last->next; - last->next = NULL; - from->bytes -= size; - to->bytes += size; -} - +#define QUARANTINE_PERCPU_SIZE (1 << 20) +#define QUARANTINE_BATCHES \ + (1024 > 4 * CONFIG_NR_CPUS ? 1024 : 4 * CONFIG_NR_CPUS) /* * The object quarantine consists of per-cpu queues and a global queue, @@ -111,11 +97,23 @@ static void qlist_move(struct qlist_head *from, struct qlist_node *last, */ static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine); -static struct qlist_head global_quarantine; +/* Round-robin FIFO array of batches. */ +static struct qlist_head global_quarantine[QUARANTINE_BATCHES]; +static int quarantine_head; +static int quarantine_tail; +/* Total size of all objects in global_quarantine across all batches. */ +static unsigned long quarantine_size; static DEFINE_SPINLOCK(quarantine_lock); +DEFINE_STATIC_SRCU(remove_cache_srcu); /* Maximum size of the global queue. */ -static unsigned long quarantine_size; +static unsigned long quarantine_max_size; + +/* + * Target size of a batch in global_quarantine. + * Usually equal to QUARANTINE_PERCPU_SIZE unless we have too much RAM. + */ +static unsigned long quarantine_batch_size; /* * The fraction of physical memory the quarantine is allowed to occupy. @@ -124,9 +122,6 @@ static unsigned long quarantine_size; */ #define QUARANTINE_FRACTION 32 -#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4) -#define QUARANTINE_PERCPU_SIZE (1 << 20) - static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink) { return virt_to_head_page(qlink)->slab_cache; @@ -180,62 +175,89 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) struct qlist_head *q; struct qlist_head temp = QLIST_INIT; + /* + * Note: irq must be disabled until after we move the batch to the + * global quarantine. Otherwise quarantine_remove_cache() can miss + * some objects belonging to the cache if they are in our local temp + * list. quarantine_remove_cache() executes on_each_cpu() at the + * beginning which ensures that it either sees the objects in per-cpu + * lists or in the global quarantine. + */ local_irq_save(flags); q = this_cpu_ptr(&cpu_quarantine); qlist_put(q, &info->quarantine_link, cache->size); - if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) + if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) { qlist_move_all(q, &temp); - local_irq_restore(flags); - - if (unlikely(!qlist_empty(&temp))) { - spin_lock_irqsave(&quarantine_lock, flags); - qlist_move_all(&temp, &global_quarantine); - spin_unlock_irqrestore(&quarantine_lock, flags); + spin_lock(&quarantine_lock); + WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes); + qlist_move_all(&temp, &global_quarantine[quarantine_tail]); + if (global_quarantine[quarantine_tail].bytes >= + READ_ONCE(quarantine_batch_size)) { + int new_tail; + + new_tail = quarantine_tail + 1; + if (new_tail == QUARANTINE_BATCHES) + new_tail = 0; + if (new_tail != quarantine_head) + quarantine_tail = new_tail; + } + spin_unlock(&quarantine_lock); } + + local_irq_restore(flags); } void quarantine_reduce(void) { - size_t new_quarantine_size, percpu_quarantines; + size_t total_size, new_quarantine_size, percpu_quarantines; unsigned long flags; + int srcu_idx; struct qlist_head to_free = QLIST_INIT; - size_t size_to_free = 0; - struct qlist_node *last; - if (likely(READ_ONCE(global_quarantine.bytes) <= - READ_ONCE(quarantine_size))) + if (likely(READ_ONCE(quarantine_size) <= + READ_ONCE(quarantine_max_size))) return; + /* + * srcu critical section ensures that quarantine_remove_cache() + * will not miss objects belonging to the cache while they are in our + * local to_free list. srcu is chosen because (1) it gives us private + * grace period domain that does not interfere with anything else, + * and (2) it allows synchronize_srcu() to return without waiting + * if there are no pending read critical sections (which is the + * expected case). + */ + srcu_idx = srcu_read_lock(&remove_cache_srcu); spin_lock_irqsave(&quarantine_lock, flags); /* * Update quarantine size in case of hotplug. Allocate a fraction of * the installed memory to quarantine minus per-cpu queue limits. */ - new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / + total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / QUARANTINE_FRACTION; percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); - new_quarantine_size = (new_quarantine_size < percpu_quarantines) ? - 0 : new_quarantine_size - percpu_quarantines; - WRITE_ONCE(quarantine_size, new_quarantine_size); - - last = global_quarantine.head; - while (last) { - struct kmem_cache *cache = qlink_to_cache(last); - - size_to_free += cache->size; - if (!last->next || size_to_free > - global_quarantine.bytes - QUARANTINE_LOW_SIZE) - break; - last = last->next; + new_quarantine_size = (total_size < percpu_quarantines) ? + 0 : total_size - percpu_quarantines; + WRITE_ONCE(quarantine_max_size, new_quarantine_size); + /* Aim at consuming at most 1/2 of slots in quarantine. */ + WRITE_ONCE(quarantine_batch_size, max((size_t)QUARANTINE_PERCPU_SIZE, + 2 * total_size / QUARANTINE_BATCHES)); + + if (likely(quarantine_size > quarantine_max_size)) { + qlist_move_all(&global_quarantine[quarantine_head], &to_free); + WRITE_ONCE(quarantine_size, quarantine_size - to_free.bytes); + quarantine_head++; + if (quarantine_head == QUARANTINE_BATCHES) + quarantine_head = 0; } - qlist_move(&global_quarantine, last, &to_free, size_to_free); spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, NULL); + srcu_read_unlock(&remove_cache_srcu, srcu_idx); } static void qlist_move_cache(struct qlist_head *from, @@ -273,16 +295,34 @@ static void per_cpu_remove_cache(void *arg) qlist_free_all(&to_free, cache); } +/* Free all quarantined objects belonging to cache. */ void quarantine_remove_cache(struct kmem_cache *cache) { - unsigned long flags; + unsigned long flags, i; struct qlist_head to_free = QLIST_INIT; + /* + * Must be careful to not miss any objects that are being moved from + * per-cpu list to the global quarantine in quarantine_put(), + * nor objects being freed in quarantine_reduce(). on_each_cpu() + * achieves the first goal, while synchronize_srcu() achieves the + * second. + */ on_each_cpu(per_cpu_remove_cache, cache, 1); spin_lock_irqsave(&quarantine_lock, flags); - qlist_move_cache(&global_quarantine, &to_free, cache); + for (i = 0; i < QUARANTINE_BATCHES; i++) { + if (qlist_empty(&global_quarantine[i])) + continue; + qlist_move_cache(&global_quarantine[i], &to_free, cache); + /* Scanning whole quarantine can take a while. */ + spin_unlock_irqrestore(&quarantine_lock, flags); + cond_resched(); + spin_lock_irqsave(&quarantine_lock, flags); + } spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, cache); + + synchronize_srcu(&remove_cache_srcu); } diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 073325aedc68..ab42a0803f16 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -13,6 +13,9 @@ * */ +#include <linux/bitops.h> +#include <linux/ftrace.h> +#include <linux/init.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/printk.h> @@ -136,6 +139,8 @@ static void kasan_end_report(unsigned long *flags) pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); + if (panic_on_warn) + panic("panic_on_warn set ...\n"); kasan_enable_current(); } @@ -290,6 +295,40 @@ static void kasan_report_error(struct kasan_access_info *info) kasan_end_report(&flags); } +static unsigned long kasan_flags; + +#define KASAN_BIT_REPORTED 0 +#define KASAN_BIT_MULTI_SHOT 1 + +bool kasan_save_enable_multi_shot(void) +{ + return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); +} +EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot); + +void kasan_restore_multi_shot(bool enabled) +{ + if (!enabled) + clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); +} +EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); + +static int __init kasan_set_multi_shot(char *str) +{ + set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); + return 1; +} +__setup("kasan_multi_shot", kasan_set_multi_shot); + +static inline bool kasan_report_enabled(void) +{ + if (current->kasan_depth) + return false; + if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) + return true; + return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); +} + void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) { @@ -298,6 +337,8 @@ void kasan_report(unsigned long addr, size_t size, if (likely(!kasan_report_enabled())) return; + disable_trace_on_warning(); + info.access_addr = (void *)addr; info.access_size = size; info.is_write = is_write; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 87e1a7ca3846..ba40b7f673f4 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2,6 +2,8 @@ #include <linux/mm.h> #include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/coredump.h> #include <linux/mmu_notifier.h> #include <linux/rmap.h> #include <linux/swap.h> @@ -420,7 +422,7 @@ int __khugepaged_enter(struct mm_struct *mm) list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); spin_unlock(&khugepaged_mm_lock); - atomic_inc(&mm->mm_count); + mmgrab(mm); if (wakeup) wake_up_interruptible(&khugepaged_wait); @@ -875,13 +877,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int referenced) { - pte_t pteval; int swapped_in = 0, ret = 0; - struct fault_env fe = { + struct vm_fault vmf = { .vma = vma, .address = address, .flags = FAULT_FLAG_ALLOW_RETRY, .pmd = pmd, + .pgoff = linear_page_index(vma, address), }; /* we only decide to swapin, if there is enough young ptes */ @@ -889,19 +891,19 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; } - fe.pte = pte_offset_map(pmd, address); - for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; - fe.pte++, fe.address += PAGE_SIZE) { - pteval = *fe.pte; - if (!is_swap_pte(pteval)) + vmf.pte = pte_offset_map(pmd, address); + for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; + vmf.pte++, vmf.address += PAGE_SIZE) { + vmf.orig_pte = *vmf.pte; + if (!is_swap_pte(vmf.orig_pte)) continue; swapped_in++; - ret = do_swap_page(&fe, pteval); + ret = do_swap_page(&vmf); /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ if (ret & VM_FAULT_RETRY) { down_read(&mm->mmap_sem); - if (hugepage_vma_revalidate(mm, address, &fe.vma)) { + if (hugepage_vma_revalidate(mm, address, &vmf.vma)) { /* vma is no longer available, don't continue to swapin */ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; @@ -915,10 +917,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, return false; } /* pte is unmapped now, we need to map it */ - fe.pte = pte_offset_map(pmd, fe.address); + vmf.pte = pte_offset_map(pmd, vmf.address); } - fe.pte--; - pte_unmap(fe.pte); + vmf.pte--; + pte_unmap(vmf.pte); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); return true; } @@ -943,7 +945,7 @@ static void collapse_huge_page(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; + gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; /* * Before allocating the hugepage, release the mmap_sem read lock. @@ -1309,8 +1311,7 @@ static void collapse_shmem(struct mm_struct *mm, VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | - __GFP_OTHER_NODE | __GFP_THISNODE; + gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; new_page = khugepaged_alloc_page(hpage, gfp, node); if (!new_page) { @@ -1403,6 +1404,9 @@ static void collapse_shmem(struct mm_struct *mm, spin_lock_irq(&mapping->tree_lock); + slot = radix_tree_lookup_slot(&mapping->page_tree, index); + VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot, + &mapping->tree_lock), page); VM_BUG_ON_PAGE(page_mapped(page), page); /* @@ -1423,9 +1427,10 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - radix_tree_replace_slot(slot, + radix_tree_replace_slot(&mapping->page_tree, slot, new_page + (index % HPAGE_PMD_NR)); + slot = radix_tree_iter_resume(slot, &iter); index++; continue; out_lru: @@ -1521,9 +1526,10 @@ tree_unlocked: if (!page || iter.index < page->index) { if (!nr_none) break; - /* Put holes back where they were */ - radix_tree_replace_slot(slot, NULL); nr_none--; + /* Put holes back where they were */ + radix_tree_delete(&mapping->page_tree, + iter.index); continue; } @@ -1532,7 +1538,9 @@ tree_unlocked: /* Unfreeze the page. */ list_del(&page->lru); page_ref_unfreeze(page, 2); - radix_tree_replace_slot(slot, page); + radix_tree_replace_slot(&mapping->page_tree, + slot, page); + slot = radix_tree_iter_resume(slot, &iter); spin_unlock_irq(&mapping->tree_lock); putback_lru_page(page); unlock_page(page); @@ -1616,8 +1624,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, present++; if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index d1380ed93fdf..20036d4f9f13 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -19,7 +19,7 @@ * * * For more information on the algorithm and kmemleak usage, please see - * Documentation/kmemleak.txt. + * Documentation/dev-tools/kmemleak.rst. * * Notes on locking * ---------------- @@ -73,7 +73,9 @@ #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> #include <linux/jiffies.h> #include <linux/delay.h> #include <linux/export.h> @@ -1414,7 +1416,7 @@ static void kmemleak_scan(void) /* data/bss scanning */ scan_large_block(_sdata, _edata); scan_large_block(__bss_start, __bss_stop); - scan_large_block(__start_data_ro_after_init, __end_data_ro_after_init); + scan_large_block(__start_ro_after_init, __end_ro_after_init); #ifdef CONFIG_SMP /* per-cpu sections scanning */ @@ -19,6 +19,8 @@ #include <linux/fs.h> #include <linux/mman.h> #include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/coredump.h> #include <linux/rwsem.h> #include <linux/pagemap.h> #include <linux/rmap.h> @@ -223,6 +225,12 @@ static unsigned int ksm_thread_pages_to_scan = 100; /* Milliseconds ksmd should sleep between batches */ static unsigned int ksm_thread_sleep_millisecs = 20; +/* Checksum of an empty (zeroed) page */ +static unsigned int zero_checksum __read_mostly; + +/* Whether to merge empty (zeroed) pages with actual zero pages */ +static bool ksm_use_zero_pages __read_mostly; + #ifdef CONFIG_NUMA /* Zeroed when merging across nodes is not allowed */ static unsigned int ksm_merge_across_nodes = 1; @@ -850,33 +858,36 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, pte_t *orig_pte) { struct mm_struct *mm = vma->vm_mm; - unsigned long addr; - pte_t *ptep; - spinlock_t *ptl; + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + }; int swapped; int err = -EFAULT; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ - addr = page_address_in_vma(page, vma); - if (addr == -EFAULT) + pvmw.address = page_address_in_vma(page, vma); + if (pvmw.address == -EFAULT) goto out; BUG_ON(PageTransCompound(page)); - mmun_start = addr; - mmun_end = addr + PAGE_SIZE; + mmun_start = pvmw.address; + mmun_end = pvmw.address + PAGE_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - ptep = page_check_address(page, mm, addr, &ptl, 0); - if (!ptep) + if (!page_vma_mapped_walk(&pvmw)) goto out_mn; + if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) + goto out_unlock; - if (pte_write(*ptep) || pte_dirty(*ptep)) { + if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || + (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte))) { pte_t entry; swapped = PageSwapCache(page); - flush_cache_page(vma, addr, page_to_pfn(page)); + flush_cache_page(vma, pvmw.address, page_to_pfn(page)); /* * Ok this is tricky, when get_user_pages_fast() run it doesn't * take any lock, therefore the check that we are going to make @@ -886,25 +897,29 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * this assure us that no O_DIRECT can happen after the check * or in the middle of the check. */ - entry = ptep_clear_flush_notify(vma, addr, ptep); + entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); /* * Check that no O_DIRECT or similar I/O is in progress on the * page */ if (page_mapcount(page) + 1 + swapped != page_count(page)) { - set_pte_at(mm, addr, ptep, entry); + set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } if (pte_dirty(entry)) set_page_dirty(page); - entry = pte_mkclean(pte_wrprotect(entry)); - set_pte_at_notify(mm, addr, ptep, entry); + + if (pte_protnone(entry)) + entry = pte_mkclean(pte_clear_savedwrite(entry)); + else + entry = pte_mkclean(pte_wrprotect(entry)); + set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); } - *orig_pte = *ptep; + *orig_pte = *pvmw.pte; err = 0; out_unlock: - pte_unmap_unlock(ptep, ptl); + page_vma_mapped_walk_done(&pvmw); out_mn: mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: @@ -926,6 +941,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, struct mm_struct *mm = vma->vm_mm; pmd_t *pmd; pte_t *ptep; + pte_t newpte; spinlock_t *ptl; unsigned long addr; int err = -EFAULT; @@ -950,12 +966,22 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, goto out_mn; } - get_page(kpage); - page_add_anon_rmap(kpage, vma, addr, false); + /* + * No need to check ksm_use_zero_pages here: we can only have a + * zero_page here if ksm_use_zero_pages was enabled alreaady. + */ + if (!is_zero_pfn(page_to_pfn(kpage))) { + get_page(kpage); + page_add_anon_rmap(kpage, vma, addr, false); + newpte = mk_pte(kpage, vma->vm_page_prot); + } else { + newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage), + vma->vm_page_prot)); + } flush_cache_page(vma, addr, pte_pfn(*ptep)); ptep_clear_flush_notify(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); + set_pte_at_notify(mm, addr, ptep, newpte); page_remove_rmap(page, false); if (!page_mapped(page)) @@ -1467,6 +1493,23 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) return; } + /* + * Same checksum as an empty page. We attempt to merge it with the + * appropriate zero page if the user enabled this via sysfs. + */ + if (ksm_use_zero_pages && (checksum == zero_checksum)) { + struct vm_area_struct *vma; + + vma = find_mergeable_vma(rmap_item->mm, rmap_item->address); + err = try_to_merge_one_page(vma, page, + ZERO_PAGE(rmap_item->address)); + /* + * In case of failure, the page was not really empty, so we + * need to continue. Otherwise we're done. + */ + if (!err) + return; + } tree_rmap_item = unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { @@ -1813,7 +1856,7 @@ int __ksm_enter(struct mm_struct *mm) spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); - atomic_inc(&mm->mm_count); + mmgrab(mm); if (needs_wakeup) wake_up_interruptible(&ksm_thread_wait); @@ -2233,6 +2276,28 @@ static ssize_t merge_across_nodes_store(struct kobject *kobj, KSM_ATTR(merge_across_nodes); #endif +static ssize_t use_zero_pages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_use_zero_pages); +} +static ssize_t use_zero_pages_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + bool value; + + err = kstrtobool(buf, &value); + if (err) + return -EINVAL; + + ksm_use_zero_pages = value; + + return count; +} +KSM_ATTR(use_zero_pages); + static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -2290,6 +2355,7 @@ static struct attribute *ksm_attrs[] = { #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, #endif + &use_zero_pages_attr.attr, NULL, }; @@ -2304,6 +2370,11 @@ static int __init ksm_init(void) struct task_struct *ksm_thread; int err; + /* The correct value depends on page size and endianness */ + zero_checksum = calc_checksum(ZERO_PAGE(0)); + /* Default to false for backwards compatibility */ + ksm_use_zero_pages = false; + err = ksm_slab_init(); if (err) goto out; diff --git a/mm/madvise.c b/mm/madvise.c index 93fb63e88b5e..7a2abf0127ae 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -10,6 +10,7 @@ #include <linux/syscalls.h> #include <linux/mempolicy.h> #include <linux/page-isolation.h> +#include <linux/userfaultfd_k.h> #include <linux/hugetlb.h> #include <linux/falloc.h> #include <linux/sched.h> @@ -20,10 +21,13 @@ #include <linux/backing-dev.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/shmem_fs.h> #include <linux/mmu_notifier.h> #include <asm/tlb.h> +#include "internal.h" + /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_sem for writing. Others, which simply traverse vmas, need @@ -89,14 +93,28 @@ static long madvise_behavior(struct vm_area_struct *vma, case MADV_MERGEABLE: case MADV_UNMERGEABLE: error = ksm_madvise(vma, start, end, behavior, &new_flags); - if (error) + if (error) { + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; goto out; + } break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: error = hugepage_madvise(vma, &new_flags, behavior); - if (error) + if (error) { + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; goto out; + } break; } @@ -117,15 +135,37 @@ static long madvise_behavior(struct vm_area_struct *vma, *prev = vma; if (start != vma->vm_start) { - error = split_vma(mm, vma, start, 1); - if (error) + if (unlikely(mm->map_count >= sysctl_max_map_count)) { + error = -ENOMEM; goto out; + } + error = __split_vma(mm, vma, start, 1); + if (error) { + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + goto out; + } } if (end != vma->vm_end) { - error = split_vma(mm, vma, end, 0); - if (error) + if (unlikely(mm->map_count >= sysctl_max_map_count)) { + error = -ENOMEM; goto out; + } + error = __split_vma(mm, vma, end, 0); + if (error) { + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + goto out; + } } success: @@ -133,10 +173,7 @@ success: * vm_flags is protected by the mmap_sem held in write mode. */ vma->vm_flags = new_flags; - out: - if (error == -ENOMEM) - error = -EAGAIN; return error; } @@ -281,6 +318,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; + tlb_remove_check_page_size_change(tlb, PAGE_SIZE); orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); for (; addr != end; pte++, addr += PAGE_SIZE) { @@ -472,10 +510,47 @@ static long madvise_dontneed(struct vm_area_struct *vma, unsigned long start, unsigned long end) { *prev = vma; - if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) + if (!can_madv_dontneed_vma(vma)) return -EINVAL; - zap_page_range(vma, start, end - start, NULL); + if (!userfaultfd_remove(vma, start, end)) { + *prev = NULL; /* mmap_sem has been dropped, prev is stale */ + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, start); + if (!vma) + return -ENOMEM; + if (start < vma->vm_start) { + /* + * This "vma" under revalidation is the one + * with the lowest vma->vm_start where start + * is also < vma->vm_end. If start < + * vma->vm_start it means an hole materialized + * in the user address space within the + * virtual range passed to MADV_DONTNEED. + */ + return -ENOMEM; + } + if (!can_madv_dontneed_vma(vma)) + return -EINVAL; + if (end > vma->vm_end) { + /* + * Don't fail if end > vma->vm_end. If the old + * vma was splitted while the mmap_sem was + * released the effect of the concurrent + * operation may not cause MADV_DONTNEED to + * have an undefined result. There may be an + * adjacent next vma that we'll walk + * next. userfaultfd_remove() will generate an + * UFFD_EVENT_REMOVE repetition on the + * end-vma->vm_end range, but the manager can + * handle a repetition fine. + */ + end = vma->vm_end; + } + VM_WARN_ON(start >= end); + } + zap_page_range(vma, start, end - start); return 0; } @@ -515,7 +590,10 @@ static long madvise_remove(struct vm_area_struct *vma, * mmap_sem. */ get_file(f); - up_read(¤t->mm->mmap_sem); + if (userfaultfd_remove(vma, start, end)) { + /* mmap_sem was not released by userfaultfd_remove() */ + up_read(¤t->mm->mmap_sem); + } error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); diff --git a/mm/memblock.c b/mm/memblock.c index 7608bc305936..696f06d17c4e 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -35,15 +35,18 @@ struct memblock memblock __initdata_memblock = { .memory.regions = memblock_memory_init_regions, .memory.cnt = 1, /* empty dummy entry */ .memory.max = INIT_MEMBLOCK_REGIONS, + .memory.name = "memory", .reserved.regions = memblock_reserved_init_regions, .reserved.cnt = 1, /* empty dummy entry */ .reserved.max = INIT_MEMBLOCK_REGIONS, + .reserved.name = "reserved", #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP .physmem.regions = memblock_physmem_init_regions, .physmem.cnt = 1, /* empty dummy entry */ .physmem.max = INIT_PHYSMEM_REGIONS, + .physmem.name = "physmem", #endif .bottom_up = false, @@ -64,18 +67,6 @@ ulong __init_memblock choose_memblock_flags(void) return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; } -/* inline so we don't get a warning when pr_debug is compiled out */ -static __init_memblock const char * -memblock_type_name(struct memblock_type *type) -{ - if (type == &memblock.memory) - return "memory"; - else if (type == &memblock.reserved) - return "reserved"; - else - return "unknown"; -} - /* adjust *@size so that (@base + *@size) doesn't overflow, return new size */ static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) { @@ -402,12 +393,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, } if (!addr) { pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", - memblock_type_name(type), type->max, type->max * 2); + type->name, type->max, type->max * 2); return -1; } memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]", - memblock_type_name(type), type->max * 2, (u64)addr, + type->name, type->max * 2, (u64)addr, (u64)addr + new_size - 1); /* @@ -611,10 +602,10 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { - memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", - (unsigned long long)base, - (unsigned long long)base + size - 1, - 0UL, (void *)_RET_IP_); + phys_addr_t end = base + size - 1; + + memblock_dbg("memblock_add: [%pa-%pa] %pF\n", + &base, &end, (void *)_RET_IP_); return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0); } @@ -718,10 +709,10 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { - memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", - (unsigned long long)base, - (unsigned long long)base + size - 1, - (void *)_RET_IP_); + phys_addr_t end = base + size - 1; + + memblock_dbg(" memblock_free: [%pa-%pa] %pF\n", + &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); return memblock_remove_range(&memblock.reserved, base, size); @@ -729,10 +720,10 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) { - memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", - (unsigned long long)base, - (unsigned long long)base + size - 1, - 0UL, (void *)_RET_IP_); + phys_addr_t end = base + size - 1; + + memblock_dbg("memblock_reserve: [%pa-%pa] %pF\n", + &base, &end, (void *)_RET_IP_); return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0); } @@ -1105,6 +1096,34 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, *out_nid = r->nid; } +unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn, + unsigned long max_pfn) +{ + struct memblock_type *type = &memblock.memory; + unsigned int right = type->cnt; + unsigned int mid, left = 0; + phys_addr_t addr = PFN_PHYS(pfn + 1); + + do { + mid = (right + left) / 2; + + if (addr < type->regions[mid].base) + right = mid; + else if (addr >= (type->regions[mid].base + + type->regions[mid].size)) + left = mid + 1; + else { + /* addr is within the region, so pfn + 1 is valid */ + return min(pfn + 1, max_pfn); + } + } while (left < right); + + if (right == type->cnt) + return max_pfn; + else + return min(PHYS_PFN(type->regions[right].base), max_pfn); +} + /** * memblock_set_node - set node ID on memblock regions * @base: base of area to set node ID for @@ -1202,8 +1221,8 @@ phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys alloc = __memblock_alloc_base(size, align, max_addr); if (alloc == 0) - panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", - (unsigned long long) size, (unsigned long long) max_addr); + panic("ERROR: Failed to allocate %pa bytes below %pa.\n", + &size, &max_addr); return alloc; } @@ -1274,18 +1293,17 @@ static void * __init memblock_virt_alloc_internal( if (max_addr > memblock.current_limit) max_addr = memblock.current_limit; - again: alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, nid, flags); - if (alloc) + if (alloc && !memblock_reserve(alloc, size)) goto done; if (nid != NUMA_NO_NODE) { alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, NUMA_NO_NODE, flags); - if (alloc) + if (alloc && !memblock_reserve(alloc, size)) goto done; } @@ -1303,7 +1321,6 @@ again: return NULL; done: - memblock_reserve(alloc, size); ptr = phys_to_virt(alloc); memset(ptr, 0, size); @@ -1615,8 +1632,7 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size if (idx == -1) return 0; - return memblock.memory.regions[idx].base <= base && - (memblock.memory.regions[idx].base + + return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size) >= end; } @@ -1671,40 +1687,44 @@ phys_addr_t __init_memblock memblock_get_current_limit(void) return memblock.current_limit; } -static void __init_memblock memblock_dump(struct memblock_type *type, char *name) +static void __init_memblock memblock_dump(struct memblock_type *type) { - unsigned long long base, size; + phys_addr_t base, end, size; unsigned long flags; int idx; struct memblock_region *rgn; - pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); + pr_info(" %s.cnt = 0x%lx\n", type->name, type->cnt); for_each_memblock_type(type, rgn) { char nid_buf[32] = ""; base = rgn->base; size = rgn->size; + end = base + size - 1; flags = rgn->flags; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP if (memblock_get_region_node(rgn) != MAX_NUMNODES) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); #endif - pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n", - name, idx, base, base + size - 1, size, nid_buf, flags); + pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#lx\n", + type->name, idx, &base, &end, &size, nid_buf, flags); } } void __init_memblock __memblock_dump_all(void) { pr_info("MEMBLOCK configuration:\n"); - pr_info(" memory size = %#llx reserved size = %#llx\n", - (unsigned long long)memblock.memory.total_size, - (unsigned long long)memblock.reserved.total_size); + pr_info(" memory size = %pa reserved size = %pa\n", + &memblock.memory.total_size, + &memblock.reserved.total_size); - memblock_dump(&memblock.memory, "memory"); - memblock_dump(&memblock.reserved, "reserved"); + memblock_dump(&memblock.memory); + memblock_dump(&memblock.reserved); +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP + memblock_dump(&memblock.physmem); +#endif } void __init memblock_allow_resize(void) @@ -1727,19 +1747,14 @@ static int memblock_debug_show(struct seq_file *m, void *private) struct memblock_type *type = m->private; struct memblock_region *reg; int i; + phys_addr_t end; for (i = 0; i < type->cnt; i++) { reg = &type->regions[i]; - seq_printf(m, "%4d: ", i); - if (sizeof(phys_addr_t) == 4) - seq_printf(m, "0x%08lx..0x%08lx\n", - (unsigned long)reg->base, - (unsigned long)(reg->base + reg->size - 1)); - else - seq_printf(m, "0x%016llx..0x%016llx\n", - (unsigned long long)reg->base, - (unsigned long long)(reg->base + reg->size - 1)); + end = reg->base + reg->size - 1; + seq_printf(m, "%4d: ", i); + seq_printf(m, "%pa..%pa\n", ®->base, &end); } return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0f870ba43942..2bd7541d7c11 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -35,6 +35,8 @@ #include <linux/memcontrol.h> #include <linux/cgroup.h> #include <linux/mm.h> +#include <linux/sched/mm.h> +#include <linux/shmem_fs.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/smp.h> @@ -68,7 +70,7 @@ #include <net/ip.h> #include "slab.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <trace/events/vmscan.h> @@ -317,6 +319,8 @@ void memcg_put_cache_ids(void) DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); +struct workqueue_struct *memcg_kmem_cache_wq; + #endif /* !CONFIG_SLOB */ /** @@ -462,6 +466,8 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) struct mem_cgroup_tree_per_node *mctz; mctz = soft_limit_tree_from_page(page); + if (!mctz) + return; /* * Necessary to update all ancestors when hierarchy is used. * because their event counter is not touched. @@ -499,7 +505,8 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) for_each_node(nid) { mz = mem_cgroup_nodeinfo(memcg, nid); mctz = soft_limit_tree_node(nid); - mem_cgroup_remove_exceeded(mz, mctz); + if (mctz) + mem_cgroup_remove_exceeded(mz, mctz); } } @@ -625,8 +632,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); unsigned long nr = 0; - struct mem_cgroup_per_node *mz; enum lru_list lru; VM_BUG_ON((unsigned)nid >= nr_node_ids); @@ -634,8 +641,7 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - mz = mem_cgroup_nodeinfo(memcg, nid); - nr += mz->lru_size[lru]; + nr += mem_cgroup_get_lru_size(lruvec, lru); } return nr; } @@ -1002,6 +1008,7 @@ out: * mem_cgroup_update_lru_size - account for adding or removing an lru page * @lruvec: mem_cgroup per zone lru vector * @lru: index of lru list the page is sitting on + * @zid: zone id of the accounted pages * @nr_pages: positive when adding or negative when removing * * This function must be called under lru_lock, just before a page is added @@ -1009,27 +1016,25 @@ out: * so as to allow it to check that lru_size 0 is consistent with list_empty). */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int nr_pages) + int zid, int nr_pages) { struct mem_cgroup_per_node *mz; unsigned long *lru_size; long size; - bool empty; if (mem_cgroup_disabled()) return; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - lru_size = mz->lru_size + lru; - empty = list_empty(lruvec->lists + lru); + lru_size = &mz->lru_zone_size[zid][lru]; if (nr_pages < 0) *lru_size += nr_pages; size = *lru_size; - if (WARN_ONCE(size < 0 || empty != !size, - "%s(%p, %d, %d): lru_size %ld but %sempty\n", - __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) { + if (WARN_ONCE(size < 0, + "%s(%p, %d, %d): lru_size %ld\n", + __func__, lruvec, lru, nr_pages, size)) { VM_BUG_ON(1); *lru_size = 0; } @@ -1816,22 +1821,13 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) mutex_unlock(&percpu_charge_mutex); } -static int memcg_cpu_hotplug_callback(struct notifier_block *nb, - unsigned long action, - void *hcpu) +static int memcg_hotplug_cpu_dead(unsigned int cpu) { - int cpu = (unsigned long)hcpu; struct memcg_stock_pcp *stock; - if (action == CPU_ONLINE) - return NOTIFY_OK; - - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) - return NOTIFY_OK; - stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); - return NOTIFY_OK; + return 0; } static void reclaim_high(struct mem_cgroup *memcg, @@ -2185,7 +2181,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, cw->cachep = cachep; INIT_WORK(&cw->work, memcg_kmem_cache_create_func); - schedule_work(&cw->work); + queue_work(memcg_kmem_cache_wq, &cw->work); } static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, @@ -2565,7 +2561,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, * is empty. Do it lockless to prevent lock bouncing. Races * are acceptable as soft limit is best effort anyway. */ - if (RB_EMPTY_ROOT(&mctz->rb_root)) + if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) return 0; /* @@ -2846,6 +2842,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) */ memcg->kmemcg_id = memcg_id; memcg->kmem_state = KMEM_ONLINE; + INIT_LIST_HEAD(&memcg->kmem_caches); return 0; } @@ -4011,9 +4008,9 @@ static struct cftype mem_cgroup_legacy_files[] = { #ifdef CONFIG_SLABINFO { .name = "kmem.slabinfo", - .seq_start = slab_start, - .seq_next = slab_next, - .seq_stop = slab_stop, + .seq_start = memcg_slab_start, + .seq_next = memcg_slab_next, + .seq_stop = memcg_slab_stop, .seq_show = memcg_slab_show, }, #endif @@ -4141,17 +4138,22 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) kfree(memcg->nodeinfo[node]); } -static void mem_cgroup_free(struct mem_cgroup *memcg) +static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; - memcg_wb_domain_exit(memcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->stat); kfree(memcg); } +static void mem_cgroup_free(struct mem_cgroup *memcg) +{ + memcg_wb_domain_exit(memcg); + __mem_cgroup_free(memcg); +} + static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; @@ -4202,7 +4204,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) fail: if (memcg->id.id > 0) idr_remove(&mem_cgroup_idr, memcg->id.id); - mem_cgroup_free(memcg); + __mem_cgroup_free(memcg); return NULL; } @@ -4362,9 +4364,9 @@ static int mem_cgroup_do_precharge(unsigned long count) return ret; } - /* Try charges one by one with reclaim */ + /* Try charges one by one with reclaim, but do not retry */ while (count--) { - ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); + ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); if (ret) return ret; mc.precharge++; @@ -5774,16 +5776,28 @@ __setup("cgroup.memory=", cgroup_memory); /* * subsys_initcall() for memory controller. * - * Some parts like hotcpu_notifier() have to be initialized from this context - * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically - * everything that doesn't depend on a specific mem_cgroup structure should - * be initialized from here. + * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this + * context because of lock dependencies (cgroup_lock -> cpu hotplug) but + * basically everything that doesn't depend on a specific mem_cgroup structure + * should be initialized from here. */ static int __init mem_cgroup_init(void) { int cpu, node; - hotcpu_notifier(memcg_cpu_hotplug_callback, 0); +#ifndef CONFIG_SLOB + /* + * Kmem cache creation is mostly done with the slab_mutex held, + * so use a workqueue with limited concurrency to avoid stalling + * all worker threads in case lots of cgroups are created and + * destroyed simultaneously. + */ + memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1); + BUG_ON(!memcg_kmem_cache_wq); +#endif + + cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, + memcg_hotplug_cpu_dead); for_each_possible_cpu(cpu) INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 19e796d36a62..27f7210e7fab 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -40,7 +40,8 @@ #include <linux/mm.h> #include <linux/page-flags.h> #include <linux/kernel-page-flags.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/export.h> @@ -764,12 +765,11 @@ static int me_huge_page(struct page *p, unsigned long pfn) */ #define dirty (1UL << PG_dirty) -#define sc (1UL << PG_swapcache) +#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked)) #define unevict (1UL << PG_unevictable) #define mlock (1UL << PG_mlocked) #define writeback (1UL << PG_writeback) #define lru (1UL << PG_lru) -#define swapbacked (1UL << PG_swapbacked) #define head (1UL << PG_head) #define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved) @@ -819,7 +819,6 @@ static struct page_state { #undef mlock #undef writeback #undef lru -#undef swapbacked #undef head #undef slab #undef reserved @@ -1529,7 +1528,8 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) { int ret = __get_any_page(page, pfn, flags); - if (ret == 1 && !PageHuge(page) && !PageLRU(page)) { + if (ret == 1 && !PageHuge(page) && + !PageLRU(page) && !__PageMovable(page)) { /* * Try to free it. */ @@ -1651,7 +1651,10 @@ static int __soft_offline_page(struct page *page, int flags) * Try to migrate to a new page instead. migrate.c * handles a large number of cases for us. */ - ret = isolate_lru_page(page); + if (PageLRU(page)) + ret = isolate_lru_page(page); + else + ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); /* * Drop page reference which is came from get_any_page() * successful isolate_lru_page() already took another one. @@ -1659,18 +1662,20 @@ static int __soft_offline_page(struct page *page, int flags) put_hwpoison_page(page); if (!ret) { LIST_HEAD(pagelist); - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + /* + * After isolated lru page, the PageLRU will be cleared, + * so use !__PageMovable instead for LRU page's mapping + * cannot have PAGE_MAPPING_MOVABLE. + */ + if (!__PageMovable(page)) + inc_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { - if (!list_empty(&pagelist)) { - list_del(&page->lru); - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); - putback_lru_page(page); - } + if (!list_empty(&pagelist)) + putback_movable_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); diff --git a/mm/memory.c b/mm/memory.c index e18c57bdc75c..235ba51b2fbf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -30,7 +30,7 @@ /* * 05.04.94 - Multi-page memory management added for v1.1. - * Idea by Alex Bligh (alex@cconcepts.co.uk) + * Idea by Alex Bligh (alex@cconcepts.co.uk) * * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG * (Gerhard.Wichert@pdb.siemens.de) @@ -40,6 +40,10 @@ #include <linux/kernel_stat.h> #include <linux/mm.h> +#include <linux/sched/mm.h> +#include <linux/sched/coredump.h> +#include <linux/sched/numa_balancing.h> +#include <linux/sched/task.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/swap.h> @@ -68,7 +72,7 @@ #include <asm/io.h> #include <asm/mmu_context.h> #include <asm/pgalloc.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/pgtable.h> @@ -82,9 +86,9 @@ #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; -struct page *mem_map; - EXPORT_SYMBOL(max_mapnr); + +struct page *mem_map; EXPORT_SYMBOL(mem_map); #endif @@ -95,8 +99,7 @@ EXPORT_SYMBOL(mem_map); * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL * and ZONE_HIGHMEM. */ -void * high_memory; - +void *high_memory; EXPORT_SYMBOL(high_memory); /* @@ -120,10 +123,10 @@ static int __init disable_randmaps(char *s) __setup("norandmaps", disable_randmaps); unsigned long zero_pfn __read_mostly; -unsigned long highest_memmap_pfn __read_mostly; - EXPORT_SYMBOL(zero_pfn); +unsigned long highest_memmap_pfn __read_mostly; + /* * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() */ @@ -300,15 +303,14 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); - - if (!tlb->page_size) - tlb->page_size = page_size; - else { - if (page_size != tlb->page_size) - return true; - } + VM_WARN_ON(tlb->page_size != page_size); batch = tlb->active; + /* + * Add the page and check if we are full. If so + * force a flush. + */ + batch->pages[batch->nr++] = page; if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return true; @@ -316,7 +318,6 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ } VM_BUG_ON_PAGE(batch->nr > batch->max, page); - batch->pages[batch->nr++] = page; return false; } @@ -444,7 +445,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, mm_dec_nr_pmds(tlb->mm); } -static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, +static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { @@ -453,7 +454,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long start; start = addr; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) @@ -461,6 +462,39 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, free_pmd_range(tlb, pud, addr, next, floor, ceiling); } while (pud++, addr = next, addr != end); + start &= P4D_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= P4D_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + pud = pud_offset(p4d, start); + p4d_clear(p4d); + pud_free_tlb(tlb, pud, start); +} + +static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + p4d_t *p4d; + unsigned long next; + unsigned long start; + + start = addr; + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d)) + continue; + free_pud_range(tlb, p4d, addr, next, floor, ceiling); + } while (p4d++, addr = next, addr != end); + start &= PGDIR_MASK; if (start < floor) return; @@ -472,9 +506,9 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, if (end - 1 > ceiling - 1) return; - pud = pud_offset(pgd, start); + p4d = p4d_offset(pgd, start); pgd_clear(pgd); - pud_free_tlb(tlb, pud, start); + p4d_free_tlb(tlb, p4d, start); } /* @@ -528,13 +562,17 @@ void free_pgd_range(struct mmu_gather *tlb, end -= PMD_SIZE; if (addr > end - 1) return; - + /* + * We add page table cache pages with PAGE_SIZE, + * (see pte_free_tlb()), flush the tlb if we need + */ + tlb_remove_check_page_size_change(tlb, PAGE_SIZE); pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - free_pud_range(tlb, pgd, addr, next, floor, ceiling); + free_p4d_range(tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); } @@ -554,7 +592,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, - floor, next? next->vm_start: ceiling); + floor, next ? next->vm_start : ceiling); } else { /* * Optimization: gather nearby vmas into one call down @@ -567,7 +605,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unlink_file_vma(vma); } free_pgd_range(tlb, addr, vma->vm_end, - floor, next? next->vm_start: ceiling); + floor, next ? next->vm_start : ceiling); } vma = next; } @@ -653,7 +691,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, pte_t pte, struct page *page) { pgd_t *pgd = pgd_offset(vma->vm_mm, addr); - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); struct address_space *mapping; pgoff_t index; @@ -999,7 +1038,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src next = pmd_addr_end(addr, end); if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { int err; - VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); + VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, vma); if (err == -ENOMEM) @@ -1018,18 +1057,30 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src } static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pud_t *src_pud, *dst_pud; unsigned long next; - dst_pud = pud_alloc(dst_mm, dst_pgd, addr); + dst_pud = pud_alloc(dst_mm, dst_p4d, addr); if (!dst_pud) return -ENOMEM; - src_pud = pud_offset(src_pgd, addr); + src_pud = pud_offset(src_p4d, addr); do { next = pud_addr_end(addr, end); + if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { + int err; + + VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma); + err = copy_huge_pud(dst_mm, src_mm, + dst_pud, src_pud, addr, vma); + if (err == -ENOMEM) + return -ENOMEM; + if (!err) + continue; + /* fall through */ + } if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, @@ -1039,6 +1090,28 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src return 0; } +static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + p4d_t *src_p4d, *dst_p4d; + unsigned long next; + + dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr); + if (!dst_p4d) + return -ENOMEM; + src_p4d = p4d_offset(src_pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(src_p4d)) + continue; + if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d, + vma, addr, next)) + return -ENOMEM; + } while (dst_p4d++, src_p4d++, addr = next, addr != end); + return 0; +} + int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct vm_area_struct *vma) { @@ -1094,7 +1167,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; - if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, + if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd, vma, addr, next))) { ret = -ENOMEM; break; @@ -1118,8 +1191,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t *start_pte; pte_t *pte; swp_entry_t entry; - struct page *pending_page = NULL; + tlb_remove_check_page_size_change(tlb, PAGE_SIZE); again: init_rss_vec(rss); start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); @@ -1127,9 +1200,8 @@ again: arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; - if (pte_none(ptent)) { + if (pte_none(ptent)) continue; - } if (pte_present(ptent)) { struct page *page; @@ -1153,12 +1225,6 @@ again: if (!PageAnon(page)) { if (pte_dirty(ptent)) { - /* - * oom_reaper cannot tear down dirty - * pages - */ - if (unlikely(details && details->ignore_dirty)) - continue; force_flush = 1; set_page_dirty(page); } @@ -1172,14 +1238,13 @@ again: print_bad_pte(vma, addr, ptent, page); if (unlikely(__tlb_remove_page(tlb, page))) { force_flush = 1; - pending_page = page; addr += PAGE_SIZE; break; } continue; } - /* only check swap_entries if explicitly asked for in details */ - if (unlikely(details && !details->check_swap_entries)) + /* If details->check_mapping, we leave swap entries. */ + if (unlikely(details)) continue; entry = pte_to_swp_entry(ptent); @@ -1213,11 +1278,6 @@ again: if (force_flush) { force_flush = 0; tlb_flush_mmu_free(tlb); - if (pending_page) { - /* remove the page with new size */ - __tlb_remove_pte_page(tlb, pending_page); - pending_page = NULL; - } if (addr != end) goto again; } @@ -1240,7 +1300,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, if (next - addr != HPAGE_PMD_SIZE) { VM_BUG_ON_VMA(vma_is_anonymous(vma) && !rwsem_is_locked(&tlb->mm->mmap_sem), vma); - split_huge_pmd(vma, pmd, addr); + __split_huge_pmd(vma, pmd, addr, false, NULL); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ @@ -1263,24 +1323,53 @@ next: } static inline unsigned long zap_pud_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, pgd_t *pgd, + struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, struct zap_details *details) { pud_t *pud; unsigned long next; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); + if (pud_trans_huge(*pud) || pud_devmap(*pud)) { + if (next - addr != HPAGE_PUD_SIZE) { + VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma); + split_huge_pud(vma, pud, addr); + } else if (zap_huge_pud(tlb, vma, pud, addr)) + goto next; + /* fall through */ + } if (pud_none_or_clear_bad(pud)) continue; next = zap_pmd_range(tlb, vma, pud, addr, next, details); +next: + cond_resched(); } while (pud++, addr = next, addr != end); return addr; } +static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d)) + continue; + next = zap_pud_range(tlb, vma, p4d, addr, next, details); + } while (p4d++, addr = next, addr != end); + + return addr; +} + void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, @@ -1296,7 +1385,7 @@ void unmap_page_range(struct mmu_gather *tlb, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - next = zap_pud_range(tlb, vma, pgd, addr, next, details); + next = zap_p4d_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); } @@ -1380,12 +1469,11 @@ void unmap_vmas(struct mmu_gather *tlb, * @vma: vm_area_struct holding the applicable pages * @start: starting address of pages to zap * @size: number of bytes to zap - * @details: details of shared cache invalidation * * Caller must protect the VMA list */ void zap_page_range(struct vm_area_struct *vma, unsigned long start, - unsigned long size, struct zap_details *details) + unsigned long size) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; @@ -1396,7 +1484,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, start, end); for ( ; vma && vma->vm_start < end; vma = vma->vm_next) - unmap_single_vma(&tlb, vma, start, end, details); + unmap_single_vma(&tlb, vma, start, end, NULL); mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); } @@ -1452,16 +1540,24 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes); pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { - pgd_t * pgd = pgd_offset(mm, addr); - pud_t * pud = pud_alloc(mm, pgd, addr); - if (pud) { - pmd_t * pmd = pmd_alloc(mm, pud, addr); - if (pmd) { - VM_BUG_ON(pmd_trans_huge(*pmd)); - return pte_alloc_map_lock(mm, pmd, addr, ptl); - } - } - return NULL; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return NULL; + pud = pud_alloc(mm, p4d, addr); + if (!pud) + return NULL; + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return NULL; + + VM_BUG_ON(pmd_trans_huge(*pmd)); + return pte_alloc_map_lock(mm, pmd, addr, ptl); } /* @@ -1637,8 +1733,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV))) - return -EINVAL; + + track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot); @@ -1655,8 +1751,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_insert(vma, &pgprot, pfn)) - return -EINVAL; + + track_pfn_insert(vma, &pgprot, pfn); /* * If we don't have pte special, then we have to use the pfn_valid() @@ -1727,7 +1823,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, return 0; } -static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, +static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { @@ -1735,7 +1831,7 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, unsigned long next; pfn -= addr >> PAGE_SHIFT; - pud = pud_alloc(mm, pgd, addr); + pud = pud_alloc(mm, p4d, addr); if (!pud) return -ENOMEM; do { @@ -1747,6 +1843,26 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, return 0; } +static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + p4d_t *p4d; + unsigned long next; + + pfn -= addr >> PAGE_SHIFT; + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return -ENOMEM; + do { + next = p4d_addr_end(addr, end); + if (remap_pud_range(mm, p4d, addr, next, + pfn + (addr >> PAGE_SHIFT), prot)) + return -ENOMEM; + } while (p4d++, addr = next, addr != end); + return 0; +} + /** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to @@ -1803,7 +1919,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, flush_cache_range(vma, addr, end); do { next = pgd_addr_end(addr, end); - err = remap_pud_range(mm, pgd, addr, next, + err = remap_p4d_range(mm, pgd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) break; @@ -1919,7 +2035,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, return err; } -static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, +static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, pte_fn_t fn, void *data) { @@ -1927,7 +2043,7 @@ static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, unsigned long next; int err; - pud = pud_alloc(mm, pgd, addr); + pud = pud_alloc(mm, p4d, addr); if (!pud) return -ENOMEM; do { @@ -1939,6 +2055,26 @@ static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, return err; } +static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + p4d_t *p4d; + unsigned long next; + int err; + + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return -ENOMEM; + do { + next = p4d_addr_end(addr, end); + err = apply_to_pud_range(mm, p4d, addr, next, fn, data); + if (err) + break; + } while (p4d++, addr = next, addr != end); + return err; +} + /* * Scan a region of virtual memory, filling in page tables as necessary * and calling a provided function on each leaf page table. @@ -1957,7 +2093,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); - err = apply_to_pud_range(mm, pgd, addr, next, fn, data); + err = apply_to_p4d_range(mm, pgd, addr, next, fn, data); if (err) break; } while (pgd++, addr = next, addr != end); @@ -2038,20 +2174,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) * * We do this without the lock held, so that it can sleep if it needs to. */ -static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, - unsigned long address) +static int do_page_mkwrite(struct vm_fault *vmf) { - struct vm_fault vmf; int ret; + struct page *page = vmf->page; + unsigned int old_flags = vmf->flags; - vmf.virtual_address = (void __user *)(address & PAGE_MASK); - vmf.pgoff = page->index; - vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - vmf.gfp_mask = __get_fault_gfp_mask(vma); - vmf.page = page; - vmf.cow_page = NULL; + vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - ret = vma->vm_ops->page_mkwrite(vma, &vmf); + ret = vmf->vma->vm_ops->page_mkwrite(vmf); + /* Restore original flags so that caller is not surprised */ + vmf->flags = old_flags; if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; if (unlikely(!(ret & VM_FAULT_LOCKED))) { @@ -2067,6 +2200,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, } /* + * Handle dirtying of a page in shared file mapping on a write fault. + * + * The function expects the page to be locked and unlocks it. + */ +static void fault_dirty_shared_page(struct vm_area_struct *vma, + struct page *page) +{ + struct address_space *mapping; + bool dirtied; + bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; + + dirtied = set_page_dirty(page); + VM_BUG_ON_PAGE(PageAnon(page), page); + /* + * Take a local copy of the address_space - page.mapping may be zeroed + * by truncate after unlock_page(). The address_space itself remains + * pinned by vma->vm_file's reference. We rely on unlock_page()'s + * release semantics to prevent the compiler from undoing this copying. + */ + mapping = page_rmapping(page); + unlock_page(page); + + if ((dirtied || page_mkwrite) && mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + + if (!page_mkwrite) + file_update_time(vma->vm_file); +} + +/* * Handle write page faults for pages that can be reused in the current vma * * This can happen either due to the mapping being with the VM_SHARED flag, @@ -2074,11 +2242,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ -static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, - struct page *page, int page_mkwrite, int dirty_shared) - __releases(fe->ptl) +static inline void wp_page_reuse(struct vm_fault *vmf) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; + struct page *page = vmf->page; pte_t entry; /* * Clear the pages cpupid information as the existing @@ -2088,39 +2256,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, if (page) page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); - flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); - entry = pte_mkyoung(orig_pte); + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); + entry = pte_mkyoung(vmf->orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1)) - update_mmu_cache(vma, fe->address, fe->pte); - pte_unmap_unlock(fe->pte, fe->ptl); - - if (dirty_shared) { - struct address_space *mapping; - int dirtied; - - if (!page_mkwrite) - lock_page(page); - - dirtied = set_page_dirty(page); - VM_BUG_ON_PAGE(PageAnon(page), page); - mapping = page->mapping; - unlock_page(page); - put_page(page); - - if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } - - if (!page_mkwrite) - file_update_time(vma->vm_file); - } - - return VM_FAULT_WRITE; + if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) + update_mmu_cache(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); } /* @@ -2139,31 +2280,32 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ -static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, - struct page *old_page) +static int wp_page_copy(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; + struct page *old_page = vmf->page; struct page *new_page = NULL; pte_t entry; int page_copied = 0; - const unsigned long mmun_start = fe->address & PAGE_MASK; + const unsigned long mmun_start = vmf->address & PAGE_MASK; const unsigned long mmun_end = mmun_start + PAGE_SIZE; struct mem_cgroup *memcg; if (unlikely(anon_vma_prepare(vma))) goto oom; - if (is_zero_pfn(pte_pfn(orig_pte))) { - new_page = alloc_zeroed_user_highpage_movable(vma, fe->address); + if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { + new_page = alloc_zeroed_user_highpage_movable(vma, + vmf->address); if (!new_page) goto oom; } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, - fe->address); + vmf->address); if (!new_page) goto oom; - cow_user_page(new_page, old_page, fe->address, vma); + cow_user_page(new_page, old_page, vmf->address, vma); } if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) @@ -2176,8 +2318,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, /* * Re-check the pte - we dropped the lock */ - fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); - if (likely(pte_same(*fe->pte, orig_pte))) { + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { dec_mm_counter_fast(mm, @@ -2187,7 +2329,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, } else { inc_mm_counter_fast(mm, MM_ANONPAGES); } - flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* @@ -2196,8 +2338,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush_notify(vma, fe->address, fe->pte); - page_add_new_anon_rmap(new_page, vma, fe->address, false); + ptep_clear_flush_notify(vma, vmf->address, vmf->pte); + page_add_new_anon_rmap(new_page, vma, vmf->address, false); mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); /* @@ -2205,8 +2347,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */ - set_pte_at_notify(mm, fe->address, fe->pte, entry); - update_mmu_cache(vma, fe->address, fe->pte); + set_pte_at_notify(mm, vmf->address, vmf->pte, entry); + update_mmu_cache(vma, vmf->address, vmf->pte); if (old_page) { /* * Only after switching the pte to the new page may @@ -2243,7 +2385,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, if (new_page) put_page(new_page); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (old_page) { /* @@ -2267,79 +2409,91 @@ oom: return VM_FAULT_OOM; } +/** + * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE + * writeable once the page is prepared + * + * @vmf: structure describing the fault + * + * This function handles all that is needed to finish a write page fault in a + * shared mapping due to PTE being read-only once the mapped page is prepared. + * It handles locking of PTE and modifying it. The function returns + * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE + * lock. + * + * The function expects the page to be locked or other protection against + * concurrent faults / writeback (such as DAX radix tree locks). + */ +int finish_mkwrite_fault(struct vm_fault *vmf) +{ + WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + /* + * We might have raced with another page fault while we released the + * pte_offset_map_lock. + */ + if (!pte_same(*vmf->pte, vmf->orig_pte)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return VM_FAULT_NOPAGE; + } + wp_page_reuse(vmf); + return 0; +} + /* * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ -static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) +static int wp_pfn_shared(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { - struct vm_fault vmf = { - .page = NULL, - .pgoff = linear_page_index(vma, fe->address), - .virtual_address = - (void __user *)(fe->address & PAGE_MASK), - .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, - }; int ret; - pte_unmap_unlock(fe->pte, fe->ptl); - ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); - if (ret & VM_FAULT_ERROR) + pte_unmap_unlock(vmf->pte, vmf->ptl); + vmf->flags |= FAULT_FLAG_MKWRITE; + ret = vma->vm_ops->pfn_mkwrite(vmf); + if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) return ret; - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - /* - * We might have raced with another page fault while we - * released the pte_offset_map_lock. - */ - if (!pte_same(*fe->pte, orig_pte)) { - pte_unmap_unlock(fe->pte, fe->ptl); - return 0; - } + return finish_mkwrite_fault(vmf); } - return wp_page_reuse(fe, orig_pte, NULL, 0, 0); + wp_page_reuse(vmf); + return VM_FAULT_WRITE; } -static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, - struct page *old_page) - __releases(fe->ptl) +static int wp_page_shared(struct vm_fault *vmf) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; - int page_mkwrite = 0; + struct vm_area_struct *vma = vmf->vma; - get_page(old_page); + get_page(vmf->page); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { int tmp; - pte_unmap_unlock(fe->pte, fe->ptl); - tmp = do_page_mkwrite(vma, old_page, fe->address); + pte_unmap_unlock(vmf->pte, vmf->ptl); + tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - put_page(old_page); + put_page(vmf->page); return tmp; } - /* - * Since we dropped the lock we need to revalidate - * the PTE as someone else may have changed it. If - * they did, we just return, as we can count on the - * MMU to tell us if they didn't also make it writable. - */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_same(*fe->pte, orig_pte)) { - unlock_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); - put_page(old_page); - return 0; + tmp = finish_mkwrite_fault(vmf); + if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + unlock_page(vmf->page); + put_page(vmf->page); + return tmp; } - page_mkwrite = 1; + } else { + wp_page_reuse(vmf); + lock_page(vmf->page); } + fault_dirty_shared_page(vma, vmf->page); + put_page(vmf->page); - return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1); + return VM_FAULT_WRITE; } /* @@ -2360,14 +2514,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_wp_page(struct fault_env *fe, pte_t orig_pte) - __releases(fe->ptl) +static int do_wp_page(struct vm_fault *vmf) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; - struct page *old_page; + struct vm_area_struct *vma = vmf->vma; - old_page = vm_normal_page(vma, fe->address, orig_pte); - if (!old_page) { + vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); + if (!vmf->page) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a * VM_PFNMAP VMA. @@ -2377,33 +2530,33 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) - return wp_pfn_shared(fe, orig_pte); + return wp_pfn_shared(vmf); - pte_unmap_unlock(fe->pte, fe->ptl); - return wp_page_copy(fe, orig_pte, old_page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return wp_page_copy(vmf); } /* * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ - if (PageAnon(old_page) && !PageKsm(old_page)) { + if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { int total_mapcount; - if (!trylock_page(old_page)) { - get_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); - lock_page(old_page); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, - fe->address, &fe->ptl); - if (!pte_same(*fe->pte, orig_pte)) { - unlock_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); - put_page(old_page); + if (!trylock_page(vmf->page)) { + get_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + lock_page(vmf->page); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (!pte_same(*vmf->pte, vmf->orig_pte)) { + unlock_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + put_page(vmf->page); return 0; } - put_page(old_page); + put_page(vmf->page); } - if (reuse_swap_page(old_page, &total_mapcount)) { + if (reuse_swap_page(vmf->page, &total_mapcount)) { if (total_mapcount == 1) { /* * The page is all ours. Move it to @@ -2412,24 +2565,25 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) * Protected against the rmap code by * the page lock. */ - page_move_anon_rmap(old_page, vma); + page_move_anon_rmap(vmf->page, vma); } - unlock_page(old_page); - return wp_page_reuse(fe, orig_pte, old_page, 0, 0); + unlock_page(vmf->page); + wp_page_reuse(vmf); + return VM_FAULT_WRITE; } - unlock_page(old_page); + unlock_page(vmf->page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { - return wp_page_shared(fe, orig_pte, old_page); + return wp_page_shared(vmf); } /* * Ok, we need to copy. Oh, well.. */ - get_page(old_page); + get_page(vmf->page); - pte_unmap_unlock(fe->pte, fe->ptl); - return wp_page_copy(fe, orig_pte, old_page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return wp_page_copy(vmf); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, @@ -2496,7 +2650,7 @@ void unmap_mapping_range(struct address_space *mapping, hlen = ULONG_MAX - hba + 1; } - details.check_mapping = even_cows? NULL: mapping; + details.check_mapping = even_cows ? NULL : mapping; details.first_index = hba; details.last_index = hba + hlen - 1; if (details.last_index < details.first_index) @@ -2517,9 +2671,9 @@ EXPORT_SYMBOL(unmap_mapping_range); * We return with the mmap_sem locked or unlocked in the same cases * as does filemap_fault(). */ -int do_swap_page(struct fault_env *fe, pte_t orig_pte) +int do_swap_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page, *swapcache; struct mem_cgroup *memcg; swp_entry_t entry; @@ -2528,17 +2682,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) int exclusive = 0; int ret = 0; - if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) goto out; - entry = pte_to_swp_entry(orig_pte); + entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { - migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); + migration_entry_wait(vma->vm_mm, vmf->pmd, + vmf->address); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { - print_bad_pte(vma, fe->address, orig_pte, NULL); + print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); ret = VM_FAULT_SIGBUS; } goto out; @@ -2546,16 +2701,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { - page = swapin_readahead(entry, - GFP_HIGHUSER_MOVABLE, vma, fe->address); + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, + vmf->address); if (!page) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, - fe->address, &fe->ptl); - if (likely(pte_same(*fe->pte, orig_pte))) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) ret = VM_FAULT_OOM; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; @@ -2577,7 +2732,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) } swapcache = page; - locked = lock_page_or_retry(page, vma->vm_mm, fe->flags); + locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); if (!locked) { @@ -2594,7 +2749,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) goto out_page; - page = ksm_might_need_to_copy(page, vma, fe->address); + page = ksm_might_need_to_copy(page, vma, vmf->address); if (unlikely(!page)) { ret = VM_FAULT_OOM; page = swapcache; @@ -2610,9 +2765,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) /* * Back out if somebody else already faulted in this pte. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (unlikely(!pte_same(*fe->pte, orig_pte))) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) goto out_nomap; if (unlikely(!PageUptodate(page))) { @@ -2633,22 +2788,23 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); - fe->flags &= ~FAULT_FLAG_WRITE; + vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; exclusive = RMAP_EXCLUSIVE; } flush_icache_page(vma, page); - if (pte_swp_soft_dirty(orig_pte)) + if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); - set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + vmf->orig_pte = pte; if (page == swapcache) { - do_page_add_anon_rmap(page, vma, fe->address, exclusive); + do_page_add_anon_rmap(page, vma, vmf->address, exclusive); mem_cgroup_commit_charge(page, memcg, true, false); activate_page(page); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } @@ -2671,22 +2827,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) put_page(swapcache); } - if (fe->flags & FAULT_FLAG_WRITE) { - ret |= do_wp_page(fe, pte); + if (vmf->flags & FAULT_FLAG_WRITE) { + ret |= do_wp_page(vmf); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; goto out; } /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out: return ret; out_nomap: mem_cgroup_cancel_charge(page, memcg, false); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: unlock_page(page); out_release: @@ -2737,9 +2893,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_anonymous_page(struct fault_env *fe) +static int do_anonymous_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; struct page *page; pte_t entry; @@ -2749,7 +2905,7 @@ static int do_anonymous_page(struct fault_env *fe) return VM_FAULT_SIGBUS; /* Check if we need to add a guard page to the stack */ - if (check_stack_guard_page(vma, fe->address) < 0) + if (check_stack_guard_page(vma, vmf->address) < 0) return VM_FAULT_SIGSEGV; /* @@ -2762,26 +2918,26 @@ static int do_anonymous_page(struct fault_env *fe) * * Here we only have down_read(mmap_sem). */ - if (pte_alloc(vma->vm_mm, fe->pmd, fe->address)) + if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address)) return VM_FAULT_OOM; /* See the comment in pte_alloc_one_map() */ - if (unlikely(pmd_trans_unstable(fe->pmd))) + if (unlikely(pmd_trans_unstable(vmf->pmd))) return 0; /* Use the zero-page for reads */ - if (!(fe->flags & FAULT_FLAG_WRITE) && + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { - entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), + entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_none(*fe->pte)) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (!pte_none(*vmf->pte)) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { - pte_unmap_unlock(fe->pte, fe->ptl); - return handle_userfault(fe, VM_UFFD_MISSING); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_MISSING); } goto setpte; } @@ -2789,7 +2945,7 @@ static int do_anonymous_page(struct fault_env *fe) /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; - page = alloc_zeroed_user_highpage_movable(vma, fe->address); + page = alloc_zeroed_user_highpage_movable(vma, vmf->address); if (!page) goto oom; @@ -2807,30 +2963,30 @@ static int do_anonymous_page(struct fault_env *fe) if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_none(*fe->pte)) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (!pte_none(*vmf->pte)) goto release; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return handle_userfault(fe, VM_UFFD_MISSING); + return handle_userfault(vmf, VM_UFFD_MISSING); } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); setpte: - set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; release: mem_cgroup_cancel_charge(page, memcg, false); @@ -2847,62 +3003,50 @@ oom: * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ -static int __do_fault(struct fault_env *fe, pgoff_t pgoff, - struct page *cow_page, struct page **page, void **entry) +static int __do_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct vm_fault vmf; + struct vm_area_struct *vma = vmf->vma; int ret; - vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK); - vmf.pgoff = pgoff; - vmf.flags = fe->flags; - vmf.page = NULL; - vmf.gfp_mask = __get_fault_gfp_mask(vma); - vmf.cow_page = cow_page; - - ret = vma->vm_ops->fault(vma, &vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - return ret; - if (ret & VM_FAULT_DAX_LOCKED) { - *entry = vmf.entry; + ret = vma->vm_ops->fault(vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | + VM_FAULT_DONE_COW))) return ret; - } - if (unlikely(PageHWPoison(vmf.page))) { + if (unlikely(PageHWPoison(vmf->page))) { if (ret & VM_FAULT_LOCKED) - unlock_page(vmf.page); - put_page(vmf.page); + unlock_page(vmf->page); + put_page(vmf->page); + vmf->page = NULL; return VM_FAULT_HWPOISON; } if (unlikely(!(ret & VM_FAULT_LOCKED))) - lock_page(vmf.page); + lock_page(vmf->page); else - VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); + VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); - *page = vmf.page; return ret; } -static int pte_alloc_one_map(struct fault_env *fe) +static int pte_alloc_one_map(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; - if (!pmd_none(*fe->pmd)) + if (!pmd_none(*vmf->pmd)) goto map_pte; - if (fe->prealloc_pte) { - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) { - spin_unlock(fe->ptl); + if (vmf->prealloc_pte) { + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) { + spin_unlock(vmf->ptl); goto map_pte; } atomic_long_inc(&vma->vm_mm->nr_ptes); - pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte); - spin_unlock(fe->ptl); - fe->prealloc_pte = 0; - } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) { + pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); + spin_unlock(vmf->ptl); + vmf->prealloc_pte = NULL; + } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) { return VM_FAULT_OOM; } map_pte: @@ -2917,11 +3061,11 @@ map_pte: * through an atomic read in C, which is what pmd_trans_unstable() * provides. */ - if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) + if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) return VM_FAULT_NOPAGE; - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); return 0; } @@ -2939,11 +3083,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return true; } -static int do_set_pmd(struct fault_env *fe, struct page *page) +static void deposit_prealloc_pte(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); + /* + * We are going to consume the prealloc table, + * count that as nr_ptes. + */ + atomic_long_inc(&vma->vm_mm->nr_ptes); + vmf->prealloc_pte = NULL; +} + +static int do_set_pmd(struct vm_fault *vmf, struct page *page) { - struct vm_area_struct *vma = fe->vma; - bool write = fe->flags & FAULT_FLAG_WRITE; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + bool write = vmf->flags & FAULT_FLAG_WRITE; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; int i, ret; @@ -2953,8 +3110,19 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) ret = VM_FAULT_FALLBACK; page = compound_head(page); - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) + /* + * Archs like ppc64 need additonal space to store information + * related to pte entry. Use the preallocated table for that. + */ + if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { + vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address); + if (!vmf->prealloc_pte) + return VM_FAULT_OOM; + smp_wmb(); /* See comment in __pte_alloc() */ + } + + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) goto out; for (i = 0; i < HPAGE_PMD_NR; i++) @@ -2966,20 +3134,25 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR); page_add_file_rmap(page, true); + /* + * deposit and withdraw with pmd lock held + */ + if (arch_needs_pgtable_deposit()) + deposit_prealloc_pte(vmf); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); - update_mmu_cache_pmd(vma, haddr, fe->pmd); + update_mmu_cache_pmd(vma, haddr, vmf->pmd); /* fault is handled */ ret = 0; count_vm_event(THP_FILE_MAPPED); out: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); return ret; } #else -static int do_set_pmd(struct fault_env *fe, struct page *page) +static int do_set_pmd(struct vm_fault *vmf, struct page *page) { BUILD_BUG(); return 0; @@ -2990,41 +3163,42 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * alloc_set_pte - setup new PTE entry for given page and add reverse page * mapping. If needed, the fucntion allocates page table or use pre-allocated. * - * @fe: fault environment + * @vmf: fault environment * @memcg: memcg to charge page (only for private mappings) * @page: page to map * - * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return. + * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on + * return. * * Target users are page handler itself and implementations of * vm_ops->map_pages. */ -int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, +int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page) { - struct vm_area_struct *vma = fe->vma; - bool write = fe->flags & FAULT_FLAG_WRITE; + struct vm_area_struct *vma = vmf->vma; + bool write = vmf->flags & FAULT_FLAG_WRITE; pte_t entry; int ret; - if (pmd_none(*fe->pmd) && PageTransCompound(page) && + if (pmd_none(*vmf->pmd) && PageTransCompound(page) && IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { /* THP on COW? */ VM_BUG_ON_PAGE(memcg, page); - ret = do_set_pmd(fe, page); + ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) return ret; } - if (!fe->pte) { - ret = pte_alloc_one_map(fe); + if (!vmf->pte) { + ret = pte_alloc_one_map(vmf); if (ret) return ret; } /* Re-check under ptl */ - if (unlikely(!pte_none(*fe->pte))) + if (unlikely(!pte_none(*vmf->pte))) return VM_FAULT_NOPAGE; flush_icache_page(vma, page); @@ -3034,21 +3208,53 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); } - set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); return 0; } + +/** + * finish_fault - finish page fault once we have prepared the page to fault + * + * @vmf: structure describing the fault + * + * This function handles all that is needed to finish a page fault once the + * page to fault in is prepared. It handles locking of PTEs, inserts PTE for + * given page, adds reverse page mapping, handles memcg charges and LRU + * addition. The function returns 0 on success, VM_FAULT_ code in case of + * error. + * + * The function expects the page to be locked and on success it consumes a + * reference of a page being mapped (for the PTE which maps it). + */ +int finish_fault(struct vm_fault *vmf) +{ + struct page *page; + int ret; + + /* Did we COW the page? */ + if ((vmf->flags & FAULT_FLAG_WRITE) && + !(vmf->vma->vm_flags & VM_SHARED)) + page = vmf->cow_page; + else + page = vmf->page; + ret = alloc_set_pte(vmf, vmf->memcg, page); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); + return ret; +} + static unsigned long fault_around_bytes __read_mostly = rounddown_pow_of_two(65536); @@ -3113,17 +3319,18 @@ late_initcall(fault_around_debugfs); * fault_around_pages() value (and therefore to page order). This way it's * easier to guarantee that we don't cross page table boundaries. */ -static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) +static int do_fault_around(struct vm_fault *vmf) { - unsigned long address = fe->address, nr_pages, mask; + unsigned long address = vmf->address, nr_pages, mask; + pgoff_t start_pgoff = vmf->pgoff; pgoff_t end_pgoff; int off, ret = 0; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; - fe->address = max(address & mask, fe->vma->vm_start); - off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + vmf->address = max(address & mask, vmf->vma->vm_start); + off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); start_pgoff -= off; /* @@ -3131,50 +3338,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) * or fault_around_pages() from start_pgoff, depending what is nearest. */ end_pgoff = start_pgoff - - ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; - end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, + end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, start_pgoff + nr_pages - 1); - if (pmd_none(*fe->pmd)) { - fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address); - if (!fe->prealloc_pte) + if (pmd_none(*vmf->pmd)) { + vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm, + vmf->address); + if (!vmf->prealloc_pte) goto out; smp_wmb(); /* See comment in __pte_alloc() */ } - fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); + vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); - /* preallocated pagetable is unused: free it */ - if (fe->prealloc_pte) { - pte_free(fe->vma->vm_mm, fe->prealloc_pte); - fe->prealloc_pte = 0; - } /* Huge page is mapped? Page fault is solved */ - if (pmd_trans_huge(*fe->pmd)) { + if (pmd_trans_huge(*vmf->pmd)) { ret = VM_FAULT_NOPAGE; goto out; } /* ->map_pages() haven't done anything useful. Cold page cache? */ - if (!fe->pte) + if (!vmf->pte) goto out; /* check if the page fault is solved */ - fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); - if (!pte_none(*fe->pte)) + vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); + if (!pte_none(*vmf->pte)) ret = VM_FAULT_NOPAGE; - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out: - fe->address = address; - fe->pte = NULL; + vmf->address = address; + vmf->pte = NULL; return ret; } -static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_read_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct page *fault_page; + struct vm_area_struct *vma = vmf->vma; int ret = 0; /* @@ -3183,80 +3385,67 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { - ret = do_fault_around(fe, pgoff); + ret = do_fault_around(vmf); if (ret) return ret; } - ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; - ret |= alloc_set_pte(fe, NULL, fault_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); - unlock_page(fault_page); + ret |= finish_fault(vmf); + unlock_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - put_page(fault_page); + put_page(vmf->page); return ret; } -static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_cow_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct page *fault_page, *new_page; - void *fault_entry; - struct mem_cgroup *memcg; + struct vm_area_struct *vma = vmf->vma; int ret; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address); - if (!new_page) + vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); + if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { - put_page(new_page); + if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, + &vmf->memcg, false)) { + put_page(vmf->cow_page); return VM_FAULT_OOM; } - ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; + if (ret & VM_FAULT_DONE_COW) + return ret; - if (!(ret & VM_FAULT_DAX_LOCKED)) - copy_user_highpage(new_page, fault_page, fe->address, vma); - __SetPageUptodate(new_page); + copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); + __SetPageUptodate(vmf->cow_page); - ret |= alloc_set_pte(fe, memcg, new_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); - if (!(ret & VM_FAULT_DAX_LOCKED)) { - unlock_page(fault_page); - put_page(fault_page); - } else { - dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); - } + ret |= finish_fault(vmf); + unlock_page(vmf->page); + put_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; return ret; uncharge_out: - mem_cgroup_cancel_charge(new_page, memcg, false); - put_page(new_page); + mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false); + put_page(vmf->cow_page); return ret; } -static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_shared_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct page *fault_page; - struct address_space *mapping; - int dirtied = 0; + struct vm_area_struct *vma = vmf->vma; int ret, tmp; - ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -3265,46 +3454,24 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) * about to become writable */ if (vma->vm_ops->page_mkwrite) { - unlock_page(fault_page); - tmp = do_page_mkwrite(vma, fault_page, fe->address); + unlock_page(vmf->page); + tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - put_page(fault_page); + put_page(vmf->page); return tmp; } } - ret |= alloc_set_pte(fe, NULL, fault_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); + ret |= finish_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) { - unlock_page(fault_page); - put_page(fault_page); + unlock_page(vmf->page); + put_page(vmf->page); return ret; } - if (set_page_dirty(fault_page)) - dirtied = 1; - /* - * Take a local copy of the address_space - page.mapping may be zeroed - * by truncate after unlock_page(). The address_space itself remains - * pinned by vma->vm_file's reference. We rely on unlock_page()'s - * release semantics to prevent the compiler from undoing this copying. - */ - mapping = page_rmapping(fault_page); - unlock_page(fault_page); - if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping but still - * dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } - - if (!vma->vm_ops->page_mkwrite) - file_update_time(vma->vm_file); - + fault_dirty_shared_page(vma, vmf->page); return ret; } @@ -3314,19 +3481,27 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int do_fault(struct fault_env *fe) +static int do_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - pgoff_t pgoff = linear_page_index(vma, fe->address); + struct vm_area_struct *vma = vmf->vma; + int ret; /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) - return VM_FAULT_SIGBUS; - if (!(fe->flags & FAULT_FLAG_WRITE)) - return do_read_fault(fe, pgoff); - if (!(vma->vm_flags & VM_SHARED)) - return do_cow_fault(fe, pgoff); - return do_shared_fault(fe, pgoff); + ret = VM_FAULT_SIGBUS; + else if (!(vmf->flags & FAULT_FLAG_WRITE)) + ret = do_read_fault(vmf); + else if (!(vma->vm_flags & VM_SHARED)) + ret = do_cow_fault(vmf); + else + ret = do_shared_fault(vmf); + + /* preallocated pagetable is unused: free it */ + if (vmf->prealloc_pte) { + pte_free(vma->vm_mm, vmf->prealloc_pte); + vmf->prealloc_pte = NULL; + } + return ret; } static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, @@ -3344,50 +3519,51 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } -static int do_numa_page(struct fault_env *fe, pte_t pte) +static int do_numa_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; int page_nid = -1; int last_cpupid; int target_nid; bool migrated = false; - bool was_writable = pte_write(pte); + pte_t pte; + bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; /* - * The "pte" at this point cannot be used safely without - * validation through pte_unmap_same(). It's of NUMA type but - * the pfn may be screwed if the read is non atomic. - * - * We can safely just do a "set_pte_at()", because the old - * page table entry is not accessible, so there would be no - * concurrent hardware modifications to the PTE. - */ - fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); - spin_lock(fe->ptl); - if (unlikely(!pte_same(*fe->pte, pte))) { - pte_unmap_unlock(fe->pte, fe->ptl); + * The "pte" at this point cannot be used safely without + * validation through pte_unmap_same(). It's of NUMA type but + * the pfn may be screwed if the read is non atomic. + */ + vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } - /* Make it present again */ + /* + * Make it present again, Depending on how arch implementes non + * accessible ptes, some can allow access by kernel mode. + */ + pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte); pte = pte_modify(pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); - update_mmu_cache(vma, fe->address, fe->pte); + ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte); + update_mmu_cache(vma, vmf->address, vmf->pte); - page = vm_normal_page(vma, fe->address, pte); + page = vm_normal_page(vma, vmf->address, pte); if (!page) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } /* TODO: handle PTE-mapped THP */ if (PageCompound(page)) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } @@ -3411,9 +3587,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, fe->address, page_nid, + target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); if (target_nid == -1) { put_page(page); goto out; @@ -3433,28 +3609,25 @@ out: return 0; } -static int create_huge_pmd(struct fault_env *fe) +static int create_huge_pmd(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - if (vma_is_anonymous(vma)) - return do_huge_pmd_anonymous_page(fe); - if (vma->vm_ops->pmd_fault) - return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd, - fe->flags); + if (vma_is_anonymous(vmf->vma)) + return do_huge_pmd_anonymous_page(vmf); + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); return VM_FAULT_FALLBACK; } -static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) +static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { - if (vma_is_anonymous(fe->vma)) - return do_huge_pmd_wp_page(fe, orig_pmd); - if (fe->vma->vm_ops->pmd_fault) - return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd, - fe->flags); + if (vma_is_anonymous(vmf->vma)) + return do_huge_pmd_wp_page(vmf, orig_pmd); + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); /* COW handled on pte level: split pmd */ - VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); - split_huge_pmd(fe->vma, fe->pmd, fe->address); + VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); + __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } @@ -3464,6 +3637,30 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE); } +static int create_huge_pud(struct vm_fault *vmf) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* No support for anonymous transparent PUD pages yet */ + if (vma_is_anonymous(vmf->vma)) + return VM_FAULT_FALLBACK; + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + return VM_FAULT_FALLBACK; +} + +static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* No support for anonymous transparent PUD pages yet */ + if (vma_is_anonymous(vmf->vma)) + return VM_FAULT_FALLBACK; + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + return VM_FAULT_FALLBACK; +} + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3479,21 +3676,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) * The mmap_sem may have been released depending on flags and our return value. * See filemap_fault() and __lock_page_or_retry(). */ -static int handle_pte_fault(struct fault_env *fe) +static int handle_pte_fault(struct vm_fault *vmf) { pte_t entry; - if (unlikely(pmd_none(*fe->pmd))) { + if (unlikely(pmd_none(*vmf->pmd))) { /* * Leave __pte_alloc() until later: because vm_ops->fault may * want to allocate huge page, and if we expose page table * for an instant, it will be difficult to retract from * concurrent faults and from rmap lookups. */ - fe->pte = NULL; + vmf->pte = NULL; } else { /* See comment in pte_alloc_one_map() */ - if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) + if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) return 0; /* * A regular pmd is established and it can't morph into a huge @@ -3501,9 +3698,8 @@ static int handle_pte_fault(struct fault_env *fe) * mmap_sem read mode and khugepaged takes it in write mode. * So now it's safe to run pte_offset_map(). */ - fe->pte = pte_offset_map(fe->pmd, fe->address); - - entry = *fe->pte; + vmf->pte = pte_offset_map(vmf->pmd, vmf->address); + vmf->orig_pte = *vmf->pte; /* * some architectures can have larger ptes than wordsize, @@ -3514,38 +3710,39 @@ static int handle_pte_fault(struct fault_env *fe) * ptl lock held. So here a barrier will do. */ barrier(); - if (pte_none(entry)) { - pte_unmap(fe->pte); - fe->pte = NULL; + if (pte_none(vmf->orig_pte)) { + pte_unmap(vmf->pte); + vmf->pte = NULL; } } - if (!fe->pte) { - if (vma_is_anonymous(fe->vma)) - return do_anonymous_page(fe); + if (!vmf->pte) { + if (vma_is_anonymous(vmf->vma)) + return do_anonymous_page(vmf); else - return do_fault(fe); + return do_fault(vmf); } - if (!pte_present(entry)) - return do_swap_page(fe, entry); + if (!pte_present(vmf->orig_pte)) + return do_swap_page(vmf); - if (pte_protnone(entry) && vma_is_accessible(fe->vma)) - return do_numa_page(fe, entry); + if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) + return do_numa_page(vmf); - fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); - spin_lock(fe->ptl); - if (unlikely(!pte_same(*fe->pte, entry))) + vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + entry = vmf->orig_pte; + if (unlikely(!pte_same(*vmf->pte, entry))) goto unlock; - if (fe->flags & FAULT_FLAG_WRITE) { + if (vmf->flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) - return do_wp_page(fe, entry); + return do_wp_page(vmf); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry, - fe->flags & FAULT_FLAG_WRITE)) { - update_mmu_cache(fe->vma, fe->address, fe->pte); + if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, + vmf->flags & FAULT_FLAG_WRITE)) { + update_mmu_cache(vmf->vma, vmf->address, vmf->pte); } else { /* * This is needed only for protection faults but the arch code @@ -3553,11 +3750,11 @@ static int handle_pte_fault(struct fault_env *fe) * This still avoids useless tlb flushes for .text page faults * with threads. */ - if (fe->flags & FAULT_FLAG_WRITE) - flush_tlb_fix_spurious_fault(fe->vma, fe->address); + if (vmf->flags & FAULT_FLAG_WRITE) + flush_tlb_fix_spurious_fault(vmf->vma, vmf->address); } unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } @@ -3570,48 +3767,78 @@ unlock: static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - struct fault_env fe = { + struct vm_fault vmf = { .vma = vma, - .address = address, + .address = address & PAGE_MASK, .flags = flags, + .pgoff = linear_page_index(vma, address), + .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; - pud_t *pud; + p4d_t *p4d; + int ret; pgd = pgd_offset(mm, address); - pud = pud_alloc(mm, pgd, address); - if (!pud) + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) return VM_FAULT_OOM; - fe.pmd = pmd_alloc(mm, pud, address); - if (!fe.pmd) + + vmf.pud = pud_alloc(mm, p4d, address); + if (!vmf.pud) return VM_FAULT_OOM; - if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) { - int ret = create_huge_pmd(&fe); + if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) { + ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - pmd_t orig_pmd = *fe.pmd; - int ret; + pud_t orig_pud = *vmf.pud; + + barrier(); + if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { + unsigned int dirty = flags & FAULT_FLAG_WRITE; + + /* NUMA case for anonymous PUDs would go here */ + + if (dirty && !pud_write(orig_pud)) { + ret = wp_huge_pud(&vmf, orig_pud); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + huge_pud_set_accessed(&vmf, orig_pud); + return 0; + } + } + } + + vmf.pmd = pmd_alloc(mm, vmf.pud, address); + if (!vmf.pmd) + return VM_FAULT_OOM; + if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { + ret = create_huge_pmd(&vmf); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + pmd_t orig_pmd = *vmf.pmd; barrier(); if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) - return do_huge_pmd_numa_page(&fe, orig_pmd); + return do_huge_pmd_numa_page(&vmf, orig_pmd); - if ((fe.flags & FAULT_FLAG_WRITE) && + if ((vmf.flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) { - ret = wp_huge_pmd(&fe, orig_pmd); + ret = wp_huge_pmd(&vmf, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - huge_pmd_set_accessed(&fe, orig_pmd); + huge_pmd_set_accessed(&vmf, orig_pmd); return 0; } } } - return handle_pte_fault(&fe); + return handle_pte_fault(&vmf); } /* @@ -3652,14 +3879,14 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (flags & FAULT_FLAG_USER) { mem_cgroup_oom_disable(); - /* - * The task may have entered a memcg OOM situation but - * if the allocation error was handled gracefully (no - * VM_FAULT_OOM), there is no need to kill anything. - * Just clean up the OOM state peacefully. - */ - if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) - mem_cgroup_oom_synchronize(false); + /* + * The task may have entered a memcg OOM situation but + * if the allocation error was handled gracefully (no + * VM_FAULT_OOM), there is no need to kill anything. + * Just clean up the OOM state peacefully. + */ + if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) + mem_cgroup_oom_synchronize(false); } /* @@ -3679,12 +3906,35 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL_GPL(handle_mm_fault); +#ifndef __PAGETABLE_P4D_FOLDED +/* + * Allocate p4d page table. + * We've already handled the fast-path in-line. + */ +int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + p4d_t *new = p4d_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + smp_wmb(); /* See comment in __pte_alloc */ + + spin_lock(&mm->page_table_lock); + if (pgd_present(*pgd)) /* Another has populated it */ + p4d_free(mm, new); + else + pgd_populate(mm, pgd, new); + spin_unlock(&mm->page_table_lock); + return 0; +} +#endif /* __PAGETABLE_P4D_FOLDED */ + #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. * We've already handled the fast-path in-line. */ -int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { pud_t *new = pud_alloc_one(mm, address); if (!new) @@ -3693,10 +3943,17 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&mm->page_table_lock); - if (pgd_present(*pgd)) /* Another has populated it */ +#ifndef __ARCH_HAS_5LEVEL_HACK + if (p4d_present(*p4d)) /* Another has populated it */ pud_free(mm, new); else - pgd_populate(mm, pgd, new); + p4d_populate(mm, p4d, new); +#else + if (pgd_present(*p4d)) /* Another has populated it */ + pud_free(mm, new); + else + pgd_populate(mm, p4d, new); +#endif /* __ARCH_HAS_5LEVEL_HACK */ spin_unlock(&mm->page_table_lock); return 0; } @@ -3709,13 +3966,14 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) */ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { + spinlock_t *ptl; pmd_t *new = pmd_alloc_one(mm, address); if (!new) return -ENOMEM; smp_wmb(); /* See comment in __pte_alloc */ - spin_lock(&mm->page_table_lock); + ptl = pud_lock(mm, pud); #ifndef __ARCH_HAS_4LEVEL_HACK if (!pud_present(*pud)) { mm_inc_nr_pmds(mm); @@ -3729,15 +3987,16 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } else /* Another has populated it */ pmd_free(mm, new); #endif /* __ARCH_HAS_4LEVEL_HACK */ - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); return 0; } #endif /* __PAGETABLE_PMD_FOLDED */ -static int __follow_pte(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp) +static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *ptep; @@ -3746,17 +4005,30 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address, if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) goto out; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d))) + goto out; + + pud = pud_offset(p4d, address); if (pud_none(*pud) || unlikely(pud_bad(*pud))) goto out; pmd = pmd_offset(pud, address); VM_BUG_ON(pmd_trans_huge(*pmd)); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto out; - /* We cannot handle huge page PFN maps. Luckily they don't exist. */ - if (pmd_huge(*pmd)) + if (pmd_huge(*pmd)) { + if (!pmdpp) + goto out; + + *ptlp = pmd_lock(mm, pmd); + if (pmd_huge(*pmd)) { + *pmdpp = pmd; + return 0; + } + spin_unlock(*ptlp); + } + + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; ptep = pte_offset_map_lock(mm, pmd, address, ptlp); @@ -3779,10 +4051,24 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address, /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, - !(res = __follow_pte(mm, address, ptepp, ptlp))); + !(res = __follow_pte_pmd(mm, address, ptepp, NULL, + ptlp))); return res; } +int follow_pte_pmd(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) +{ + int res; + + /* (void) is needed to make gcc happy */ + (void) __cond_lock(*ptlp, + !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp, + ptlp))); + return res; +} +EXPORT_SYMBOL(follow_pte_pmd); + /** * follow_pfn - look up PFN at a user virtual address * @vma: memory mapping @@ -3868,7 +4154,7 @@ EXPORT_SYMBOL_GPL(generic_access_phys); * Access another process' address space as given in mm. If non-NULL, use the * given task for page fault accounting. */ -static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, +int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct vm_area_struct *vma; @@ -3883,7 +4169,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct page *page = NULL; ret = get_user_pages_remote(tsk, mm, addr, 1, - gup_flags, &page, &vma); + gup_flags, &page, &vma, NULL); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; @@ -3966,6 +4252,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, return ret; } +EXPORT_SYMBOL_GPL(access_process_vm); /* * Print the name of a VMA. @@ -4093,6 +4380,38 @@ void copy_user_huge_page(struct page *dst, struct page *src, copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); } } + +long copy_huge_page_from_user(struct page *dst_page, + const void __user *usr_src, + unsigned int pages_per_huge_page, + bool allow_pagefault) +{ + void *src = (void *)usr_src; + void *page_kaddr; + unsigned long i, rc = 0; + unsigned long ret_val = pages_per_huge_page * PAGE_SIZE; + + for (i = 0; i < pages_per_huge_page; i++) { + if (allow_pagefault) + page_kaddr = kmap(dst_page + i); + else + page_kaddr = kmap_atomic(dst_page + i); + rc = copy_from_user(page_kaddr, + (const void __user *)(src + i * PAGE_SIZE), + PAGE_SIZE); + if (allow_pagefault) + kunmap(dst_page + i); + else + kunmap_atomic(page_kaddr); + + ret_val -= (PAGE_SIZE - rc); + if (rc) + break; + + cond_resched(); + } + return ret_val; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index cad4b9125695..6fa7208bcd56 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -6,6 +6,7 @@ #include <linux/stddef.h> #include <linux/mm.h> +#include <linux/sched/signal.h> #include <linux/swap.h> #include <linux/interrupt.h> #include <linux/pagemap.h> @@ -124,8 +125,13 @@ void put_online_mems(void) } +/* Serializes write accesses to mem_hotplug.active_writer. */ +static DEFINE_MUTEX(memory_add_remove_lock); + void mem_hotplug_begin(void) { + mutex_lock(&memory_add_remove_lock); + mem_hotplug.active_writer = current; memhp_lock_acquire(); @@ -144,6 +150,7 @@ void mem_hotplug_done(void) mem_hotplug.active_writer = NULL; mutex_unlock(&mem_hotplug.lock); memhp_lock_release(); + mutex_unlock(&memory_add_remove_lock); } /* add this memory to iomem resource */ @@ -179,7 +186,7 @@ static void release_memory_resource(struct resource *res) void get_page_bootmem(unsigned long info, struct page *page, unsigned long type) { - page->lru.next = (struct list_head *) type; + page->freelist = (void *)type; SetPagePrivate(page); set_page_private(page, info); page_ref_inc(page); @@ -189,11 +196,12 @@ void put_page_bootmem(struct page *page) { unsigned long type; - type = (unsigned long) page->lru.next; + type = (unsigned long) page->freelist; BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); if (page_ref_dec_return(page) == 1) { + page->freelist = NULL; ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); @@ -861,7 +869,6 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, return ret; } -EXPORT_SYMBOL_GPL(__remove_pages); #endif /* CONFIG_MEMORY_HOTREMOVE */ int set_online_page_callback(online_page_callback_t callback) @@ -1033,36 +1040,39 @@ static void node_states_set_node(int node, struct memory_notify *arg) node_set_state(node, N_MEMORY); } -int zone_can_shift(unsigned long pfn, unsigned long nr_pages, - enum zone_type target) +bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, + enum zone_type target, int *zone_shift) { struct zone *zone = page_zone(pfn_to_page(pfn)); enum zone_type idx = zone_idx(zone); int i; + *zone_shift = 0; + if (idx < target) { /* pages must be at end of current zone */ if (pfn + nr_pages != zone_end_pfn(zone)) - return 0; + return false; /* no zones in use between current zone and target */ for (i = idx + 1; i < target; i++) if (zone_is_initialized(zone - idx + i)) - return 0; + return false; } if (target < idx) { /* pages must be at beginning of current zone */ if (pfn != zone->zone_start_pfn) - return 0; + return false; /* no zones in use between current zone and target */ for (i = target + 1; i < idx; i++) if (zone_is_initialized(zone - idx + i)) - return 0; + return false; } - return target - idx; + *zone_shift = target - idx; + return true; } /* Must be protected by mem_hotplug_begin() */ @@ -1089,10 +1099,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ !can_online_high_movable(zone)) return -EINVAL; - if (online_type == MMOP_ONLINE_KERNEL) - zone_shift = zone_can_shift(pfn, nr_pages, ZONE_NORMAL); - else if (online_type == MMOP_ONLINE_MOVABLE) - zone_shift = zone_can_shift(pfn, nr_pages, ZONE_MOVABLE); + if (online_type == MMOP_ONLINE_KERNEL) { + if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift)) + return -EINVAL; + } else if (online_type == MMOP_ONLINE_MOVABLE) { + if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift)) + return -EINVAL; + } zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); if (!zone) @@ -1329,7 +1342,7 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default, static int online_memory_block(struct memory_block *mem, void *arg) { - return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); + return device_online(&mem->dev); } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ @@ -1477,17 +1490,20 @@ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) } /* - * Confirm all pages in a range [start, end) is belongs to the same zone. + * Confirm all pages in a range [start, end) belong to the same zone. + * When true, return its valid [start, end). */ -int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) +int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, + unsigned long *valid_start, unsigned long *valid_end) { unsigned long pfn, sec_end_pfn; + unsigned long start, end; struct zone *zone = NULL; struct page *page; int i; - for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn); + for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1); pfn < end_pfn; - pfn = sec_end_pfn + 1, sec_end_pfn += PAGES_PER_SECTION) { + pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) { /* Make sure the memory section is present first */ if (!present_section_nr(pfn_to_section_nr(pfn))) continue; @@ -1498,22 +1514,32 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) i++; - if (i == MAX_ORDER_NR_PAGES) + if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn) continue; page = pfn_to_page(pfn + i); if (zone && page_zone(page) != zone) return 0; + if (!zone) + start = pfn + i; zone = page_zone(page); + end = pfn + MAX_ORDER_NR_PAGES; } } - return 1; + + if (zone) { + *valid_start = start; + *valid_end = min(end, end_pfn); + return 1; + } else { + return 0; + } } /* - * Scan pfn range [start,end) to find movable/migratable pages (LRU pages - * and hugepages). We scan pfn because it's much easier than scanning over - * linked list. This function returns the pfn of the first found movable - * page if it's found, otherwise 0. + * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, + * non-lru movable pages and hugepages). We scan pfn because it's much + * easier than scanning over linked list. This function returns the pfn + * of the first found movable page if it's found, otherwise 0. */ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) { @@ -1524,6 +1550,8 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) page = pfn_to_page(pfn); if (PageLRU(page)) return pfn; + if (__PageMovable(page)) + return pfn; if (PageHuge(page)) { if (page_huge_active(page)) return pfn; @@ -1600,21 +1628,25 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (!get_page_unless_zero(page)) continue; /* - * We can skip free pages. And we can only deal with pages on - * LRU. + * We can skip free pages. And we can deal with pages on + * LRU and non-lru movable pages. */ - ret = isolate_lru_page(page); + if (PageLRU(page)) + ret = isolate_lru_page(page); + else + ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); if (!ret) { /* Success */ put_page(page); list_add_tail(&page->lru, &source); move_pages--; - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + if (!__PageMovable(page)) + inc_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); } else { #ifdef CONFIG_DEBUG_VM - pr_alert("removing pfn %lx from LRU failed\n", pfn); - dump_page(page, "failed to remove from LRU"); + pr_alert("failed to isolate pfn %lx\n", pfn); + dump_page(page, "isolation failed"); #endif put_page(page); /* Because we don't have big zone->lock. we should @@ -1727,26 +1759,6 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) static int __init cmdline_parse_movable_node(char *p) { #ifdef CONFIG_MOVABLE_NODE - /* - * Memory used by the kernel cannot be hot-removed because Linux - * cannot migrate the kernel pages. When memory hotplug is - * enabled, we should prevent memblock from allocating memory - * for the kernel. - * - * ACPI SRAT records all hotpluggable memory ranges. But before - * SRAT is parsed, we don't know about it. - * - * The kernel image is loaded into memory at very early time. We - * cannot prevent this anyway. So on NUMA system, we set any - * node the kernel resides in as un-hotpluggable. - * - * Since on modern servers, one node could have double-digit - * gigabytes memory, we can assume the memory around the kernel - * image is also un-hotpluggable. So before SRAT is parsed, just - * allocate memory near the kernel image to try the best to keep - * the kernel away from hotpluggable memory. - */ - memblock_set_bottom_up(true); movable_node_enabled = true; #else pr_warn("movable_node option not supported\n"); @@ -1853,6 +1865,7 @@ static int __ref __offline_pages(unsigned long start_pfn, long offlined_pages; int ret, drain, retry_max, node; unsigned long flags; + unsigned long valid_start, valid_end; struct zone *zone; struct memory_notify arg; @@ -1863,10 +1876,10 @@ static int __ref __offline_pages(unsigned long start_pfn, return -EINVAL; /* This makes hotplug much easier...and readable. we assume this for now. .*/ - if (!test_pages_in_a_zone(start_pfn, end_pfn)) + if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) return -EINVAL; - zone = page_zone(pfn_to_page(start_pfn)); + zone = page_zone(pfn_to_page(valid_start)); node = zone_to_nid(zone); nr_pages = end_pfn - start_pfn; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0b859af06b87..37d0b334bfe9 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -73,6 +73,9 @@ #include <linux/hugetlb.h> #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/numa_balancing.h> +#include <linux/sched/task.h> #include <linux/nodemask.h> #include <linux/cpuset.h> #include <linux/slab.h> @@ -96,7 +99,7 @@ #include <linux/printk.h> #include <asm/tlbflush.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" @@ -276,7 +279,9 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, return ERR_PTR(-EINVAL); } } else if (mode == MPOL_LOCAL) { - if (!nodes_empty(*nodes)) + if (!nodes_empty(*nodes) || + (flags & MPOL_F_STATIC_NODES) || + (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) @@ -496,7 +501,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, page = pmd_page(*pmd); if (is_huge_zero_page(page)) { spin_unlock(ptl); - split_huge_pmd(vma, pmd, addr); + __split_huge_pmd(vma, pmd, addr, false, NULL); } else { get_page(page); spin_unlock(ptl); @@ -1524,7 +1529,6 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, compat_ulong_t, maxnode) { - long err = 0; unsigned long __user *nm = NULL; unsigned long nr_bits, alloc_size; DECLARE_BITMAP(bm, MAX_NUMNODES); @@ -1533,14 +1537,13 @@ COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) { - err = compat_get_bitmap(bm, nmask, nr_bits); + if (compat_get_bitmap(bm, nmask, nr_bits)) + return -EFAULT; nm = compat_alloc_user_space(alloc_size); - err |= copy_to_user(nm, bm, alloc_size); + if (copy_to_user(nm, bm, alloc_size)) + return -EFAULT; } - if (err) - return -EFAULT; - return sys_set_mempolicy(mode, nm, nr_bits+1); } @@ -1548,7 +1551,6 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, compat_ulong_t, mode, compat_ulong_t __user *, nmask, compat_ulong_t, maxnode, compat_ulong_t, flags) { - long err = 0; unsigned long __user *nm = NULL; unsigned long nr_bits, alloc_size; nodemask_t bm; @@ -1557,14 +1559,13 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) { - err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); + if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits)) + return -EFAULT; nm = compat_alloc_user_space(alloc_size); - err |= copy_to_user(nm, nodes_addr(bm), alloc_size); + if (copy_to_user(nm, nodes_addr(bm), alloc_size)) + return -EFAULT; } - if (err) - return -EFAULT; - return sys_mbind(start, len, mode, nm, nr_bits+1, flags); } @@ -1679,25 +1680,17 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, int nd) { - switch (policy->mode) { - case MPOL_PREFERRED: - if (!(policy->flags & MPOL_F_LOCAL)) - nd = policy->v.preferred_node; - break; - case MPOL_BIND: + if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) + nd = policy->v.preferred_node; + else { /* - * Normally, MPOL_BIND allocations are node-local within the - * allowed nodemask. However, if __GFP_THISNODE is set and the - * current node isn't part of the mask, we use the zonelist for - * the first node in the mask instead. + * __GFP_THISNODE shouldn't even be used with the bind policy + * because we might easily break the expectation to stay on the + * requested node and not break the policy. */ - if (unlikely(gfp & __GFP_THISNODE) && - unlikely(!node_isset(nd, policy->v.nodes))) - nd = first_node(policy->v.nodes); - break; - default: - BUG(); + WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); } + return node_zonelist(nd, gfp); } @@ -2023,8 +2016,8 @@ retry_cpuset: nmask = policy_nodemask(gfp, pol); zl = policy_zonelist(gfp, pol, node); - mpol_cond_put(pol); page = __alloc_pages_nodemask(gfp, order, zl, nmask); + mpol_cond_put(pol); out: if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; diff --git a/mm/migrate.c b/mm/migrate.c index 99250aee1ac1..738f1d5f8350 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -40,6 +40,7 @@ #include <linux/mmu_notifier.h> #include <linux/page_idle.h> #include <linux/page_owner.h> +#include <linux/sched/mm.h> #include <asm/tlbflush.h> @@ -74,7 +75,7 @@ int migrate_prep_local(void) return 0; } -bool isolate_movable_page(struct page *page, isolate_mode_t mode) +int isolate_movable_page(struct page *page, isolate_mode_t mode) { struct address_space *mapping; @@ -125,14 +126,14 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode) __SetPageIsolated(page); unlock_page(page); - return true; + return 0; out_no_isolated: unlock_page(page); out_putpage: put_page(page); out: - return false; + return -EBUSY; } /* It should be called on page which is PG_movable */ @@ -168,8 +169,6 @@ void putback_movable_pages(struct list_head *l) continue; } list_del(&page->lru); - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); /* * We isolated non-lru movable page so here we can use * __PageMovable because LRU page's mapping cannot have @@ -185,6 +184,8 @@ void putback_movable_pages(struct list_head *l) unlock_page(page); put_page(page); } else { + dec_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); putback_lru_page(page); } } @@ -193,82 +194,65 @@ void putback_movable_pages(struct list_head *l) /* * Restore a potential migration pte to a working pte entry */ -static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, +static int remove_migration_pte(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *old) { - struct mm_struct *mm = vma->vm_mm; + struct page_vma_mapped_walk pvmw = { + .page = old, + .vma = vma, + .address = addr, + .flags = PVMW_SYNC | PVMW_MIGRATION, + }; + struct page *new; + pte_t pte; swp_entry_t entry; - pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - if (unlikely(PageHuge(new))) { - ptep = huge_pte_offset(mm, addr); - if (!ptep) - goto out; - ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep); - } else { - pmd = mm_find_pmd(mm, addr); - if (!pmd) - goto out; + VM_BUG_ON_PAGE(PageTail(page), page); + while (page_vma_mapped_walk(&pvmw)) { + if (PageKsm(page)) + new = page; + else + new = page - pvmw.page->index + + linear_page_index(vma, pvmw.address); - ptep = pte_offset_map(pmd, addr); + get_page(new); + pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); + if (pte_swp_soft_dirty(*pvmw.pte)) + pte = pte_mksoft_dirty(pte); /* - * Peek to check is_swap_pte() before taking ptlock? No, we - * can race mremap's move_ptes(), which skips anon_vma lock. + * Recheck VMA as permissions can change since migration started */ - - ptl = pte_lockptr(mm, pmd); - } - - spin_lock(ptl); - pte = *ptep; - if (!is_swap_pte(pte)) - goto unlock; - - entry = pte_to_swp_entry(pte); - - if (!is_migration_entry(entry) || - migration_entry_to_page(entry) != old) - goto unlock; - - get_page(new); - pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); - if (pte_swp_soft_dirty(*ptep)) - pte = pte_mksoft_dirty(pte); - - /* Recheck VMA as permissions can change since migration started */ - if (is_write_migration_entry(entry)) - pte = maybe_mkwrite(pte, vma); + entry = pte_to_swp_entry(*pvmw.pte); + if (is_write_migration_entry(entry)) + pte = maybe_mkwrite(pte, vma); #ifdef CONFIG_HUGETLB_PAGE - if (PageHuge(new)) { - pte = pte_mkhuge(pte); - pte = arch_make_huge_pte(pte, vma, new, 0); - } + if (PageHuge(new)) { + pte = pte_mkhuge(pte); + pte = arch_make_huge_pte(pte, vma, new, 0); + } #endif - flush_dcache_page(new); - set_pte_at(mm, addr, ptep, pte); + flush_dcache_page(new); + set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); - if (PageHuge(new)) { - if (PageAnon(new)) - hugepage_add_anon_rmap(new, vma, addr); + if (PageHuge(new)) { + if (PageAnon(new)) + hugepage_add_anon_rmap(new, vma, pvmw.address); + else + page_dup_rmap(new, true); + } else if (PageAnon(new)) + page_add_anon_rmap(new, vma, pvmw.address, false); else - page_dup_rmap(new, true); - } else if (PageAnon(new)) - page_add_anon_rmap(new, vma, addr, false); - else - page_add_file_rmap(new, false); + page_add_file_rmap(new, false); - if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) - mlock_vma_page(new); + if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) + mlock_vma_page(new); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, pvmw.address, pvmw.pte); + } - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, addr, ptep); -unlock: - pte_unmap_unlock(ptep, ptl); -out: return SWAP_AGAIN; } @@ -466,13 +450,15 @@ int migrate_page_move_mapping(struct address_space *mapping, */ newpage->index = page->index; newpage->mapping = page->mapping; - if (PageSwapBacked(page)) - __SetPageSwapBacked(newpage); - get_page(newpage); /* add cache reference */ - if (PageSwapCache(page)) { - SetPageSwapCache(newpage); - set_page_private(newpage, page_private(page)); + if (PageSwapBacked(page)) { + __SetPageSwapBacked(newpage); + if (PageSwapCache(page)) { + SetPageSwapCache(newpage); + set_page_private(newpage, page_private(page)); + } + } else { + VM_BUG_ON_PAGE(PageSwapCache(page), page); } /* Move dirty while page refs frozen and newpage not yet exposed */ @@ -482,7 +468,7 @@ int migrate_page_move_mapping(struct address_space *mapping, SetPageDirty(newpage); } - radix_tree_replace_slot(pslot, newpage); + radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); /* * Drop cache reference from old page by unfreezing @@ -556,7 +542,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, get_page(newpage); - radix_tree_replace_slot(pslot, newpage); + radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); page_ref_unfreeze(page, expected_count - 1); @@ -1121,8 +1107,15 @@ out: * restored. */ list_del(&page->lru); - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + + /* + * Compaction can migrate also non-LRU pages which are + * not accounted to NR_ISOLATED_*. They can be recognized + * as __PageMovable + */ + if (likely(!__PageMovable(page))) + dec_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); } /* diff --git a/mm/mincore.c b/mm/mincore.c index bfb866435478..c5687c45c326 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -14,9 +14,10 @@ #include <linux/syscalls.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/shmem_fs.h> #include <linux/hugetlb.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgtable.h> static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, diff --git a/mm/mlock.c b/mm/mlock.c index cdbed8aaa426..0dd9ca18e19e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -8,6 +8,7 @@ #include <linux/capability.h> #include <linux/mman.h> #include <linux/mm.h> +#include <linux/sched/user.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pagemap.h> @@ -379,6 +380,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, pte = get_locked_pte(vma->vm_mm, start, &ptl); /* Make sure we do not cross the page table boundary */ end = pgd_addr_end(start, end); + end = p4d_addr_end(start, end); end = pud_addr_end(start, end); end = pmd_addr_end(start, end); @@ -441,7 +443,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, while (start < end) { struct page *page; - unsigned int page_mask; + unsigned int page_mask = 0; unsigned long page_increm; struct pagevec pvec; struct zone *zone; @@ -455,8 +457,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, * suits munlock very well (and if somehow an abnormal page * has sneaked into the range, we won't oops here: great). */ - page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, - &page_mask); + page = follow_page(vma, start, FOLL_GET | FOLL_DUMP); if (page && !IS_ERR(page)) { if (PageTransTail(page)) { @@ -467,8 +468,8 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, /* * Any THP page found by follow_page_mask() may * have gotten split before reaching - * munlock_vma_page(), so we need to recompute - * the page_mask here. + * munlock_vma_page(), so we need to compute + * the page_mask here instead. */ page_mask = munlock_vma_page(page); unlock_page(page); diff --git a/mm/mmap.c b/mm/mmap.c index 1af87c14183d..bfbe8856d134 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -45,7 +45,7 @@ #include <linux/moduleparam.h> #include <linux/pkeys.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlb.h> #include <asm/mmu_context.h> @@ -176,7 +176,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -static int do_brk(unsigned long addr, unsigned long len); +static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf); SYSCALL_DEFINE1(brk, unsigned long, brk) { @@ -185,6 +185,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) struct mm_struct *mm = current->mm; unsigned long min_brk; bool populate; + LIST_HEAD(uf); if (down_write_killable(&mm->mmap_sem)) return -EINTR; @@ -222,7 +223,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* Always allow shrinking brk. */ if (brk <= mm->brk) { - if (!do_munmap(mm, newbrk, oldbrk-newbrk)) + if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf)) goto set_brk; goto out; } @@ -232,13 +233,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) goto out; /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk) < 0) + if (do_brk(oldbrk, newbrk-oldbrk, &uf) < 0) goto out; set_brk: mm->brk = brk; populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; up_write(&mm->mmap_sem); + userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(oldbrk, newbrk - oldbrk); return brk; @@ -1304,7 +1306,8 @@ static inline int mlock_future_check(struct mm_struct *mm, unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, - unsigned long pgoff, unsigned long *populate) + unsigned long pgoff, unsigned long *populate, + struct list_head *uf) { struct mm_struct *mm = current->mm; int pkey = 0; @@ -1447,7 +1450,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_NORESERVE; } - addr = mmap_region(file, addr, len, vm_flags, pgoff); + addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) @@ -1583,7 +1586,8 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) } unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -1609,7 +1613,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Clear old maps */ while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { - if (do_munmap(mm, addr, len)) + if (do_munmap(mm, addr, len, uf)) return -ENOMEM; } @@ -1668,7 +1672,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * new file must not have been exposed to user-space, yet. */ vma->vm_file = get_file(file); - error = file->f_op->mmap(file, vma); + error = call_mmap(file, vma); if (error) goto unmap_and_free_vma; @@ -2495,11 +2499,11 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * __split_vma() bypasses sysctl_max_map_count checking. We use this on the - * munmap path where it doesn't make sense to fail. + * __split_vma() bypasses sysctl_max_map_count checking. We use this where it + * has already been checked or doesn't make sense to fail. */ -static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, int new_below) +int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vm_area_struct *new; int err; @@ -2579,7 +2583,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, * work. This now handles partial unmappings. * Jeremy Fitzhardinge <jeremy@goop.org> */ -int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, + struct list_head *uf) { unsigned long end; struct vm_area_struct *vma, *prev, *last; @@ -2603,6 +2608,13 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) if (vma->vm_start >= end) return 0; + if (uf) { + int error = userfaultfd_unmap_prep(vma, start, end, uf); + + if (error) + return error; + } + /* * If we need to split any vma, do it now to save pain later. * @@ -2668,27 +2680,22 @@ int vm_munmap(unsigned long start, size_t len) { int ret; struct mm_struct *mm = current->mm; + LIST_HEAD(uf); if (down_write_killable(&mm->mmap_sem)) return -EINTR; - ret = do_munmap(mm, start, len); + ret = do_munmap(mm, start, len, &uf); up_write(&mm->mmap_sem); + userfaultfd_unmap_complete(mm, &uf); return ret; } EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { - int ret; - struct mm_struct *mm = current->mm; - profile_munmap(addr); - if (down_write_killable(&mm->mmap_sem)) - return -EINTR; - ret = do_munmap(mm, addr, len); - up_write(&mm->mmap_sem); - return ret; + return vm_munmap(addr, len); } @@ -2780,7 +2787,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, file = get_file(vma->vm_file); ret = do_mmap_pgoff(vma->vm_file, start, size, - prot, flags, pgoff, &populate); + prot, flags, pgoff, &populate, NULL); fput(file); out: up_write(&mm->mmap_sem); @@ -2806,11 +2813,11 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */ -static int do_brk(unsigned long addr, unsigned long request) +static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; - unsigned long flags, len; + unsigned long len; struct rb_node **rb_link, *rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; int error; @@ -2821,7 +2828,10 @@ static int do_brk(unsigned long addr, unsigned long request) if (!len) return 0; - flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + /* Until we need other flags, refuse anything except VM_EXEC. */ + if ((flags & (~VM_EXEC)) != 0) + return -EINVAL; + flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); if (offset_in_page(error)) @@ -2842,7 +2852,7 @@ static int do_brk(unsigned long addr, unsigned long request) */ while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { - if (do_munmap(mm, addr, len)) + if (do_munmap(mm, addr, len, uf)) return -ENOMEM; } @@ -2889,22 +2899,35 @@ out: return 0; } -int vm_brk(unsigned long addr, unsigned long len) +static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf) +{ + return do_brk_flags(addr, len, 0, uf); +} + +int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; int ret; bool populate; + LIST_HEAD(uf); if (down_write_killable(&mm->mmap_sem)) return -EINTR; - ret = do_brk(addr, len); + ret = do_brk_flags(addr, len, flags, &uf); populate = ((mm->def_flags & VM_LOCKED) != 0); up_write(&mm->mmap_sem); + userfaultfd_unmap_complete(mm, &uf); if (populate && !ret) mm_populate(addr, len); return ret; } +EXPORT_SYMBOL(vm_brk_flags); + +int vm_brk(unsigned long addr, unsigned long len) +{ + return vm_brk_flags(addr, len, 0); +} EXPORT_SYMBOL(vm_brk); /* Release all mmaps. */ @@ -3111,8 +3134,7 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) mm->data_vm += npages; } -static int special_mapping_fault(struct vm_area_struct *vma, - struct vm_fault *vmf); +static int special_mapping_fault(struct vm_fault *vmf); /* * Having a close hook prevents vma merging regardless of flags. @@ -3147,9 +3169,9 @@ static const struct vm_operations_struct legacy_special_mapping_vmops = { .fault = special_mapping_fault, }; -static int special_mapping_fault(struct vm_area_struct *vma, - struct vm_fault *vmf) +static int special_mapping_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; pgoff_t pgoff; struct page **pages; @@ -3159,7 +3181,7 @@ static int special_mapping_fault(struct vm_area_struct *vma, struct vm_special_mapping *sm = vma->vm_private_data; if (sm->fault) - return sm->fault(sm, vma, vmf); + return sm->fault(sm, vmf->vma, vmf); pages = sm->pages; } @@ -3433,7 +3455,7 @@ void mm_drop_all_locks(struct mm_struct *mm) } /* - * initialise the VMA slab + * initialise the percpu counter for VM */ void __init mmap_init(void) { diff --git a/mm/mmu_context.c b/mm/mmu_context.c index 6f4d27c5bb32..3e612ae748e9 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -5,6 +5,8 @@ #include <linux/mm.h> #include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/task.h> #include <linux/mmu_context.h> #include <linux/export.h> @@ -25,7 +27,7 @@ void use_mm(struct mm_struct *mm) task_lock(tsk); active_mm = tsk->active_mm; if (active_mm != mm) { - atomic_inc(&mm->mm_count); + mmgrab(mm); tsk->active_mm = mm; } tsk->mm = mm; diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index f4259e496f83..a7652acd2ab9 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -17,6 +17,7 @@ #include <linux/srcu.h> #include <linux/rcupdate.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/slab.h> /* global SRCU for all MMs */ @@ -275,7 +276,7 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, mm->mmu_notifier_mm = mmu_notifier_mm; mmu_notifier_mm = NULL; } - atomic_inc(&mm->mm_count); + mmgrab(mm); /* * Serialize the update against mmu_notifier_unregister. A diff --git a/mm/mmzone.c b/mm/mmzone.c index 5652be858e5e..a51c0a67ea3d 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -60,7 +60,7 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z, * Find the next suitable zone to use for the allocation. * Only filter based on nodemask if it's set */ - if (likely(nodes == NULL)) + if (unlikely(nodes == NULL)) while (zonelist_zone_idx(z) > highest_zoneidx) z++; else diff --git a/mm/mprotect.c b/mm/mprotect.c index 11936526b08b..8edd0d576254 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -25,7 +25,7 @@ #include <linux/perf_event.h> #include <linux/pkeys.h> #include <linux/ksm.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> @@ -33,34 +33,6 @@ #include "internal.h" -/* - * For a prot_numa update we only hold mmap_sem for read so there is a - * potential race with faulting where a pmd was temporarily none. This - * function checks for a transhuge pmd under the appropriate lock. It - * returns a pte if it was successfully locked or NULL if it raced with - * a transhuge insertion. - */ -static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, int prot_numa, spinlock_t **ptl) -{ - pte_t *pte; - spinlock_t *pmdl; - - /* !prot_numa is protected by mmap_sem held for write */ - if (!prot_numa) - return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); - - pmdl = pmd_lock(vma->vm_mm, pmd); - if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) { - spin_unlock(pmdl); - return NULL; - } - - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); - spin_unlock(pmdl); - return pte; -} - static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) @@ -69,11 +41,31 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; + int target_node = NUMA_NO_NODE; - pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); + /* + * Can be called with only the mmap_sem for reading by + * prot_numa so we must check the pmd isn't constantly + * changing from under us from pmd_none to pmd_trans_huge + * and/or the other way around. + */ + if (pmd_trans_unstable(pmd)) + return 0; + + /* + * The pmd points to a regular pte so the pmd can't change + * from under us even if the mmap_sem is only hold for + * reading. + */ + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!pte) return 0; + /* Get target node for single threaded private VMAs */ + if (prot_numa && !(vma->vm_flags & VM_SHARED) && + atomic_read(&vma->vm_mm->mm_users) == 1) + target_node = numa_node_id(); + arch_enter_lazy_mmu_mode(); do { oldpte = *pte; @@ -95,12 +87,19 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) continue; + + /* + * Don't mess with PTEs if page is already on the node + * a single-threaded process is running on. + */ + if (target_node == page_to_nid(page)) + continue; } ptent = ptep_modify_prot_start(mm, addr, pte); ptent = pte_modify(ptent, newprot); if (preserve_write) - ptent = pte_mkwrite(ptent); + ptent = pte_mk_savedwrite(ptent); /* Avoid taking write faults for known dirty pages */ if (dirty_accountable && pte_dirty(ptent) && @@ -163,9 +162,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { - split_huge_pmd(vma, pmd, addr); - if (pmd_trans_unstable(pmd)) - continue; + __split_huge_pmd(vma, pmd, addr, false, NULL); } else { int nr_ptes = change_huge_pmd(vma, pmd, addr, newprot, prot_numa); @@ -196,14 +193,14 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, } static inline unsigned long change_pud_range(struct vm_area_struct *vma, - pgd_t *pgd, unsigned long addr, unsigned long end, + p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) { pud_t *pud; unsigned long next; unsigned long pages = 0; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) @@ -215,6 +212,26 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, return pages; } +static inline unsigned long change_p4d_range(struct vm_area_struct *vma, + pgd_t *pgd, unsigned long addr, unsigned long end, + pgprot_t newprot, int dirty_accountable, int prot_numa) +{ + p4d_t *p4d; + unsigned long next; + unsigned long pages = 0; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d)) + continue; + pages += change_pud_range(vma, p4d, addr, next, newprot, + dirty_accountable, prot_numa); + } while (p4d++, addr = next, addr != end); + + return pages; +} + static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) @@ -233,7 +250,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - pages += change_pud_range(vma, pgd, addr, next, newprot, + pages += change_p4d_range(vma, pgd, addr, next, newprot, dirty_accountable, prot_numa); } while (pgd++, addr = next, addr != end); @@ -484,6 +501,8 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, return do_mprotect_pkey(start, len, prot, -1); } +#ifdef CONFIG_ARCH_HAS_PKEYS + SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len, unsigned long, prot, int, pkey) { @@ -534,3 +553,5 @@ SYSCALL_DEFINE1(pkey_free, int, pkey) */ return ret; } + +#endif /* CONFIG_ARCH_HAS_PKEYS */ diff --git a/mm/mremap.c b/mm/mremap.c index 30d7d2482eea..cd8a1b199ef9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -22,6 +22,7 @@ #include <linux/mmu_notifier.h> #include <linux/uaccess.h> #include <linux/mm-arch-hooks.h> +#include <linux/userfaultfd_k.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -31,6 +32,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -38,7 +40,11 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) if (pgd_none_or_clear_bad(pgd)) return NULL; - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (p4d_none_or_clear_bad(p4d)) + return NULL; + + pud = pud_offset(p4d, addr); if (pud_none_or_clear_bad(pud)) return NULL; @@ -53,11 +59,15 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = pgd_offset(mm, addr); - pud = pud_alloc(mm, pgd, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return NULL; + pud = pud_alloc(mm, p4d, addr); if (!pud) return NULL; @@ -250,7 +260,9 @@ unsigned long move_page_tables(struct vm_area_struct *vma, static unsigned long move_vma(struct vm_area_struct *vma, unsigned long old_addr, unsigned long old_len, - unsigned long new_len, unsigned long new_addr, bool *locked) + unsigned long new_len, unsigned long new_addr, + bool *locked, struct vm_userfaultfd_ctx *uf, + struct list_head *uf_unmap) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; @@ -309,6 +321,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, old_addr = new_addr; new_addr = err; } else { + mremap_userfaultfd_prep(new_vma, uf); arch_remap(mm, old_addr, old_addr + old_len, new_addr, new_addr + new_len); } @@ -338,7 +351,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (unlikely(vma->vm_flags & VM_PFNMAP)) untrack_pfn_moved(vma); - if (do_munmap(mm, old_addr, old_len) < 0) { + if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { /* OOM: unable to split vma, just get accounts right */ vm_unacct_memory(excess >> PAGE_SHIFT); excess = 0; @@ -413,7 +426,9 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, } static unsigned long mremap_to(unsigned long addr, unsigned long old_len, - unsigned long new_addr, unsigned long new_len, bool *locked) + unsigned long new_addr, unsigned long new_len, bool *locked, + struct vm_userfaultfd_ctx *uf, + struct list_head *uf_unmap) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -431,12 +446,12 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (addr + old_len > new_addr && new_addr + new_len > addr) goto out; - ret = do_munmap(mm, new_addr, new_len); + ret = do_munmap(mm, new_addr, new_len, NULL); if (ret) goto out; if (old_len >= new_len) { - ret = do_munmap(mm, addr+new_len, old_len - new_len); + ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap); if (ret && old_len != new_len) goto out; old_len = new_len; @@ -458,7 +473,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (offset_in_page(ret)) goto out1; - ret = move_vma(vma, addr, old_len, new_len, new_addr, locked); + ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf, + uf_unmap); if (!(offset_in_page(ret))) goto out; out1: @@ -497,6 +513,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long ret = -EINVAL; unsigned long charged = 0; bool locked = false; + struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; + LIST_HEAD(uf_unmap); if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) return ret; @@ -523,7 +541,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (flags & MREMAP_FIXED) { ret = mremap_to(addr, old_len, new_addr, new_len, - &locked); + &locked, &uf, &uf_unmap); goto out; } @@ -533,7 +551,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * do_munmap does all the needed commit accounting */ if (old_len >= new_len) { - ret = do_munmap(mm, addr+new_len, old_len - new_len); + ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap); if (ret && old_len != new_len) goto out; ret = addr; @@ -592,7 +610,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, goto out; } - ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); + ret = move_vma(vma, addr, old_len, new_len, new_addr, + &locked, &uf, &uf_unmap); } out: if (offset_in_page(ret)) { @@ -602,5 +621,7 @@ out: up_write(¤t->mm->mmap_sem); if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); + mremap_userfaultfd_complete(&uf, addr, new_addr, old_len); + userfaultfd_unmap_complete(mm, &uf_unmap); return ret; } diff --git a/mm/nommu.c b/mm/nommu.c index 8b8faaf2a9e9..2d131b97a851 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -17,6 +17,7 @@ #include <linux/export.h> #include <linux/mm.h> +#include <linux/sched/mm.h> #include <linux/vmacache.h> #include <linux/mman.h> #include <linux/swap.h> @@ -35,7 +36,7 @@ #include <linux/audit.h> #include <linux/printk.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -176,9 +177,10 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages_locked); -long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) +static long __get_user_pages_unlocked(struct task_struct *tsk, + struct mm_struct *mm, unsigned long start, + unsigned long nr_pages, struct page **pages, + unsigned int gup_flags) { long ret; down_read(&mm->mmap_sem); @@ -187,7 +189,6 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, up_read(&mm->mmap_sem); return ret; } -EXPORT_SYMBOL(__get_user_pages_unlocked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) @@ -517,7 +518,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) } /* - * initialise the VMA and region record slabs + * initialise the percpu counter for VM and region record slabs */ void __init mmap_init(void) { @@ -757,7 +758,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) mm->map_count--; for (i = 0; i < VMACACHE_SIZE; i++) { /* if the vma is cached, invalidate the entire cache */ - if (curr->vmacache[i] == vma) { + if (curr->vmacache.vmas[i] == vma) { vmacache_invalidate(mm); break; } @@ -1084,7 +1085,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) { int ret; - ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); + ret = call_mmap(vma->vm_file, vma); if (ret == 0) { vma->vm_region->vm_top = vma->vm_region->vm_end; return 0; @@ -1115,7 +1116,7 @@ static int do_mmap_private(struct vm_area_struct *vma, * - VM_MAYSHARE will be set if it may attempt to share */ if (capabilities & NOMMU_MAP_DIRECT) { - ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); + ret = call_mmap(vma->vm_file, vma); if (ret == 0) { /* shouldn't return success if we're not sharing */ BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); @@ -1191,7 +1192,7 @@ error_free: enomem: pr_err("Allocation of length %lu from process %d (%s) failed\n", len, current->pid, current->comm); - show_free_areas(0); + show_free_areas(0, NULL); return -ENOMEM; } @@ -1205,7 +1206,8 @@ unsigned long do_mmap(struct file *file, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, - unsigned long *populate) + unsigned long *populate, + struct list_head *uf) { struct vm_area_struct *vma; struct vm_region *region; @@ -1412,13 +1414,13 @@ error_getting_vma: kmem_cache_free(vm_region_jar, region); pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n", len, current->pid); - show_free_areas(0); + show_free_areas(0, NULL); return -ENOMEM; error_getting_region: pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n", len, current->pid); - show_free_areas(0); + show_free_areas(0, NULL); return -ENOMEM; } @@ -1577,7 +1579,7 @@ static int shrink_vma(struct mm_struct *mm, * - under NOMMU conditions the chunk to be unmapped must be backed by a single * VMA, though it need not cover the whole VMA */ -int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { struct vm_area_struct *vma; unsigned long end; @@ -1643,7 +1645,7 @@ int vm_munmap(unsigned long addr, size_t len) int ret; down_write(&mm->mmap_sem); - ret = do_munmap(mm, addr, len); + ret = do_munmap(mm, addr, len, NULL); up_write(&mm->mmap_sem); return ret; } @@ -1794,21 +1796,21 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +int filemap_fault(struct vm_fault *vmf) { BUG(); return 0; } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct fault_env *fe, +void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { BUG(); } EXPORT_SYMBOL(filemap_map_pages); -static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, +int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct vm_area_struct *vma; @@ -1878,6 +1880,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in mmput(mm); return len; } +EXPORT_SYMBOL_GPL(access_process_vm); /** * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ec9f11d4f094..d083714a2bb9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -22,6 +22,9 @@ #include <linux/err.h> #include <linux/gfp.h> #include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/coredump.h> +#include <linux/sched/task.h> #include <linux/swap.h> #include <linux/timex.h> #include <linux/jiffies.h> @@ -403,12 +406,14 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p) { - nodemask_t *nm = (oc->nodemask) ? oc->nodemask : &cpuset_current_mems_allowed; - - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n", - current->comm, oc->gfp_mask, &oc->gfp_mask, - nodemask_pr_args(nm), oc->order, - current->signal->oom_score_adj); + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=", + current->comm, oc->gfp_mask, &oc->gfp_mask); + if (oc->nodemask) + pr_cont("%*pbl", nodemask_pr_args(oc->nodemask)); + else + pr_cont("(null)"); + pr_cont(", order=%d, oom_score_adj=%hd\n", + oc->order, current->signal->oom_score_adj); if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) pr_warn("COMPACTION is disabled!!!\n"); @@ -417,7 +422,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) if (oc->memcg) mem_cgroup_print_oom_info(oc->memcg, p); else - show_mem(SHOW_MEM_FILTER_NODES); + show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); if (sysctl_oom_dump_tasks) dump_tasks(oc->memcg, oc->nodemask); } @@ -465,8 +470,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { struct mmu_gather tlb; struct vm_area_struct *vma; - struct zap_details details = {.check_swap_entries = true, - .ignore_dirty = true}; bool ret = true; /* @@ -510,14 +513,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) tlb_gather_mmu(&tlb, mm, 0, -1); for (vma = mm->mmap ; vma; vma = vma->vm_next) { - if (is_vm_hugetlb_page(vma)) - continue; - - /* - * mlocked VMAs require explicit munlocking before unmap. - * Let's keep it simple here and skip such VMAs. - */ - if (vma->vm_flags & VM_LOCKED) + if (!can_madv_dontneed_vma(vma)) continue; /* @@ -532,7 +528,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) */ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, - &details); + NULL); } tlb_finish_mmu(&tlb, 0, -1); pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", @@ -660,7 +656,7 @@ static void mark_oom_victim(struct task_struct *tsk) /* oom_mm is bound to the signal struct life time. */ if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) - atomic_inc(&tsk->signal->oom_mm->mm_count); + mmgrab(tsk->signal->oom_mm); /* * Make sure that the task is woken up from uninterruptible sleep @@ -877,7 +873,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) /* Get a reference to safely compare mm after task_unlock(victim) */ mm = victim->mm; - atomic_inc(&mm->mm_count); + mmgrab(mm); /* * We should send SIGKILL before setting TIF_MEMDIE in order to prevent * the OOM victim from depleting the memory reserves from the user @@ -1013,7 +1009,7 @@ bool out_of_memory(struct oom_control *oc) * make sure exclude 0 mask - all other users should have at least * ___GFP_DIRECT_RECLAIM to get here. */ - if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL))) + if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) return true; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 439cc63ad903..d8ac2a7fb9e7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -36,6 +36,7 @@ #include <linux/pagevec.h> #include <linux/timer.h> #include <linux/sched/rt.h> +#include <linux/sched/signal.h> #include <linux/mm_inline.h> #include <trace/events/writeback.h> @@ -580,7 +581,7 @@ static void wb_domain_writeout_inc(struct wb_domain *dom, __fprop_inc_percpu_max(&dom->completions, completions, max_prop_frac); /* First event after period switching was turned off? */ - if (!unlikely(dom->period_time)) { + if (unlikely(!dom->period_time)) { /* * We can race with other __bdi_writeout_inc calls here but * it does not cause any harm since the resulting time when @@ -1778,6 +1779,7 @@ pause: pause, start_time); __set_current_state(TASK_KILLABLE); + wb->dirty_sleep = now; io_schedule_timeout(pause); current->dirty_paused_when = now + pause; @@ -1796,7 +1798,7 @@ pause: * pages exceeds dirty_thresh, give the other good wb's a pipe * to go through, so that tasks on them still remain responsive. * - * In theory 1 page is enough to keep the comsumer-producer + * In theory 1 page is enough to keep the consumer-producer * pipe going: the flusher cleans 1 page => the task dirties 1 * more page. However wb_dirty has accounting errors. So use * the larger and more IO friendly wb_stat_error. @@ -1987,11 +1989,11 @@ void laptop_mode_timer_fn(unsigned long data) * We want to write everything out, not just down to the dirty * threshold */ - if (!bdi_has_dirty_io(&q->backing_dev_info)) + if (!bdi_has_dirty_io(q->backing_dev_info)) return; rcu_read_lock(); - list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node) + list_for_each_entry_rcu(wb, &q->backing_dev_info->wb_list, bdi_node) if (wb_has_dirty_io(wb)) wb_start_writeback(wb, nr_pages, true, WB_REASON_LAPTOP_TIMER); @@ -2105,18 +2107,26 @@ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { #define WRITEBACK_TAG_BATCH 4096 - unsigned long tagged; - - do { - spin_lock_irq(&mapping->tree_lock); - tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, - &start, end, WRITEBACK_TAG_BATCH, - PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); + unsigned long tagged = 0; + struct radix_tree_iter iter; + void **slot; + + spin_lock_irq(&mapping->tree_lock); + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, + PAGECACHE_TAG_DIRTY) { + if (iter.index > end) + break; + radix_tree_iter_tag_set(&mapping->page_tree, &iter, + PAGECACHE_TAG_TOWRITE); + tagged++; + if ((tagged % WRITEBACK_TAG_BATCH) != 0) + continue; + slot = radix_tree_iter_resume(slot, &iter); spin_unlock_irq(&mapping->tree_lock); - WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH); cond_resched(); - /* We check 'start' to handle wrapping when end == ~0UL */ - } while (tagged >= WRITEBACK_TAG_BATCH && start); + spin_lock_irq(&mapping->tree_lock); + } + spin_unlock_irq(&mapping->tree_lock); } EXPORT_SYMBOL(tag_pages_for_writeback); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6de9440e3ae2..07efbc3a8656 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -55,12 +55,13 @@ #include <linux/kmemleak.h> #include <linux/compaction.h> #include <trace/events/kmem.h> +#include <trace/events/oom.h> #include <linux/prefetch.h> #include <linux/mm_inline.h> #include <linux/migrate.h> -#include <linux/page_ext.h> #include <linux/hugetlb.h> #include <linux/sched/rt.h> +#include <linux/sched/mm.h> #include <linux/page_owner.h> #include <linux/kthread.h> #include <linux/memcontrol.h> @@ -91,6 +92,10 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_); int _node_numa_mem_[MAX_NUMNODES]; #endif +/* work_structs for global per-cpu drains */ +DEFINE_MUTEX(pcpu_drain_mutex); +DEFINE_PER_CPU(struct work_struct, pcpu_drain); + #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY volatile unsigned long latent_entropy __latent_entropy; EXPORT_SYMBOL(latent_entropy); @@ -714,7 +719,7 @@ static inline void rmv_page_order(struct page *page) /* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if - * (a) the buddy is not in a hole && + * (a) the buddy is not in a hole (check before calling!) && * (b) the buddy is in the buddy system && * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. @@ -729,9 +734,6 @@ static inline void rmv_page_order(struct page *page) static inline int page_is_buddy(struct page *page, struct page *buddy, unsigned int order) { - if (!pfn_valid_within(page_to_pfn(buddy))) - return 0; - if (page_is_guard(buddy) && page_order(buddy) == order) { if (page_zone_id(page) != page_zone_id(buddy)) return 0; @@ -787,9 +789,8 @@ static inline void __free_one_page(struct page *page, struct zone *zone, unsigned int order, int migratetype) { - unsigned long page_idx; - unsigned long combined_idx; - unsigned long uninitialized_var(buddy_idx); + unsigned long combined_pfn; + unsigned long uninitialized_var(buddy_pfn); struct page *buddy; unsigned int max_order; @@ -802,15 +803,16 @@ static inline void __free_one_page(struct page *page, if (likely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); - page_idx = pfn & ((1 << MAX_ORDER) - 1); - - VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); + VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); continue_merging: while (order < max_order - 1) { - buddy_idx = __find_buddy_index(page_idx, order); - buddy = page + (buddy_idx - page_idx); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); + + if (!pfn_valid_within(buddy_pfn)) + goto done_merging; if (!page_is_buddy(page, buddy, order)) goto done_merging; /* @@ -824,9 +826,9 @@ continue_merging: zone->free_area[order].nr_free--; rmv_page_order(buddy); } - combined_idx = buddy_idx & page_idx; - page = page + (combined_idx - page_idx); - page_idx = combined_idx; + combined_pfn = buddy_pfn & pfn; + page = page + (combined_pfn - pfn); + pfn = combined_pfn; order++; } if (max_order < MAX_ORDER) { @@ -841,8 +843,8 @@ continue_merging: if (unlikely(has_isolate_pageblock(zone))) { int buddy_mt; - buddy_idx = __find_buddy_index(page_idx, order); - buddy = page + (buddy_idx - page_idx); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); buddy_mt = get_pageblock_migratetype(buddy); if (migratetype != buddy_mt @@ -865,13 +867,14 @@ done_merging: * so it's less likely to be used soon and more likely to be merged * as a higher order page */ - if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { + if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) { struct page *higher_page, *higher_buddy; - combined_idx = buddy_idx & page_idx; - higher_page = page + (combined_idx - page_idx); - buddy_idx = __find_buddy_index(combined_idx, order + 1); - higher_buddy = higher_page + (buddy_idx - combined_idx); - if (page_is_buddy(higher_page, higher_buddy, order + 1)) { + combined_pfn = buddy_pfn & pfn; + higher_page = page + (combined_pfn - pfn); + buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); + higher_buddy = higher_page + (buddy_pfn - combined_pfn); + if (pfn_valid_within(buddy_pfn) && + page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); goto out; @@ -1864,14 +1867,14 @@ int move_freepages(struct zone *zone, #endif for (page = start_page; page <= end_page;) { - /* Make sure we are not inadvertently changing nodes */ - VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); - if (!pfn_valid_within(page_to_pfn(page))) { page++; continue; } + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); + if (!PageBuddy(page)) { page++; continue; @@ -2058,8 +2061,12 @@ out_unlock: * potentially hurts the reliability of high-order allocations when under * intense memory pressure but failed atomic allocations should be easier * to recover from than an OOM. + * + * If @force is true, try to unreserve a pageblock even though highatomic + * pageblock is exhausted. */ -static void unreserve_highatomic_pageblock(const struct alloc_context *ac) +static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + bool force) { struct zonelist *zonelist = ac->zonelist; unsigned long flags; @@ -2067,11 +2074,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) struct zone *zone; struct page *page; int order; + bool ret; for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, ac->nodemask) { - /* Preserve at least one pageblock */ - if (zone->nr_reserved_highatomic <= pageblock_nr_pages) + /* + * Preserve at least one pageblock unless memory pressure + * is really high. + */ + if (!force && zone->nr_reserved_highatomic <= + pageblock_nr_pages) continue; spin_lock_irqsave(&zone->lock, flags); @@ -2085,13 +2097,25 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) continue; /* - * It should never happen but changes to locking could - * inadvertently allow a per-cpu drain to add pages - * to MIGRATE_HIGHATOMIC while unreserving so be safe - * and watch for underflows. + * In page freeing path, migratetype change is racy so + * we can counter several free pages in a pageblock + * in this loop althoug we changed the pageblock type + * from highatomic to ac->migratetype. So we should + * adjust the count once. */ - zone->nr_reserved_highatomic -= min(pageblock_nr_pages, - zone->nr_reserved_highatomic); + if (get_pageblock_migratetype(page) == + MIGRATE_HIGHATOMIC) { + /* + * It should never happen but changes to + * locking could inadvertently allow a per-cpu + * drain to add pages to MIGRATE_HIGHATOMIC + * while unreserving so be safe and watch for + * underflows. + */ + zone->nr_reserved_highatomic -= min( + pageblock_nr_pages, + zone->nr_reserved_highatomic); + } /* * Convert to ac->migratetype and avoid the normal @@ -2103,12 +2127,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) * may increase. */ set_pageblock_migratetype(page, ac->migratetype); - move_freepages_block(zone, page, ac->migratetype); - spin_unlock_irqrestore(&zone->lock, flags); - return; + ret = move_freepages_block(zone, page, ac->migratetype); + if (ret) { + spin_unlock_irqrestore(&zone->lock, flags); + return ret; + } } spin_unlock_irqrestore(&zone->lock, flags); } + + return false; } /* Remove an element from the buddy allocator from the fallback list */ @@ -2133,7 +2161,8 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); - if (can_steal) + if (can_steal && + get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC) steal_suitable_fallback(zone, page, start_migratetype); /* Remove the page from the freelists */ @@ -2192,7 +2221,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, bool cold) { - int i; + int i, alloced = 0; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -2217,13 +2246,21 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, else list_add_tail(&page->lru, list); list = &page->lru; + alloced++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } + + /* + * i pages were removed from the buddy list even if some leak due + * to check_pcp_refill failing so adjust NR_FREE_PAGES based + * on i. Do not confuse with 'alloced' which is the number of + * pages added to the pcp list. + */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); - return i; + return alloced; } #ifdef CONFIG_NUMA @@ -2307,16 +2344,26 @@ void drain_local_pages(struct zone *zone) drain_pages(cpu); } +static void drain_local_pages_wq(struct work_struct *work) +{ + /* + * drain_all_pages doesn't use proper cpu hotplug protection so + * we can race with cpu offline when the WQ can move this from + * a cpu pinned worker to an unbound one. We can operate on a different + * cpu which is allright but we also have to make sure to not move to + * a different one. + */ + preempt_disable(); + drain_local_pages(NULL); + preempt_enable(); +} + /* * Spill all the per-cpu pages from all CPUs back into the buddy allocator. * * When zone parameter is non-NULL, spill just the single zone's pages. * - * Note that this code is protected against sending an IPI to an offline - * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: - * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but - * nothing keeps CPUs from showing up after we populated the cpumask and - * before the call to on_each_cpu_mask(). + * Note that this can be extremely slow as the draining happens in a workqueue. */ void drain_all_pages(struct zone *zone) { @@ -2329,6 +2376,28 @@ void drain_all_pages(struct zone *zone) static cpumask_t cpus_with_pcps; /* + * Make sure nobody triggers this path before mm_percpu_wq is fully + * initialized. + */ + if (WARN_ON_ONCE(!mm_percpu_wq)) + return; + + /* Workqueues cannot recurse */ + if (current->flags & PF_WQ_WORKER) + return; + + /* + * Do not drain if one is already in progress unless it's specific to + * a zone. Such callers are primarily CMA and memory hotplug and need + * the drain to be complete when the call returns. + */ + if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { + if (!zone) + return; + mutex_lock(&pcpu_drain_mutex); + } + + /* * We don't care about racing with CPU hotplug event * as offline notification will cause the notified * cpu to drain that CPU pcps and on_each_cpu_mask @@ -2358,8 +2427,16 @@ void drain_all_pages(struct zone *zone) else cpumask_clear_cpu(cpu, &cpus_with_pcps); } - on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages, - zone, 1); + + for_each_cpu(cpu, &cpus_with_pcps) { + struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); + INIT_WORK(work, drain_local_pages_wq); + queue_work_on(cpu, mm_percpu_wq, work); + } + for_each_cpu(cpu, &cpus_with_pcps) + flush_work(per_cpu_ptr(&pcpu_drain, cpu)); + + mutex_unlock(&pcpu_drain_mutex); } #ifdef CONFIG_HIBERNATION @@ -2534,7 +2611,8 @@ int __isolate_free_page(struct page *page, unsigned int order) struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); - if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) + if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) + && mt != MIGRATE_HIGHATOMIC) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } @@ -2548,101 +2626,123 @@ int __isolate_free_page(struct page *page, unsigned int order) * Update NUMA hit/miss statistics * * Must be called with interrupts disabled. - * - * When __GFP_OTHER_NODE is set assume the node of the preferred - * zone is the local node. This is useful for daemons who allocate - * memory on behalf of other processes. */ -static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, - gfp_t flags) +static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) { #ifdef CONFIG_NUMA - int local_nid = numa_node_id(); enum zone_stat_item local_stat = NUMA_LOCAL; - if (unlikely(flags & __GFP_OTHER_NODE)) { + if (z->node != numa_node_id()) local_stat = NUMA_OTHER; - local_nid = preferred_zone->node; - } - if (z->node == local_nid) { + if (z->node == preferred_zone->node) __inc_zone_state(z, NUMA_HIT); - __inc_zone_state(z, local_stat); - } else { + else { __inc_zone_state(z, NUMA_MISS); __inc_zone_state(preferred_zone, NUMA_FOREIGN); } + __inc_zone_state(z, local_stat); #endif } +/* Remove page from the per-cpu list, caller must protect the list */ +static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, + bool cold, struct per_cpu_pages *pcp, + struct list_head *list) +{ + struct page *page; + + do { + if (list_empty(list)) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, list, + migratetype, cold); + if (unlikely(list_empty(list))) + return NULL; + } + + if (cold) + page = list_last_entry(list, struct page, lru); + else + page = list_first_entry(list, struct page, lru); + + list_del(&page->lru); + pcp->count--; + } while (check_new_pcp(page)); + + return page; +} + +/* Lock and remove page from the per-cpu list */ +static struct page *rmqueue_pcplist(struct zone *preferred_zone, + struct zone *zone, unsigned int order, + gfp_t gfp_flags, int migratetype) +{ + struct per_cpu_pages *pcp; + struct list_head *list; + bool cold = ((gfp_flags & __GFP_COLD) != 0); + struct page *page; + unsigned long flags; + + local_irq_save(flags); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; + page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); + if (page) { + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + zone_statistics(preferred_zone, zone); + } + local_irq_restore(flags); + return page; +} + /* * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ static inline -struct page *buffered_rmqueue(struct zone *preferred_zone, +struct page *rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags, int migratetype) { unsigned long flags; struct page *page; - bool cold = ((gfp_flags & __GFP_COLD) != 0); if (likely(order == 0)) { - struct per_cpu_pages *pcp; - struct list_head *list; - - local_irq_save(flags); - do { - pcp = &this_cpu_ptr(zone->pageset)->pcp; - list = &pcp->lists[migratetype]; - if (list_empty(list)) { - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, list, - migratetype, cold); - if (unlikely(list_empty(list))) - goto failed; - } - - if (cold) - page = list_last_entry(list, struct page, lru); - else - page = list_first_entry(list, struct page, lru); - - list_del(&page->lru); - pcp->count--; + page = rmqueue_pcplist(preferred_zone, zone, order, + gfp_flags, migratetype); + goto out; + } - } while (check_new_pcp(page)); - } else { - /* - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with __GFP_NOFAIL. - */ - WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); - spin_lock_irqsave(&zone->lock, flags); + /* + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); + spin_lock_irqsave(&zone->lock, flags); - do { - page = NULL; - if (alloc_flags & ALLOC_HARDER) { - page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); - if (page) - trace_mm_page_alloc_zone_locked(page, order, migratetype); - } - if (!page) - page = __rmqueue(zone, order, migratetype); - } while (page && check_new_pages(page, order)); - spin_unlock(&zone->lock); + do { + page = NULL; + if (alloc_flags & ALLOC_HARDER) { + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); + } if (!page) - goto failed; - __mod_zone_freepage_state(zone, -(1 << order), - get_pcppage_migratetype(page)); - } + page = __rmqueue(zone, order, migratetype); + } while (page && check_new_pages(page, order)); + spin_unlock(&zone->lock); + if (!page) + goto failed; + __mod_zone_freepage_state(zone, -(1 << order), + get_pcppage_migratetype(page)); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone, gfp_flags); + zone_statistics(preferred_zone, zone); local_irq_restore(flags); - VM_BUG_ON_PAGE(bad_range(zone, page), page); +out: + VM_BUG_ON_PAGE(page && bad_range(zone, page), page); return page; failed: @@ -2850,7 +2950,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, #ifdef CONFIG_NUMA static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { - return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < + return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= RECLAIM_DISTANCE; } #else /* CONFIG_NUMA */ @@ -2947,7 +3047,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, } try_this_zone: - page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order, + page = rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags, ac->migratetype); if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); @@ -2980,18 +3080,12 @@ static inline bool should_suppress_show_mem(void) return ret; } -static DEFINE_RATELIMIT_STATE(nopage_rs, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - -void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) +static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) { unsigned int filter = SHOW_MEM_FILTER_NODES; - struct va_format vaf; - va_list args; + static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || - debug_guardpage_minorder() > 0) + if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) return; /* @@ -3006,6 +3100,20 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; + show_mem(filter, nodemask); +} + +void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || + debug_guardpage_minorder() > 0) + return; + pr_warn("%s: ", current->comm); va_start(args, fmt); @@ -3014,11 +3122,36 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) pr_cont("%pV", &vaf); va_end(args); - pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask); + pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask); + if (nodemask) + pr_cont("%*pbl\n", nodemask_pr_args(nodemask)); + else + pr_cont("(null)\n"); + + cpuset_print_current_mems_allowed(); dump_stack(); - if (!should_suppress_show_mem()) - show_mem(filter); + warn_alloc_show_mem(gfp_mask, nodemask); +} + +static inline struct page * +__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, + unsigned int alloc_flags, + const struct alloc_context *ac) +{ + struct page *page; + + page = get_page_from_freelist(gfp_mask, order, + alloc_flags|ALLOC_CPUSET, ac); + /* + * fallback to ignore cpuset restriction if our nodes + * are depleted + */ + if (!page) + page = get_page_from_freelist(gfp_mask, order, + alloc_flags, ac); + + return page; } static inline struct page * @@ -3056,47 +3189,42 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (page) goto out; - if (!(gfp_mask & __GFP_NOFAIL)) { - /* Coredumps can quickly deplete all memory reserves */ - if (current->flags & PF_DUMPCORE) - goto out; - /* The OOM killer will not help higher order allocs */ - if (order > PAGE_ALLOC_COSTLY_ORDER) - goto out; - /* The OOM killer does not needlessly kill tasks for lowmem */ - if (ac->high_zoneidx < ZONE_NORMAL) - goto out; - if (pm_suspended_storage()) - goto out; - /* - * XXX: GFP_NOFS allocations should rather fail than rely on - * other request to make a forward progress. - * We are in an unfortunate situation where out_of_memory cannot - * do much for this context but let's try it to at least get - * access to memory reserved if the current task is killed (see - * out_of_memory). Once filesystems are ready to handle allocation - * failures more gracefully we should just bail out here. - */ + /* Coredumps can quickly deplete all memory reserves */ + if (current->flags & PF_DUMPCORE) + goto out; + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + /* The OOM killer does not needlessly kill tasks for lowmem */ + if (ac->high_zoneidx < ZONE_NORMAL) + goto out; + if (pm_suspended_storage()) + goto out; + /* + * XXX: GFP_NOFS allocations should rather fail than rely on + * other request to make a forward progress. + * We are in an unfortunate situation where out_of_memory cannot + * do much for this context but let's try it to at least get + * access to memory reserved if the current task is killed (see + * out_of_memory). Once filesystems are ready to handle allocation + * failures more gracefully we should just bail out here. + */ + + /* The OOM killer may not free memory on a specific node */ + if (gfp_mask & __GFP_THISNODE) + goto out; - /* The OOM killer may not free memory on a specific node */ - if (gfp_mask & __GFP_THISNODE) - goto out; - } /* Exhausted what can be done so it's blamo time */ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; - if (gfp_mask & __GFP_NOFAIL) { - page = get_page_from_freelist(gfp_mask, order, - ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac); - /* - * fallback to ignore cpuset restriction if our nodes - * are depleted - */ - if (!page) - page = get_page_from_freelist(gfp_mask, order, + /* + * Help non-failing allocations by giving them access to memory + * reserves + */ + if (gfp_mask & __GFP_NOFAIL) + page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_NO_WATERMARKS, ac); - } } out: mutex_unlock(&oom_lock); @@ -3165,6 +3293,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, { int max_retries = MAX_COMPACT_RETRIES; int min_priority; + bool ret = false; + int retries = *compaction_retries; + enum compact_priority priority = *compact_priority; if (!order) return false; @@ -3186,8 +3317,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, * But do not retry if the given zonelist is not suitable for * compaction. */ - if (compaction_withdrawn(compact_result)) - return compaction_zonelist_suitable(ac, order, alloc_flags); + if (compaction_withdrawn(compact_result)) { + ret = compaction_zonelist_suitable(ac, order, alloc_flags); + goto out; + } /* * !costly requests are much more important than __GFP_REPEAT @@ -3199,8 +3332,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, */ if (order > PAGE_ALLOC_COSTLY_ORDER) max_retries /= 4; - if (*compaction_retries <= max_retries) - return true; + if (*compaction_retries <= max_retries) { + ret = true; + goto out; + } /* * Make sure there are attempts at the highest priority if we exhausted @@ -3209,12 +3344,15 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, check_priority: min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; + if (*compact_priority > min_priority) { (*compact_priority)--; *compaction_retries = 0; - return true; + ret = true; } - return false; +out: + trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); + return ret; } #else static inline struct page * @@ -3305,7 +3443,7 @@ retry: * Shrink them them and try again */ if (!page && !drained) { - unreserve_highatomic_pageblock(ac); + unreserve_highatomic_pageblock(ac, false); drain_all_pages(NULL); drained = true; goto retry; @@ -3422,8 +3560,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * Make sure we converge to OOM if we cannot make any progress * several times in the row. */ - if (*no_progress_loops > MAX_RECLAIM_RETRIES) - return false; + if (*no_progress_loops > MAX_RECLAIM_RETRIES) { + /* Before OOM, exhaust highatomic_reserve */ + return unreserve_highatomic_pageblock(ac, true); + } /* * Keep reclaiming pages while there is a chance this will lead @@ -3435,6 +3575,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, ac->nodemask) { unsigned long available; unsigned long reclaimable; + unsigned long min_wmark = min_wmark_pages(zone); + bool wmark; available = reclaimable = zone_reclaimable_pages(zone); available -= DIV_ROUND_UP((*no_progress_loops) * available, @@ -3445,8 +3587,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * Would the allocation succeed if we reclaimed the whole * available? */ - if (__zone_watermark_ok(zone, order, min_wmark_pages(zone), - ac_classzone_idx(ac), alloc_flags, available)) { + wmark = __zone_watermark_ok(zone, order, min_wmark, + ac_classzone_idx(ac), alloc_flags, available); + trace_reclaim_retry_zone(z, order, reclaimable, + available, min_wmark, *no_progress_loops, wmark); + if (wmark) { /* * If we didn't make any progress and have a lot of * dirty + writeback pages then we should wait for @@ -3494,12 +3639,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; unsigned int alloc_flags; unsigned long did_some_progress; - enum compact_priority compact_priority = DEF_COMPACT_PRIORITY; + enum compact_priority compact_priority; enum compact_result compact_result; - int compaction_retries = 0; - int no_progress_loops = 0; + int compaction_retries; + int no_progress_loops; unsigned long alloc_start = jiffies; unsigned int stall_timeout = 10 * HZ; + unsigned int cpuset_mems_cookie; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -3520,6 +3666,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; +retry_cpuset: + compaction_retries = 0; + no_progress_loops = 0; + compact_priority = DEF_COMPACT_PRIORITY; + cpuset_mems_cookie = read_mems_allowed_begin(); + /* * The fast path uses conservative alloc_flags to succeed only until * kswapd needs to be woken up, and to avoid the cost of setting up @@ -3527,6 +3679,17 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, */ alloc_flags = gfp_to_alloc_flags(gfp_mask); + /* + * We need to recalculate the starting point for the zonelist iterator + * because we might have used different nodemask in the fast path, or + * there was a cpuset modification and we are retrying - otherwise we + * could end up iterating over non-eligible zones endlessly. + */ + ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, + ac->high_zoneidx, ac->nodemask); + if (!ac->preferred_zoneref->zone) + goto nopage; + if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); @@ -3603,35 +3766,21 @@ retry: goto got_pg; /* Caller is not willing to reclaim, we can't balance anything */ - if (!can_direct_reclaim) { - /* - * All existing users of the __GFP_NOFAIL are blockable, so warn - * of any new users that actually allow this type of allocation - * to fail. - */ - WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); + if (!can_direct_reclaim) goto nopage; - } - /* Avoid recursion of direct reclaim */ - if (current->flags & PF_MEMALLOC) { - /* - * __GFP_NOFAIL request from this context is rather bizarre - * because we cannot reclaim anything and only can loop waiting - * for somebody to do a work for us. - */ - if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { - cond_resched(); - goto retry; - } - goto nopage; + /* Make sure we know about allocations which stall for too long */ + if (time_after(jiffies, alloc_start + stall_timeout)) { + warn_alloc(gfp_mask, ac->nodemask, + "page allocation stalls for %ums, order:%u", + jiffies_to_msecs(jiffies-alloc_start), order); + stall_timeout += 10 * HZ; } - /* Avoid allocations with no watermarks from looping endlessly */ - if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) + /* Avoid recursion of direct reclaim */ + if (current->flags & PF_MEMALLOC) goto nopage; - /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, &did_some_progress); @@ -3655,14 +3804,6 @@ retry: if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) goto nopage; - /* Make sure we know about allocations which stall for too long */ - if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask, - "page allocation stalls for %ums, order:%u", - jiffies_to_msecs(jiffies-alloc_start), order); - stall_timeout += 10 * HZ; - } - if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, did_some_progress > 0, &no_progress_loops)) goto retry; @@ -3679,11 +3820,22 @@ retry: &compaction_retries)) goto retry; + /* + * It's possible we raced with cpuset update so the OOM would be + * premature (see below the nopage: label for full explanation). + */ + if (read_mems_allowed_retry(cpuset_mems_cookie)) + goto retry_cpuset; + /* Reclaim has failed us, start killing things */ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); if (page) goto got_pg; + /* Avoid allocations with no watermarks from looping endlessly */ + if (test_thread_flag(TIF_MEMDIE)) + goto nopage; + /* Retry as long as the OOM killer is making progress */ if (did_some_progress) { no_progress_loops = 0; @@ -3691,74 +3843,127 @@ retry: } nopage: - warn_alloc(gfp_mask, + /* + * When updating a task's mems_allowed or mempolicy nodemask, it is + * possible to race with parallel threads in such a way that our + * allocation can fail while the mask is being updated. If we are about + * to fail, check if the cpuset changed during allocation and if so, + * retry. + */ + if (read_mems_allowed_retry(cpuset_mems_cookie)) + goto retry_cpuset; + + /* + * Make sure that __GFP_NOFAIL request doesn't leak out and make sure + * we always retry + */ + if (gfp_mask & __GFP_NOFAIL) { + /* + * All existing users of the __GFP_NOFAIL are blockable, so warn + * of any new users that actually require GFP_NOWAIT + */ + if (WARN_ON_ONCE(!can_direct_reclaim)) + goto fail; + + /* + * PF_MEMALLOC request from this context is rather bizarre + * because we cannot reclaim anything and only can loop waiting + * for somebody to do a work for us + */ + WARN_ON_ONCE(current->flags & PF_MEMALLOC); + + /* + * non failing costly orders are a hard requirement which we + * are not prepared for much so let's warn about these users + * so that we can identify them and convert them to something + * else. + */ + WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); + + /* + * Help non-failing allocations by giving them access to memory + * reserves but do not use ALLOC_NO_WATERMARKS because this + * could deplete whole memory reserves which would just make + * the situation worse + */ + page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); + if (page) + goto got_pg; + + cond_resched(); + goto retry; + } +fail: + warn_alloc(gfp_mask, ac->nodemask, "page allocation failure: order:%u", order); got_pg: return page; } -/* - * This is the 'heart' of the zoned buddy allocator. - */ -struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) +static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nodemask, + struct alloc_context *ac, gfp_t *alloc_mask, + unsigned int *alloc_flags) { - struct page *page; - unsigned int cpuset_mems_cookie; - unsigned int alloc_flags = ALLOC_WMARK_LOW; - gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ - struct alloc_context ac = { - .high_zoneidx = gfp_zone(gfp_mask), - .zonelist = zonelist, - .nodemask = nodemask, - .migratetype = gfpflags_to_migratetype(gfp_mask), - }; + ac->high_zoneidx = gfp_zone(gfp_mask); + ac->zonelist = zonelist; + ac->nodemask = nodemask; + ac->migratetype = gfpflags_to_migratetype(gfp_mask); if (cpusets_enabled()) { - alloc_mask |= __GFP_HARDWALL; - alloc_flags |= ALLOC_CPUSET; - if (!ac.nodemask) - ac.nodemask = &cpuset_current_mems_allowed; + *alloc_mask |= __GFP_HARDWALL; + if (!ac->nodemask) + ac->nodemask = &cpuset_current_mems_allowed; + else + *alloc_flags |= ALLOC_CPUSET; } - gfp_mask &= gfp_allowed_mask; - lockdep_trace_alloc(gfp_mask); might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); if (should_fail_alloc_page(gfp_mask, order)) - return NULL; - - /* - * Check the zones suitable for the gfp_mask contain at least one - * valid zone. It's possible to have an empty zonelist as a result - * of __GFP_THISNODE and a memoryless node - */ - if (unlikely(!zonelist->_zonerefs->zone)) - return NULL; + return false; - if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; + if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) + *alloc_flags |= ALLOC_CMA; -retry_cpuset: - cpuset_mems_cookie = read_mems_allowed_begin(); + return true; +} +/* Determine whether to spread dirty pages and what the first usable zone */ +static inline void finalise_ac(gfp_t gfp_mask, + unsigned int order, struct alloc_context *ac) +{ /* Dirty zone balancing only done in the fast path */ - ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); + ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); /* * The preferred zone is used for statistics but crucially it is * also used as the starting point for the zonelist iterator. It * may get reset for allocations that ignore memory policies. */ - ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, - ac.high_zoneidx, ac.nodemask); - if (!ac.preferred_zoneref) { - page = NULL; - goto no_zone; - } + ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, + ac->high_zoneidx, ac->nodemask); +} + +/* + * This is the 'heart' of the zoned buddy allocator. + */ +struct page * +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nodemask) +{ + struct page *page; + unsigned int alloc_flags = ALLOC_WMARK_LOW; + gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ + struct alloc_context ac = { }; + + gfp_mask &= gfp_allowed_mask; + if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags)) + return NULL; + + finalise_ac(gfp_mask, order, &ac); /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); @@ -3776,21 +3981,10 @@ retry_cpuset: * Restore the original nodemask if it was potentially replaced with * &cpuset_current_mems_allowed to optimize the fast-path attempt. */ - if (cpusets_enabled()) + if (unlikely(ac.nodemask != nodemask)) ac.nodemask = nodemask; - page = __alloc_pages_slowpath(alloc_mask, order, &ac); -no_zone: - /* - * When updating a task's mems_allowed, it is possible to race with - * parallel threads in such a way that an allocation can fail while - * the mask is being updated. If a page allocation is about to fail, - * check if the cpuset changed during allocation and if so, retry. - */ - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) { - alloc_mask = gfp_mask; - goto retry_cpuset; - } + page = __alloc_pages_slowpath(alloc_mask, order, &ac); out: if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && @@ -3867,8 +4061,8 @@ EXPORT_SYMBOL(free_pages); * drivers to provide a backing region of memory for use as either an * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. */ -static struct page *__page_frag_refill(struct page_frag_cache *nc, - gfp_t gfp_mask) +static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, + gfp_t gfp_mask) { struct page *page = NULL; gfp_t gfp = gfp_mask; @@ -3888,8 +4082,23 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc, return page; } -void *__alloc_page_frag(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask) +void __page_frag_cache_drain(struct page *page, unsigned int count) +{ + VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); + + if (page_ref_sub_and_test(page, count)) { + unsigned int order = compound_order(page); + + if (order == 0) + free_hot_cold_page(page, false); + else + __free_pages_ok(page, order); + } +} +EXPORT_SYMBOL(__page_frag_cache_drain); + +void *page_frag_alloc(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask) { unsigned int size = PAGE_SIZE; struct page *page; @@ -3897,7 +4106,7 @@ void *__alloc_page_frag(struct page_frag_cache *nc, if (unlikely(!nc->va)) { refill: - page = __page_frag_refill(nc, gfp_mask); + page = __page_frag_cache_refill(nc, gfp_mask); if (!page) return NULL; @@ -3940,19 +4149,19 @@ refill: return nc->va + offset; } -EXPORT_SYMBOL(__alloc_page_frag); +EXPORT_SYMBOL(page_frag_alloc); /* * Frees a page fragment allocated out of either a compound or order 0 page. */ -void __free_page_frag(void *addr) +void page_frag_free(void *addr) { struct page *page = virt_to_head_page(addr); if (unlikely(put_page_testzero(page))) __free_pages_ok(page, compound_order(page)); } -EXPORT_SYMBOL(__free_page_frag); +EXPORT_SYMBOL(page_frag_free); static void *make_alloc_exact(unsigned long addr, unsigned int order, size_t size) @@ -4182,20 +4391,20 @@ void si_meminfo_node(struct sysinfo *val, int nid) * Determine whether the node should be displayed or not, depending on whether * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). */ -bool skip_free_areas_node(unsigned int flags, int nid) +static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) { - bool ret = false; - unsigned int cpuset_mems_cookie; - if (!(flags & SHOW_MEM_FILTER_NODES)) - goto out; + return false; - do { - cpuset_mems_cookie = read_mems_allowed_begin(); - ret = !node_isset(nid, cpuset_current_mems_allowed); - } while (read_mems_allowed_retry(cpuset_mems_cookie)); -out: - return ret; + /* + * no node mask - aka implicit memory numa policy. Do not bother with + * the synchronization - read_mems_allowed_begin - because we do not + * have to be precise here. + */ + if (!nodemask) + nodemask = &cpuset_current_mems_allowed; + + return !node_isset(nid, *nodemask); } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -4236,7 +4445,7 @@ static void show_migration_types(unsigned char type) * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's * cpuset. */ -void show_free_areas(unsigned int filter) +void show_free_areas(unsigned int filter, nodemask_t *nodemask) { unsigned long free_pcp = 0; int cpu; @@ -4244,7 +4453,7 @@ void show_free_areas(unsigned int filter) pg_data_t *pgdat; for_each_populated_zone(zone) { - if (skip_free_areas_node(filter, zone_to_nid(zone))) + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; for_each_online_cpu(cpu) @@ -4278,6 +4487,9 @@ void show_free_areas(unsigned int filter) global_page_state(NR_FREE_CMA_PAGES)); for_each_online_pgdat(pgdat) { + if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) + continue; + printk("Node %d" " active_anon:%lukB" " inactive_anon:%lukB" @@ -4311,13 +4523,13 @@ void show_free_areas(unsigned int filter) K(node_page_state(pgdat, NR_FILE_MAPPED)), K(node_page_state(pgdat, NR_FILE_DIRTY)), K(node_page_state(pgdat, NR_WRITEBACK)), + K(node_page_state(pgdat, NR_SHMEM)), #ifdef CONFIG_TRANSPARENT_HUGEPAGE K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR), K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), #endif - K(node_page_state(pgdat, NR_SHMEM)), K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), K(node_page_state(pgdat, NR_UNSTABLE_NFS)), node_page_state(pgdat, NR_PAGES_SCANNED), @@ -4327,7 +4539,7 @@ void show_free_areas(unsigned int filter) for_each_populated_zone(zone) { int i; - if (skip_free_areas_node(filter, zone_to_nid(zone))) + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; free_pcp = 0; @@ -4392,7 +4604,7 @@ void show_free_areas(unsigned int filter) unsigned long nr[MAX_ORDER], flags, total = 0; unsigned char types[MAX_ORDER]; - if (skip_free_areas_node(filter, zone_to_nid(zone))) + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; show_node(zone); printk(KERN_CONT "%s: ", zone->name); @@ -5013,8 +5225,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (context != MEMMAP_EARLY) goto not_early; - if (!early_pfn_valid(pfn)) + if (!early_pfn_valid(pfn)) { +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + /* + * Skip to the pfn preceding the next valid one (or + * end_pfn), such that we hit a valid pfn (or end_pfn) + * on our next iteration of the loop. + */ + pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1; +#endif continue; + } if (!early_pfn_in_nid(pfn, nid)) continue; if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) @@ -5710,7 +5931,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, * the zone and SPARSEMEM is in use. If there are holes within the * zone, each populated memory region may cost us one or two extra * memmap pages due to alignment because memmap pages for each - * populated regions may not naturally algined on page boundary. + * populated regions may not be naturally aligned on page boundary. * So the (present_pages >> 4) heuristic is a tradeoff for that. */ if (spanned_pages > present_pages + (present_pages >> 4) && @@ -6274,8 +6495,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) start_pfn = end_pfn; } - arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; - arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; /* Find the PFNs that ZONE_MOVABLE begins at in each node */ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); @@ -6399,8 +6618,8 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) } if (pages && s) - pr_info("Freeing %s memory: %ldK (%p - %p)\n", - s, pages << (PAGE_SHIFT - 10), start, end); + pr_info("Freeing %s memory: %ldK\n", + s, pages << (PAGE_SHIFT - 10)); return pages; } @@ -6491,38 +6710,39 @@ void __init free_area_init(unsigned long *zones_size) __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); } -static int page_alloc_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int page_alloc_cpu_dead(unsigned int cpu) { - int cpu = (unsigned long)hcpu; - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - lru_add_drain_cpu(cpu); - drain_pages(cpu); + lru_add_drain_cpu(cpu); + drain_pages(cpu); - /* - * Spill the event counters of the dead processor - * into the current processors event counters. - * This artificially elevates the count of the current - * processor. - */ - vm_events_fold_cpu(cpu); + /* + * Spill the event counters of the dead processor + * into the current processors event counters. + * This artificially elevates the count of the current + * processor. + */ + vm_events_fold_cpu(cpu); - /* - * Zero the differential counters of the dead processor - * so that the vm statistics are consistent. - * - * This is only okay since the processor is dead and cannot - * race with what we are doing. - */ - cpu_vm_stats_fold(cpu); - } - return NOTIFY_OK; + /* + * Zero the differential counters of the dead processor + * so that the vm statistics are consistent. + * + * This is only okay since the processor is dead and cannot + * race with what we are doing. + */ + cpu_vm_stats_fold(cpu); + return 0; } void __init page_alloc_init(void) { - hotcpu_notifier(page_alloc_cpu_notify, 0); + int ret; + + ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD, + "mm/page_alloc:dead", NULL, + page_alloc_cpu_dead); + WARN_ON(ret < 0); } /* @@ -7010,8 +7230,9 @@ void *__init alloc_large_system_hash(const char *tablename, * If @count is not zero, it is okay to include less @count unmovable pages * * PageLRU check without isolation or lru_lock could race so that - * MIGRATE_MOVABLE block might include unmovable pages. It means you can't - * expect this function should be exact. + * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable + * check without lock_page also may miss some movable non-lru pages at + * race condition. So you can't expect this function should be exact. */ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, bool skip_hwpoisoned_pages) @@ -7067,6 +7288,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, if (skip_hwpoisoned_pages && PageHWPoison(page)) continue; + if (__PageMovable(page)) + continue; + if (!PageLRU(page)) found++; /* @@ -7178,6 +7402,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks * in range must have the same migratetype and it must * be either of the two. + * @gfp_mask: GFP mask to use during compaction * * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES * aligned, however it's the caller's responsibility to guarantee that @@ -7191,7 +7416,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * need to be freed with free_contig_range(). */ int alloc_contig_range(unsigned long start, unsigned long end, - unsigned migratetype) + unsigned migratetype, gfp_t gfp_mask) { unsigned long outer_start, outer_end; unsigned int order; @@ -7203,6 +7428,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, .ignore_skip_hint = true, + .gfp_mask = memalloc_noio_flags(gfp_mask), }; INIT_LIST_HEAD(&cc.migratepages); diff --git a/mm/page_idle.c b/mm/page_idle.c index ae11aa914e55..b0ee56c56b58 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -54,27 +54,27 @@ static int page_idle_clear_pte_refs_one(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg) { - struct mm_struct *mm = vma->vm_mm; - pmd_t *pmd; - pte_t *pte; - spinlock_t *ptl; + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = addr, + }; bool referenced = false; - if (!page_check_address_transhuge(page, mm, addr, &pmd, &pte, &ptl)) - return SWAP_AGAIN; - - if (pte) { - referenced = ptep_clear_young_notify(vma, addr, pte); - pte_unmap(pte); - } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - referenced = pmdp_clear_young_notify(vma, addr, pmd); - } else { - /* unexpected pmd-mapped page? */ - WARN_ON_ONCE(1); + while (page_vma_mapped_walk(&pvmw)) { + addr = pvmw.address; + if (pvmw.pte) { + referenced = ptep_clear_young_notify(vma, addr, + pvmw.pte); + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + referenced = pmdp_clear_young_notify(vma, addr, + pvmw.pmd); + } else { + /* unexpected pmd-mapped page? */ + WARN_ON_ONCE(1); + } } - spin_unlock(ptl); - if (referenced) { clear_page_idle(page); /* diff --git a/mm/page_io.c b/mm/page_io.c index a2651f58c86a..23f6d0d3470f 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -320,10 +320,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, ret = -ENOMEM; goto out; } - if (wbc->sync_mode == WB_SYNC_ALL) - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC); - else - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index a5594bfcc5ed..f4e17a57926a 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -83,7 +83,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) unsigned long flags, nr_pages; bool isolated_page = false; unsigned int order; - unsigned long page_idx, buddy_idx; + unsigned long pfn, buddy_pfn; struct page *buddy; zone = page_zone(page); @@ -102,11 +102,11 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) if (PageBuddy(page)) { order = page_order(page); if (order >= pageblock_order) { - page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); - buddy_idx = __find_buddy_index(page_idx, order); - buddy = page + (buddy_idx - page_idx); + pfn = page_to_pfn(page); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); - if (pfn_valid_within(page_to_pfn(buddy)) && + if (pfn_valid_within(buddy_pfn) && !is_migrate_isolate_page(buddy)) { __isolate_free_page(page, order); isolated_page = true; diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c new file mode 100644 index 000000000000..de9c40d7304a --- /dev/null +++ b/mm/page_vma_mapped.c @@ -0,0 +1,223 @@ +#include <linux/mm.h> +#include <linux/rmap.h> +#include <linux/hugetlb.h> +#include <linux/swap.h> +#include <linux/swapops.h> + +#include "internal.h" + +static inline bool check_pmd(struct page_vma_mapped_walk *pvmw) +{ + pmd_t pmde; + /* + * Make sure we don't re-load pmd between present and !trans_huge check. + * We need a consistent view. + */ + pmde = READ_ONCE(*pvmw->pmd); + return pmd_present(pmde) && !pmd_trans_huge(pmde); +} + +static inline bool not_found(struct page_vma_mapped_walk *pvmw) +{ + page_vma_mapped_walk_done(pvmw); + return false; +} + +static bool map_pte(struct page_vma_mapped_walk *pvmw) +{ + pvmw->pte = pte_offset_map(pvmw->pmd, pvmw->address); + if (!(pvmw->flags & PVMW_SYNC)) { + if (pvmw->flags & PVMW_MIGRATION) { + if (!is_swap_pte(*pvmw->pte)) + return false; + } else { + if (!pte_present(*pvmw->pte)) + return false; + } + } + pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd); + spin_lock(pvmw->ptl); + return true; +} + +static bool check_pte(struct page_vma_mapped_walk *pvmw) +{ + if (pvmw->flags & PVMW_MIGRATION) { +#ifdef CONFIG_MIGRATION + swp_entry_t entry; + if (!is_swap_pte(*pvmw->pte)) + return false; + entry = pte_to_swp_entry(*pvmw->pte); + if (!is_migration_entry(entry)) + return false; + if (migration_entry_to_page(entry) - pvmw->page >= + hpage_nr_pages(pvmw->page)) { + return false; + } + if (migration_entry_to_page(entry) < pvmw->page) + return false; +#else + WARN_ON_ONCE(1); +#endif + } else { + if (!pte_present(*pvmw->pte)) + return false; + + /* THP can be referenced by any subpage */ + if (pte_page(*pvmw->pte) - pvmw->page >= + hpage_nr_pages(pvmw->page)) { + return false; + } + if (pte_page(*pvmw->pte) < pvmw->page) + return false; + } + + return true; +} + +/** + * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at + * @pvmw->address + * @pvmw: pointer to struct page_vma_mapped_walk. page, vma, address and flags + * must be set. pmd, pte and ptl must be NULL. + * + * Returns true if the page is mapped in the vma. @pvmw->pmd and @pvmw->pte point + * to relevant page table entries. @pvmw->ptl is locked. @pvmw->address is + * adjusted if needed (for PTE-mapped THPs). + * + * If @pvmw->pmd is set but @pvmw->pte is not, you have found PMD-mapped page + * (usually THP). For PTE-mapped THP, you should run page_vma_mapped_walk() in + * a loop to find all PTEs that map the THP. + * + * For HugeTLB pages, @pvmw->pte is set to the relevant page table entry + * regardless of which page table level the page is mapped at. @pvmw->pmd is + * NULL. + * + * Retruns false if there are no more page table entries for the page in + * the vma. @pvmw->ptl is unlocked and @pvmw->pte is unmapped. + * + * If you need to stop the walk before page_vma_mapped_walk() returned false, + * use page_vma_mapped_walk_done(). It will do the housekeeping. + */ +bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) +{ + struct mm_struct *mm = pvmw->vma->vm_mm; + struct page *page = pvmw->page; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + /* The only possible pmd mapping has been handled on last iteration */ + if (pvmw->pmd && !pvmw->pte) + return not_found(pvmw); + + if (pvmw->pte) + goto next_pte; + + if (unlikely(PageHuge(pvmw->page))) { + /* when pud is not present, pte will be NULL */ + pvmw->pte = huge_pte_offset(mm, pvmw->address); + if (!pvmw->pte) + return false; + + pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte); + spin_lock(pvmw->ptl); + if (!check_pte(pvmw)) + return not_found(pvmw); + return true; + } +restart: + pgd = pgd_offset(mm, pvmw->address); + if (!pgd_present(*pgd)) + return false; + p4d = p4d_offset(pgd, pvmw->address); + if (!p4d_present(*p4d)) + return false; + pud = pud_offset(p4d, pvmw->address); + if (!pud_present(*pud)) + return false; + pvmw->pmd = pmd_offset(pud, pvmw->address); + if (pmd_trans_huge(*pvmw->pmd)) { + pvmw->ptl = pmd_lock(mm, pvmw->pmd); + if (!pmd_present(*pvmw->pmd)) + return not_found(pvmw); + if (likely(pmd_trans_huge(*pvmw->pmd))) { + if (pvmw->flags & PVMW_MIGRATION) + return not_found(pvmw); + if (pmd_page(*pvmw->pmd) != page) + return not_found(pvmw); + return true; + } else { + /* THP pmd was split under us: handle on pte level */ + spin_unlock(pvmw->ptl); + pvmw->ptl = NULL; + } + } else { + if (!check_pmd(pvmw)) + return false; + } + if (!map_pte(pvmw)) + goto next_pte; + while (1) { + if (check_pte(pvmw)) + return true; +next_pte: + /* Seek to next pte only makes sense for THP */ + if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page)) + return not_found(pvmw); + do { + pvmw->address += PAGE_SIZE; + if (pvmw->address >= pvmw->vma->vm_end || + pvmw->address >= + __vma_address(pvmw->page, pvmw->vma) + + hpage_nr_pages(pvmw->page) * PAGE_SIZE) + return not_found(pvmw); + /* Did we cross page table boundary? */ + if (pvmw->address % PMD_SIZE == 0) { + pte_unmap(pvmw->pte); + if (pvmw->ptl) { + spin_unlock(pvmw->ptl); + pvmw->ptl = NULL; + } + goto restart; + } else { + pvmw->pte++; + } + } while (pte_none(*pvmw->pte)); + + if (!pvmw->ptl) { + pvmw->ptl = pte_lockptr(mm, pvmw->pmd); + spin_lock(pvmw->ptl); + } + } +} + +/** + * page_mapped_in_vma - check whether a page is really mapped in a VMA + * @page: the page to test + * @vma: the VMA to test + * + * Returns 1 if the page is mapped into the page tables of the VMA, 0 + * if the page is not mapped into the page tables of this VMA. Only + * valid for normal file or anonymous VMAs. + */ +int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +{ + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .flags = PVMW_SYNC, + }; + unsigned long start, end; + + start = __vma_address(page, vma); + end = start + PAGE_SIZE * (hpage_nr_pages(page) - 1); + + if (unlikely(end < vma->vm_start || start >= vma->vm_end)) + return 0; + pvmw.address = max(start, vma->vm_start); + if (!page_vma_mapped_walk(&pvmw)) + return 0; + page_vma_mapped_walk_done(&pvmw); + return 1; +} diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 207244489a68..60f7856e508f 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -69,23 +69,41 @@ again: return err; } -static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, +static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, struct mm_walk *walk) { pud_t *pud; unsigned long next; int err = 0; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); do { + again: next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) { + if (pud_none(*pud) || !walk->vma) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); if (err) break; continue; } + + if (walk->pud_entry) { + spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); + + if (ptl) { + err = walk->pud_entry(pud, addr, next, walk); + spin_unlock(ptl); + if (err) + break; + continue; + } + } + + split_huge_pud(walk->vma, pud, addr); + if (pud_none(*pud)) + goto again; + if (walk->pmd_entry || walk->pte_entry) err = walk_pmd_range(pud, addr, next, walk); if (err) @@ -95,6 +113,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, return err; } +static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + p4d_t *p4d; + unsigned long next; + int err = 0; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + if (err) + break; + continue; + } + if (walk->pmd_entry || walk->pte_entry) + err = walk_pud_range(p4d, addr, next, walk); + if (err) + break; + } while (p4d++, addr = next, addr != end); + + return err; +} + static int walk_pgd_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -113,7 +157,7 @@ static int walk_pgd_range(unsigned long addr, unsigned long end, continue; } if (walk->pmd_entry || walk->pte_entry) - err = walk_pud_range(pgd, addr, next, walk); + err = walk_p4d_range(pgd, addr, next, walk); if (err) break; } while (pgd++, addr = next, addr != end); diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 538998a137d2..9ac639499bd1 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -21,7 +21,6 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, /** * pcpu_get_pages - get temp pages array - * @chunk: chunk of interest * * Returns pointer to array of pointers to struct page which can be indexed * with pcpu_page_idx(). Note that there is only one array and accesses @@ -30,7 +29,7 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, * RETURNS: * Pointer to temp pages array on success. */ -static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc) +static struct page **pcpu_get_pages(void) { static struct page **pages; size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); @@ -275,7 +274,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, { struct page **pages; - pages = pcpu_get_pages(chunk); + pages = pcpu_get_pages(); if (!pages) return -ENOMEM; @@ -313,7 +312,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, * successful population attempt so the temp pages array must * be available now. */ - pages = pcpu_get_pages(chunk); + pages = pcpu_get_pages(); BUG_ON(!pages); /* unmap and free */ diff --git a/mm/percpu.c b/mm/percpu.c index 255714302394..60a6488e9e6d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -43,7 +43,7 @@ * Chunks can be determined from the address using the index field * in the page struct. The index field contains a pointer to the chunk. * - * To use this allocator, arch code should do the followings. + * To use this allocator, arch code should do the following: * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be @@ -886,7 +886,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, size = ALIGN(size, 2); - if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { + if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE || + !is_power_of_2(align))) { WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n", size, align); return NULL; @@ -1010,8 +1011,11 @@ area_found: mutex_unlock(&pcpu_alloc_mutex); } - if (chunk != pcpu_reserved_chunk) + if (chunk != pcpu_reserved_chunk) { + spin_lock_irqsave(&pcpu_lock, flags); pcpu_nr_empty_pop_pages -= occ_pages; + spin_unlock_irqrestore(&pcpu_lock, flags); + } if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) pcpu_schedule_balance_work(); @@ -2093,6 +2097,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size, size_t pages_size; struct page **pages; int unit, i, j, rc; + int upa; + int nr_g0_units; snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); @@ -2100,7 +2106,12 @@ int __init pcpu_page_first_chunk(size_t reserved_size, if (IS_ERR(ai)) return PTR_ERR(ai); BUG_ON(ai->nr_groups != 1); - BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); + upa = ai->alloc_size/ai->unit_size; + nr_g0_units = roundup(num_possible_cpus(), upa); + if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) { + pcpu_free_alloc_info(ai); + return -EINVAL; + } unit_pages = ai->unit_size >> PAGE_SHIFT; @@ -2111,21 +2122,22 @@ int __init pcpu_page_first_chunk(size_t reserved_size, /* allocate pages */ j = 0; - for (unit = 0; unit < num_possible_cpus(); unit++) + for (unit = 0; unit < num_possible_cpus(); unit++) { + unsigned int cpu = ai->groups[0].cpu_map[unit]; for (i = 0; i < unit_pages; i++) { - unsigned int cpu = ai->groups[0].cpu_map[unit]; void *ptr; ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); if (!ptr) { pr_warn("failed to allocate %s page for cpu%u\n", - psize_str, cpu); + psize_str, cpu); goto enomem; } /* kmemleak tracks the percpu allocations separately */ kmemleak_free(ptr); pages[j++] = virt_to_page(ptr); } + } /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 71c5f9109f2a..c99d9512a45b 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -22,6 +22,12 @@ void pgd_clear_bad(pgd_t *pgd) pgd_clear(pgd); } +void p4d_clear_bad(p4d_t *p4d) +{ + p4d_ERROR(*p4d); + p4d_clear(p4d); +} + void pud_clear_bad(pud_t *pud) { pud_ERROR(*pud); @@ -123,6 +129,20 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp) +{ + pud_t pud; + + VM_BUG_ON(address & ~HPAGE_PUD_MASK); + VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp)); + pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp); + flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); + return pud; +} +#endif #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index be8dc8d1edb9..8973cd231ece 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -12,6 +12,7 @@ #include <linux/mm.h> #include <linux/uio.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/highmem.h> #include <linux/ptrace.h> #include <linux/slab.h> @@ -88,7 +89,7 @@ static int process_vm_rw_single_vec(unsigned long addr, ssize_t rc = 0; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); - unsigned int flags = FOLL_REMOTE; + unsigned int flags = 0; /* Work out address and page range required */ if (len == 0) @@ -100,15 +101,19 @@ static int process_vm_rw_single_vec(unsigned long addr, while (!rc && nr_pages && iov_iter_count(iter)) { int pages = min(nr_pages, max_pages_per_loop); + int locked = 1; size_t bytes; /* * Get the pages we're interested in. We must - * add FOLL_REMOTE because task/mm might not + * access remotely because task/mm might not * current/current->mm */ - pages = __get_user_pages_unlocked(task, mm, pa, pages, - process_pages, flags); + down_read(&mm->mmap_sem); + pages = get_user_pages_remote(task, mm, pa, pages, flags, + process_pages, NULL, &locked); + if (locked) + up_read(&mm->mmap_sem); if (pages <= 0) return -EFAULT; diff --git a/mm/readahead.c b/mm/readahead.c index c8a955b1297e..c4ca70239233 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -207,12 +207,21 @@ out: * memory at once. */ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) + pgoff_t offset, unsigned long nr_to_read) { + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct file_ra_state *ra = &filp->f_ra; + unsigned long max_pages; + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) return -EINVAL; - nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages); + /* + * If the request exceeds the readahead window, allow the read to + * be up to the optimal hardware IO size + */ + max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); + nr_to_read = min(nr_to_read, max_pages); while (nr_to_read) { int err; @@ -369,10 +378,18 @@ ondemand_readahead(struct address_space *mapping, bool hit_readahead_marker, pgoff_t offset, unsigned long req_size) { - unsigned long max = ra->ra_pages; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + unsigned long max_pages = ra->ra_pages; pgoff_t prev_offset; /* + * If the request exceeds the readahead window, allow the read to + * be up to the optimal hardware IO size + */ + if (req_size > max_pages && bdi->io_pages > max_pages) + max_pages = min(req_size, bdi->io_pages); + + /* * start of file */ if (!offset) @@ -385,7 +402,7 @@ ondemand_readahead(struct address_space *mapping, if ((offset == (ra->start + ra->size - ra->async_size) || offset == (ra->start + ra->size))) { ra->start += ra->size; - ra->size = get_next_ra_size(ra, max); + ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; goto readit; } @@ -400,16 +417,16 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = page_cache_next_hole(mapping, offset + 1, max); + start = page_cache_next_hole(mapping, offset + 1, max_pages); rcu_read_unlock(); - if (!start || start - offset > max) + if (!start || start - offset > max_pages) return 0; ra->start = start; ra->size = start - offset; /* old async_size */ ra->size += req_size; - ra->size = get_next_ra_size(ra, max); + ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; goto readit; } @@ -417,7 +434,7 @@ ondemand_readahead(struct address_space *mapping, /* * oversize read */ - if (req_size > max) + if (req_size > max_pages) goto initial_readahead; /* @@ -433,7 +450,7 @@ ondemand_readahead(struct address_space *mapping, * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ - if (try_context_readahead(mapping, ra, offset, req_size, max)) + if (try_context_readahead(mapping, ra, offset, req_size, max_pages)) goto readit; /* @@ -444,7 +461,7 @@ ondemand_readahead(struct address_space *mapping, initial_readahead: ra->start = offset; - ra->size = get_init_ra_size(req_size, max); + ra->size = get_init_ra_size(req_size, max_pages); ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; readit: @@ -454,7 +471,7 @@ readit: * the resulted next readahead window into the current one. */ if (offset == ra->start && ra->size == ra->async_size) { - ra->async_size = get_next_ra_size(ra, max); + ra->async_size = get_next_ra_size(ra, max_pages); ra->size += ra->async_size; } diff --git a/mm/rmap.c b/mm/rmap.c index 1ef36404e7b2..f6838015810f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -46,6 +46,8 @@ */ #include <linux/mm.h> +#include <linux/sched/mm.h> +#include <linux/sched/task.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/swapops.h> @@ -141,14 +143,15 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, } /** - * anon_vma_prepare - attach an anon_vma to a memory region + * __anon_vma_prepare - attach an anon_vma to a memory region * @vma: the memory region in question * * This makes sure the memory mapping described by 'vma' has * an 'anon_vma' attached to it, so that we can associate the * anonymous pages mapped into it with that anon_vma. * - * The common case will be that we already have one, but if + * The common case will be that we already have one, which + * is handled inline by anon_vma_prepare(). But if * not we either need to find an adjacent mapping that we * can re-use the anon_vma from (very common when the only * reason for splitting a vma has been mprotect()), or we @@ -167,48 +170,46 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * * This must be called with the mmap_sem held for reading. */ -int anon_vma_prepare(struct vm_area_struct *vma) +int __anon_vma_prepare(struct vm_area_struct *vma) { - struct anon_vma *anon_vma = vma->anon_vma; + struct mm_struct *mm = vma->vm_mm; + struct anon_vma *anon_vma, *allocated; struct anon_vma_chain *avc; might_sleep(); - if (unlikely(!anon_vma)) { - struct mm_struct *mm = vma->vm_mm; - struct anon_vma *allocated; - avc = anon_vma_chain_alloc(GFP_KERNEL); - if (!avc) - goto out_enomem; + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) + goto out_enomem; + + anon_vma = find_mergeable_anon_vma(vma); + allocated = NULL; + if (!anon_vma) { + anon_vma = anon_vma_alloc(); + if (unlikely(!anon_vma)) + goto out_enomem_free_avc; + allocated = anon_vma; + } - anon_vma = find_mergeable_anon_vma(vma); + anon_vma_lock_write(anon_vma); + /* page_table_lock to protect against threads */ + spin_lock(&mm->page_table_lock); + if (likely(!vma->anon_vma)) { + vma->anon_vma = anon_vma; + anon_vma_chain_link(vma, avc, anon_vma); + /* vma reference or self-parent link for new root */ + anon_vma->degree++; allocated = NULL; - if (!anon_vma) { - anon_vma = anon_vma_alloc(); - if (unlikely(!anon_vma)) - goto out_enomem_free_avc; - allocated = anon_vma; - } + avc = NULL; + } + spin_unlock(&mm->page_table_lock); + anon_vma_unlock_write(anon_vma); - anon_vma_lock_write(anon_vma); - /* page_table_lock to protect against threads */ - spin_lock(&mm->page_table_lock); - if (likely(!vma->anon_vma)) { - vma->anon_vma = anon_vma; - anon_vma_chain_link(vma, avc, anon_vma); - /* vma reference or self-parent link for new root */ - anon_vma->degree++; - allocated = NULL; - avc = NULL; - } - spin_unlock(&mm->page_table_lock); - anon_vma_unlock_write(anon_vma); + if (unlikely(allocated)) + put_anon_vma(allocated); + if (unlikely(avc)) + anon_vma_chain_free(avc); - if (unlikely(allocated)) - put_anon_vma(allocated); - if (unlikely(avc)) - anon_vma_chain_free(avc); - } return 0; out_enomem_free_avc: @@ -608,8 +609,7 @@ void try_to_unmap_flush_dirty(void) try_to_unmap_flush(); } -static void set_tlb_ubc_flush_pending(struct mm_struct *mm, - struct page *page, bool writable) +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; @@ -644,8 +644,7 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) return should_defer; } #else -static void set_tlb_ubc_flush_pending(struct mm_struct *mm, - struct page *page, bool writable) +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) { } @@ -685,6 +684,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd = NULL; pmd_t pmde; @@ -693,7 +693,11 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) if (!pgd_present(*pgd)) goto out; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + goto out; + + pud = pud_offset(p4d, address); if (!pud_present(*pud)) goto out; @@ -711,170 +715,6 @@ out: return pmd; } -/* - * Check that @page is mapped at @address into @mm. - * - * If @sync is false, page_check_address may perform a racy check to avoid - * the page table lock when the pte is not present (helpful when reclaiming - * highly shared pages). - * - * On success returns with pte mapped and locked. - */ -pte_t *__page_check_address(struct page *page, struct mm_struct *mm, - unsigned long address, spinlock_t **ptlp, int sync) -{ - pmd_t *pmd; - pte_t *pte; - spinlock_t *ptl; - - if (unlikely(PageHuge(page))) { - /* when pud is not present, pte will be NULL */ - pte = huge_pte_offset(mm, address); - if (!pte) - return NULL; - - ptl = huge_pte_lockptr(page_hstate(page), mm, pte); - goto check; - } - - pmd = mm_find_pmd(mm, address); - if (!pmd) - return NULL; - - pte = pte_offset_map(pmd, address); - /* Make a quick check before getting the lock */ - if (!sync && !pte_present(*pte)) { - pte_unmap(pte); - return NULL; - } - - ptl = pte_lockptr(mm, pmd); -check: - spin_lock(ptl); - if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { - *ptlp = ptl; - return pte; - } - pte_unmap_unlock(pte, ptl); - return NULL; -} - -/** - * page_mapped_in_vma - check whether a page is really mapped in a VMA - * @page: the page to test - * @vma: the VMA to test - * - * Returns 1 if the page is mapped into the page tables of the VMA, 0 - * if the page is not mapped into the page tables of this VMA. Only - * valid for normal file or anonymous VMAs. - */ -int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) -{ - unsigned long address; - pte_t *pte; - spinlock_t *ptl; - - address = __vma_address(page, vma); - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) - return 0; - pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); - if (!pte) /* the page is not in this mm */ - return 0; - pte_unmap_unlock(pte, ptl); - - return 1; -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -/* - * Check that @page is mapped at @address into @mm. In contrast to - * page_check_address(), this function can handle transparent huge pages. - * - * On success returns true with pte mapped and locked. For PMD-mapped - * transparent huge pages *@ptep is set to NULL. - */ -bool page_check_address_transhuge(struct page *page, struct mm_struct *mm, - unsigned long address, pmd_t **pmdp, - pte_t **ptep, spinlock_t **ptlp) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - spinlock_t *ptl; - - if (unlikely(PageHuge(page))) { - /* when pud is not present, pte will be NULL */ - pte = huge_pte_offset(mm, address); - if (!pte) - return false; - - ptl = huge_pte_lockptr(page_hstate(page), mm, pte); - pmd = NULL; - goto check_pte; - } - - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - return false; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return false; - pmd = pmd_offset(pud, address); - - if (pmd_trans_huge(*pmd)) { - ptl = pmd_lock(mm, pmd); - if (!pmd_present(*pmd)) - goto unlock_pmd; - if (unlikely(!pmd_trans_huge(*pmd))) { - spin_unlock(ptl); - goto map_pte; - } - - if (pmd_page(*pmd) != page) - goto unlock_pmd; - - pte = NULL; - goto found; -unlock_pmd: - spin_unlock(ptl); - return false; - } else { - pmd_t pmde = *pmd; - - barrier(); - if (!pmd_present(pmde) || pmd_trans_huge(pmde)) - return false; - } -map_pte: - pte = pte_offset_map(pmd, address); - if (!pte_present(*pte)) { - pte_unmap(pte); - return false; - } - - ptl = pte_lockptr(mm, pmd); -check_pte: - spin_lock(ptl); - - if (!pte_present(*pte)) { - pte_unmap_unlock(pte, ptl); - return false; - } - - /* THP can be referenced by any subpage */ - if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) { - pte_unmap_unlock(pte, ptl); - return false; - } -found: - *ptep = pte; - *pmdp = pmd; - *ptlp = ptl; - return true; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - struct page_referenced_arg { int mapcount; int referenced; @@ -887,45 +727,48 @@ struct page_referenced_arg { static int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { - struct mm_struct *mm = vma->vm_mm; struct page_referenced_arg *pra = arg; - pmd_t *pmd; - pte_t *pte; - spinlock_t *ptl; + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = address, + }; int referenced = 0; - if (!page_check_address_transhuge(page, mm, address, &pmd, &pte, &ptl)) - return SWAP_AGAIN; + while (page_vma_mapped_walk(&pvmw)) { + address = pvmw.address; - if (vma->vm_flags & VM_LOCKED) { - if (pte) - pte_unmap(pte); - spin_unlock(ptl); - pra->vm_flags |= VM_LOCKED; - return SWAP_FAIL; /* To break the loop */ - } + if (vma->vm_flags & VM_LOCKED) { + page_vma_mapped_walk_done(&pvmw); + pra->vm_flags |= VM_LOCKED; + return SWAP_FAIL; /* To break the loop */ + } - if (pte) { - if (ptep_clear_flush_young_notify(vma, address, pte)) { - /* - * Don't treat a reference through a sequentially read - * mapping as such. If the page has been used in - * another mapping, we will catch it; if this other - * mapping is already gone, the unmap path will have - * set PG_referenced or activated the page. - */ - if (likely(!(vma->vm_flags & VM_SEQ_READ))) + if (pvmw.pte) { + if (ptep_clear_flush_young_notify(vma, address, + pvmw.pte)) { + /* + * Don't treat a reference through + * a sequentially read mapping as such. + * If the page has been used in another mapping, + * we will catch it; if this other mapping is + * already gone, the unmap path will have set + * PG_referenced or activated the page. + */ + if (likely(!(vma->vm_flags & VM_SEQ_READ))) + referenced++; + } + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (pmdp_clear_flush_young_notify(vma, address, + pvmw.pmd)) referenced++; + } else { + /* unexpected pmd-mapped page? */ + WARN_ON_ONCE(1); } - pte_unmap(pte); - } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - if (pmdp_clear_flush_young_notify(vma, address, pmd)) - referenced++; - } else { - /* unexpected pmd-mapped page? */ - WARN_ON_ONCE(1); + + pra->mapcount--; } - spin_unlock(ptl); if (referenced) clear_page_idle(page); @@ -937,7 +780,6 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, pra->vm_flags |= vma->vm_flags; } - pra->mapcount--; if (!pra->mapcount) return SWAP_SUCCESS; /* To break the loop */ @@ -1016,34 +858,56 @@ int page_referenced(struct page *page, static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { - struct mm_struct *mm = vma->vm_mm; - pte_t *pte; - spinlock_t *ptl; - int ret = 0; + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = address, + .flags = PVMW_SYNC, + }; int *cleaned = arg; - pte = page_check_address(page, mm, address, &ptl, 1); - if (!pte) - goto out; - - if (pte_dirty(*pte) || pte_write(*pte)) { - pte_t entry; + while (page_vma_mapped_walk(&pvmw)) { + int ret = 0; + address = pvmw.address; + if (pvmw.pte) { + pte_t entry; + pte_t *pte = pvmw.pte; + + if (!pte_dirty(*pte) && !pte_write(*pte)) + continue; + + flush_cache_page(vma, address, pte_pfn(*pte)); + entry = ptep_clear_flush(vma, address, pte); + entry = pte_wrprotect(entry); + entry = pte_mkclean(entry); + set_pte_at(vma->vm_mm, address, pte, entry); + ret = 1; + } else { +#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE + pmd_t *pmd = pvmw.pmd; + pmd_t entry; + + if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) + continue; + + flush_cache_page(vma, address, page_to_pfn(page)); + entry = pmdp_huge_clear_flush(vma, address, pmd); + entry = pmd_wrprotect(entry); + entry = pmd_mkclean(entry); + set_pmd_at(vma->vm_mm, address, pmd, entry); + ret = 1; +#else + /* unexpected pmd-mapped page? */ + WARN_ON_ONCE(1); +#endif + } - flush_cache_page(vma, address, pte_pfn(*pte)); - entry = ptep_clear_flush(vma, address, pte); - entry = pte_wrprotect(entry); - entry = pte_mkclean(entry); - set_pte_at(mm, address, pte, entry); - ret = 1; + if (ret) { + mmu_notifier_invalidate_page(vma->vm_mm, address); + (*cleaned)++; + } } - pte_unmap_unlock(pte, ptl); - - if (ret) { - mmu_notifier_invalidate_page(mm, address); - (*cleaned)++; - } -out: return SWAP_AGAIN; } @@ -1295,7 +1159,7 @@ void page_add_file_rmap(struct page *page, bool compound) goto out; } __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr); - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, nr); out: unlock_page_memcg(page); } @@ -1335,7 +1199,7 @@ static void page_remove_file_rmap(struct page *page, bool compound) * pte lock(a spinlock) is held, which implies preemption disabled. */ __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr); - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, -nr); if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1436,155 +1300,164 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; - pte_t *pte; + struct page_vma_mapped_walk pvmw = { + .page = page, + .vma = vma, + .address = address, + }; pte_t pteval; - spinlock_t *ptl; + struct page *subpage; int ret = SWAP_AGAIN; struct rmap_private *rp = arg; enum ttu_flags flags = rp->flags; /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) - goto out; + return SWAP_AGAIN; if (flags & TTU_SPLIT_HUGE_PMD) { split_huge_pmd_address(vma, address, flags & TTU_MIGRATION, page); - /* check if we have anything to do after split */ - if (page_mapcount(page) == 0) - goto out; } - pte = page_check_address(page, mm, address, &ptl, - PageTransCompound(page)); - if (!pte) - goto out; - - /* - * If the page is mlock()d, we cannot swap it out. - * If it's recently referenced (perhaps page_referenced - * skipped over this mm) then we should reactivate it. - */ - if (!(flags & TTU_IGNORE_MLOCK)) { - if (vma->vm_flags & VM_LOCKED) { - /* PTE-mapped THP are never mlocked */ - if (!PageTransCompound(page)) { - /* - * Holding pte lock, we do *not* need - * mmap_sem here - */ - mlock_vma_page(page); + while (page_vma_mapped_walk(&pvmw)) { + /* + * If the page is mlock()d, we cannot swap it out. + * If it's recently referenced (perhaps page_referenced + * skipped over this mm) then we should reactivate it. + */ + if (!(flags & TTU_IGNORE_MLOCK)) { + if (vma->vm_flags & VM_LOCKED) { + /* PTE-mapped THP are never mlocked */ + if (!PageTransCompound(page)) { + /* + * Holding pte lock, we do *not* need + * mmap_sem here + */ + mlock_vma_page(page); + } + ret = SWAP_MLOCK; + page_vma_mapped_walk_done(&pvmw); + break; } - ret = SWAP_MLOCK; - goto out_unmap; + if (flags & TTU_MUNLOCK) + continue; } - if (flags & TTU_MUNLOCK) - goto out_unmap; - } - if (!(flags & TTU_IGNORE_ACCESS)) { - if (ptep_clear_flush_young_notify(vma, address, pte)) { - ret = SWAP_FAIL; - goto out_unmap; - } - } - /* Nuke the page table entry. */ - flush_cache_page(vma, address, page_to_pfn(page)); - if (should_defer_flush(mm, flags)) { - /* - * We clear the PTE but do not flush so potentially a remote - * CPU could still be writing to the page. If the entry was - * previously clean then the architecture must guarantee that - * a clear->dirty transition on a cached TLB entry is written - * through and traps if the PTE is unmapped. - */ - pteval = ptep_get_and_clear(mm, address, pte); + /* Unexpected PMD-mapped THP? */ + VM_BUG_ON_PAGE(!pvmw.pte, page); - set_tlb_ubc_flush_pending(mm, page, pte_dirty(pteval)); - } else { - pteval = ptep_clear_flush(vma, address, pte); - } + subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); + address = pvmw.address; - /* Move the dirty bit to the physical page now the pte is gone. */ - if (pte_dirty(pteval)) - set_page_dirty(page); - /* Update high watermark before we lower rss */ - update_hiwater_rss(mm); + if (!(flags & TTU_IGNORE_ACCESS)) { + if (ptep_clear_flush_young_notify(vma, address, + pvmw.pte)) { + ret = SWAP_FAIL; + page_vma_mapped_walk_done(&pvmw); + break; + } + } + + /* Nuke the page table entry. */ + flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + if (should_defer_flush(mm, flags)) { + /* + * We clear the PTE but do not flush so potentially + * a remote CPU could still be writing to the page. + * If the entry was previously clean then the + * architecture must guarantee that a clear->dirty + * transition on a cached TLB entry is written through + * and traps if the PTE is unmapped. + */ + pteval = ptep_get_and_clear(mm, address, pvmw.pte); - if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { - if (PageHuge(page)) { - hugetlb_count_sub(1 << compound_order(page), mm); + set_tlb_ubc_flush_pending(mm, pte_dirty(pteval)); } else { - dec_mm_counter(mm, mm_counter(page)); + pteval = ptep_clear_flush(vma, address, pvmw.pte); } - set_pte_at(mm, address, pte, - swp_entry_to_pte(make_hwpoison_entry(page))); - } else if (pte_unused(pteval)) { - /* - * The guest indicated that the page content is of no - * interest anymore. Simply discard the pte, vmscan - * will take care of the rest. - */ - dec_mm_counter(mm, mm_counter(page)); - } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) { - swp_entry_t entry; - pte_t swp_pte; - /* - * Store the pfn of the page in a special migration - * pte. do_swap_page() will wait until the migration - * pte is removed and then restart fault handling. - */ - entry = make_migration_entry(page, pte_write(pteval)); - swp_pte = swp_entry_to_pte(entry); - if (pte_soft_dirty(pteval)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - set_pte_at(mm, address, pte, swp_pte); - } else if (PageAnon(page)) { - swp_entry_t entry = { .val = page_private(page) }; - pte_t swp_pte; - /* - * Store the swap location in the pte. - * See handle_pte_fault() ... - */ - VM_BUG_ON_PAGE(!PageSwapCache(page), page); - if (!PageDirty(page) && (flags & TTU_LZFREE)) { - /* It's a freeable page by MADV_FREE */ - dec_mm_counter(mm, MM_ANONPAGES); - rp->lazyfreed++; - goto discard; - } + /* Move the dirty bit to the page. Now the pte is gone. */ + if (pte_dirty(pteval)) + set_page_dirty(page); - if (swap_duplicate(entry) < 0) { - set_pte_at(mm, address, pte, pteval); - ret = SWAP_FAIL; - goto out_unmap; - } - if (list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - if (list_empty(&mm->mmlist)) - list_add(&mm->mmlist, &init_mm.mmlist); - spin_unlock(&mmlist_lock); - } - dec_mm_counter(mm, MM_ANONPAGES); - inc_mm_counter(mm, MM_SWAPENTS); - swp_pte = swp_entry_to_pte(entry); - if (pte_soft_dirty(pteval)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - set_pte_at(mm, address, pte, swp_pte); - } else - dec_mm_counter(mm, mm_counter_file(page)); + /* Update high watermark before we lower rss */ + update_hiwater_rss(mm); -discard: - page_remove_rmap(page, PageHuge(page)); - put_page(page); + if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { + if (PageHuge(page)) { + int nr = 1 << compound_order(page); + hugetlb_count_sub(nr, mm); + } else { + dec_mm_counter(mm, mm_counter(page)); + } + + pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); + set_pte_at(mm, address, pvmw.pte, pteval); + } else if (pte_unused(pteval)) { + /* + * The guest indicated that the page content is of no + * interest anymore. Simply discard the pte, vmscan + * will take care of the rest. + */ + dec_mm_counter(mm, mm_counter(page)); + } else if (IS_ENABLED(CONFIG_MIGRATION) && + (flags & TTU_MIGRATION)) { + swp_entry_t entry; + pte_t swp_pte; + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + entry = make_migration_entry(subpage, + pte_write(pteval)); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, address, pvmw.pte, swp_pte); + } else if (PageAnon(page)) { + swp_entry_t entry = { .val = page_private(subpage) }; + pte_t swp_pte; + /* + * Store the swap location in the pte. + * See handle_pte_fault() ... + */ + VM_BUG_ON_PAGE(!PageSwapCache(page), page); -out_unmap: - pte_unmap_unlock(pte, ptl); - if (ret != SWAP_FAIL && ret != SWAP_MLOCK && !(flags & TTU_MUNLOCK)) + if (!PageDirty(page) && (flags & TTU_LZFREE)) { + /* It's a freeable page by MADV_FREE */ + dec_mm_counter(mm, MM_ANONPAGES); + rp->lazyfreed++; + goto discard; + } + + if (swap_duplicate(entry) < 0) { + set_pte_at(mm, address, pvmw.pte, pteval); + ret = SWAP_FAIL; + page_vma_mapped_walk_done(&pvmw); + break; + } + if (list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + if (list_empty(&mm->mmlist)) + list_add(&mm->mmlist, &init_mm.mmlist); + spin_unlock(&mmlist_lock); + } + dec_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter(mm, MM_SWAPENTS); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, address, pvmw.pte, swp_pte); + } else + dec_mm_counter(mm, mm_counter_file(page)); +discard: + page_remove_rmap(subpage, PageHuge(page)); + put_page(page); mmu_notifier_invalidate_page(mm, address); -out: + } return ret; } @@ -1609,7 +1482,7 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) static int page_mapcount_is_zero(struct page *page) { - return !page_mapcount(page); + return !total_mapcount(page); } /** @@ -1756,7 +1629,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, bool locked) { struct anon_vma *anon_vma; - pgoff_t pgoff; + pgoff_t pgoff_start, pgoff_end; struct anon_vma_chain *avc; int ret = SWAP_AGAIN; @@ -1770,8 +1643,10 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, if (!anon_vma) return ret; - pgoff = page_to_pgoff(page); - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { + pgoff_start = page_to_pgoff(page); + pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, + pgoff_start, pgoff_end) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); @@ -1809,7 +1684,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, bool locked) { struct address_space *mapping = page_mapping(page); - pgoff_t pgoff; + pgoff_t pgoff_start, pgoff_end; struct vm_area_struct *vma; int ret = SWAP_AGAIN; @@ -1824,10 +1699,12 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, if (!mapping) return ret; - pgoff = page_to_pgoff(page); + pgoff_start = page_to_pgoff(page); + pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; if (!locked) i_mmap_lock_read(mapping); - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, + pgoff_start, pgoff_end) { unsigned long address = vma_address(page, vma); cond_resched(); diff --git a/mm/rodata_test.c b/mm/rodata_test.c new file mode 100644 index 000000000000..0fd21670b513 --- /dev/null +++ b/mm/rodata_test.c @@ -0,0 +1,56 @@ +/* + * rodata_test.c: functional test for mark_rodata_ro function + * + * (C) Copyright 2008 Intel Corporation + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/uaccess.h> +#include <asm/sections.h> + +const int rodata_test_data = 0xC3; +EXPORT_SYMBOL_GPL(rodata_test_data); + +void rodata_test(void) +{ + unsigned long start, end; + int zero = 0; + + /* test 1: read the value */ + /* If this test fails, some previous testrun has clobbered the state */ + if (!rodata_test_data) { + pr_err("rodata_test: test 1 fails (start data)\n"); + return; + } + + /* test 2: write to the variable; this should fault */ + if (!probe_kernel_write((void *)&rodata_test_data, + (void *)&zero, sizeof(zero))) { + pr_err("rodata_test: test data was not read only\n"); + return; + } + + /* test 3: check the value hasn't changed */ + if (rodata_test_data == zero) { + pr_err("rodata_test: test data was changed\n"); + return; + } + + /* test 4: check if the rodata section is PAGE_SIZE aligned */ + start = (unsigned long)__start_rodata; + end = (unsigned long)__end_rodata; + if (start & (PAGE_SIZE - 1)) { + pr_err("rodata_test: start of .rodata is not page size aligned\n"); + return; + } + if (end & (PAGE_SIZE - 1)) { + pr_err("rodata_test: end of .rodata is not page size aligned\n"); + return; + } + + pr_info("rodata_test: all tests were successful\n"); +} diff --git a/mm/shmem.c b/mm/shmem.c index 9d32e1cb9f38..e67d6ba4e98e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -29,11 +29,14 @@ #include <linux/pagemap.h> #include <linux/file.h> #include <linux/mm.h> +#include <linux/sched/signal.h> #include <linux/export.h> #include <linux/swap.h> #include <linux/uio.h> #include <linux/khugepaged.h> +#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ + static struct vfsmount *shm_mnt; #ifdef CONFIG_SHMEM @@ -70,8 +73,10 @@ static struct vfsmount *shm_mnt; #include <linux/syscalls.h> #include <linux/fcntl.h> #include <uapi/linux/memfd.h> +#include <linux/userfaultfd_k.h> +#include <linux/rmap.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgtable.h> #include "internal.h" @@ -115,13 +120,14 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index); static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, - gfp_t gfp, struct mm_struct *fault_mm, int *fault_type); + gfp_t gfp, struct vm_area_struct *vma, + struct vm_fault *vmf, int *fault_type); int shmem_getpage(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp) { return shmem_getpage_gfp(inode, index, pagep, sgp, - mapping_gfp_mask(inode->i_mapping), NULL, NULL); + mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); } static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) @@ -190,6 +196,11 @@ static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; static struct file_system_type shmem_fs_type; +bool vma_is_shmem(struct vm_area_struct *vma) +{ + return vma->vm_ops == &shmem_vm_ops; +} + static LIST_HEAD(shmem_swaplist); static DEFINE_MUTEX(shmem_swaplist_mutex); @@ -300,18 +311,19 @@ void shmem_uncharge(struct inode *inode, long pages) static int shmem_radix_tree_replace(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { + struct radix_tree_node *node; void **pslot; void *item; VM_BUG_ON(!expected); VM_BUG_ON(!replacement); - pslot = radix_tree_lookup_slot(&mapping->page_tree, index); - if (!pslot) + item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot); + if (!item) return -ENOENT; - item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock); if (item != expected) return -ENOENT; - radix_tree_replace_slot(pslot, replacement); + __radix_tree_replace(&mapping->page_tree, node, pslot, + replacement, NULL, NULL); return 0; } @@ -370,6 +382,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, int shmem_huge __read_mostly; +#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static int shmem_parse_huge(const char *str) { if (!strcmp(str, "never")) @@ -407,11 +420,13 @@ static const char *shmem_format_huge(int huge) return "bad_val"; } } +#endif static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_split) { LIST_HEAD(list), *pos, *next; + LIST_HEAD(to_remove); struct inode *inode; struct shmem_inode_info *info; struct page *page; @@ -438,9 +453,8 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, /* Check if there's anything to gain */ if (round_up(inode->i_size, PAGE_SIZE) == round_up(inode->i_size, HPAGE_PMD_SIZE)) { - list_del_init(&info->shrinklist); + list_move(&info->shrinklist, &to_remove); removed++; - iput(inode); goto next; } @@ -451,6 +465,13 @@ next: } spin_unlock(&sbinfo->shrinklist_lock); + list_for_each_safe(pos, next, &to_remove) { + info = list_entry(pos, struct shmem_inode_info, shrinklist); + inode = &info->vfs_inode; + list_del_init(&info->shrinklist); + iput(inode); + } + list_for_each_safe(pos, next, &list) { int ret; @@ -658,8 +679,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, swapped++; if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } @@ -938,10 +959,10 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) } EXPORT_SYMBOL_GPL(shmem_truncate_range); -static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +static int shmem_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); if (info->alloced - info->swapped != inode->i_mapping->nrpages) { @@ -1046,6 +1067,30 @@ static void shmem_evict_inode(struct inode *inode) clear_inode(inode); } +static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) +{ + struct radix_tree_iter iter; + void **slot; + unsigned long found = -1; + unsigned int checked = 0; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, root, &iter, 0) { + if (*slot == item) { + found = iter.index; + break; + } + checked++; + if ((checked % 4096) != 0) + continue; + slot = radix_tree_iter_resume(slot, &iter); + cond_resched_rcu(); + } + + rcu_read_unlock(); + return found; +} + /* * If swap found in inode, free it and move page from swapcache to filecache. */ @@ -1059,7 +1104,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, int error = 0; radswap = swp_to_radix_entry(swap); - index = radix_tree_locate_item(&mapping->page_tree, radswap); + index = find_swap_entry(&mapping->page_tree, radswap); if (index == -1) return -EAGAIN; /* tell shmem_unuse we found nothing */ @@ -1536,10 +1581,10 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, - struct mm_struct *fault_mm, int *fault_type) + struct vm_area_struct *vma, struct vm_fault *vmf, int *fault_type) { struct address_space *mapping = inode->i_mapping; - struct shmem_inode_info *info; + struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; struct mem_cgroup *memcg; @@ -1589,9 +1634,8 @@ repeat: * Fast cache lookup did not find it: * bring it back from swap or allocate. */ - info = SHMEM_I(inode); sbinfo = SHMEM_SB(inode->i_sb); - charge_mm = fault_mm ? : current->mm; + charge_mm = vma ? vma->vm_mm : current->mm; if (swap.val) { /* Look it up and read it in.. */ @@ -1601,7 +1645,8 @@ repeat: if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(fault_mm, PGMAJFAULT); + mem_cgroup_count_vm_event(charge_mm, + PGMAJFAULT); } /* Here we actually start the io */ page = shmem_swapin(swap, gfp, info, index); @@ -1670,6 +1715,11 @@ repeat: swap_free(swap); } else { + if (vma && userfaultfd_missing(vma)) { + *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); + return 0; + } + /* shmem_symlink() */ if (mapping->a_ops != &shmem_aops) goto alloc_nohuge; @@ -1837,7 +1887,6 @@ unlock: put_page(page); } if (error == -ENOSPC && !once++) { - info = SHMEM_I(inode); spin_lock_irq(&info->lock); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); @@ -1860,8 +1909,9 @@ static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync return ret; } -static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int shmem_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); enum sgp_type sgp; @@ -1933,7 +1983,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) sgp = SGP_NOHUGE; error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, - gfp, vma->vm_mm, &ret); + gfp, vma, vmf, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); return ret; @@ -2143,10 +2193,123 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode bool shmem_mapping(struct address_space *mapping) { - if (!mapping->host) - return false; + return mapping->a_ops == &shmem_aops; +} - return mapping->host->i_sb->s_op == &shmem_ops; +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep) +{ + struct inode *inode = file_inode(dst_vma->vm_file); + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + gfp_t gfp = mapping_gfp_mask(mapping); + pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); + struct mem_cgroup *memcg; + spinlock_t *ptl; + void *page_kaddr; + struct page *page; + pte_t _dst_pte, *dst_pte; + int ret; + + ret = -ENOMEM; + if (shmem_acct_block(info->flags, 1)) + goto out; + if (sbinfo->max_blocks) { + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks) >= 0) + goto out_unacct_blocks; + percpu_counter_inc(&sbinfo->used_blocks); + } + + if (!*pagep) { + page = shmem_alloc_page(gfp, info, pgoff); + if (!page) + goto out_dec_used_blocks; + + page_kaddr = kmap_atomic(page); + ret = copy_from_user(page_kaddr, (const void __user *)src_addr, + PAGE_SIZE); + kunmap_atomic(page_kaddr); + + /* fallback to copy_from_user outside mmap_sem */ + if (unlikely(ret)) { + *pagep = page; + if (sbinfo->max_blocks) + percpu_counter_add(&sbinfo->used_blocks, -1); + shmem_unacct_blocks(info->flags, 1); + /* don't free the page */ + return -EFAULT; + } + } else { + page = *pagep; + *pagep = NULL; + } + + VM_BUG_ON(PageLocked(page) || PageSwapBacked(page)); + __SetPageLocked(page); + __SetPageSwapBacked(page); + __SetPageUptodate(page); + + ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false); + if (ret) + goto out_release; + + ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); + if (!ret) { + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL); + radix_tree_preload_end(); + } + if (ret) + goto out_release_uncharge; + + mem_cgroup_commit_charge(page, memcg, false, false); + + _dst_pte = mk_pte(page, dst_vma->vm_page_prot); + if (dst_vma->vm_flags & VM_WRITE) + _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); + + ret = -EEXIST; + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (!pte_none(*dst_pte)) + goto out_release_uncharge_unlock; + + lru_cache_add_anon(page); + + spin_lock(&info->lock); + info->alloced++; + inode->i_blocks += BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + + inc_mm_counter(dst_mm, mm_counter_file(page)); + page_add_file_rmap(page, false); + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + unlock_page(page); + pte_unmap_unlock(dst_pte, ptl); + ret = 0; +out: + return ret; +out_release_uncharge_unlock: + pte_unmap_unlock(dst_pte, ptl); +out_release_uncharge: + mem_cgroup_cancel_charge(page, memcg, false); +out_release: + unlock_page(page); + put_page(page); +out_dec_used_blocks: + if (sbinfo->max_blocks) + percpu_counter_add(&sbinfo->used_blocks, -1); +out_unacct_blocks: + shmem_unacct_blocks(info->flags, 1); + goto out; } #ifdef CONFIG_TMPFS @@ -2169,7 +2332,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; /* i_mutex is held by caller */ - if (unlikely(info->seals)) { + if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) { if (info->seals & F_SEAL_WRITE) return -EPERM; if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) @@ -2446,8 +2609,8 @@ static void shmem_tag_pins(struct address_space *mapping) } if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); @@ -2516,8 +2679,8 @@ static int shmem_wait_for_pins(struct address_space *mapping) spin_unlock_irq(&mapping->tree_lock); continue_resched: if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); @@ -3187,7 +3350,6 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) #endif /* CONFIG_TMPFS_XATTR */ static const struct inode_operations shmem_short_symlink_operations = { - .readlink = generic_readlink, .get_link = simple_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, @@ -3195,7 +3357,6 @@ static const struct inode_operations shmem_short_symlink_operations = { }; static const struct inode_operations shmem_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = shmem_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, @@ -4110,7 +4271,7 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, BUG_ON(mapping->a_ops != &shmem_aops); error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, - gfp, NULL, NULL); + gfp, NULL, NULL, NULL); if (error) page = ERR_PTR(error); else diff --git a/mm/slab.c b/mm/slab.c index 0b0550ca85b4..807d86c76908 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -116,6 +116,7 @@ #include <linux/kmemcheck.h> #include <linux/memory.h> #include <linux/prefetch.h> +#include <linux/sched/task_stack.h> #include <net/sock.h> @@ -227,13 +228,14 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) INIT_LIST_HEAD(&parent->slabs_full); INIT_LIST_HEAD(&parent->slabs_partial); INIT_LIST_HEAD(&parent->slabs_free); + parent->total_slabs = 0; + parent->free_slabs = 0; parent->shared = NULL; parent->alien = NULL; parent->colour_next = 0; spin_lock_init(&parent->list_lock); parent->free_objects = 0; parent->free_touched = 0; - parent->num_slabs = 0; } #define MAKE_LIST(cachep, listp, slab, nodeid) \ @@ -551,12 +553,7 @@ static void start_cpu_timer(int cpu) { struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); - /* - * When this gets called from do_initcalls via cpucache_init(), - * init_workqueues() has already run, so keventd will be setup - * at that time. - */ - if (keventd_up() && reap_work->work.func == NULL) { + if (reap_work->work.func == NULL) { init_reap_node(cpu); INIT_DEFERRABLE_WORK(reap_work, cache_reap); schedule_delayed_work_on(cpu, reap_work, @@ -1292,7 +1289,8 @@ void __init kmem_cache_init(void) * Initialize the caches that provide memory for the kmem_cache_node * structures first. Without this, further allocations will bug. */ - kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node", + kmalloc_caches[INDEX_NODE] = create_kmalloc_cache( + kmalloc_info[INDEX_NODE].name, kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); slab_state = PARTIAL_NODE; setup_kmalloc_cache_index_table(); @@ -1366,7 +1364,6 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) { #if DEBUG struct kmem_cache_node *n; - struct page *page; unsigned long flags; int node; static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, @@ -1381,32 +1378,18 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) cachep->name, cachep->size, cachep->gfporder); for_each_kmem_cache_node(cachep, node, n) { - unsigned long active_objs = 0, num_objs = 0, free_objects = 0; - unsigned long active_slabs = 0, num_slabs = 0; - unsigned long num_slabs_partial = 0, num_slabs_free = 0; - unsigned long num_slabs_full; + unsigned long total_slabs, free_slabs, free_objs; spin_lock_irqsave(&n->list_lock, flags); - num_slabs = n->num_slabs; - list_for_each_entry(page, &n->slabs_partial, lru) { - active_objs += page->active; - num_slabs_partial++; - } - list_for_each_entry(page, &n->slabs_free, lru) - num_slabs_free++; - - free_objects += n->free_objects; + total_slabs = n->total_slabs; + free_slabs = n->free_slabs; + free_objs = n->free_objects; spin_unlock_irqrestore(&n->list_lock, flags); - num_objs = num_slabs * cachep->num; - active_slabs = num_slabs - num_slabs_free; - num_slabs_full = num_slabs - - (num_slabs_partial + num_slabs_free); - active_objs += (num_slabs_full * cachep->num); - - pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", - node, active_slabs, num_slabs, active_objs, num_objs, - free_objects); + pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n", + node, total_slabs - free_slabs, total_slabs, + (total_slabs * cachep->num) - free_objs, + total_slabs * cachep->num); } #endif } @@ -2318,7 +2301,8 @@ static int drain_freelist(struct kmem_cache *cache, page = list_entry(p, struct page, lru); list_del(&page->lru); - n->num_slabs--; + n->free_slabs--; + n->total_slabs--; /* * Safe to drop the lock. The slab is no longer linked * to the cache. @@ -2332,7 +2316,7 @@ out: return nr_freed; } -int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) +int __kmem_cache_shrink(struct kmem_cache *cachep) { int ret = 0; int node; @@ -2350,9 +2334,16 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) return (ret ? 1 : 0); } +#ifdef CONFIG_MEMCG +void __kmemcg_cache_deactivate(struct kmem_cache *cachep) +{ + __kmem_cache_shrink(cachep); +} +#endif + int __kmem_cache_shutdown(struct kmem_cache *cachep) { - return __kmem_cache_shrink(cachep, false); + return __kmem_cache_shrink(cachep); } void __kmem_cache_release(struct kmem_cache *cachep) @@ -2475,7 +2466,6 @@ union freelist_init_state { unsigned int pos; unsigned int *list; unsigned int count; - unsigned int rand; }; struct rnd_state rnd_state; }; @@ -2501,8 +2491,7 @@ static bool freelist_state_initialize(union freelist_init_state *state, } else { state->list = cachep->random_seq; state->count = count; - state->pos = 0; - state->rand = rand; + state->pos = rand % count; ret = true; } return ret; @@ -2511,7 +2500,9 @@ static bool freelist_state_initialize(union freelist_init_state *state, /* Get the next entry on the list and randomize it using a random shift */ static freelist_idx_t next_random_slot(union freelist_init_state *state) { - return (state->list[state->pos++] + state->rand) % state->count; + if (state->pos >= state->count) + state->pos = 0; + return state->list[state->pos++]; } /* Swap two freelist entries */ @@ -2753,12 +2744,13 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) n = get_node(cachep, page_to_nid(page)); spin_lock(&n->list_lock); - if (!page->active) + n->total_slabs++; + if (!page->active) { list_add_tail(&page->lru, &(n->slabs_free)); - else + n->free_slabs++; + } else fixup_slab_list(cachep, n, page, &list); - n->num_slabs++; STATS_INC_GROWN(cachep); n->free_objects += cachep->num - page->active; spin_unlock(&n->list_lock); @@ -2903,9 +2895,10 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, /* Move pfmemalloc slab to the end of list to speed up next search */ list_del(&page->lru); - if (!page->active) + if (!page->active) { list_add_tail(&page->lru, &n->slabs_free); - else + n->free_slabs++; + } else list_add_tail(&page->lru, &n->slabs_partial); list_for_each_entry(page, &n->slabs_partial, lru) { @@ -2913,9 +2906,12 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, return page; } + n->free_touched = 1; list_for_each_entry(page, &n->slabs_free, lru) { - if (!PageSlabPfmemalloc(page)) + if (!PageSlabPfmemalloc(page)) { + n->free_slabs--; return page; + } } return NULL; @@ -2925,16 +2921,18 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) { struct page *page; - page = list_first_entry_or_null(&n->slabs_partial, - struct page, lru); + assert_spin_locked(&n->list_lock); + page = list_first_entry_or_null(&n->slabs_partial, struct page, lru); if (!page) { n->free_touched = 1; - page = list_first_entry_or_null(&n->slabs_free, - struct page, lru); + page = list_first_entry_or_null(&n->slabs_free, struct page, + lru); + if (page) + n->free_slabs--; } if (sk_memalloc_socks()) - return get_valid_first_slab(n, page, pfmemalloc); + page = get_valid_first_slab(n, page, pfmemalloc); return page; } @@ -3434,9 +3432,10 @@ static void free_block(struct kmem_cache *cachep, void **objpp, STATS_DEC_ACTIVE(cachep); /* fixup slab chains */ - if (page->active == 0) + if (page->active == 0) { list_add(&page->lru, &n->slabs_free); - else { + n->free_slabs++; + } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. @@ -3450,7 +3449,8 @@ static void free_block(struct kmem_cache *cachep, void **objpp, page = list_last_entry(&n->slabs_free, struct page, lru); list_move(&page->lru, list); - n->num_slabs--; + n->free_slabs--; + n->total_slabs--; } } @@ -4102,64 +4102,33 @@ out: #ifdef CONFIG_SLABINFO void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) { - struct page *page; - unsigned long active_objs; - unsigned long num_objs; - unsigned long active_slabs = 0; - unsigned long num_slabs, free_objects = 0, shared_avail = 0; - unsigned long num_slabs_partial = 0, num_slabs_free = 0; - unsigned long num_slabs_full = 0; - const char *name; - char *error = NULL; + unsigned long active_objs, num_objs, active_slabs; + unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0; + unsigned long free_slabs = 0; int node; struct kmem_cache_node *n; - active_objs = 0; - num_slabs = 0; for_each_kmem_cache_node(cachep, node, n) { - check_irq_on(); spin_lock_irq(&n->list_lock); - num_slabs += n->num_slabs; + total_slabs += n->total_slabs; + free_slabs += n->free_slabs; + free_objs += n->free_objects; - list_for_each_entry(page, &n->slabs_partial, lru) { - if (page->active == cachep->num && !error) - error = "slabs_partial accounting error"; - if (!page->active && !error) - error = "slabs_partial accounting error"; - active_objs += page->active; - num_slabs_partial++; - } - - list_for_each_entry(page, &n->slabs_free, lru) { - if (page->active && !error) - error = "slabs_free accounting error"; - num_slabs_free++; - } - - free_objects += n->free_objects; if (n->shared) shared_avail += n->shared->avail; spin_unlock_irq(&n->list_lock); } - num_objs = num_slabs * cachep->num; - active_slabs = num_slabs - num_slabs_free; - num_slabs_full = num_slabs - (num_slabs_partial + num_slabs_free); - active_objs += (num_slabs_full * cachep->num); - - if (num_objs - active_objs != free_objects && !error) - error = "free_objects accounting error"; - - name = cachep->name; - if (error) - pr_err("slab: cache %s error: %s\n", name, error); + num_objs = total_slabs * cachep->num; + active_slabs = total_slabs - free_slabs; + active_objs = num_objs - free_objs; sinfo->active_objs = active_objs; sinfo->num_objs = num_objs; sinfo->active_slabs = active_slabs; - sinfo->num_slabs = num_slabs; + sinfo->num_slabs = total_slabs; sinfo->shared_avail = shared_avail; sinfo->limit = cachep->limit; sinfo->batchcount = cachep->batchcount; diff --git a/mm/slab.h b/mm/slab.h index bc05fdc3edce..65e7c3fcac72 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -71,6 +71,12 @@ extern struct list_head slab_caches; /* The slab cache that manages slab cache information */ extern struct kmem_cache *kmem_cache; +/* A table of kmalloc cache names and sizes */ +extern const struct kmalloc_info_struct { + const char *name; + unsigned long size; +} kmalloc_info[]; + unsigned long calculate_alignment(unsigned long flags, unsigned long align, unsigned long size); @@ -142,11 +148,27 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #define SLAB_CACHE_FLAGS (0) #endif +/* Common flags available with current configuration */ #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) +/* Common flags permitted for kmem_cache_create */ +#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ + SLAB_RED_ZONE | \ + SLAB_POISON | \ + SLAB_STORE_USER | \ + SLAB_TRACE | \ + SLAB_CONSISTENCY_CHECKS | \ + SLAB_MEM_SPREAD | \ + SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | \ + SLAB_NOTRACK | \ + SLAB_ACCOUNT) + int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); -int __kmem_cache_shrink(struct kmem_cache *, bool); +int __kmem_cache_shrink(struct kmem_cache *); +void __kmemcg_cache_deactivate(struct kmem_cache *s); void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; @@ -180,17 +202,22 @@ void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) + +/* List of all root caches. */ +extern struct list_head slab_root_caches; +#define root_caches_node memcg_params.__root_caches_node + /* * Iterate over all memcg caches of the given root cache. The caller must hold * slab_mutex. */ #define for_each_memcg_cache(iter, root) \ - list_for_each_entry(iter, &(root)->memcg_params.list, \ - memcg_params.list) + list_for_each_entry(iter, &(root)->memcg_params.children, \ + memcg_params.children_node) static inline bool is_root_cache(struct kmem_cache *s) { - return s->memcg_params.is_root_cache; + return !s->memcg_params.root_cache; } static inline bool slab_equal_or_root(struct kmem_cache *s, @@ -279,9 +306,16 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, } extern void slab_init_memcg_params(struct kmem_cache *); +extern void memcg_link_cache(struct kmem_cache *s); +extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, + void (*deact_fn)(struct kmem_cache *)); #else /* CONFIG_MEMCG && !CONFIG_SLOB */ +/* If !memcg, all caches are root. */ +#define slab_root_caches slab_caches +#define root_caches_node list + #define for_each_memcg_cache(iter, root) \ for ((void)(iter), (void)(root); 0; ) @@ -326,6 +360,11 @@ static inline void memcg_uncharge_slab(struct page *page, int order, static inline void slab_init_memcg_params(struct kmem_cache *s) { } + +static inline void memcg_link_cache(struct kmem_cache *s) +{ +} + #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) @@ -432,7 +471,8 @@ struct kmem_cache_node { struct list_head slabs_partial; /* partial list first, better asm code */ struct list_head slabs_full; struct list_head slabs_free; - unsigned long num_slabs; + unsigned long total_slabs; /* length of all slab lists */ + unsigned long free_slabs; /* length of free slab list only */ unsigned long free_objects; unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ @@ -472,6 +512,9 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) void *slab_start(struct seq_file *m, loff_t *pos); void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); +void *memcg_slab_start(struct seq_file *m, loff_t *pos); +void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos); +void memcg_slab_stop(struct seq_file *m, void *p); int memcg_slab_show(struct seq_file *m, void *p); void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); diff --git a/mm/slab_common.c b/mm/slab_common.c index 329b03843863..09d0e849b07f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -30,6 +30,11 @@ LIST_HEAD(slab_caches); DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; +static LIST_HEAD(slab_caches_to_rcu_destroy); +static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work); +static DECLARE_WORK(slab_caches_to_rcu_destroy_work, + slab_caches_to_rcu_destroy_workfn); + /* * Set of flags that will prevent slab merging */ @@ -133,11 +138,14 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, } #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) + +LIST_HEAD(slab_root_caches); + void slab_init_memcg_params(struct kmem_cache *s) { - s->memcg_params.is_root_cache = true; - INIT_LIST_HEAD(&s->memcg_params.list); + s->memcg_params.root_cache = NULL; RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL); + INIT_LIST_HEAD(&s->memcg_params.children); } static int init_memcg_params(struct kmem_cache *s, @@ -145,10 +153,11 @@ static int init_memcg_params(struct kmem_cache *s, { struct memcg_cache_array *arr; - if (memcg) { - s->memcg_params.is_root_cache = false; - s->memcg_params.memcg = memcg; + if (root_cache) { s->memcg_params.root_cache = root_cache; + s->memcg_params.memcg = memcg; + INIT_LIST_HEAD(&s->memcg_params.children_node); + INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node); return 0; } @@ -177,9 +186,6 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size) { struct memcg_cache_array *old, *new; - if (!is_root_cache(s)) - return 0; - new = kzalloc(sizeof(struct memcg_cache_array) + new_array_size * sizeof(void *), GFP_KERNEL); if (!new) @@ -203,7 +209,7 @@ int memcg_update_all_caches(int num_memcgs) int ret = 0; mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) { + list_for_each_entry(s, &slab_root_caches, root_caches_node) { ret = update_memcg_params(s, num_memcgs); /* * Instead of freeing the memory, we'll just leave the caches @@ -215,6 +221,28 @@ int memcg_update_all_caches(int num_memcgs) mutex_unlock(&slab_mutex); return ret; } + +void memcg_link_cache(struct kmem_cache *s) +{ + if (is_root_cache(s)) { + list_add(&s->root_caches_node, &slab_root_caches); + } else { + list_add(&s->memcg_params.children_node, + &s->memcg_params.root_cache->memcg_params.children); + list_add(&s->memcg_params.kmem_caches_node, + &s->memcg_params.memcg->kmem_caches); + } +} + +static void memcg_unlink_cache(struct kmem_cache *s) +{ + if (is_root_cache(s)) { + list_del(&s->root_caches_node); + } else { + list_del(&s->memcg_params.children_node); + list_del(&s->memcg_params.kmem_caches_node); + } +} #else static inline int init_memcg_params(struct kmem_cache *s, struct mem_cgroup *memcg, struct kmem_cache *root_cache) @@ -225,6 +253,10 @@ static inline int init_memcg_params(struct kmem_cache *s, static inline void destroy_memcg_params(struct kmem_cache *s) { } + +static inline void memcg_unlink_cache(struct kmem_cache *s) +{ +} #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ /* @@ -255,7 +287,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, { struct kmem_cache *s; - if (slab_nomerge || (flags & SLAB_NEVER_MERGE)) + if (slab_nomerge) return NULL; if (ctor) @@ -266,7 +298,10 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, size = ALIGN(size, align); flags = kmem_cache_flags(size, flags, name, NULL); - list_for_each_entry_reverse(s, &slab_caches, list) { + if (flags & SLAB_NEVER_MERGE) + return NULL; + + list_for_each_entry_reverse(s, &slab_root_caches, root_caches_node) { if (slab_unmergeable(s)) continue; @@ -350,6 +385,7 @@ static struct kmem_cache *create_cache(const char *name, s->refcount = 1; list_add(&s->list, &slab_caches); + memcg_link_cache(s); out: if (err) return ERR_PTR(err); @@ -404,6 +440,12 @@ kmem_cache_create(const char *name, size_t size, size_t align, goto out_unlock; } + /* Refuse requests with allocator specific flags */ + if (flags & ~SLAB_FLAGS_PERMITTED) { + err = -EINVAL; + goto out_unlock; + } + /* * Some allocators will constraint the set of valid flags to a subset * of all flags. We expect them to define CACHE_CREATE_MASK in this @@ -452,33 +494,61 @@ out_unlock: } EXPORT_SYMBOL(kmem_cache_create); -static int shutdown_cache(struct kmem_cache *s, - struct list_head *release, bool *need_rcu_barrier) +static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) { - if (__kmem_cache_shutdown(s) != 0) - return -EBUSY; + LIST_HEAD(to_destroy); + struct kmem_cache *s, *s2; - if (s->flags & SLAB_DESTROY_BY_RCU) - *need_rcu_barrier = true; + /* + * On destruction, SLAB_DESTROY_BY_RCU kmem_caches are put on the + * @slab_caches_to_rcu_destroy list. The slab pages are freed + * through RCU and and the associated kmem_cache are dereferenced + * while freeing the pages, so the kmem_caches should be freed only + * after the pending RCU operations are finished. As rcu_barrier() + * is a pretty slow operation, we batch all pending destructions + * asynchronously. + */ + mutex_lock(&slab_mutex); + list_splice_init(&slab_caches_to_rcu_destroy, &to_destroy); + mutex_unlock(&slab_mutex); - list_move(&s->list, release); - return 0; + if (list_empty(&to_destroy)) + return; + + rcu_barrier(); + + list_for_each_entry_safe(s, s2, &to_destroy, list) { +#ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_release(s); +#else + slab_kmem_cache_release(s); +#endif + } } -static void release_caches(struct list_head *release, bool need_rcu_barrier) +static int shutdown_cache(struct kmem_cache *s) { - struct kmem_cache *s, *s2; + /* free asan quarantined objects */ + kasan_cache_shutdown(s); + + if (__kmem_cache_shutdown(s) != 0) + return -EBUSY; - if (need_rcu_barrier) - rcu_barrier(); + memcg_unlink_cache(s); + list_del(&s->list); - list_for_each_entry_safe(s, s2, release, list) { + if (s->flags & SLAB_DESTROY_BY_RCU) { + list_add_tail(&s->list, &slab_caches_to_rcu_destroy); + schedule_work(&slab_caches_to_rcu_destroy_work); + } else { #ifdef SLAB_SUPPORTS_SYSFS - sysfs_slab_remove(s); + sysfs_slab_release(s); #else slab_kmem_cache_release(s); #endif } + + return 0; } #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) @@ -545,8 +615,6 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out_unlock; } - list_add(&s->memcg_params.list, &root_cache->memcg_params.list); - /* * Since readers won't lock (see cache_from_memcg_idx()), we need a * barrier here to ensure nobody will see the kmem_cache partially @@ -562,6 +630,66 @@ out_unlock: put_online_cpus(); } +static void kmemcg_deactivate_workfn(struct work_struct *work) +{ + struct kmem_cache *s = container_of(work, struct kmem_cache, + memcg_params.deact_work); + + get_online_cpus(); + get_online_mems(); + + mutex_lock(&slab_mutex); + + s->memcg_params.deact_fn(s); + + mutex_unlock(&slab_mutex); + + put_online_mems(); + put_online_cpus(); + + /* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */ + css_put(&s->memcg_params.memcg->css); +} + +static void kmemcg_deactivate_rcufn(struct rcu_head *head) +{ + struct kmem_cache *s = container_of(head, struct kmem_cache, + memcg_params.deact_rcu_head); + + /* + * We need to grab blocking locks. Bounce to ->deact_work. The + * work item shares the space with the RCU head and can't be + * initialized eariler. + */ + INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn); + queue_work(memcg_kmem_cache_wq, &s->memcg_params.deact_work); +} + +/** + * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a + * sched RCU grace period + * @s: target kmem_cache + * @deact_fn: deactivation function to call + * + * Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex + * held after a sched RCU grace period. The slab is guaranteed to stay + * alive until @deact_fn is finished. This is to be used from + * __kmemcg_cache_deactivate(). + */ +void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, + void (*deact_fn)(struct kmem_cache *)) +{ + if (WARN_ON_ONCE(is_root_cache(s)) || + WARN_ON_ONCE(s->memcg_params.deact_fn)) + return; + + /* pin memcg so that @s doesn't get destroyed in the middle */ + css_get(&s->memcg_params.memcg->css); + + s->memcg_params.deact_fn = deact_fn; + call_rcu_sched(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn); +} + void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) { int idx; @@ -574,17 +702,14 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) get_online_mems(); mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) { - if (!is_root_cache(s)) - continue; - + list_for_each_entry(s, &slab_root_caches, root_caches_node) { arr = rcu_dereference_protected(s->memcg_params.memcg_caches, lockdep_is_held(&slab_mutex)); c = arr->entries[idx]; if (!c) continue; - __kmem_cache_shrink(c, true); + __kmemcg_cache_deactivate(c); arr->entries[idx] = NULL; } mutex_unlock(&slab_mutex); @@ -593,47 +718,29 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) put_online_cpus(); } -static int __shutdown_memcg_cache(struct kmem_cache *s, - struct list_head *release, bool *need_rcu_barrier) -{ - BUG_ON(is_root_cache(s)); - - if (shutdown_cache(s, release, need_rcu_barrier)) - return -EBUSY; - - list_del(&s->memcg_params.list); - return 0; -} - void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) { - LIST_HEAD(release); - bool need_rcu_barrier = false; struct kmem_cache *s, *s2; get_online_cpus(); get_online_mems(); mutex_lock(&slab_mutex); - list_for_each_entry_safe(s, s2, &slab_caches, list) { - if (is_root_cache(s) || s->memcg_params.memcg != memcg) - continue; + list_for_each_entry_safe(s, s2, &memcg->kmem_caches, + memcg_params.kmem_caches_node) { /* * The cgroup is about to be freed and therefore has no charges * left. Hence, all its caches must be empty by now. */ - BUG_ON(__shutdown_memcg_cache(s, &release, &need_rcu_barrier)); + BUG_ON(shutdown_cache(s)); } mutex_unlock(&slab_mutex); put_online_mems(); put_online_cpus(); - - release_caches(&release, need_rcu_barrier); } -static int shutdown_memcg_caches(struct kmem_cache *s, - struct list_head *release, bool *need_rcu_barrier) +static int shutdown_memcg_caches(struct kmem_cache *s) { struct memcg_cache_array *arr; struct kmem_cache *c, *c2; @@ -652,13 +759,13 @@ static int shutdown_memcg_caches(struct kmem_cache *s, c = arr->entries[i]; if (!c) continue; - if (__shutdown_memcg_cache(c, release, need_rcu_barrier)) + if (shutdown_cache(c)) /* * The cache still has objects. Move it to a temporary * list so as not to try to destroy it for a second * time while iterating over inactive caches below. */ - list_move(&c->memcg_params.list, &busy); + list_move(&c->memcg_params.children_node, &busy); else /* * The cache is empty and will be destroyed soon. Clear @@ -673,23 +780,22 @@ static int shutdown_memcg_caches(struct kmem_cache *s, * Second, shutdown all caches left from memory cgroups that are now * offline. */ - list_for_each_entry_safe(c, c2, &s->memcg_params.list, - memcg_params.list) - __shutdown_memcg_cache(c, release, need_rcu_barrier); + list_for_each_entry_safe(c, c2, &s->memcg_params.children, + memcg_params.children_node) + shutdown_cache(c); - list_splice(&busy, &s->memcg_params.list); + list_splice(&busy, &s->memcg_params.children); /* * A cache being destroyed must be empty. In particular, this means * that all per memcg caches attached to it must be empty too. */ - if (!list_empty(&s->memcg_params.list)) + if (!list_empty(&s->memcg_params.children)) return -EBUSY; return 0; } #else -static inline int shutdown_memcg_caches(struct kmem_cache *s, - struct list_head *release, bool *need_rcu_barrier) +static inline int shutdown_memcg_caches(struct kmem_cache *s) { return 0; } @@ -705,8 +811,6 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { - LIST_HEAD(release); - bool need_rcu_barrier = false; int err; if (unlikely(!s)) @@ -715,16 +819,15 @@ void kmem_cache_destroy(struct kmem_cache *s) get_online_cpus(); get_online_mems(); - kasan_cache_destroy(s); mutex_lock(&slab_mutex); s->refcount--; if (s->refcount) goto out_unlock; - err = shutdown_memcg_caches(s, &release, &need_rcu_barrier); + err = shutdown_memcg_caches(s); if (!err) - err = shutdown_cache(s, &release, &need_rcu_barrier); + err = shutdown_cache(s); if (err) { pr_err("kmem_cache_destroy %s: Slab cache still has objects\n", @@ -736,8 +839,6 @@ out_unlock: put_online_mems(); put_online_cpus(); - - release_caches(&release, need_rcu_barrier); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -755,7 +856,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) get_online_cpus(); get_online_mems(); kasan_cache_shrink(cachep); - ret = __kmem_cache_shrink(cachep, false); + ret = __kmem_cache_shrink(cachep); put_online_mems(); put_online_cpus(); return ret; @@ -799,6 +900,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, create_boot_cache(s, name, size, flags); list_add(&s->list, &slab_caches); + memcg_link_cache(s); s->refcount = 1; return s; } @@ -883,10 +985,7 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is * kmalloc-67108864. */ -static struct { - const char *name; - unsigned long size; -} const kmalloc_info[] __initconst = { +const struct kmalloc_info_struct kmalloc_info[] __initconst = { {NULL, 0}, {"kmalloc-96", 96}, {"kmalloc-192", 192}, {"kmalloc-8", 8}, {"kmalloc-16", 16}, {"kmalloc-32", 32}, @@ -1109,12 +1208,12 @@ static void print_slabinfo_header(struct seq_file *m) void *slab_start(struct seq_file *m, loff_t *pos) { mutex_lock(&slab_mutex); - return seq_list_start(&slab_caches, *pos); + return seq_list_start(&slab_root_caches, *pos); } void *slab_next(struct seq_file *m, void *p, loff_t *pos) { - return seq_list_next(p, &slab_caches, pos); + return seq_list_next(p, &slab_root_caches, pos); } void slab_stop(struct seq_file *m, void *p) @@ -1166,25 +1265,44 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m) static int slab_show(struct seq_file *m, void *p) { - struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + struct kmem_cache *s = list_entry(p, struct kmem_cache, root_caches_node); - if (p == slab_caches.next) + if (p == slab_root_caches.next) print_slabinfo_header(m); - if (is_root_cache(s)) - cache_show(s, m); + cache_show(s, m); return 0; } #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +void *memcg_slab_start(struct seq_file *m, loff_t *pos) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + mutex_lock(&slab_mutex); + return seq_list_start(&memcg->kmem_caches, *pos); +} + +void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + return seq_list_next(p, &memcg->kmem_caches, pos); +} + +void memcg_slab_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&slab_mutex); +} + int memcg_slab_show(struct seq_file *m, void *p) { - struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + struct kmem_cache *s = list_entry(p, struct kmem_cache, + memcg_params.kmem_caches_node); struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - if (p == slab_caches.next) + if (p == memcg->kmem_caches.next) print_slabinfo_header(m); - if (!is_root_cache(s) && s->memcg_params.memcg == memcg) - cache_show(s, m); + cache_show(s, m); return 0; } #endif diff --git a/mm/slob.c b/mm/slob.c index 5ec158054ffe..eac04d4357ec 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -634,7 +634,7 @@ void __kmem_cache_release(struct kmem_cache *c) { } -int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate) +int __kmem_cache_shrink(struct kmem_cache *d) { return 0; } diff --git a/mm/slub.c b/mm/slub.c index 2b3e740609e9..7f4bc7027ed5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -214,11 +214,13 @@ enum track_item { TRACK_ALLOC, TRACK_FREE }; static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void memcg_propagate_slab_attrs(struct kmem_cache *s); +static void sysfs_slab_remove(struct kmem_cache *s); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } +static inline void sysfs_slab_remove(struct kmem_cache *s) { } #endif static inline void stat(const struct kmem_cache *s, enum stat_item si) @@ -496,10 +498,11 @@ static inline int check_valid_pointer(struct kmem_cache *s, return 1; } -static void print_section(char *text, u8 *addr, unsigned int length) +static void print_section(char *level, char *text, u8 *addr, + unsigned int length) { metadata_access_enable(); - print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, + print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, length, 1); metadata_access_disable(); } @@ -636,14 +639,15 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) p, p - addr, get_freepointer(s, p)); if (s->flags & SLAB_RED_ZONE) - print_section("Redzone ", p - s->red_left_pad, s->red_left_pad); + print_section(KERN_ERR, "Redzone ", p - s->red_left_pad, + s->red_left_pad); else if (p > addr + 16) - print_section("Bytes b4 ", p - 16, 16); + print_section(KERN_ERR, "Bytes b4 ", p - 16, 16); - print_section("Object ", p, min_t(unsigned long, s->object_size, - PAGE_SIZE)); + print_section(KERN_ERR, "Object ", p, + min_t(unsigned long, s->object_size, PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) - print_section("Redzone ", p + s->object_size, + print_section(KERN_ERR, "Redzone ", p + s->object_size, s->inuse - s->object_size); if (s->offset) @@ -658,7 +662,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ - print_section("Padding ", p + off, size_from_object(s) - off); + print_section(KERN_ERR, "Padding ", p + off, + size_from_object(s) - off); dump_stack(); } @@ -820,7 +825,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section("Padding ", end - remainder, remainder); + print_section(KERN_ERR, "Padding ", end - remainder, remainder); restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); return 0; @@ -973,7 +978,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, page->freelist); if (!alloc) - print_section("Object ", (void *)object, + print_section(KERN_INFO, "Object ", (void *)object, s->object_size); dump_stack(); @@ -1419,6 +1424,10 @@ static int init_cache_random_seq(struct kmem_cache *s) int err; unsigned long i, count = oo_objects(s->oo); + /* Bailout if already initialised */ + if (s->random_seq) + return 0; + err = cache_random_seq_create(s, count, GFP_KERNEL); if (err) { pr_err("SLUB: Unable to initialize free list for %s\n", @@ -1623,6 +1632,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) flags &= ~GFP_SLAB_BUG_MASK; pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", invalid_mask, &invalid_mask, flags, &flags); + dump_stack(); } return allocate_slab(s, @@ -3076,7 +3086,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) struct detached_freelist df; size = build_detached_freelist(s, size, p, &df); - if (unlikely(!df.page)) + if (!df.page) continue; slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_); @@ -3679,6 +3689,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) if (n->nr_partial || slabs_node(s, node)) return 1; } + sysfs_slab_remove(s); return 0; } @@ -3883,7 +3894,7 @@ EXPORT_SYMBOL(kfree); * being allocated from last increasing the chance that the last objects * are freed in them. */ -int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) +int __kmem_cache_shrink(struct kmem_cache *s) { int node; int i; @@ -3895,21 +3906,6 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) unsigned long flags; int ret = 0; - if (deactivate) { - /* - * Disable empty slabs caching. Used to avoid pinning offline - * memory cgroups by kmem pages that can be freed. - */ - s->cpu_partial = 0; - s->min_partial = 0; - - /* - * s->cpu_partial is checked locklessly (see put_cpu_partial), - * so we have to make sure the change is visible. - */ - synchronize_sched(); - } - flush_all(s); for_each_kmem_cache_node(s, node, n) { INIT_LIST_HEAD(&discard); @@ -3960,13 +3956,49 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) return ret; } +#ifdef CONFIG_MEMCG +static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s) +{ + /* + * Called with all the locks held after a sched RCU grace period. + * Even if @s becomes empty after shrinking, we can't know that @s + * doesn't have allocations already in-flight and thus can't + * destroy @s until the associated memcg is released. + * + * However, let's remove the sysfs files for empty caches here. + * Each cache has a lot of interface files which aren't + * particularly useful for empty draining caches; otherwise, we can + * easily end up with millions of unnecessary sysfs files on + * systems which have a lot of memory and transient cgroups. + */ + if (!__kmem_cache_shrink(s)) + sysfs_slab_remove(s); +} + +void __kmemcg_cache_deactivate(struct kmem_cache *s) +{ + /* + * Disable empty slabs caching. Used to avoid pinning offline + * memory cgroups by kmem pages that can be freed. + */ + s->cpu_partial = 0; + s->min_partial = 0; + + /* + * s->cpu_partial is checked locklessly (see put_cpu_partial), so + * we have to make sure the change is visible before shrinking. + */ + slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu); +} +#endif + static int slab_mem_going_offline_callback(void *arg) { struct kmem_cache *s; mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) - __kmem_cache_shrink(s, false); + __kmem_cache_shrink(s); mutex_unlock(&slab_mutex); return 0; @@ -4116,6 +4148,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) } slab_init_memcg_params(s); list_add(&s->list, &slab_caches); + memcg_link_cache(s); return s; } @@ -4675,6 +4708,22 @@ enum slab_stat_type { #define SO_OBJECTS (1 << SL_OBJECTS) #define SO_TOTAL (1 << SL_TOTAL) +#ifdef CONFIG_MEMCG +static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); + +static int __init setup_slub_memcg_sysfs(char *str) +{ + int v; + + if (get_option(&str, &v) > 0) + memcg_sysfs_enabled = v; + + return 1; +} + +__setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs); +#endif + static ssize_t show_slab_objects(struct kmem_cache *s, char *buf, unsigned long flags) { @@ -5578,8 +5627,14 @@ static int sysfs_slab_add(struct kmem_cache *s) { int err; const char *name; + struct kset *kset = cache_kset(s); int unmergeable = slab_unmergeable(s); + if (!kset) { + kobject_init(&s->kobj, &slab_ktype); + return 0; + } + if (unmergeable) { /* * Slabcache can never be merged so we can use the name proper. @@ -5596,7 +5651,7 @@ static int sysfs_slab_add(struct kmem_cache *s) name = create_unique_id(s); } - s->kobj.kset = cache_kset(s); + s->kobj.kset = kset; err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); if (err) goto out; @@ -5606,7 +5661,7 @@ static int sysfs_slab_add(struct kmem_cache *s) goto out_del_kobj; #ifdef CONFIG_MEMCG - if (is_root_cache(s)) { + if (is_root_cache(s) && memcg_sysfs_enabled) { s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); if (!s->memcg_kset) { err = -ENOMEM; @@ -5629,7 +5684,7 @@ out_del_kobj: goto out; } -void sysfs_slab_remove(struct kmem_cache *s) +static void sysfs_slab_remove(struct kmem_cache *s) { if (slab_state < FULL) /* @@ -5638,12 +5693,26 @@ void sysfs_slab_remove(struct kmem_cache *s) */ return; + if (!s->kobj.state_in_sysfs) + /* + * For a memcg cache, this may be called during + * deactivation and again on shutdown. Remove only once. + * A cache is never shut down before deactivation is + * complete, so no need to worry about synchronization. + */ + return; + #ifdef CONFIG_MEMCG kset_unregister(s->memcg_kset); #endif kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); - kobject_put(&s->kobj); +} + +void sysfs_slab_release(struct kmem_cache *s) +{ + if (slab_state >= FULL) + kobject_put(&s->kobj); } /* diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 574c67b663fe..a56c3989f773 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -196,9 +196,9 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) return pmd; } -pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node) +pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) { - pud_t *pud = pud_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); if (pud_none(*pud)) { void *p = vmemmap_alloc_block(PAGE_SIZE, node); if (!p) @@ -208,6 +208,18 @@ pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node) return pud; } +p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) +{ + p4d_t *p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + void *p = vmemmap_alloc_block(PAGE_SIZE, node); + if (!p) + return NULL; + p4d_populate(&init_mm, p4d, p); + } + return p4d; +} + pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) { pgd_t *pgd = pgd_offset_k(addr); @@ -225,6 +237,7 @@ int __meminit vmemmap_populate_basepages(unsigned long start, { unsigned long addr = start; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -233,7 +246,10 @@ int __meminit vmemmap_populate_basepages(unsigned long start, pgd = vmemmap_pgd_populate(addr, node); if (!pgd) return -ENOMEM; - pud = vmemmap_pud_populate(pgd, addr, node); + p4d = vmemmap_p4d_populate(pgd, addr, node); + if (!p4d) + return -ENOMEM; + pud = vmemmap_pud_populate(p4d, addr, node); if (!pud) return -ENOMEM; pmd = vmemmap_pmd_populate(pud, addr, node); diff --git a/mm/sparse.c b/mm/sparse.c index 1e168bf2779a..db6bf3c97ea2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -662,12 +662,12 @@ static void free_map_bootmem(struct page *memmap) >> PAGE_SHIFT; for (i = 0; i < nr_pages; i++, page++) { - magic = (unsigned long) page->lru.next; + magic = (unsigned long) page->freelist; BUG_ON(magic == NODE_INFO); maps_section_nr = pfn_to_section_nr(page_to_pfn(page)); - removing_section_nr = page->private; + removing_section_nr = page_private(page); /* * When this function is called, the removing section is diff --git a/mm/swap.c b/mm/swap.c index 4dcf852e1e6d..5dabf444d724 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -69,6 +69,7 @@ static void __page_cache_release(struct page *page) del_page_from_lru_list(page, lruvec, page_off_lru(page)); spin_unlock_irqrestore(zone_lru_lock(zone), flags); } + __ClearPageWaiters(page); mem_cgroup_uncharge(page); } @@ -208,9 +209,10 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, { int *pgmoved = arg; - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - enum lru_list lru = page_lru_base_type(page); - list_move_tail(&page->lru, &lruvec->lists[lru]); + if (PageLRU(page) && !PageUnevictable(page)) { + del_page_from_lru_list(page, lruvec, page_lru(page)); + ClearPageActive(page); + add_page_to_lru_list_tail(page, lruvec, page_lru(page)); (*pgmoved)++; } } @@ -234,7 +236,7 @@ static void pagevec_move_tail(struct pagevec *pvec) */ void rotate_reclaimable_page(struct page *page) { - if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && + if (!PageLocked(page) && !PageDirty(page) && !PageUnevictable(page) && PageLRU(page)) { struct pagevec *pvec; unsigned long flags; @@ -668,30 +670,19 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); -/* - * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM - * workqueue, aiding in getting memory freed. - */ -static struct workqueue_struct *lru_add_drain_wq; - -static int __init lru_init(void) -{ - lru_add_drain_wq = alloc_workqueue("lru-add-drain", WQ_MEM_RECLAIM, 0); - - if (WARN(!lru_add_drain_wq, - "Failed to create workqueue lru_add_drain_wq")) - return -ENOMEM; - - return 0; -} -early_initcall(lru_init); - void lru_add_drain_all(void) { static DEFINE_MUTEX(lock); static struct cpumask has_work; int cpu; + /* + * Make sure nobody triggers this path before mm_percpu_wq is fully + * initialized. + */ + if (WARN_ON(!mm_percpu_wq)) + return; + mutex_lock(&lock); get_online_cpus(); cpumask_clear(&has_work); @@ -705,7 +696,7 @@ void lru_add_drain_all(void) pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); - queue_work_on(cpu, lru_add_drain_wq, work); + queue_work_on(cpu, mm_percpu_wq, work); cpumask_set_cpu(cpu, &has_work); } } @@ -784,6 +775,7 @@ void release_pages(struct page **pages, int nr, bool cold) /* Clear Active bit in case of parallel mark_page_accessed */ __ClearPageActive(page); + __ClearPageWaiters(page); list_add(&page->lru, &pages_to_free); } @@ -969,12 +961,6 @@ EXPORT_SYMBOL(pagevec_lookup_tag); void __init swap_setup(void) { unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); -#ifdef CONFIG_SWAP - int i; - - for (i = 0; i < MAX_SWAPFILES; i++) - spin_lock_init(&swapper_spaces[i].tree_lock); -#endif /* Use a smaller cluster for small-memory machines */ if (megs < 16) diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index 310ac0b8f974..ac6318a064d3 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -201,6 +201,8 @@ void swap_cgroup_swapoff(int type) struct page *page = map[i]; if (page) __free_page(page); + if (!(i % SWAP_CLUSTER_MAX)) + cond_resched(); } vfree(map); } diff --git a/mm/swap_slots.c b/mm/swap_slots.c new file mode 100644 index 000000000000..b1ccb58ad397 --- /dev/null +++ b/mm/swap_slots.c @@ -0,0 +1,340 @@ +/* + * Manage cache of swap slots to be used for and returned from + * swap. + * + * Copyright(c) 2016 Intel Corporation. + * + * Author: Tim Chen <tim.c.chen@linux.intel.com> + * + * We allocate the swap slots from the global pool and put + * it into local per cpu caches. This has the advantage + * of no needing to acquire the swap_info lock every time + * we need a new slot. + * + * There is also opportunity to simply return the slot + * to local caches without needing to acquire swap_info + * lock. We do not reuse the returned slots directly but + * move them back to the global pool in a batch. This + * allows the slots to coaellesce and reduce fragmentation. + * + * The swap entry allocated is marked with SWAP_HAS_CACHE + * flag in map_count that prevents it from being allocated + * again from the global pool. + * + * The swap slots cache is protected by a mutex instead of + * a spin lock as when we search for slots with scan_swap_map, + * we can possibly sleep. + */ + +#include <linux/swap_slots.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/vmalloc.h> +#include <linux/mutex.h> + +#ifdef CONFIG_SWAP + +static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); +static bool swap_slot_cache_active; +bool swap_slot_cache_enabled; +static bool swap_slot_cache_initialized; +DEFINE_MUTEX(swap_slots_cache_mutex); +/* Serialize swap slots cache enable/disable operations */ +DEFINE_MUTEX(swap_slots_cache_enable_mutex); + +static void __drain_swap_slots_cache(unsigned int type); +static void deactivate_swap_slots_cache(void); +static void reactivate_swap_slots_cache(void); + +#define use_swap_slot_cache (swap_slot_cache_active && \ + swap_slot_cache_enabled && swap_slot_cache_initialized) +#define SLOTS_CACHE 0x1 +#define SLOTS_CACHE_RET 0x2 + +static void deactivate_swap_slots_cache(void) +{ + mutex_lock(&swap_slots_cache_mutex); + swap_slot_cache_active = false; + __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); + mutex_unlock(&swap_slots_cache_mutex); +} + +static void reactivate_swap_slots_cache(void) +{ + mutex_lock(&swap_slots_cache_mutex); + swap_slot_cache_active = true; + mutex_unlock(&swap_slots_cache_mutex); +} + +/* Must not be called with cpu hot plug lock */ +void disable_swap_slots_cache_lock(void) +{ + mutex_lock(&swap_slots_cache_enable_mutex); + swap_slot_cache_enabled = false; + if (swap_slot_cache_initialized) { + /* serialize with cpu hotplug operations */ + get_online_cpus(); + __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); + put_online_cpus(); + } +} + +static void __reenable_swap_slots_cache(void) +{ + swap_slot_cache_enabled = has_usable_swap(); +} + +void reenable_swap_slots_cache_unlock(void) +{ + __reenable_swap_slots_cache(); + mutex_unlock(&swap_slots_cache_enable_mutex); +} + +static bool check_cache_active(void) +{ + long pages; + + if (!swap_slot_cache_enabled || !swap_slot_cache_initialized) + return false; + + pages = get_nr_swap_pages(); + if (!swap_slot_cache_active) { + if (pages > num_online_cpus() * + THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE) + reactivate_swap_slots_cache(); + goto out; + } + + /* if global pool of slot caches too low, deactivate cache */ + if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE) + deactivate_swap_slots_cache(); +out: + return swap_slot_cache_active; +} + +static int alloc_swap_slot_cache(unsigned int cpu) +{ + struct swap_slots_cache *cache; + swp_entry_t *slots, *slots_ret; + + /* + * Do allocation outside swap_slots_cache_mutex + * as vzalloc could trigger reclaim and get_swap_page, + * which can lock swap_slots_cache_mutex. + */ + slots = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE); + if (!slots) + return -ENOMEM; + + slots_ret = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE); + if (!slots_ret) { + vfree(slots); + return -ENOMEM; + } + + mutex_lock(&swap_slots_cache_mutex); + cache = &per_cpu(swp_slots, cpu); + if (cache->slots || cache->slots_ret) + /* cache already allocated */ + goto out; + if (!cache->lock_initialized) { + mutex_init(&cache->alloc_lock); + spin_lock_init(&cache->free_lock); + cache->lock_initialized = true; + } + cache->nr = 0; + cache->cur = 0; + cache->n_ret = 0; + cache->slots = slots; + slots = NULL; + cache->slots_ret = slots_ret; + slots_ret = NULL; +out: + mutex_unlock(&swap_slots_cache_mutex); + if (slots) + vfree(slots); + if (slots_ret) + vfree(slots_ret); + return 0; +} + +static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, + bool free_slots) +{ + struct swap_slots_cache *cache; + swp_entry_t *slots = NULL; + + cache = &per_cpu(swp_slots, cpu); + if ((type & SLOTS_CACHE) && cache->slots) { + mutex_lock(&cache->alloc_lock); + swapcache_free_entries(cache->slots + cache->cur, cache->nr); + cache->cur = 0; + cache->nr = 0; + if (free_slots && cache->slots) { + vfree(cache->slots); + cache->slots = NULL; + } + mutex_unlock(&cache->alloc_lock); + } + if ((type & SLOTS_CACHE_RET) && cache->slots_ret) { + spin_lock_irq(&cache->free_lock); + swapcache_free_entries(cache->slots_ret, cache->n_ret); + cache->n_ret = 0; + if (free_slots && cache->slots_ret) { + slots = cache->slots_ret; + cache->slots_ret = NULL; + } + spin_unlock_irq(&cache->free_lock); + if (slots) + vfree(slots); + } +} + +static void __drain_swap_slots_cache(unsigned int type) +{ + unsigned int cpu; + + /* + * This function is called during + * 1) swapoff, when we have to make sure no + * left over slots are in cache when we remove + * a swap device; + * 2) disabling of swap slot cache, when we run low + * on swap slots when allocating memory and need + * to return swap slots to global pool. + * + * We cannot acquire cpu hot plug lock here as + * this function can be invoked in the cpu + * hot plug path: + * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback + * -> memory allocation -> direct reclaim -> get_swap_page + * -> drain_swap_slots_cache + * + * Hence the loop over current online cpu below could miss cpu that + * is being brought online but not yet marked as online. + * That is okay as we do not schedule and run anything on a + * cpu before it has been marked online. Hence, we will not + * fill any swap slots in slots cache of such cpu. + * There are no slots on such cpu that need to be drained. + */ + for_each_online_cpu(cpu) + drain_slots_cache_cpu(cpu, type, false); +} + +static int free_slot_cache(unsigned int cpu) +{ + mutex_lock(&swap_slots_cache_mutex); + drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true); + mutex_unlock(&swap_slots_cache_mutex); + return 0; +} + +int enable_swap_slots_cache(void) +{ + int ret = 0; + + mutex_lock(&swap_slots_cache_enable_mutex); + if (swap_slot_cache_initialized) { + __reenable_swap_slots_cache(); + goto out_unlock; + } + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache", + alloc_swap_slot_cache, free_slot_cache); + if (ret < 0) + goto out_unlock; + swap_slot_cache_initialized = true; + __reenable_swap_slots_cache(); +out_unlock: + mutex_unlock(&swap_slots_cache_enable_mutex); + return 0; +} + +/* called with swap slot cache's alloc lock held */ +static int refill_swap_slots_cache(struct swap_slots_cache *cache) +{ + if (!use_swap_slot_cache || cache->nr) + return 0; + + cache->cur = 0; + if (swap_slot_cache_active) + cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, cache->slots); + + return cache->nr; +} + +int free_swap_slot(swp_entry_t entry) +{ + struct swap_slots_cache *cache; + + cache = &get_cpu_var(swp_slots); + if (use_swap_slot_cache && cache->slots_ret) { + spin_lock_irq(&cache->free_lock); + /* Swap slots cache may be deactivated before acquiring lock */ + if (!use_swap_slot_cache) { + spin_unlock_irq(&cache->free_lock); + goto direct_free; + } + if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) { + /* + * Return slots to global pool. + * The current swap_map value is SWAP_HAS_CACHE. + * Set it to 0 to indicate it is available for + * allocation in global pool + */ + swapcache_free_entries(cache->slots_ret, cache->n_ret); + cache->n_ret = 0; + } + cache->slots_ret[cache->n_ret++] = entry; + spin_unlock_irq(&cache->free_lock); + } else { +direct_free: + swapcache_free_entries(&entry, 1); + } + put_cpu_var(swp_slots); + + return 0; +} + +swp_entry_t get_swap_page(void) +{ + swp_entry_t entry, *pentry; + struct swap_slots_cache *cache; + + /* + * Preemption is allowed here, because we may sleep + * in refill_swap_slots_cache(). But it is safe, because + * accesses to the per-CPU data structure are protected by the + * mutex cache->alloc_lock. + * + * The alloc path here does not touch cache->slots_ret + * so cache->free_lock is not taken. + */ + cache = raw_cpu_ptr(&swp_slots); + + entry.val = 0; + if (check_cache_active()) { + mutex_lock(&cache->alloc_lock); + if (cache->slots) { +repeat: + if (cache->nr) { + pentry = &cache->slots[cache->cur++]; + entry = *pentry; + pentry->val = 0; + cache->nr--; + } else { + if (refill_swap_slots_cache(cache)) + goto repeat; + } + } + mutex_unlock(&cache->alloc_lock); + if (entry.val) + return entry; + } + + get_swap_pages(1, &entry); + + return entry; +} + +#endif /* CONFIG_SWAP */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 35d7e0ee1c77..473b71e052a8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -17,6 +17,8 @@ #include <linux/blkdev.h> #include <linux/pagevec.h> #include <linux/migrate.h> +#include <linux/vmalloc.h> +#include <linux/swap_slots.h> #include <asm/pgtable.h> @@ -32,15 +34,8 @@ static const struct address_space_operations swap_aops = { #endif }; -struct address_space swapper_spaces[MAX_SWAPFILES] = { - [0 ... MAX_SWAPFILES - 1] = { - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), - .i_mmap_writable = ATOMIC_INIT(0), - .a_ops = &swap_aops, - /* swap cache doesn't use writeback related tags */ - .flags = 1 << AS_NO_WRITEBACK_TAGS, - } -}; +struct address_space *swapper_spaces[MAX_SWAPFILES]; +static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -53,11 +48,26 @@ static struct { unsigned long total_swapcache_pages(void) { - int i; + unsigned int i, j, nr; unsigned long ret = 0; + struct address_space *spaces; - for (i = 0; i < MAX_SWAPFILES; i++) - ret += swapper_spaces[i].nrpages; + rcu_read_lock(); + for (i = 0; i < MAX_SWAPFILES; i++) { + /* + * The corresponding entries in nr_swapper_spaces and + * swapper_spaces will be reused only after at least + * one grace period. So it is impossible for them + * belongs to different usage. + */ + nr = nr_swapper_spaces[i]; + spaces = rcu_dereference(swapper_spaces[i]); + if (!nr || !spaces) + continue; + for (j = 0; j < nr; j++) + ret += spaces[j].nrpages; + } + rcu_read_unlock(); return ret; } @@ -315,6 +325,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, break; /* + * Just skip read ahead for unused swap slot. + * During swap_off when swap_slot_cache is disabled, + * we have to handle the race between putting + * swap entry in swap cache and marking swap slot + * as SWAP_HAS_CACHE. That's done in later part of code or + * else swap_off will be aborted if we return NULL. + */ + if (!__swp_swapcount(entry) && swap_slot_cache_enabled) + break; + + /* * Get a new page to read into from swap. */ if (!new_page) { @@ -505,3 +526,38 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, skip: return read_swap_cache_async(entry, gfp_mask, vma, addr); } + +int init_swap_address_space(unsigned int type, unsigned long nr_pages) +{ + struct address_space *spaces, *space; + unsigned int i, nr; + + nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); + spaces = vzalloc(sizeof(struct address_space) * nr); + if (!spaces) + return -ENOMEM; + for (i = 0; i < nr; i++) { + space = spaces + i; + INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN); + atomic_set(&space->i_mmap_writable, 0); + space->a_ops = &swap_aops; + /* swap cache doesn't use writeback related tags */ + mapping_set_no_writeback_tags(space); + spin_lock_init(&space->tree_lock); + } + nr_swapper_spaces[type] = nr; + rcu_assign_pointer(swapper_spaces[type], spaces); + + return 0; +} + +void exit_swap_address_space(unsigned int type) +{ + struct address_space *spaces; + + spaces = swapper_spaces[type]; + nr_swapper_spaces[type] = 0; + rcu_assign_pointer(swapper_spaces[type], NULL); + synchronize_rcu(); + kvfree(spaces); +} diff --git a/mm/swapfile.c b/mm/swapfile.c index f30438970cd1..178130880b90 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -6,6 +6,8 @@ */ #include <linux/mm.h> +#include <linux/sched/mm.h> +#include <linux/sched/task.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/slab.h> @@ -34,6 +36,7 @@ #include <linux/frontswap.h> #include <linux/swapfile.h> #include <linux/export.h> +#include <linux/swap_slots.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -257,6 +260,47 @@ static inline void cluster_set_null(struct swap_cluster_info *info) info->data = 0; } +static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, + unsigned long offset) +{ + struct swap_cluster_info *ci; + + ci = si->cluster_info; + if (ci) { + ci += offset / SWAPFILE_CLUSTER; + spin_lock(&ci->lock); + } + return ci; +} + +static inline void unlock_cluster(struct swap_cluster_info *ci) +{ + if (ci) + spin_unlock(&ci->lock); +} + +static inline struct swap_cluster_info *lock_cluster_or_swap_info( + struct swap_info_struct *si, + unsigned long offset) +{ + struct swap_cluster_info *ci; + + ci = lock_cluster(si, offset); + if (!ci) + spin_lock(&si->lock); + + return ci; +} + +static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, + struct swap_cluster_info *ci) +{ + if (ci) + unlock_cluster(ci); + else + spin_unlock(&si->lock); +} + static inline bool cluster_list_empty(struct swap_cluster_list *list) { return cluster_is_null(&list->head); @@ -281,9 +325,17 @@ static void cluster_list_add_tail(struct swap_cluster_list *list, cluster_set_next_flag(&list->head, idx, 0); cluster_set_next_flag(&list->tail, idx, 0); } else { + struct swap_cluster_info *ci_tail; unsigned int tail = cluster_next(&list->tail); - cluster_set_next(&ci[tail], idx); + /* + * Nested cluster lock, but both cluster locks are + * only acquired when we held swap_info_struct->lock + */ + ci_tail = ci + tail; + spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); + cluster_set_next(ci_tail, idx); + unlock_cluster(ci_tail); cluster_set_next_flag(&list->tail, idx, 0); } } @@ -328,7 +380,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, */ static void swap_do_scheduled_discard(struct swap_info_struct *si) { - struct swap_cluster_info *info; + struct swap_cluster_info *info, *ci; unsigned int idx; info = si->cluster_info; @@ -341,10 +393,14 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) SWAPFILE_CLUSTER); spin_lock(&si->lock); - cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); + ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); + cluster_set_flag(ci, CLUSTER_FLAG_FREE); + unlock_cluster(ci); cluster_list_add_tail(&si->free_clusters, info, idx); + ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); + unlock_cluster(ci); } } @@ -443,12 +499,13 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, * Try to get a swap entry from current cpu's swap entry pool (a cluster). This * might involve allocating a new cluster for current CPU too. */ -static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, +static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, unsigned long *offset, unsigned long *scan_base) { struct percpu_cluster *cluster; + struct swap_cluster_info *ci; bool found_free; - unsigned long tmp; + unsigned long tmp, max; new_cluster: cluster = this_cpu_ptr(si->percpu_cluster); @@ -466,7 +523,7 @@ new_cluster: *scan_base = *offset = si->cluster_next; goto new_cluster; } else - return; + return false; } found_free = false; @@ -476,14 +533,21 @@ new_cluster: * check if there is still free entry in the cluster */ tmp = cluster->next; - while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) * - SWAPFILE_CLUSTER) { + max = min_t(unsigned long, si->max, + (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); + if (tmp >= max) { + cluster_set_null(&cluster->index); + goto new_cluster; + } + ci = lock_cluster(si, tmp); + while (tmp < max) { if (!si->swap_map[tmp]) { found_free = true; break; } tmp++; } + unlock_cluster(ci); if (!found_free) { cluster_set_null(&cluster->index); goto new_cluster; @@ -491,15 +555,22 @@ new_cluster: cluster->next = tmp + 1; *offset = tmp; *scan_base = tmp; + return found_free; } -static unsigned long scan_swap_map(struct swap_info_struct *si, - unsigned char usage) +static int scan_swap_map_slots(struct swap_info_struct *si, + unsigned char usage, int nr, + swp_entry_t slots[]) { + struct swap_cluster_info *ci; unsigned long offset; unsigned long scan_base; unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; + int n_ret = 0; + + if (nr > SWAP_BATCH) + nr = SWAP_BATCH; /* * We try to cluster swap pages by allocating them sequentially @@ -517,8 +588,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, /* SSD algorithm */ if (si->cluster_info) { - scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); - goto checks; + if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) + goto checks; + else + goto scan; } if (unlikely(!si->cluster_nr--)) { @@ -562,8 +635,14 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, checks: if (si->cluster_info) { - while (scan_swap_map_ssd_cluster_conflict(si, offset)) - scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); + while (scan_swap_map_ssd_cluster_conflict(si, offset)) { + /* take a break if we already got some slots */ + if (n_ret) + goto done; + if (!scan_swap_map_try_ssd_cluster(si, &offset, + &scan_base)) + goto scan; + } } if (!(si->flags & SWP_WRITEOK)) goto no_page; @@ -572,9 +651,11 @@ checks: if (offset > si->highest_bit) scan_base = offset = si->lowest_bit; + ci = lock_cluster(si, offset); /* reuse swap entry of cache-only swap if not busy. */ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; + unlock_cluster(ci); spin_unlock(&si->lock); swap_was_freed = __try_to_reclaim_swap(si, offset); spin_lock(&si->lock); @@ -584,8 +665,13 @@ checks: goto scan; /* check next one */ } - if (si->swap_map[offset]) - goto scan; + if (si->swap_map[offset]) { + unlock_cluster(ci); + if (!n_ret) + goto scan; + else + goto done; + } if (offset == si->lowest_bit) si->lowest_bit++; @@ -601,10 +687,45 @@ checks: } si->swap_map[offset] = usage; inc_cluster_info_page(si, si->cluster_info, offset); + unlock_cluster(ci); si->cluster_next = offset + 1; - si->flags -= SWP_SCANNING; + slots[n_ret++] = swp_entry(si->type, offset); + + /* got enough slots or reach max slots? */ + if ((n_ret == nr) || (offset >= si->highest_bit)) + goto done; - return offset; + /* search for next available slot */ + + /* time to take a break? */ + if (unlikely(--latency_ration < 0)) { + if (n_ret) + goto done; + spin_unlock(&si->lock); + cond_resched(); + spin_lock(&si->lock); + latency_ration = LATENCY_LIMIT; + } + + /* try to get more slots in cluster */ + if (si->cluster_info) { + if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) + goto checks; + else + goto done; + } + /* non-ssd case */ + ++offset; + + /* non-ssd case, still more slots in cluster? */ + if (si->cluster_nr && !si->swap_map[offset]) { + --si->cluster_nr; + goto checks; + } + +done: + si->flags -= SWP_SCANNING; + return n_ret; scan: spin_unlock(&si->lock); @@ -642,17 +763,41 @@ scan: no_page: si->flags -= SWP_SCANNING; - return 0; + return n_ret; +} + +static unsigned long scan_swap_map(struct swap_info_struct *si, + unsigned char usage) +{ + swp_entry_t entry; + int n_ret; + + n_ret = scan_swap_map_slots(si, usage, 1, &entry); + + if (n_ret) + return swp_offset(entry); + else + return 0; + } -swp_entry_t get_swap_page(void) +int get_swap_pages(int n_goal, swp_entry_t swp_entries[]) { struct swap_info_struct *si, *next; - pgoff_t offset; + long avail_pgs; + int n_ret = 0; - if (atomic_long_read(&nr_swap_pages) <= 0) + avail_pgs = atomic_long_read(&nr_swap_pages); + if (avail_pgs <= 0) goto noswap; - atomic_long_dec(&nr_swap_pages); + + if (n_goal > SWAP_BATCH) + n_goal = SWAP_BATCH; + + if (n_goal > avail_pgs) + n_goal = avail_pgs; + + atomic_long_sub(n_goal, &nr_swap_pages); spin_lock(&swap_avail_lock); @@ -678,14 +823,14 @@ start_over: spin_unlock(&si->lock); goto nextsi; } - - /* This is called for allocating swap entry for cache */ - offset = scan_swap_map(si, SWAP_HAS_CACHE); + n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, + n_goal, swp_entries); spin_unlock(&si->lock); - if (offset) - return swp_entry(si->type, offset); + if (n_ret) + goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", - si->type); + si->type); + spin_lock(&swap_avail_lock); nextsi: /* @@ -696,7 +841,8 @@ nextsi: * up between us dropping swap_avail_lock and taking si->lock. * Since we dropped the swap_avail_lock, the swap_avail_head * list may have been modified; so if next is still in the - * swap_avail_head list then try it, otherwise start over. + * swap_avail_head list then try it, otherwise start over + * if we have not gotten any slots. */ if (plist_node_empty(&next->avail_list)) goto start_over; @@ -704,9 +850,11 @@ nextsi: spin_unlock(&swap_avail_lock); - atomic_long_inc(&nr_swap_pages); +check_out: + if (n_ret < n_goal) + atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages); noswap: - return (swp_entry_t) {0}; + return n_ret; } /* The only caller of this function is now suspend routine */ @@ -731,7 +879,7 @@ swp_entry_t get_swap_page_of_type(int type) return (swp_entry_t) {0}; } -static struct swap_info_struct *swap_info_get(swp_entry_t entry) +static struct swap_info_struct *__swap_info_get(swp_entry_t entry) { struct swap_info_struct *p; unsigned long offset, type; @@ -747,34 +895,76 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry) offset = swp_offset(entry); if (offset >= p->max) goto bad_offset; - if (!p->swap_map[offset]) - goto bad_free; - spin_lock(&p->lock); return p; -bad_free: - pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val); - goto out; bad_offset: - pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val); + pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val); goto out; bad_device: - pr_err("swap_free: %s%08lx\n", Unused_file, entry.val); + pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val); goto out; bad_nofile: - pr_err("swap_free: %s%08lx\n", Bad_file, entry.val); + pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val); out: return NULL; } -static unsigned char swap_entry_free(struct swap_info_struct *p, - swp_entry_t entry, unsigned char usage) +static struct swap_info_struct *_swap_info_get(swp_entry_t entry) { + struct swap_info_struct *p; + + p = __swap_info_get(entry); + if (!p) + goto out; + if (!p->swap_map[swp_offset(entry)]) + goto bad_free; + return p; + +bad_free: + pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val); + goto out; +out: + return NULL; +} + +static struct swap_info_struct *swap_info_get(swp_entry_t entry) +{ + struct swap_info_struct *p; + + p = _swap_info_get(entry); + if (p) + spin_lock(&p->lock); + return p; +} + +static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, + struct swap_info_struct *q) +{ + struct swap_info_struct *p; + + p = _swap_info_get(entry); + + if (p != q) { + if (q != NULL) + spin_unlock(&q->lock); + if (p != NULL) + spin_lock(&p->lock); + } + return p; +} + +static unsigned char __swap_entry_free(struct swap_info_struct *p, + swp_entry_t entry, unsigned char usage) +{ + struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char count; unsigned char has_cache; + ci = lock_cluster_or_swap_info(p, offset); + count = p->swap_map[offset]; + has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; @@ -798,38 +988,52 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, } usage = count | has_cache; - p->swap_map[offset] = usage; - - /* free if no reference */ - if (!usage) { - mem_cgroup_uncharge_swap(entry); - dec_cluster_info_page(p, p->cluster_info, offset); - if (offset < p->lowest_bit) - p->lowest_bit = offset; - if (offset > p->highest_bit) { - bool was_full = !p->highest_bit; - p->highest_bit = offset; - if (was_full && (p->flags & SWP_WRITEOK)) { - spin_lock(&swap_avail_lock); - WARN_ON(!plist_node_empty(&p->avail_list)); - if (plist_node_empty(&p->avail_list)) - plist_add(&p->avail_list, - &swap_avail_head); - spin_unlock(&swap_avail_lock); - } - } - atomic_long_inc(&nr_swap_pages); - p->inuse_pages--; - frontswap_invalidate_page(p->type, offset); - if (p->flags & SWP_BLKDEV) { - struct gendisk *disk = p->bdev->bd_disk; - if (disk->fops->swap_slot_free_notify) - disk->fops->swap_slot_free_notify(p->bdev, - offset); + p->swap_map[offset] = usage ? : SWAP_HAS_CACHE; + + unlock_cluster_or_swap_info(p, ci); + + return usage; +} + +static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) +{ + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + unsigned char count; + + ci = lock_cluster(p, offset); + count = p->swap_map[offset]; + VM_BUG_ON(count != SWAP_HAS_CACHE); + p->swap_map[offset] = 0; + dec_cluster_info_page(p, p->cluster_info, offset); + unlock_cluster(ci); + + mem_cgroup_uncharge_swap(entry); + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) { + bool was_full = !p->highest_bit; + + p->highest_bit = offset; + if (was_full && (p->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + WARN_ON(!plist_node_empty(&p->avail_list)); + if (plist_node_empty(&p->avail_list)) + plist_add(&p->avail_list, + &swap_avail_head); + spin_unlock(&swap_avail_lock); } } + atomic_long_inc(&nr_swap_pages); + p->inuse_pages--; + frontswap_invalidate_page(p->type, offset); + if (p->flags & SWP_BLKDEV) { + struct gendisk *disk = p->bdev->bd_disk; - return usage; + if (disk->fops->swap_slot_free_notify) + disk->fops->swap_slot_free_notify(p->bdev, + offset); + } } /* @@ -840,10 +1044,10 @@ void swap_free(swp_entry_t entry) { struct swap_info_struct *p; - p = swap_info_get(entry); + p = _swap_info_get(entry); if (p) { - swap_entry_free(p, entry, 1); - spin_unlock(&p->lock); + if (!__swap_entry_free(p, entry, 1)) + free_swap_slot(entry); } } @@ -854,13 +1058,35 @@ void swapcache_free(swp_entry_t entry) { struct swap_info_struct *p; - p = swap_info_get(entry); + p = _swap_info_get(entry); if (p) { - swap_entry_free(p, entry, SWAP_HAS_CACHE); - spin_unlock(&p->lock); + if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE)) + free_swap_slot(entry); } } +void swapcache_free_entries(swp_entry_t *entries, int n) +{ + struct swap_info_struct *p, *prev; + int i; + + if (n <= 0) + return; + + prev = NULL; + p = NULL; + for (i = 0; i < n; ++i) { + p = swap_info_get_cont(entries[i], prev); + if (p) + swap_entry_free(p, entries[i]); + else + break; + prev = p; + } + if (p) + spin_unlock(&p->lock); +} + /* * How many references to page are currently swapped out? * This does not give an exact answer when swap count is continued, @@ -870,13 +1096,39 @@ int page_swapcount(struct page *page) { int count = 0; struct swap_info_struct *p; + struct swap_cluster_info *ci; swp_entry_t entry; + unsigned long offset; entry.val = page_private(page); - p = swap_info_get(entry); + p = _swap_info_get(entry); if (p) { - count = swap_count(p->swap_map[swp_offset(entry)]); - spin_unlock(&p->lock); + offset = swp_offset(entry); + ci = lock_cluster_or_swap_info(p, offset); + count = swap_count(p->swap_map[offset]); + unlock_cluster_or_swap_info(p, ci); + } + return count; +} + +/* + * How many references to @entry are currently swapped out? + * This does not give an exact answer when swap count is continued, + * but does include the high COUNT_CONTINUED flag to allow for that. + */ +int __swp_swapcount(swp_entry_t entry) +{ + int count = 0; + pgoff_t offset; + struct swap_info_struct *si; + struct swap_cluster_info *ci; + + si = __swap_info_get(entry); + if (si) { + offset = swp_offset(entry); + ci = lock_cluster_or_swap_info(si, offset); + count = swap_count(si->swap_map[offset]); + unlock_cluster_or_swap_info(si, ci); } return count; } @@ -889,22 +1141,26 @@ int swp_swapcount(swp_entry_t entry) { int count, tmp_count, n; struct swap_info_struct *p; + struct swap_cluster_info *ci; struct page *page; pgoff_t offset; unsigned char *map; - p = swap_info_get(entry); + p = _swap_info_get(entry); if (!p) return 0; - count = swap_count(p->swap_map[swp_offset(entry)]); + offset = swp_offset(entry); + + ci = lock_cluster_or_swap_info(p, offset); + + count = swap_count(p->swap_map[offset]); if (!(count & COUNT_CONTINUED)) goto out; count &= ~COUNT_CONTINUED; n = SWAP_MAP_MAX + 1; - offset = swp_offset(entry); page = vmalloc_to_page(p->swap_map + offset); offset &= ~PAGE_MASK; VM_BUG_ON(page_private(page) != SWP_CONTINUED); @@ -919,7 +1175,7 @@ int swp_swapcount(swp_entry_t entry) n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: - spin_unlock(&p->lock); + unlock_cluster_or_swap_info(p, ci); return count; } @@ -943,11 +1199,25 @@ bool reuse_swap_page(struct page *page, int *total_mapcount) count = page_trans_huge_mapcount(page, total_mapcount); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); - if (count == 1 && !PageWriteback(page)) { + if (count != 1) + goto out; + if (!PageWriteback(page)) { delete_from_swap_cache(page); SetPageDirty(page); + } else { + swp_entry_t entry; + struct swap_info_struct *p; + + entry.val = page_private(page); + p = swap_info_get(entry); + if (p->flags & SWP_STABLE_WRITES) { + spin_unlock(&p->lock); + return false; + } + spin_unlock(&p->lock); } } +out: return count <= 1; } @@ -997,21 +1267,23 @@ int free_swap_and_cache(swp_entry_t entry) { struct swap_info_struct *p; struct page *page = NULL; + unsigned char count; if (non_swap_entry(entry)) return 1; - p = swap_info_get(entry); + p = _swap_info_get(entry); if (p) { - if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { + count = __swap_entry_free(p, entry, 1); + if (count == SWAP_HAS_CACHE) { page = find_get_page(swap_address_space(entry), swp_offset(entry)); if (page && !trylock_page(page)) { put_page(page); page = NULL; } - } - spin_unlock(&p->lock); + } else if (!count) + free_swap_slot(entry); } if (page) { /* @@ -1234,6 +1506,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { + cond_resched(); next = pmd_addr_end(addr, end); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; @@ -1244,7 +1517,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, return 0; } -static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, +static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, swp_entry_t entry, struct page *page) { @@ -1252,7 +1525,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long next; int ret; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) @@ -1264,6 +1537,26 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, return 0; } +static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + swp_entry_t entry, struct page *page) +{ + p4d_t *p4d; + unsigned long next; + int ret; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d)) + continue; + ret = unuse_pud_range(vma, p4d, addr, next, entry, page); + if (ret) + return ret; + } while (p4d++, addr = next, addr != end); + return 0; +} + static int unuse_vma(struct vm_area_struct *vma, swp_entry_t entry, struct page *page) { @@ -1287,7 +1580,7 @@ static int unuse_vma(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - ret = unuse_pud_range(vma, pgd, addr, next, entry, page); + ret = unuse_p4d_range(vma, pgd, addr, next, entry, page); if (ret) return ret; } while (pgd++, addr = next, addr != end); @@ -1313,6 +1606,7 @@ static int unuse_mm(struct mm_struct *mm, for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) break; + cond_resched(); } up_read(&mm->mmap_sem); return (ret < 0)? ret: 0; @@ -1350,15 +1644,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, prev = 0; i = 1; } - if (frontswap) { - if (frontswap_test(si, i)) - break; - else - continue; - } count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) - break; + if (!frontswap || frontswap_test(si, i)) + break; + if ((i % LATENCY_LIMIT) == 0) + cond_resched(); } return i; } @@ -1402,7 +1693,7 @@ int try_to_unuse(unsigned int type, bool frontswap, * that. */ start_mm = &init_mm; - atomic_inc(&init_mm.mm_users); + mmget(&init_mm); /* * Keep on scanning until all entries have gone. Usually, @@ -1451,7 +1742,7 @@ int try_to_unuse(unsigned int type, bool frontswap, if (atomic_read(&start_mm->mm_users) == 1) { mmput(start_mm); start_mm = &init_mm; - atomic_inc(&init_mm.mm_users); + mmget(&init_mm); } /* @@ -1488,13 +1779,13 @@ int try_to_unuse(unsigned int type, bool frontswap, struct mm_struct *prev_mm = start_mm; struct mm_struct *mm; - atomic_inc(&new_start_mm->mm_users); - atomic_inc(&prev_mm->mm_users); + mmget(new_start_mm); + mmget(prev_mm); spin_lock(&mmlist_lock); while (swap_count(*swap_map) && !retval && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); - if (!atomic_inc_not_zero(&mm->mm_users)) + if (!mmget_not_zero(mm)) continue; spin_unlock(&mmlist_lock); mmput(prev_mm); @@ -1512,7 +1803,7 @@ int try_to_unuse(unsigned int type, bool frontswap, if (set_start_mm && *swap_map < swcount) { mmput(new_start_mm); - atomic_inc(&mm->mm_users); + mmget(mm); new_start_mm = mm; set_start_mm = 0; } @@ -1840,6 +2131,17 @@ static void reinsert_swap_info(struct swap_info_struct *p) spin_unlock(&swap_lock); } +bool has_usable_swap(void) +{ + bool ret = true; + + spin_lock(&swap_lock); + if (plist_head_empty(&swap_active_head)) + ret = false; + spin_unlock(&swap_lock); + return ret; +} + SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; @@ -1910,6 +2212,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&p->lock); spin_unlock(&swap_lock); + disable_swap_slots_cache_lock(); + set_current_oom_origin(); err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ clear_current_oom_origin(); @@ -1917,9 +2221,12 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (err) { /* re-insert swap space back into swap_list */ reinsert_swap_info(p); + reenable_swap_slots_cache_unlock(); goto out_dput; } + reenable_swap_slots_cache_unlock(); + flush_work(&p->discard_work); destroy_swap_extents(p); @@ -1962,6 +2269,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) vfree(frontswap_map); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); + exit_swap_address_space(p->type); inode = mapping->host; if (S_ISBLK(inode->i_mode)) { @@ -2285,6 +2593,13 @@ static unsigned long read_swap_header(struct swap_info_struct *p, return maxpages; } +#define SWAP_CLUSTER_INFO_COLS \ + DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) +#define SWAP_CLUSTER_SPACE_COLS \ + DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) +#define SWAP_CLUSTER_COLS \ + max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) + static int setup_swap_map_and_extents(struct swap_info_struct *p, union swap_header *swap_header, unsigned char *swap_map, @@ -2292,11 +2607,12 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, unsigned long maxpages, sector_t *span) { - int i; + unsigned int j, k; unsigned int nr_good_pages; int nr_extents; unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); - unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER; + unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; + unsigned long i, idx; nr_good_pages = maxpages - 1; /* omit header page */ @@ -2344,15 +2660,23 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, if (!cluster_info) return nr_extents; - for (i = 0; i < nr_clusters; i++) { - if (!cluster_count(&cluster_info[idx])) { + + /* + * Reduce false cache line sharing between cluster_info and + * sharing same address space. + */ + for (k = 0; k < SWAP_CLUSTER_COLS; k++) { + j = (k + col) % SWAP_CLUSTER_COLS; + for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { + idx = i * SWAP_CLUSTER_COLS + j; + if (idx >= nr_clusters) + continue; + if (cluster_count(&cluster_info[idx])) + continue; cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); cluster_list_add_tail(&p->free_clusters, cluster_info, idx); } - idx++; - if (idx == nr_clusters) - idx = 0; } return nr_extents; } @@ -2449,8 +2773,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -ENOMEM; goto bad_swap; } + + if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) + p->flags |= SWP_STABLE_WRITES; + if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { int cpu; + unsigned long ci, nr_cluster; p->flags |= SWP_SOLIDSTATE; /* @@ -2458,13 +2787,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) * SSD */ p->cluster_next = 1 + (prandom_u32() % p->highest_bit); + nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); - cluster_info = vzalloc(DIV_ROUND_UP(maxpages, - SWAPFILE_CLUSTER) * sizeof(*cluster_info)); + cluster_info = vzalloc(nr_cluster * sizeof(*cluster_info)); if (!cluster_info) { error = -ENOMEM; goto bad_swap; } + + for (ci = 0; ci < nr_cluster; ci++) + spin_lock_init(&((cluster_info + ci)->lock)); + p->percpu_cluster = alloc_percpu(struct percpu_cluster); if (!p->percpu_cluster) { error = -ENOMEM; @@ -2521,6 +2854,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } } + error = init_swap_address_space(p->type, maxpages); + if (error) + goto bad_swap; + mutex_lock(&swapon_mutex); prio = -1; if (swap_flags & SWAP_FLAG_PREFER) @@ -2576,6 +2913,8 @@ out: putname(name); if (inode && S_ISREG(inode->i_mode)) inode_unlock(inode); + if (!error) + enable_swap_slots_cache(); return error; } @@ -2610,6 +2949,7 @@ void si_swapinfo(struct sysinfo *val) static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { struct swap_info_struct *p; + struct swap_cluster_info *ci; unsigned long offset, type; unsigned char count; unsigned char has_cache; @@ -2623,10 +2963,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) goto bad_file; p = swap_info[type]; offset = swp_offset(entry); - - spin_lock(&p->lock); if (unlikely(offset >= p->max)) - goto unlock_out; + goto out; + + ci = lock_cluster_or_swap_info(p, offset); count = p->swap_map[offset]; @@ -2669,7 +3009,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) p->swap_map[offset] = count | has_cache; unlock_out: - spin_unlock(&p->lock); + unlock_cluster_or_swap_info(p, ci); out: return err; @@ -2758,6 +3098,7 @@ EXPORT_SYMBOL_GPL(__page_file_index); int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) { struct swap_info_struct *si; + struct swap_cluster_info *ci; struct page *head; struct page *page; struct page *list_page; @@ -2781,6 +3122,9 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) } offset = swp_offset(entry); + + ci = lock_cluster(si, offset); + count = si->swap_map[offset] & ~SWAP_HAS_CACHE; if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { @@ -2793,6 +3137,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) } if (!page) { + unlock_cluster(ci); spin_unlock(&si->lock); return -ENOMEM; } @@ -2841,6 +3186,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) list_add_tail(&page->lru, &head->lru); page = NULL; /* now it's attached, don't free it */ out: + unlock_cluster(ci); spin_unlock(&si->lock); outer: if (page) @@ -2854,7 +3200,8 @@ outer: * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. - * Called while __swap_duplicate() or swap_entry_free() holds swap_lock. + * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster + * lock. */ static bool swap_count_continued(struct swap_info_struct *si, pgoff_t offset, unsigned char count) diff --git a/mm/truncate.c b/mm/truncate.c index 8d8c62d89e6d..6263affdef88 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -20,56 +20,84 @@ #include <linux/task_io_accounting_ops.h> #include <linux/buffer_head.h> /* grr. try_to_release_page, do_invalidatepage */ +#include <linux/shmem_fs.h> #include <linux/cleancache.h> #include <linux/rmap.h> #include "internal.h" -static void clear_exceptional_entry(struct address_space *mapping, - pgoff_t index, void *entry) +static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, + void *entry) { struct radix_tree_node *node; void **slot; - /* Handled by shmem itself */ - if (shmem_mapping(mapping)) - return; - - if (dax_mapping(mapping)) { - dax_delete_mapping_entry(mapping, index); - return; - } spin_lock_irq(&mapping->tree_lock); /* * Regular page slots are stabilized by the page lock even * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, - &slot)) + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) goto unlock; if (*slot != entry) goto unlock; - radix_tree_replace_slot(slot, NULL); + __radix_tree_replace(&mapping->page_tree, node, slot, NULL, + workingset_update_node, mapping); mapping->nrexceptional--; - if (!node) - goto unlock; - workingset_node_shadows_dec(node); - /* - * Don't track node without shadow entries. - * - * Avoid acquiring the list_lru lock if already untracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!workingset_node_shadows(node) && - !list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - __radix_tree_delete_node(&mapping->page_tree, node); unlock: spin_unlock_irq(&mapping->tree_lock); } +/* + * Unconditionally remove exceptional entry. Usually called from truncate path. + */ +static void truncate_exceptional_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + /* Handled by shmem itself */ + if (shmem_mapping(mapping)) + return; + + if (dax_mapping(mapping)) { + dax_delete_mapping_entry(mapping, index); + return; + } + clear_shadow_entry(mapping, index, entry); +} + +/* + * Invalidate exceptional entry if easily possible. This handles exceptional + * entries for invalidate_inode_pages() so for DAX it evicts only unlocked and + * clean entries. + */ +static int invalidate_exceptional_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + /* Handled by shmem itself */ + if (shmem_mapping(mapping)) + return 1; + if (dax_mapping(mapping)) + return dax_invalidate_mapping_entry(mapping, index); + clear_shadow_entry(mapping, index, entry); + return 1; +} + +/* + * Invalidate exceptional entry if clean. This handles exceptional entries for + * invalidate_inode_pages2() so for DAX it evicts only clean entries. + */ +static int invalidate_exceptional_entry2(struct address_space *mapping, + pgoff_t index, void *entry) +{ + /* Handled by shmem itself */ + if (shmem_mapping(mapping)) + return 1; + if (dax_mapping(mapping)) + return dax_invalidate_mapping_entry_sync(mapping, index); + clear_shadow_entry(mapping, index, entry); + return 1; +} + /** * do_invalidatepage - invalidate part or all of a page * @page: the page which is affected @@ -277,7 +305,8 @@ void truncate_inode_pages_range(struct address_space *mapping, break; if (radix_tree_exceptional_entry(page)) { - clear_exceptional_entry(mapping, index, page); + truncate_exceptional_entry(mapping, index, + page); continue; } @@ -366,7 +395,8 @@ void truncate_inode_pages_range(struct address_space *mapping, } if (radix_tree_exceptional_entry(page)) { - clear_exceptional_entry(mapping, index, page); + truncate_exceptional_entry(mapping, index, + page); continue; } @@ -485,7 +515,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, break; if (radix_tree_exceptional_entry(page)) { - clear_exceptional_entry(mapping, index, page); + invalidate_exceptional_entry(mapping, index, + page); continue; } @@ -607,7 +638,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, break; if (radix_tree_exceptional_entry(page)) { - clear_exceptional_entry(mapping, index, page); + if (!invalidate_exceptional_entry2(mapping, + index, page)) + ret = -EBUSY; continue; } @@ -753,7 +786,7 @@ EXPORT_SYMBOL(truncate_setsize); */ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) { - int bsize = 1 << inode->i_blkbits; + int bsize = i_blocksize(inode); loff_t rounded_from; struct page *page; pgoff_t index; diff --git a/mm/usercopy.c b/mm/usercopy.c index 3c8da0af9695..d155e12563b1 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -16,6 +16,9 @@ #include <linux/mm.h> #include <linux/slab.h> +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> #include <asm/sections.h> enum { @@ -108,13 +111,13 @@ static inline const char *check_kernel_text_object(const void *ptr, * __pa() is not just the reverse of __va(). This can be detected * and checked: */ - textlow_linear = (unsigned long)__va(__pa(textlow)); + textlow_linear = (unsigned long)lm_alias(textlow); /* No different mapping: we're done. */ if (textlow_linear == textlow) return NULL; /* Check the secondary mapping... */ - texthigh_linear = (unsigned long)__va(__pa(texthigh)); + texthigh_linear = (unsigned long)lm_alias(texthigh); if (overlaps(ptr, n, textlow_linear, texthigh_linear)) return "<linear kernel text>"; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index af817e5060fb..8bcb501bce60 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -8,12 +8,16 @@ */ #include <linux/mm.h> +#include <linux/sched/signal.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/userfaultfd_k.h> #include <linux/mmu_notifier.h> +#include <linux/hugetlb.h> +#include <linux/pagemap.h> +#include <linux/shmem_fs.h> #include <asm/tlbflush.h> #include "internal.h" @@ -124,20 +128,248 @@ out_unlock: static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; - pmd_t *pmd = NULL; pgd = pgd_offset(mm, address); - pud = pud_alloc(mm, pgd, address); - if (pud) + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) + return NULL; + pud = pud_alloc(mm, p4d, address); + if (!pud) + return NULL; + /* + * Note that we didn't run this because the pmd was + * missing, the *pmd may be already established and in + * turn it may also be a trans_huge_pmd. + */ + return pmd_alloc(mm, pud, address); +} + +#ifdef CONFIG_HUGETLB_PAGE +/* + * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is + * called with mmap_sem held, it will release mmap_sem before returning. + */ +static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, + struct vm_area_struct *dst_vma, + unsigned long dst_start, + unsigned long src_start, + unsigned long len, + bool zeropage) +{ + int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED; + int vm_shared = dst_vma->vm_flags & VM_SHARED; + ssize_t err; + pte_t *dst_pte; + unsigned long src_addr, dst_addr; + long copied; + struct page *page; + struct hstate *h; + unsigned long vma_hpagesize; + pgoff_t idx; + u32 hash; + struct address_space *mapping; + + /* + * There is no default zero huge page for all huge page sizes as + * supported by hugetlb. A PMD_SIZE huge pages may exist as used + * by THP. Since we can not reliably insert a zero page, this + * feature is not supported. + */ + if (zeropage) { + up_read(&dst_mm->mmap_sem); + return -EINVAL; + } + + src_addr = src_start; + dst_addr = dst_start; + copied = 0; + page = NULL; + vma_hpagesize = vma_kernel_pagesize(dst_vma); + + /* + * Validate alignment based on huge page size + */ + err = -EINVAL; + if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) + goto out_unlock; + +retry: + /* + * On routine entry dst_vma is set. If we had to drop mmap_sem and + * retry, dst_vma will be set to NULL and we must lookup again. + */ + if (!dst_vma) { + err = -ENOENT; + dst_vma = find_vma(dst_mm, dst_start); + if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) + goto out_unlock; + /* + * Only allow __mcopy_atomic_hugetlb on userfaultfd + * registered ranges. + */ + if (!dst_vma->vm_userfaultfd_ctx.ctx) + goto out_unlock; + + if (dst_start < dst_vma->vm_start || + dst_start + len > dst_vma->vm_end) + goto out_unlock; + + err = -EINVAL; + if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) + goto out_unlock; + + vm_shared = dst_vma->vm_flags & VM_SHARED; + } + + if (WARN_ON(dst_addr & (vma_hpagesize - 1) || + (len - copied) & (vma_hpagesize - 1))) + goto out_unlock; + + /* + * If not shared, ensure the dst_vma has a anon_vma. + */ + err = -ENOMEM; + if (!vm_shared) { + if (unlikely(anon_vma_prepare(dst_vma))) + goto out_unlock; + } + + h = hstate_vma(dst_vma); + + while (src_addr < src_start + len) { + pte_t dst_pteval; + + BUG_ON(dst_addr >= dst_start + len); + VM_BUG_ON(dst_addr & ~huge_page_mask(h)); + + /* + * Serialize via hugetlb_fault_mutex + */ + idx = linear_page_index(dst_vma, dst_addr); + mapping = dst_vma->vm_file->f_mapping; + hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, + idx, dst_addr); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + err = -ENOMEM; + dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); + if (!dst_pte) { + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out_unlock; + } + + err = -EEXIST; + dst_pteval = huge_ptep_get(dst_pte); + if (!huge_pte_none(dst_pteval)) { + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out_unlock; + } + + err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, + dst_addr, src_addr, &page); + + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + vm_alloc_shared = vm_shared; + + cond_resched(); + + if (unlikely(err == -EFAULT)) { + up_read(&dst_mm->mmap_sem); + BUG_ON(!page); + + err = copy_huge_page_from_user(page, + (const void __user *)src_addr, + pages_per_huge_page(h), true); + if (unlikely(err)) { + err = -EFAULT; + goto out; + } + down_read(&dst_mm->mmap_sem); + + dst_vma = NULL; + goto retry; + } else + BUG_ON(page); + + if (!err) { + dst_addr += vma_hpagesize; + src_addr += vma_hpagesize; + copied += vma_hpagesize; + + if (fatal_signal_pending(current)) + err = -EINTR; + } + if (err) + break; + } + +out_unlock: + up_read(&dst_mm->mmap_sem); +out: + if (page) { /* - * Note that we didn't run this because the pmd was - * missing, the *pmd may be already established and in - * turn it may also be a trans_huge_pmd. + * We encountered an error and are about to free a newly + * allocated huge page. + * + * Reservation handling is very subtle, and is different for + * private and shared mappings. See the routine + * restore_reserve_on_error for details. Unfortunately, we + * can not call restore_reserve_on_error now as it would + * require holding mmap_sem. + * + * If a reservation for the page existed in the reservation + * map of a private mapping, the map was modified to indicate + * the reservation was consumed when the page was allocated. + * We clear the PagePrivate flag now so that the global + * reserve count will not be incremented in free_huge_page. + * The reservation map will still indicate the reservation + * was consumed and possibly prevent later page allocation. + * This is better than leaking a global reservation. If no + * reservation existed, it is still safe to clear PagePrivate + * as no adjustments to reservation counts were made during + * allocation. + * + * The reservation map for shared mappings indicates which + * pages have reservations. When a huge page is allocated + * for an address with a reservation, no change is made to + * the reserve map. In this case PagePrivate will be set + * to indicate that the global reservation count should be + * incremented when the page is freed. This is the desired + * behavior. However, when a huge page is allocated for an + * address without a reservation a reservation entry is added + * to the reservation map, and PagePrivate will not be set. + * When the page is freed, the global reserve count will NOT + * be incremented and it will appear as though we have leaked + * reserved page. In this case, set PagePrivate so that the + * global reserve count will be incremented to match the + * reservation map entry which was created. + * + * Note that vm_alloc_shared is based on the flags of the vma + * for which the page was originally allocated. dst_vma could + * be different or NULL on error. */ - pmd = pmd_alloc(mm, pud, address); - return pmd; + if (vm_alloc_shared) + SetPagePrivate(page); + else + ClearPagePrivate(page); + put_page(page); + } + BUG_ON(copied < 0); + BUG_ON(err > 0); + BUG_ON(!copied && !err); + return copied ? copied : err; } +#else /* !CONFIG_HUGETLB_PAGE */ +/* fail at build time if gcc attempts to use this */ +extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, + struct vm_area_struct *dst_vma, + unsigned long dst_start, + unsigned long src_start, + unsigned long len, + bool zeropage); +#endif /* CONFIG_HUGETLB_PAGE */ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, @@ -173,14 +405,10 @@ retry: * Make sure the vma is not shared, that the dst range is * both valid and fully within a single existing vma. */ - err = -EINVAL; + err = -ENOENT; dst_vma = find_vma(dst_mm, dst_start); - if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) - goto out_unlock; - if (dst_start < dst_vma->vm_start || - dst_start + len > dst_vma->vm_end) + if (!dst_vma) goto out_unlock; - /* * Be strict and only allow __mcopy_atomic on userfaultfd * registered ranges to prevent userland errors going @@ -193,11 +421,27 @@ retry: if (!dst_vma->vm_userfaultfd_ctx.ctx) goto out_unlock; + if (dst_start < dst_vma->vm_start || + dst_start + len > dst_vma->vm_end) + goto out_unlock; + + err = -EINVAL; + /* + * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but + * it will overwrite vm_ops, so vma_is_anonymous must return false. + */ + if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && + dst_vma->vm_flags & VM_SHARED)) + goto out_unlock; + /* - * FIXME: only allow copying on anonymous vmas, tmpfs should - * be added. + * If this is a HUGETLB vma, pass off to appropriate routine */ - if (dst_vma->vm_ops) + if (is_vm_hugetlb_page(dst_vma)) + return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, + src_start, len, zeropage); + + if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; /* @@ -206,7 +450,7 @@ retry: * dst_vma. */ err = -ENOMEM; - if (unlikely(anon_vma_prepare(dst_vma))) + if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma))) goto out_unlock; while (src_addr < src_start + len) { @@ -243,12 +487,21 @@ retry: BUG_ON(pmd_none(*dst_pmd)); BUG_ON(pmd_trans_huge(*dst_pmd)); - if (!zeropage) - err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, src_addr, &page); - else - err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, - dst_addr); + if (vma_is_anonymous(dst_vma)) { + if (!zeropage) + err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, + &page); + else + err = mfill_zeropage_pte(dst_mm, dst_pmd, + dst_vma, dst_addr); + } else { + err = -EINVAL; /* if zeropage is true return -EINVAL */ + if (likely(!zeropage)) + err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, + dst_vma, dst_addr, + src_addr, &page); + } cond_resched(); diff --git a/mm/util.c b/mm/util.c index 1a41553db866..656dc5e37a87 100644 --- a/mm/util.c +++ b/mm/util.c @@ -5,15 +5,18 @@ #include <linux/export.h> #include <linux/err.h> #include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/task_stack.h> #include <linux/security.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mman.h> #include <linux/hugetlb.h> #include <linux/vmalloc.h> +#include <linux/userfaultfd_k.h> #include <asm/sections.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" @@ -297,14 +300,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long ret; struct mm_struct *mm = current->mm; unsigned long populate; + LIST_HEAD(uf); ret = security_mmap_file(file, prot, flag); if (!ret) { if (down_write_killable(&mm->mmap_sem)) return -EINTR; ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, - &populate); + &populate, &uf); up_write(&mm->mmap_sem); + userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(ret, populate); } diff --git a/mm/vmacache.c b/mm/vmacache.c index 035fdeb35b43..7ffa0ee341b5 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -1,7 +1,8 @@ /* * Copyright (C) 2014 Davidlohr Bueso. */ -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> #include <linux/mm.h> #include <linux/vmacache.h> @@ -60,7 +61,7 @@ static inline bool vmacache_valid_mm(struct mm_struct *mm) void vmacache_update(unsigned long addr, struct vm_area_struct *newvma) { if (vmacache_valid_mm(newvma->vm_mm)) - current->vmacache[VMACACHE_HASH(addr)] = newvma; + current->vmacache.vmas[VMACACHE_HASH(addr)] = newvma; } static bool vmacache_valid(struct mm_struct *mm) @@ -71,12 +72,12 @@ static bool vmacache_valid(struct mm_struct *mm) return false; curr = current; - if (mm->vmacache_seqnum != curr->vmacache_seqnum) { + if (mm->vmacache_seqnum != curr->vmacache.seqnum) { /* * First attempt will always be invalid, initialize * the new cache for this task here. */ - curr->vmacache_seqnum = mm->vmacache_seqnum; + curr->vmacache.seqnum = mm->vmacache_seqnum; vmacache_flush(curr); return false; } @@ -93,7 +94,7 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) return NULL; for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache[i]; + struct vm_area_struct *vma = current->vmacache.vmas[i]; if (!vma) continue; @@ -121,7 +122,7 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, return NULL; for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache[i]; + struct vm_area_struct *vma = current->vmacache.vmas[i]; if (vma && vma->vm_start == start && vma->vm_end == end) { count_vm_vmacache_event(VMACACHE_FIND_HITS); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f2481cb4e6b2..0b057628a7ba 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -12,7 +12,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/highmem.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/interrupt.h> @@ -32,7 +32,7 @@ #include <linux/llist.h> #include <linux/bitops.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> @@ -86,12 +86,12 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) } while (pmd++, addr = next, addr != end); } -static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) +static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end) { pud_t *pud; unsigned long next; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_clear_huge(pud)) @@ -102,6 +102,22 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) } while (pud++, addr = next, addr != end); } +static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_clear_huge(p4d)) + continue; + if (p4d_none_or_clear_bad(p4d)) + continue; + vunmap_pud_range(p4d, addr, next); + } while (p4d++, addr = next, addr != end); +} + static void vunmap_page_range(unsigned long addr, unsigned long end) { pgd_t *pgd; @@ -113,7 +129,7 @@ static void vunmap_page_range(unsigned long addr, unsigned long end) next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - vunmap_pud_range(pgd, addr, next); + vunmap_p4d_range(pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -160,13 +176,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, return 0; } -static int vmap_pud_range(pgd_t *pgd, unsigned long addr, +static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr) { pud_t *pud; unsigned long next; - pud = pud_alloc(&init_mm, pgd, addr); + pud = pud_alloc(&init_mm, p4d, addr); if (!pud) return -ENOMEM; do { @@ -177,6 +193,23 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr, return 0; } +static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_alloc(&init_mm, pgd, addr); + if (!p4d) + return -ENOMEM; + do { + next = p4d_addr_end(addr, end); + if (vmap_pud_range(p4d, addr, next, prot, pages, nr)) + return -ENOMEM; + } while (p4d++, addr = next, addr != end); + return 0; +} + /* * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and * will have pfns corresponding to the "pages" array. @@ -196,7 +229,7 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end, pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); + err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr); if (err) return err; } while (pgd++, addr = next, addr != end); @@ -237,6 +270,10 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) unsigned long addr = (unsigned long) vmalloc_addr; struct page *page = NULL; pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; /* * XXX we might need to change this if we add VIRTUAL_BUG_ON for @@ -244,21 +281,23 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) */ VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); - if (!pgd_none(*pgd)) { - pud_t *pud = pud_offset(pgd, addr); - if (!pud_none(*pud)) { - pmd_t *pmd = pmd_offset(pud, addr); - if (!pmd_none(*pmd)) { - pte_t *ptep, pte; - - ptep = pte_offset_map(pmd, addr); - pte = *ptep; - if (pte_present(pte)) - page = pte_page(pte); - pte_unmap(ptep); - } - } - } + if (pgd_none(*pgd)) + return NULL; + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return NULL; + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + return NULL; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return NULL; + + ptep = pte_offset_map(pmd, addr); + pte = *ptep; + if (pte_present(pte)) + page = pte_page(pte); + pte_unmap(ptep); return page; } EXPORT_SYMBOL(vmalloc_to_page); @@ -365,7 +404,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, BUG_ON(offset_in_page(size)); BUG_ON(!is_power_of_2(align)); - might_sleep_if(gfpflags_allow_blocking(gfp_mask)); + might_sleep(); va = kmalloc_node(sizeof(struct vmap_area), gfp_mask & GFP_RECLAIM_MASK, node); @@ -601,6 +640,13 @@ static unsigned long lazy_max_pages(void) static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); +/* + * Serialize vmap purging. There is no actual criticial section protected + * by this look, but we want to avoid concurrent calls for performance + * reasons and to make the pcpu_get_vm_areas more deterministic. + */ +static DEFINE_MUTEX(vmap_purge_lock); + /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); @@ -615,59 +661,40 @@ void set_iounmap_nonlazy(void) /* * Purges all lazily-freed vmap areas. - * - * If sync is 0 then don't purge if there is already a purge in progress. - * If force_flush is 1, then flush kernel TLBs between *start and *end even - * if we found no lazy vmap areas to unmap (callers can use this to optimise - * their own TLB flushing). - * Returns with *start = min(*start, lowest purged address) - * *end = max(*end, highest purged address) */ -static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, - int sync, int force_flush) +static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { - static DEFINE_SPINLOCK(purge_lock); struct llist_node *valist; struct vmap_area *va; struct vmap_area *n_va; - int nr = 0; - - /* - * If sync is 0 but force_flush is 1, we'll go sync anyway but callers - * should not expect such behaviour. This just simplifies locking for - * the case that isn't actually used at the moment anyway. - */ - if (!sync && !force_flush) { - if (!spin_trylock(&purge_lock)) - return; - } else - spin_lock(&purge_lock); + bool do_free = false; - if (sync) - purge_fragmented_blocks_allcpus(); + lockdep_assert_held(&vmap_purge_lock); valist = llist_del_all(&vmap_purge_list); llist_for_each_entry(va, valist, purge_list) { - if (va->va_start < *start) - *start = va->va_start; - if (va->va_end > *end) - *end = va->va_end; - nr += (va->va_end - va->va_start) >> PAGE_SHIFT; + if (va->va_start < start) + start = va->va_start; + if (va->va_end > end) + end = va->va_end; + do_free = true; } - if (nr) - atomic_sub(nr, &vmap_lazy_nr); + if (!do_free) + return false; - if (nr || force_flush) - flush_tlb_kernel_range(*start, *end); + flush_tlb_kernel_range(start, end); - if (nr) { - spin_lock(&vmap_area_lock); - llist_for_each_entry_safe(va, n_va, valist, purge_list) - __free_vmap_area(va); - spin_unlock(&vmap_area_lock); + spin_lock(&vmap_area_lock); + llist_for_each_entry_safe(va, n_va, valist, purge_list) { + int nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + + __free_vmap_area(va); + atomic_sub(nr, &vmap_lazy_nr); + cond_resched_lock(&vmap_area_lock); } - spin_unlock(&purge_lock); + spin_unlock(&vmap_area_lock); + return true; } /* @@ -676,9 +703,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, */ static void try_purge_vmap_area_lazy(void) { - unsigned long start = ULONG_MAX, end = 0; - - __purge_vmap_area_lazy(&start, &end, 0, 0); + if (mutex_trylock(&vmap_purge_lock)) { + __purge_vmap_area_lazy(ULONG_MAX, 0); + mutex_unlock(&vmap_purge_lock); + } } /* @@ -686,9 +714,10 @@ static void try_purge_vmap_area_lazy(void) */ static void purge_vmap_area_lazy(void) { - unsigned long start = ULONG_MAX, end = 0; - - __purge_vmap_area_lazy(&start, &end, 1, 0); + mutex_lock(&vmap_purge_lock); + purge_fragmented_blocks_allcpus(); + __purge_vmap_area_lazy(ULONG_MAX, 0); + mutex_unlock(&vmap_purge_lock); } /* @@ -711,22 +740,13 @@ static void free_vmap_area_noflush(struct vmap_area *va) } /* - * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been - * called for the correct range previously. - */ -static void free_unmap_vmap_area_noflush(struct vmap_area *va) -{ - unmap_vmap_area(va); - free_vmap_area_noflush(va); -} - -/* * Free and unmap a vmap area */ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); - free_unmap_vmap_area_noflush(va); + unmap_vmap_area(va); + free_vmap_area_noflush(va); } static struct vmap_area *find_vmap_area(unsigned long addr) @@ -740,16 +760,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr) return va; } -static void free_unmap_vmap_area_addr(unsigned long addr) -{ - struct vmap_area *va; - - va = find_vmap_area(addr); - BUG_ON(!va); - free_unmap_vmap_area(va); -} - - /*** Per cpu kva allocator ***/ /* @@ -1070,6 +1080,8 @@ void vm_unmap_aliases(void) if (unlikely(!vmap_initialized)) return; + might_sleep(); + for_each_possible_cpu(cpu) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); struct vmap_block *vb; @@ -1094,7 +1106,11 @@ void vm_unmap_aliases(void) rcu_read_unlock(); } - __purge_vmap_area_lazy(&start, &end, 1, flush); + mutex_lock(&vmap_purge_lock); + purge_fragmented_blocks_allcpus(); + if (!__purge_vmap_area_lazy(start, end) && flush) + flush_tlb_kernel_range(start, end); + mutex_unlock(&vmap_purge_lock); } EXPORT_SYMBOL_GPL(vm_unmap_aliases); @@ -1107,7 +1123,9 @@ void vm_unmap_ram(const void *mem, unsigned int count) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr = (unsigned long)mem; + struct vmap_area *va; + might_sleep(); BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); @@ -1116,10 +1134,14 @@ void vm_unmap_ram(const void *mem, unsigned int count) debug_check_no_locks_freed(mem, size); vmap_debug_free_range(addr, addr+size); - if (likely(count <= VMAP_MAX_ALLOC)) + if (likely(count <= VMAP_MAX_ALLOC)) { vb_free(mem, size); - else - free_unmap_vmap_area_addr(addr); + return; + } + + va = find_vmap_area(addr); + BUG_ON(!va); + free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); @@ -1455,6 +1477,8 @@ struct vm_struct *remove_vm_area(const void *addr) { struct vmap_area *va; + might_sleep(); + va = find_vmap_area((unsigned long)addr); if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->vm; @@ -1510,7 +1534,39 @@ static void __vunmap(const void *addr, int deallocate_pages) kfree(area); return; } - + +static inline void __vfree_deferred(const void *addr) +{ + /* + * Use raw_cpu_ptr() because this can be called from preemptible + * context. Preemption is absolutely fine here, because the llist_add() + * implementation is lockless, so it works even if we are adding to + * nother cpu's list. schedule_work() should be fine with this too. + */ + struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); + + if (llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); +} + +/** + * vfree_atomic - release memory allocated by vmalloc() + * @addr: memory base address + * + * This one is just like vfree() but can be called in any atomic context + * except NMIs. + */ +void vfree_atomic(const void *addr) +{ + BUG_ON(in_nmi()); + + kmemleak_free(addr); + + if (!addr) + return; + __vfree_deferred(addr); +} + /** * vfree - release memory allocated by vmalloc() * @addr: memory base address @@ -1533,11 +1589,9 @@ void vfree(const void *addr) if (!addr) return; - if (unlikely(in_interrupt())) { - struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); - if (llist_add((struct llist_node *)addr, &p->list)) - schedule_work(&p->wq); - } else + if (unlikely(in_interrupt())) + __vfree_deferred(addr); + else __vunmap(addr, 1); } EXPORT_SYMBOL(vfree); @@ -1627,6 +1681,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, for (i = 0; i < area->nr_pages; i++) { struct page *page; + if (fatal_signal_pending(current)) { + area->nr_pages = i; + goto fail_no_warn; + } + if (node == NUMA_NO_NODE) page = alloc_page(alloc_mask); else @@ -1647,9 +1706,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - warn_alloc(gfp_mask, + warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure, allocated %ld of %ld bytes", (area->nr_pages*PAGE_SIZE), area->size); +fail_no_warn: vfree(area->addr); return NULL; } @@ -1709,7 +1769,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return addr; fail: - warn_alloc(gfp_mask, + warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure: %lu bytes", real_size); return NULL; } @@ -2294,7 +2354,7 @@ EXPORT_SYMBOL_GPL(free_vm_area); #ifdef CONFIG_SMP static struct vmap_area *node_to_va(struct rb_node *n) { - return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; + return rb_entry_safe(n, struct vmap_area, rb_node); } /** @@ -2574,32 +2634,13 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) static void *s_start(struct seq_file *m, loff_t *pos) __acquires(&vmap_area_lock) { - loff_t n = *pos; - struct vmap_area *va; - spin_lock(&vmap_area_lock); - va = list_first_entry(&vmap_area_list, typeof(*va), list); - while (n > 0 && &va->list != &vmap_area_list) { - n--; - va = list_next_entry(va, list); - } - if (!n && &va->list != &vmap_area_list) - return va; - - return NULL; - + return seq_list_start(&vmap_area_list, *pos); } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { - struct vmap_area *va = p, *next; - - ++*pos; - next = list_next_entry(va, list); - if (&next->list != &vmap_area_list) - return next; - - return NULL; + return seq_list_next(p, &vmap_area_list, pos); } static void s_stop(struct seq_file *m, void *p) @@ -2634,9 +2675,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) static int s_show(struct seq_file *m, void *p) { - struct vmap_area *va = p; + struct vmap_area *va; struct vm_struct *v; + va = list_entry(p, struct vmap_area, list); + /* * s_show can encounter race with remove_vm_area, !VM_VM_AREA on * behalf of vmap area is being tear down or vm_map_ram allocation. @@ -2656,7 +2699,7 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, " pages=%d", v->nr_pages); if (v->phys_addr) - seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); + seq_printf(m, " phys=%pa", &v->phys_addr); if (v->flags & VM_IOREMAP) seq_puts(m, " ioremap"); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 149fdf6c5c56..6063581f705c 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -112,9 +112,16 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, unsigned long reclaimed) { unsigned long scale = scanned + reclaimed; - unsigned long pressure; + unsigned long pressure = 0; /* + * reclaimed can be greater than scanned in cases + * like THP, where the scanned is 1 and reclaimed + * could be 512 + */ + if (reclaimed >= scanned) + goto out; + /* * We calculate the ratio (in percents) of how many pages were * scanned vs. reclaimed in a given time frame (window). Note that * time is in VM reclaimer's "ticks", i.e. number of pages @@ -124,6 +131,7 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, pressure = scale - (reclaimed * scale / scanned); pressure = pressure * 100 / scale; +out: pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, scanned, reclaimed); diff --git a/mm/vmscan.c b/mm/vmscan.c index d75cdf360730..bc8031ef994d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -14,6 +14,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> +#include <linux/sched/mm.h> #include <linux/module.h> #include <linux/gfp.h> #include <linux/kernel_stat.h> @@ -87,6 +88,7 @@ struct scan_control { /* The highest zone to isolate pages for reclaim from */ enum zone_type reclaim_idx; + /* Writepage batching in laptop mode; RECLAIM_WRITE */ unsigned int may_writepage:1; /* Can mapped pages be reclaimed? */ @@ -234,12 +236,39 @@ bool pgdat_reclaimable(struct pglist_data *pgdat) pgdat_reclaimable_pages(pgdat) * 6; } -unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) +/** + * lruvec_lru_size - Returns the number of pages on the given LRU list. + * @lruvec: lru vector + * @lru: lru to use + * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list) + */ +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { + unsigned long lru_size; + int zid; + if (!mem_cgroup_disabled()) - return mem_cgroup_get_lru_size(lruvec, lru); + lru_size = mem_cgroup_get_lru_size(lruvec, lru); + else + lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); + + for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; + unsigned long size; + + if (!managed_zone(zone)) + continue; + + if (!mem_cgroup_disabled()) + size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid); + else + size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid], + NR_ZONE_LRU_BASE + lru); + lru_size -= min(size, lru_size); + } + + return lru_size; - return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); } /* @@ -291,6 +320,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, int nid = shrinkctl->nid; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; + long scanned = 0, next_deferred; freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0) @@ -312,7 +342,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", shrinker->scan_objects, total_scan); total_scan = freeable; - } + next_deferred = nr; + } else + next_deferred = total_scan; /* * We need to avoid excessive windup on filesystem shrinkers @@ -369,17 +401,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, count_vm_events(SLABS_SCANNED, nr_to_scan); total_scan -= nr_to_scan; + scanned += nr_to_scan; cond_resched(); } + if (next_deferred >= scanned) + next_deferred -= scanned; + else + next_deferred = 0; /* * move the unused scan count back into the shrinker in a * manner that handles concurrent updates. If we exhausted the * scan, there is no need to do an update. */ - if (total_scan > 0) - new_nr = atomic_long_add_return(total_scan, + if (next_deferred > 0) + new_nr = atomic_long_add_return(next_deferred, &shrinker->nr_deferred[nid]); else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); @@ -894,6 +931,17 @@ static void page_check_dirty_writeback(struct page *page, mapping->a_ops->is_dirty_writeback(page, dirty, writeback); } +struct reclaim_stat { + unsigned nr_dirty; + unsigned nr_unqueued_dirty; + unsigned nr_congested; + unsigned nr_writeback; + unsigned nr_immediate; + unsigned nr_activate; + unsigned nr_ref_keep; + unsigned nr_unmap_fail; +}; + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -901,22 +949,20 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct pglist_data *pgdat, struct scan_control *sc, enum ttu_flags ttu_flags, - unsigned long *ret_nr_dirty, - unsigned long *ret_nr_unqueued_dirty, - unsigned long *ret_nr_congested, - unsigned long *ret_nr_writeback, - unsigned long *ret_nr_immediate, + struct reclaim_stat *stat, bool force_reclaim) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); int pgactivate = 0; - unsigned long nr_unqueued_dirty = 0; - unsigned long nr_dirty = 0; - unsigned long nr_congested = 0; - unsigned long nr_reclaimed = 0; - unsigned long nr_writeback = 0; - unsigned long nr_immediate = 0; + unsigned nr_unqueued_dirty = 0; + unsigned nr_dirty = 0; + unsigned nr_congested = 0; + unsigned nr_reclaimed = 0; + unsigned nr_writeback = 0; + unsigned nr_immediate = 0; + unsigned nr_ref_keep = 0; + unsigned nr_unmap_fail = 0; cond_resched(); @@ -1011,6 +1057,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, * throttling so we could easily OOM just because too many * pages are in writeback and there is nothing else to * reclaim. Wait for the writeback to complete. + * + * In cases 1) and 2) we activate the pages to get them out of + * the way while we continue scanning for clean pages on the + * inactive list and refilling from the active list. The + * observation here is that waiting for disk writes is more + * expensive than potentially causing reloads down the line. + * Since they're marked for immediate reclaim, they won't put + * memory pressure on the cache working set any longer than it + * takes to write them to disk. */ if (PageWriteback(page)) { /* Case 1 above */ @@ -1018,7 +1073,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, PageReclaim(page) && test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { nr_immediate++; - goto keep_locked; + goto activate_locked; /* Case 2 above */ } else if (sane_reclaim(sc) || @@ -1036,7 +1091,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ SetPageReclaim(page); nr_writeback++; - goto keep_locked; + goto activate_locked; /* Case 3 above */ } else { @@ -1055,6 +1110,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, case PAGEREF_ACTIVATE: goto activate_locked; case PAGEREF_KEEP: + nr_ref_keep++; goto keep_locked; case PAGEREF_RECLAIM: case PAGEREF_RECLAIM_CLEAN: @@ -1092,6 +1148,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : (ttu_flags | TTU_BATCH_FLUSH))) { case SWAP_FAIL: + nr_unmap_fail++; goto activate_locked; case SWAP_AGAIN: goto keep_locked; @@ -1106,13 +1163,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageDirty(page)) { /* - * Only kswapd can writeback filesystem pages to - * avoid risk of stack overflow but only writeback - * if many dirty pages have been encountered. + * Only kswapd can writeback filesystem pages + * to avoid risk of stack overflow. But avoid + * injecting inefficient single-page IO into + * flusher writeback as much as possible: only + * write pages when we've encountered many + * dirty pages, and when we've already scanned + * the rest of the LRU for clean pages and see + * the same dirty pages again (PageReclaim). */ if (page_is_file_cache(page) && - (!current_is_kswapd() || - !test_bit(PGDAT_DIRTY, &pgdat->flags))) { + (!current_is_kswapd() || !PageReclaim(page) || + !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -1122,7 +1184,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, inc_node_page_state(page, NR_VMSCAN_IMMEDIATE); SetPageReclaim(page); - goto keep_locked; + goto activate_locked; } if (references == PAGEREF_RECLAIM_CLEAN) @@ -1258,11 +1320,16 @@ keep: list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); - *ret_nr_dirty += nr_dirty; - *ret_nr_congested += nr_congested; - *ret_nr_unqueued_dirty += nr_unqueued_dirty; - *ret_nr_writeback += nr_writeback; - *ret_nr_immediate += nr_immediate; + if (stat) { + stat->nr_dirty = nr_dirty; + stat->nr_congested = nr_congested; + stat->nr_unqueued_dirty = nr_unqueued_dirty; + stat->nr_writeback = nr_writeback; + stat->nr_immediate = nr_immediate; + stat->nr_activate = pgactivate; + stat->nr_ref_keep = nr_ref_keep; + stat->nr_unmap_fail = nr_unmap_fail; + } return nr_reclaimed; } @@ -1274,7 +1341,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .priority = DEF_PRIORITY, .may_unmap = 1, }; - unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; + unsigned long ret; struct page *page, *next; LIST_HEAD(clean_pages); @@ -1287,8 +1354,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_UNMAP|TTU_IGNORE_ACCESS, - &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); + TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true); list_splice(&clean_pages, page_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); return ret; @@ -1323,13 +1389,10 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) * wants to isolate pages it will be able to operate on without * blocking - clean pages for the most part. * - * ISOLATE_CLEAN means that only clean pages should be isolated. This - * is used by reclaim when it is cannot write to backing storage - * * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages * that it is possible to migrate without blocking */ - if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { + if (mode & ISOLATE_ASYNC_MIGRATE) { /* All the caller can do on PageWriteback is block */ if (PageWriteback(page)) return ret; @@ -1337,10 +1400,6 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) if (PageDirty(page)) { struct address_space *mapping; - /* ISOLATE_CLEAN means only clean pages */ - if (mode & ISOLATE_CLEAN) - return ret; - /* * Only pages without mappings or that have a * ->migratepage callback are possible to migrate @@ -1374,8 +1433,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) * be complete before mem_cgroup_update_lru_size due to a santity check. */ static __always_inline void update_lru_sizes(struct lruvec *lruvec, - enum lru_list lru, unsigned long *nr_zone_taken, - unsigned long nr_taken) + enum lru_list lru, unsigned long *nr_zone_taken) { int zid; @@ -1384,11 +1442,11 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, continue; __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); - } - #ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, -nr_taken); + mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); #endif + } + } /* @@ -1420,6 +1478,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long nr_taken = 0; unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; + unsigned long skipped = 0, total_skipped = 0; unsigned long scan, nr_pages; LIST_HEAD(pages_skipped); @@ -1471,14 +1530,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, */ if (!list_empty(&pages_skipped)) { int zid; - unsigned long total_skipped = 0; for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_skipped[zid]) continue; __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); - total_skipped += nr_skipped[zid]; + skipped += nr_skipped[zid]; } /* @@ -1486,14 +1544,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * close to unreclaimable. If the LRU list is empty, account * skipped pages as a full scan. */ - scan += list_empty(src) ? total_skipped : total_skipped >> 2; + total_skipped = list_empty(src) ? skipped : skipped >> 2; list_splice(&pages_skipped, src); } - *nr_scanned = scan; - trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, - nr_taken, mode, is_file_lru(lru)); - update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken); + *nr_scanned = scan + total_skipped; + trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, + scan, skipped, nr_taken, mode, lru); + update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } @@ -1652,30 +1710,6 @@ static int current_may_throttle(void) bdi_write_congested(current->backing_dev_info); } -static bool inactive_reclaimable_pages(struct lruvec *lruvec, - struct scan_control *sc, enum lru_list lru) -{ - int zid; - struct zone *zone; - int file = is_file_lru(lru); - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - - if (!global_reclaim(sc)) - return true; - - for (zid = sc->reclaim_idx; zid >= 0; zid--) { - zone = &pgdat->node_zones[zid]; - if (!managed_zone(zone)) - continue; - - if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE + - LRU_FILE * file) >= SWAP_CLUSTER_MAX) - return true; - } - - return false; -} - /* * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages @@ -1688,19 +1722,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_scanned; unsigned long nr_reclaimed = 0; unsigned long nr_taken; - unsigned long nr_dirty = 0; - unsigned long nr_congested = 0; - unsigned long nr_unqueued_dirty = 0; - unsigned long nr_writeback = 0; - unsigned long nr_immediate = 0; + struct reclaim_stat stat = {}; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - if (!inactive_reclaimable_pages(lruvec, sc, lru)) - return 0; - while (unlikely(too_many_isolated(pgdat, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1713,8 +1740,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (!sc->may_unmap) isolate_mode |= ISOLATE_UNMAPPED; - if (!sc->may_writepage) - isolate_mode |= ISOLATE_CLEAN; spin_lock_irq(&pgdat->lru_lock); @@ -1737,9 +1762,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, return 0; nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP, - &nr_dirty, &nr_unqueued_dirty, &nr_congested, - &nr_writeback, &nr_immediate, - false); + &stat, false); spin_lock_irq(&pgdat->lru_lock); @@ -1773,7 +1796,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * of pages under pages flagged for immediate reclaim and stall if any * are encountered in the nr_immediate check below. */ - if (nr_writeback && nr_writeback == nr_taken) + if (stat.nr_writeback && stat.nr_writeback == nr_taken) set_bit(PGDAT_WRITEBACK, &pgdat->flags); /* @@ -1785,17 +1808,25 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * Tag a zone as congested if all the dirty pages scanned were * backed by a congested BDI and wait_iff_congested will stall. */ - if (nr_dirty && nr_dirty == nr_congested) + if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested) set_bit(PGDAT_CONGESTED, &pgdat->flags); /* * If dirty pages are scanned that are not queued for IO, it - * implies that flushers are not keeping up. In this case, flag - * the pgdat PGDAT_DIRTY and kswapd will start writing pages from - * reclaim context. + * implies that flushers are not doing their job. This can + * happen when memory pressure pushes dirty pages to the end of + * the LRU before the dirty limits are breached and the dirty + * data has expired. It can also happen when the proportion of + * dirty pages grows not through writes but through memory + * pressure reclaiming all the clean cache. And in some cases, + * the flushers simply cannot keep up with the allocation + * rate. Nudge the flusher threads in case they are asleep, but + * also allow kswapd to start writing pages during reclaim. */ - if (nr_unqueued_dirty == nr_taken) + if (stat.nr_unqueued_dirty == nr_taken) { + wakeup_flusher_threads(0, WB_REASON_VMSCAN); set_bit(PGDAT_DIRTY, &pgdat->flags); + } /* * If kswapd scans pages marked marked for immediate @@ -1803,7 +1834,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * that pages are cycling through the LRU faster than * they are written so also forcibly stall. */ - if (nr_immediate && current_may_throttle()) + if (stat.nr_immediate && current_may_throttle()) congestion_wait(BLK_RW_ASYNC, HZ/10); } @@ -1818,6 +1849,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, nr_scanned, nr_reclaimed, + stat.nr_dirty, stat.nr_writeback, + stat.nr_congested, stat.nr_immediate, + stat.nr_activate, stat.nr_ref_keep, + stat.nr_unmap_fail, sc->priority, file); return nr_reclaimed; } @@ -1838,17 +1873,19 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * * The downside is that we have to touch page->_refcount against each page. * But we had to alter page->flags anyway. + * + * Returns the number of pages moved to the given lru. */ -static void move_active_pages_to_lru(struct lruvec *lruvec, +static unsigned move_active_pages_to_lru(struct lruvec *lruvec, struct list_head *list, struct list_head *pages_to_free, enum lru_list lru) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); - unsigned long pgmoved = 0; struct page *page; int nr_pages; + int nr_moved = 0; while (!list_empty(list)) { page = lru_to_page(list); @@ -1860,7 +1897,6 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, nr_pages = hpage_nr_pages(page); update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); list_move(&page->lru, &lruvec->lists[lru]); - pgmoved += nr_pages; if (put_page_testzero(page)) { __ClearPageLRU(page); @@ -1874,11 +1910,15 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, pages_to_free); + } else { + nr_moved += nr_pages; } } if (!is_active_lru(lru)) - __count_vm_events(PGDEACTIVATE, pgmoved); + __count_vm_events(PGDEACTIVATE, nr_moved); + + return nr_moved; } static void shrink_active_list(unsigned long nr_to_scan, @@ -1894,7 +1934,8 @@ static void shrink_active_list(unsigned long nr_to_scan, LIST_HEAD(l_inactive); struct page *page; struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - unsigned long nr_rotated = 0; + unsigned nr_deactivate, nr_activate; + unsigned nr_rotated = 0; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -1903,8 +1944,6 @@ static void shrink_active_list(unsigned long nr_to_scan, if (!sc->may_unmap) isolate_mode |= ISOLATE_UNMAPPED; - if (!sc->may_writepage) - isolate_mode |= ISOLATE_CLEAN; spin_lock_irq(&pgdat->lru_lock); @@ -1972,13 +2011,15 @@ static void shrink_active_list(unsigned long nr_to_scan, */ reclaim_stat->recent_rotated[file] += nr_rotated; - move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); - move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); + nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); + nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge_list(&l_hold); free_hot_cold_page_list(&l_hold, true); + trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, + nr_deactivate, nr_rotated, sc->priority, file); } /* @@ -2008,14 +2049,13 @@ static void shrink_active_list(unsigned long nr_to_scan, * 10TB 320 32GB */ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, - struct scan_control *sc) + struct scan_control *sc, bool trace) { unsigned long inactive_ratio; - unsigned long inactive; - unsigned long active; + unsigned long inactive, active; + enum lru_list inactive_lru = file * LRU_FILE; + enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; unsigned long gb; - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - int zid; /* * If we don't have swap space, anonymous page deactivation @@ -2024,29 +2064,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, if (!file && !total_swap_pages) return false; - inactive = lruvec_lru_size(lruvec, file * LRU_FILE); - active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE); - - /* - * For zone-constrained allocations, it is necessary to check if - * deactivations are required for lowmem to be reclaimed. This - * calculates the inactive/active pages available in eligible zones. - */ - for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) { - struct zone *zone = &pgdat->node_zones[zid]; - unsigned long inactive_zone, active_zone; - - if (!managed_zone(zone)) - continue; - - inactive_zone = zone_page_state(zone, - NR_ZONE_LRU_BASE + (file * LRU_FILE)); - active_zone = zone_page_state(zone, - NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE); - - inactive -= min(inactive, inactive_zone); - active -= min(active, active_zone); - } + inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); + active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) @@ -2054,6 +2073,13 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, else inactive_ratio = 1; + if (trace) + trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id, + sc->reclaim_idx, + lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, + lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, + inactive_ratio, file); + return inactive * inactive_ratio < active; } @@ -2061,7 +2087,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { - if (inactive_list_is_low(lruvec, is_file_lru(lru), sc)) + if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true)) shrink_active_list(nr_to_scan, lruvec, sc, lru); return 0; } @@ -2192,8 +2218,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * lruvec even if it has plenty of old anonymous pages unless the * system is under heavy pressure. */ - if (!inactive_list_is_low(lruvec, true, sc) && - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { + if (!inactive_list_is_low(lruvec, true, sc, false) && + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { scan_balance = SCAN_FILE; goto out; } @@ -2219,10 +2245,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * anon in [0], file in [1] */ - anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) + - lruvec_lru_size(lruvec, LRU_INACTIVE_ANON); - file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); + anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); + file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES); spin_lock_irq(&pgdat->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { @@ -2260,7 +2286,7 @@ out: unsigned long size; unsigned long scan; - size = lruvec_lru_size(lruvec, lru); + size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); scan = size >> sc->priority; if (!scan && pass && force_scan) @@ -2417,7 +2443,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_list_is_low(lruvec, false, sc)) + if (inactive_list_is_low(lruvec, false, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } @@ -2746,8 +2772,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { int initial_priority = sc->priority; - unsigned long total_scanned = 0; - unsigned long writeback_threshold; retry: delayacct_freepages_start(); @@ -2760,7 +2784,6 @@ retry: sc->nr_scanned = 0; shrink_zones(zonelist, sc); - total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) break; @@ -2773,20 +2796,6 @@ retry: */ if (sc->priority < DEF_PRIORITY - 2) sc->may_writepage = 1; - - /* - * Try to write back as many pages as we just scanned. This - * tends to cause slow streaming writers to write data to the - * disk smoothly, at the dirtying rate, which is nice. But - * that's undesirable in laptop mode, where we *want* lumpy - * writeout. So in laptop mode, write out the whole world. - */ - writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; - if (total_scanned > writeback_threshold) { - wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, - WB_REASON_TRY_TO_FREE_PAGES); - sc->may_writepage = 1; - } } while (--sc->priority >= 0); delayacct_freepages_end(); @@ -3067,7 +3076,7 @@ static void age_active_anon(struct pglist_data *pgdat, do { struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); - if (inactive_list_is_low(lruvec, false, sc)) + if (inactive_list_is_low(lruvec, false, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -3088,6 +3097,7 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx) */ clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags); clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags); + clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags); return true; } @@ -3558,24 +3568,21 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) not required for correctness. So if the last cpu in a node goes away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */ -static int cpu_callback(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static int kswapd_cpu_online(unsigned int cpu) { int nid; - if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { - for_each_node_state(nid, N_MEMORY) { - pg_data_t *pgdat = NODE_DATA(nid); - const struct cpumask *mask; + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; - mask = cpumask_of_node(pgdat->node_id); + mask = cpumask_of_node(pgdat->node_id); - if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) - /* One of our CPUs online: restore mask */ - set_cpus_allowed_ptr(pgdat->kswapd, mask); - } + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->kswapd, mask); } - return NOTIFY_OK; + return 0; } /* @@ -3617,12 +3624,15 @@ void kswapd_stop(int nid) static int __init kswapd_init(void) { - int nid; + int nid, ret; swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); - hotcpu_notifier(cpu_callback, 0); + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "mm/vmscan:online", kswapd_cpu_online, + NULL); + WARN_ON(ret < 0); return 0; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 604f26a4f696..5a4f5c5a31e8 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1038,6 +1038,8 @@ const char * const vmstat_text[] = { "compact_fail", "compact_success", "compact_daemon_wake", + "compact_daemon_migrate_scanned", + "compact_daemon_free_scanned", #endif #ifdef CONFIG_HUGETLB_PAGE @@ -1063,6 +1065,9 @@ const char * const vmstat_text[] = { "thp_split_page_failed", "thp_deferred_split_page", "thp_split_pmd", +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD + "thp_split_pud", +#endif "thp_zero_page_alloc", "thp_zero_page_alloc_failed", #endif @@ -1547,7 +1552,6 @@ static const struct file_operations proc_vmstat_file_operations = { #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SMP -static struct workqueue_struct *vmstat_wq; static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; @@ -1618,7 +1622,7 @@ static void vmstat_update(struct work_struct *w) * to occur in the future. Keep on running the * update worker thread. */ - queue_delayed_work_on(smp_processor_id(), vmstat_wq, + queue_delayed_work_on(smp_processor_id(), mm_percpu_wq, this_cpu_ptr(&vmstat_work), round_jiffies_relative(sysctl_stat_interval)); } @@ -1697,7 +1701,7 @@ static void vmstat_shepherd(struct work_struct *w) struct delayed_work *dw = &per_cpu(vmstat_work, cpu); if (!delayed_work_pending(dw) && need_update(cpu)) - queue_delayed_work_on(cpu, vmstat_wq, dw, 0); + queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0); } put_online_cpus(); @@ -1713,82 +1717,76 @@ static void __init start_shepherd_timer(void) INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), vmstat_update); - vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); } static void __init init_cpu_node_state(void) { - int cpu; + int node; - get_online_cpus(); - for_each_online_cpu(cpu) - node_set_state(cpu_to_node(cpu), N_CPU); - put_online_cpus(); + for_each_online_node(node) { + if (cpumask_weight(cpumask_of_node(node)) > 0) + node_set_state(node, N_CPU); + } } -static void vmstat_cpu_dead(int node) +static int vmstat_cpu_online(unsigned int cpu) { - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - if (cpu_to_node(cpu) == node) - goto end; + refresh_zone_stat_thresholds(); + node_set_state(cpu_to_node(cpu), N_CPU); + return 0; +} - node_clear_state(node, N_CPU); -end: - put_online_cpus(); +static int vmstat_cpu_down_prep(unsigned int cpu) +{ + cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); + return 0; } -/* - * Use the cpu notifier to insure that the thresholds are recalculated - * when necessary. - */ -static int vmstat_cpuup_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - refresh_zone_stat_thresholds(); - node_set_state(cpu_to_node(cpu), N_CPU); - break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); - break; - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - refresh_zone_stat_thresholds(); - vmstat_cpu_dead(cpu_to_node(cpu)); - break; - default: - break; - } - return NOTIFY_OK; +static int vmstat_cpu_dead(unsigned int cpu) +{ + const struct cpumask *node_cpus; + int node; + + node = cpu_to_node(cpu); + + refresh_zone_stat_thresholds(); + node_cpus = cpumask_of_node(node); + if (cpumask_weight(node_cpus) > 0) + return 0; + + node_clear_state(node, N_CPU); + return 0; } -static struct notifier_block vmstat_notifier = - { &vmstat_cpuup_callback, NULL, 0 }; #endif -static int __init setup_vmstat(void) +struct workqueue_struct *mm_percpu_wq; + +void __init init_mm_internals(void) { + int ret __maybe_unused; + + mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0); + #ifdef CONFIG_SMP - cpu_notifier_register_begin(); - __register_cpu_notifier(&vmstat_notifier); + ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead", + NULL, vmstat_cpu_dead); + if (ret < 0) + pr_err("vmstat: failed to register 'dead' hotplug state\n"); + + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online", + vmstat_cpu_online, + vmstat_cpu_down_prep); + if (ret < 0) + pr_err("vmstat: failed to register 'online' hotplug state\n"); + + get_online_cpus(); init_cpu_node_state(); + put_online_cpus(); start_shepherd_timer(); - cpu_notifier_register_done(); #endif #ifdef CONFIG_PROC_FS proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); @@ -1796,9 +1794,7 @@ static int __init setup_vmstat(void) proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); #endif - return 0; } -module_init(setup_vmstat) #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) diff --git a/mm/workingset.c b/mm/workingset.c index fb1f9183d89a..eda05c71fa49 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -6,10 +6,12 @@ #include <linux/memcontrol.h> #include <linux/writeback.h> +#include <linux/shmem_fs.h> #include <linux/pagemap.h> #include <linux/atomic.h> #include <linux/module.h> #include <linux/swap.h> +#include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> @@ -266,7 +268,7 @@ bool workingset_refault(void *shadow) } lruvec = mem_cgroup_lruvec(pgdat, memcg); refault = atomic_long_read(&lruvec->inactive_age); - active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); + active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); rcu_read_unlock(); /* @@ -334,48 +336,79 @@ out: * point where they would still be useful. */ -struct list_lru workingset_shadow_nodes; +static struct list_lru shadow_nodes; + +void workingset_update_node(struct radix_tree_node *node, void *private) +{ + struct address_space *mapping = private; + + /* Only regular page cache has shadow entries */ + if (dax_mapping(mapping) || shmem_mapping(mapping)) + return; + + /* + * Track non-empty nodes that contain only shadow entries; + * unlink those that contain pages or are being freed. + * + * Avoid acquiring the list_lru lock when the nodes are + * already where they should be. The list_empty() test is safe + * as node->private_list is protected by &mapping->tree_lock. + */ + if (node->count && node->count == node->exceptional) { + if (list_empty(&node->private_list)) + list_lru_add(&shadow_nodes, &node->private_list); + } else { + if (!list_empty(&node->private_list)) + list_lru_del(&shadow_nodes, &node->private_list); + } +} static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { - unsigned long shadow_nodes; unsigned long max_nodes; - unsigned long pages; + unsigned long nodes; + unsigned long cache; /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); + nodes = list_lru_shrink_count(&shadow_nodes, sc); local_irq_enable(); - if (sc->memcg) { - pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, - LRU_ALL_FILE); - } else { - pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + - node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); - } - /* - * Active cache pages are limited to 50% of memory, and shadow - * entries that represent a refault distance bigger than that - * do not have any effect. Limit the number of shadow nodes - * such that shadow entries do not exceed the number of active - * cache pages, assuming a worst-case node population density - * of 1/8th on average. + * Approximate a reasonable limit for the radix tree nodes + * containing shadow entries. We don't need to keep more + * shadow entries than possible pages on the active list, + * since refault distances bigger than that are dismissed. + * + * The size of the active list converges toward 100% of + * overall page cache as memory grows, with only a tiny + * inactive list. Assume the total cache size for that. + * + * Nodes might be sparsely populated, with only one shadow + * entry in the extreme case. Obviously, we cannot keep one + * node for every eligible shadow entry, so compromise on a + * worst-case density of 1/8th. Below that, not all eligible + * refaults can be detected anymore. * * On 64-bit with 7 radix_tree_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume - * ~2% of available memory: + * ~1.8% of available memory: * - * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE + * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE */ - max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3); + if (sc->memcg) { + cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, + LRU_ALL_FILE); + } else { + cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + + node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); + } + max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); - if (shadow_nodes <= max_nodes) + if (nodes <= max_nodes) return 0; - - return shadow_nodes - max_nodes; + return nodes - max_nodes; } static enum lru_status shadow_lru_isolate(struct list_head *item, @@ -401,7 +434,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, */ node = container_of(item, struct radix_tree_node, private_list); - mapping = node->private_data; + mapping = container_of(node->root, struct address_space, page_tree); /* Coming from the list, invert the lock order */ if (!spin_trylock(&mapping->tree_lock)) { @@ -418,23 +451,31 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - BUG_ON(!workingset_node_shadows(node)); - BUG_ON(workingset_node_pages(node)); - + if (WARN_ON_ONCE(!node->exceptional)) + goto out_invalid; + if (WARN_ON_ONCE(node->count != node->exceptional)) + goto out_invalid; for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { if (node->slots[i]) { - BUG_ON(!radix_tree_exceptional_entry(node->slots[i])); + if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i]))) + goto out_invalid; + if (WARN_ON_ONCE(!node->exceptional)) + goto out_invalid; + if (WARN_ON_ONCE(!mapping->nrexceptional)) + goto out_invalid; node->slots[i] = NULL; - workingset_node_shadows_dec(node); - BUG_ON(!mapping->nrexceptional); + node->exceptional--; + node->count--; mapping->nrexceptional--; } } - BUG_ON(workingset_node_shadows(node)); + if (WARN_ON_ONCE(node->exceptional)) + goto out_invalid; inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); - if (!__radix_tree_delete_node(&mapping->page_tree, node)) - BUG(); + __radix_tree_delete_node(&mapping->page_tree, node, + workingset_update_node, mapping); +out_invalid: spin_unlock(&mapping->tree_lock); ret = LRU_REMOVED_RETRY; out: @@ -452,8 +493,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc, - shadow_lru_isolate, NULL); + ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL); local_irq_enable(); return ret; } @@ -492,7 +532,7 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); + ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key); if (ret) goto err; ret = register_shrinker(&workingset_shadow_shrinker); @@ -500,7 +540,7 @@ static int __init workingset_init(void) goto err_list_lru; return 0; err_list_lru: - list_lru_destroy(&workingset_shadow_nodes); + list_lru_destroy(&shadow_nodes); err: return ret; } diff --git a/mm/z3fold.c b/mm/z3fold.c index 8f9e89ca1d31..54f63c4a809a 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -34,28 +34,61 @@ /***************** * Structures *****************/ +struct z3fold_pool; +struct z3fold_ops { + int (*evict)(struct z3fold_pool *pool, unsigned long handle); +}; + +enum buddy { + HEADLESS = 0, + FIRST, + MIDDLE, + LAST, + BUDDIES_MAX +}; + +/* + * struct z3fold_header - z3fold page metadata occupying the first chunk of each + * z3fold page, except for HEADLESS pages + * @buddy: links the z3fold page into the relevant list in the pool + * @page_lock: per-page lock + * @refcount: reference cound for the z3fold page + * @first_chunks: the size of the first buddy in chunks, 0 if free + * @middle_chunks: the size of the middle buddy in chunks, 0 if free + * @last_chunks: the size of the last buddy in chunks, 0 if free + * @first_num: the starting number (for the first handle) + */ +struct z3fold_header { + struct list_head buddy; + spinlock_t page_lock; + struct kref refcount; + unsigned short first_chunks; + unsigned short middle_chunks; + unsigned short last_chunks; + unsigned short start_middle; + unsigned short first_num:2; +}; + /* * NCHUNKS_ORDER determines the internal allocation granularity, effectively * adjusting internal fragmentation. It also determines the number of * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the - * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk - * in allocated page is occupied by z3fold header, NCHUNKS will be calculated - * to 63 which shows the max number of free chunks in z3fold page, also there - * will be 63 freelists per pool. + * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks + * in the beginning of an allocated page are occupied by z3fold header, so + * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y), + * which shows the max number of free chunks in z3fold page, also there will + * be 63, or 62, respectively, freelists per pool. */ #define NCHUNKS_ORDER 6 #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) #define CHUNK_SIZE (1 << CHUNK_SHIFT) -#define ZHDR_SIZE_ALIGNED CHUNK_SIZE +#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE) +#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT) +#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT) #define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) -#define BUDDY_MASK ((1 << NCHUNKS_ORDER) - 1) - -struct z3fold_pool; -struct z3fold_ops { - int (*evict)(struct z3fold_pool *pool, unsigned long handle); -}; +#define BUDDY_MASK (0x3) /** * struct z3fold_pool - stores metadata for each z3fold pool @@ -64,8 +97,6 @@ struct z3fold_ops { * @unbuddied: array of lists tracking z3fold pages that contain 2- buddies; * the lists each z3fold page is added to depends on the size of * its free region. - * @buddied: list tracking the z3fold pages that contain 3 buddies; - * these z3fold pages are full * @lru: list tracking the z3fold pages in LRU order by most recently * added buddy. * @pages_nr: number of z3fold pages in the pool. @@ -78,49 +109,22 @@ struct z3fold_ops { struct z3fold_pool { spinlock_t lock; struct list_head unbuddied[NCHUNKS]; - struct list_head buddied; struct list_head lru; - u64 pages_nr; + atomic64_t pages_nr; const struct z3fold_ops *ops; struct zpool *zpool; const struct zpool_ops *zpool_ops; }; -enum buddy { - HEADLESS = 0, - FIRST, - MIDDLE, - LAST, - BUDDIES_MAX -}; - -/* - * struct z3fold_header - z3fold page metadata occupying the first chunk of each - * z3fold page, except for HEADLESS pages - * @buddy: links the z3fold page into the relevant list in the pool - * @first_chunks: the size of the first buddy in chunks, 0 if free - * @middle_chunks: the size of the middle buddy in chunks, 0 if free - * @last_chunks: the size of the last buddy in chunks, 0 if free - * @first_num: the starting number (for the first handle) - */ -struct z3fold_header { - struct list_head buddy; - unsigned short first_chunks; - unsigned short middle_chunks; - unsigned short last_chunks; - unsigned short start_middle; - unsigned short first_num:NCHUNKS_ORDER; -}; - /* * Internal z3fold page flags */ enum z3fold_page_flags { - UNDER_RECLAIM = 0, - PAGE_HEADLESS, + PAGE_HEADLESS = 0, MIDDLE_CHUNK_MAPPED, }; + /***************** * Helpers *****************/ @@ -140,10 +144,11 @@ static struct z3fold_header *init_z3fold_page(struct page *page) struct z3fold_header *zhdr = page_address(page); INIT_LIST_HEAD(&page->lru); - clear_bit(UNDER_RECLAIM, &page->private); clear_bit(PAGE_HEADLESS, &page->private); clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); + spin_lock_init(&zhdr->page_lock); + kref_init(&zhdr->refcount); zhdr->first_chunks = 0; zhdr->middle_chunks = 0; zhdr->last_chunks = 0; @@ -154,9 +159,42 @@ static struct z3fold_header *init_z3fold_page(struct page *page) } /* Resets the struct page fields and frees the page */ -static void free_z3fold_page(struct z3fold_header *zhdr) +static void free_z3fold_page(struct page *page) { - __free_page(virt_to_page(zhdr)); + __free_page(page); +} + +static void release_z3fold_page(struct kref *ref) +{ + struct z3fold_header *zhdr; + struct page *page; + + zhdr = container_of(ref, struct z3fold_header, refcount); + page = virt_to_page(zhdr); + + if (!list_empty(&zhdr->buddy)) + list_del(&zhdr->buddy); + if (!list_empty(&page->lru)) + list_del(&page->lru); + free_z3fold_page(page); +} + +/* Lock a z3fold page */ +static inline void z3fold_page_lock(struct z3fold_header *zhdr) +{ + spin_lock(&zhdr->page_lock); +} + +/* Try to lock a z3fold page */ +static inline int z3fold_page_trylock(struct z3fold_header *zhdr) +{ + return spin_trylock(&zhdr->page_lock); +} + +/* Unlock a z3fold page */ +static inline void z3fold_page_unlock(struct z3fold_header *zhdr) +{ + spin_unlock(&zhdr->page_lock); } /* @@ -179,7 +217,11 @@ static struct z3fold_header *handle_to_z3fold_header(unsigned long handle) return (struct z3fold_header *)(handle & PAGE_MASK); } -/* Returns buddy number */ +/* + * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle + * but that doesn't matter. because the masking will result in the + * correct buddy number. + */ static enum buddy handle_to_buddy(unsigned long handle) { struct z3fold_header *zhdr = handle_to_z3fold_header(handle); @@ -200,9 +242,10 @@ static int num_free_chunks(struct z3fold_header *zhdr) */ if (zhdr->middle_chunks != 0) { int nfree_before = zhdr->first_chunks ? - 0 : zhdr->start_middle - 1; + 0 : zhdr->start_middle - ZHDR_CHUNKS; int nfree_after = zhdr->last_chunks ? - 0 : NCHUNKS - zhdr->start_middle - zhdr->middle_chunks; + 0 : TOTAL_CHUNKS - + (zhdr->start_middle + zhdr->middle_chunks); nfree = max(nfree_before, nfree_after); } else nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; @@ -232,9 +275,8 @@ static struct z3fold_pool *z3fold_create_pool(gfp_t gfp, spin_lock_init(&pool->lock); for_each_unbuddied_list(i, 0) INIT_LIST_HEAD(&pool->unbuddied[i]); - INIT_LIST_HEAD(&pool->buddied); INIT_LIST_HEAD(&pool->lru); - pool->pages_nr = 0; + atomic64_set(&pool->pages_nr, 0); pool->ops = ops; return pool; } @@ -250,25 +292,58 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool) kfree(pool); } +static inline void *mchunk_memmove(struct z3fold_header *zhdr, + unsigned short dst_chunk) +{ + void *beg = zhdr; + return memmove(beg + (dst_chunk << CHUNK_SHIFT), + beg + (zhdr->start_middle << CHUNK_SHIFT), + zhdr->middle_chunks << CHUNK_SHIFT); +} + +#define BIG_CHUNK_GAP 3 /* Has to be called with lock held */ static int z3fold_compact_page(struct z3fold_header *zhdr) { struct page *page = virt_to_page(zhdr); - void *beg = zhdr; + if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private)) + return 0; /* can't move middle chunk, it's used */ + + if (zhdr->middle_chunks == 0) + return 0; /* nothing to compact */ - if (!test_bit(MIDDLE_CHUNK_MAPPED, &page->private) && - zhdr->middle_chunks != 0 && - zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { - memmove(beg + ZHDR_SIZE_ALIGNED, - beg + (zhdr->start_middle << CHUNK_SHIFT), - zhdr->middle_chunks << CHUNK_SHIFT); + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + /* move to the beginning */ + mchunk_memmove(zhdr, ZHDR_CHUNKS); zhdr->first_chunks = zhdr->middle_chunks; zhdr->middle_chunks = 0; zhdr->start_middle = 0; zhdr->first_num++; return 1; } + + /* + * moving data is expensive, so let's only do that if + * there's substantial gain (at least BIG_CHUNK_GAP chunks) + */ + if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 && + zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >= + BIG_CHUNK_GAP) { + mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS); + zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; + return 1; + } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 && + TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle + + zhdr->middle_chunks) >= + BIG_CHUNK_GAP) { + unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks - + zhdr->middle_chunks; + mchunk_memmove(zhdr, new_start); + zhdr->start_middle = new_start; + return 1; + } + return 0; } @@ -309,50 +384,62 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, bud = HEADLESS; else { chunks = size_to_chunks(size); - spin_lock(&pool->lock); /* First, try to find an unbuddied z3fold page. */ zhdr = NULL; for_each_unbuddied_list(i, chunks) { - if (!list_empty(&pool->unbuddied[i])) { - zhdr = list_first_entry(&pool->unbuddied[i], + spin_lock(&pool->lock); + zhdr = list_first_entry_or_null(&pool->unbuddied[i], struct z3fold_header, buddy); - page = virt_to_page(zhdr); - if (zhdr->first_chunks == 0) { - if (zhdr->middle_chunks != 0 && - chunks >= zhdr->start_middle) - bud = LAST; - else - bud = FIRST; - } else if (zhdr->last_chunks == 0) + if (!zhdr || !z3fold_page_trylock(zhdr)) { + spin_unlock(&pool->lock); + continue; + } + kref_get(&zhdr->refcount); + list_del_init(&zhdr->buddy); + spin_unlock(&pool->lock); + + page = virt_to_page(zhdr); + if (zhdr->first_chunks == 0) { + if (zhdr->middle_chunks != 0 && + chunks >= zhdr->start_middle) bud = LAST; - else if (zhdr->middle_chunks == 0) - bud = MIDDLE; - else { - pr_err("No free chunks in unbuddied\n"); - WARN_ON(1); - continue; - } - list_del(&zhdr->buddy); - goto found; + else + bud = FIRST; + } else if (zhdr->last_chunks == 0) + bud = LAST; + else if (zhdr->middle_chunks == 0) + bud = MIDDLE; + else { + z3fold_page_unlock(zhdr); + spin_lock(&pool->lock); + if (kref_put(&zhdr->refcount, + release_z3fold_page)) + atomic64_dec(&pool->pages_nr); + spin_unlock(&pool->lock); + pr_err("No free chunks in unbuddied\n"); + WARN_ON(1); + continue; } + goto found; } bud = FIRST; - spin_unlock(&pool->lock); } /* Couldn't find unbuddied z3fold page, create new one */ page = alloc_page(gfp); if (!page) return -ENOMEM; - spin_lock(&pool->lock); - pool->pages_nr++; + + atomic64_inc(&pool->pages_nr); zhdr = init_z3fold_page(page); if (bud == HEADLESS) { set_bit(PAGE_HEADLESS, &page->private); + spin_lock(&pool->lock); goto headless; } + z3fold_page_lock(zhdr); found: if (bud == FIRST) @@ -361,17 +448,15 @@ found: zhdr->last_chunks = chunks; else { zhdr->middle_chunks = chunks; - zhdr->start_middle = zhdr->first_chunks + 1; + zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; } + spin_lock(&pool->lock); if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || zhdr->middle_chunks == 0) { /* Add to unbuddied list */ freechunks = num_free_chunks(zhdr); list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); - } else { - /* Add to buddied list */ - list_add(&zhdr->buddy, &pool->buddied); } headless: @@ -383,6 +468,8 @@ headless: *handle = encode_handle(zhdr, bud); spin_unlock(&pool->lock); + if (bud != HEADLESS) + z3fold_page_unlock(zhdr); return 0; } @@ -404,7 +491,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) struct page *page; enum buddy bud; - spin_lock(&pool->lock); zhdr = handle_to_z3fold_header(handle); page = virt_to_page(zhdr); @@ -412,6 +498,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) /* HEADLESS page stored */ bud = HEADLESS; } else { + z3fold_page_lock(zhdr); bud = handle_to_buddy(handle); switch (bud) { @@ -428,38 +515,36 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) default: pr_err("%s: unknown bud %d\n", __func__, bud); WARN_ON(1); - spin_unlock(&pool->lock); + z3fold_page_unlock(zhdr); return; } } - if (test_bit(UNDER_RECLAIM, &page->private)) { - /* z3fold page is under reclaim, reclaim will free */ - spin_unlock(&pool->lock); - return; - } - - if (bud != HEADLESS) { - /* Remove from existing buddy list */ - list_del(&zhdr->buddy); - } - - if (bud == HEADLESS || - (zhdr->first_chunks == 0 && zhdr->middle_chunks == 0 && - zhdr->last_chunks == 0)) { - /* z3fold page is empty, free */ + if (bud == HEADLESS) { + spin_lock(&pool->lock); list_del(&page->lru); - clear_bit(PAGE_HEADLESS, &page->private); - free_z3fold_page(zhdr); - pool->pages_nr--; + spin_unlock(&pool->lock); + free_z3fold_page(page); + atomic64_dec(&pool->pages_nr); } else { - z3fold_compact_page(zhdr); - /* Add to the unbuddied list */ - freechunks = num_free_chunks(zhdr); - list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + if (zhdr->first_chunks != 0 || zhdr->middle_chunks != 0 || + zhdr->last_chunks != 0) { + z3fold_compact_page(zhdr); + /* Add to the unbuddied list */ + spin_lock(&pool->lock); + if (!list_empty(&zhdr->buddy)) + list_del(&zhdr->buddy); + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + spin_unlock(&pool->lock); + } + z3fold_page_unlock(zhdr); + spin_lock(&pool->lock); + if (kref_put(&zhdr->refcount, release_z3fold_page)) + atomic64_dec(&pool->pages_nr); + spin_unlock(&pool->lock); } - spin_unlock(&pool->lock); } /** @@ -506,20 +591,25 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; spin_lock(&pool->lock); - if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) || - retries == 0) { + if (!pool->ops || !pool->ops->evict || retries == 0) { spin_unlock(&pool->lock); return -EINVAL; } for (i = 0; i < retries; i++) { + if (list_empty(&pool->lru)) { + spin_unlock(&pool->lock); + return -EINVAL; + } page = list_last_entry(&pool->lru, struct page, lru); - list_del(&page->lru); + list_del_init(&page->lru); - /* Protect z3fold page against free */ - set_bit(UNDER_RECLAIM, &page->private); zhdr = page_address(page); if (!test_bit(PAGE_HEADLESS, &page->private)) { - list_del(&zhdr->buddy); + if (!list_empty(&zhdr->buddy)) + list_del_init(&zhdr->buddy); + kref_get(&zhdr->refcount); + spin_unlock(&pool->lock); + z3fold_page_lock(zhdr); /* * We need encode the handles before unlocking, since * we can race with free that will set @@ -534,13 +624,13 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) middle_handle = encode_handle(zhdr, MIDDLE); if (zhdr->last_chunks) last_handle = encode_handle(zhdr, LAST); + z3fold_page_unlock(zhdr); } else { first_handle = encode_handle(zhdr, HEADLESS); last_handle = middle_handle = 0; + spin_unlock(&pool->lock); } - spin_unlock(&pool->lock); - /* Issue the eviction callback(s) */ if (middle_handle) { ret = pool->ops->evict(pool, middle_handle); @@ -558,36 +648,41 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) goto next; } next: - spin_lock(&pool->lock); - clear_bit(UNDER_RECLAIM, &page->private); - if ((test_bit(PAGE_HEADLESS, &page->private) && ret == 0) || - (zhdr->first_chunks == 0 && zhdr->last_chunks == 0 && - zhdr->middle_chunks == 0)) { - /* - * All buddies are now free, free the z3fold page and - * return success. - */ - clear_bit(PAGE_HEADLESS, &page->private); - free_z3fold_page(zhdr); - pool->pages_nr--; - spin_unlock(&pool->lock); - return 0; - } else if (!test_bit(PAGE_HEADLESS, &page->private)) { - if (zhdr->first_chunks != 0 && - zhdr->last_chunks != 0 && - zhdr->middle_chunks != 0) { - /* Full, add to buddied list */ - list_add(&zhdr->buddy, &pool->buddied); + if (test_bit(PAGE_HEADLESS, &page->private)) { + if (ret == 0) { + free_z3fold_page(page); + return 0; } else { + spin_lock(&pool->lock); + } + } else { + z3fold_page_lock(zhdr); + if ((zhdr->first_chunks || zhdr->last_chunks || + zhdr->middle_chunks) && + !(zhdr->first_chunks && zhdr->last_chunks && + zhdr->middle_chunks)) { z3fold_compact_page(zhdr); /* add to unbuddied list */ + spin_lock(&pool->lock); freechunks = num_free_chunks(zhdr); list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + spin_unlock(&pool->lock); + } + z3fold_page_unlock(zhdr); + spin_lock(&pool->lock); + if (kref_put(&zhdr->refcount, release_z3fold_page)) { + spin_unlock(&pool->lock); + atomic64_dec(&pool->pages_nr); + return 0; } } - /* add to beginning of LRU */ + /* + * Add to the beginning of LRU. + * Pool lock has to be kept here to ensure the page has + * not already been released + */ list_add(&page->lru, &pool->lru); } spin_unlock(&pool->lock); @@ -611,7 +706,6 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) void *addr; enum buddy buddy; - spin_lock(&pool->lock); zhdr = handle_to_z3fold_header(handle); addr = zhdr; page = virt_to_page(zhdr); @@ -619,6 +713,7 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) if (test_bit(PAGE_HEADLESS, &page->private)) goto out; + z3fold_page_lock(zhdr); buddy = handle_to_buddy(handle); switch (buddy) { case FIRST: @@ -637,8 +732,9 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) addr = NULL; break; } + + z3fold_page_unlock(zhdr); out: - spin_unlock(&pool->lock); return addr; } @@ -653,31 +749,28 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle) struct page *page; enum buddy buddy; - spin_lock(&pool->lock); zhdr = handle_to_z3fold_header(handle); page = virt_to_page(zhdr); - if (test_bit(PAGE_HEADLESS, &page->private)) { - spin_unlock(&pool->lock); + if (test_bit(PAGE_HEADLESS, &page->private)) return; - } + z3fold_page_lock(zhdr); buddy = handle_to_buddy(handle); if (buddy == MIDDLE) clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); - spin_unlock(&pool->lock); + z3fold_page_unlock(zhdr); } /** * z3fold_get_pool_size() - gets the z3fold pool size in pages * @pool: pool whose size is being queried * - * Returns: size in pages of the given pool. The pool lock need not be - * taken to access pages_nr. + * Returns: size in pages of the given pool. */ static u64 z3fold_get_pool_size(struct z3fold_pool *pool) { - return pool->pages_nr; + return atomic64_read(&pool->pages_nr); } /***************** @@ -776,8 +869,8 @@ MODULE_ALIAS("zpool-z3fold"); static int __init init_z3fold(void) { - /* Make sure the z3fold header will fit in one chunk */ - BUILD_BUG_ON(sizeof(struct z3fold_header) > ZHDR_SIZE_ALIGNED); + /* Make sure the z3fold header is not larger than the page size */ + BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE); zpool_register_driver(&z3fold_zpool_driver); return 0; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b0bc023d25c5..d41edd28298b 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -24,8 +24,7 @@ * * Usage of struct page flags: * PG_private: identifies the first component page - * PG_private2: identifies the last component page - * PG_owner_priv_1: indentifies the huge component page + * PG_owner_priv_1: identifies the huge component page * */ @@ -34,6 +33,7 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/magic.h> #include <linux/bitops.h> #include <linux/errno.h> #include <linux/highmem.h> @@ -268,10 +268,6 @@ struct zs_pool { #endif }; -/* - * A zspage's class index and fullness group - * are encoded in its (first)page->mapping - */ #define FULLNESS_BITS 2 #define CLASS_BITS 8 #define ISOLATED_BITS 3 @@ -280,7 +276,7 @@ struct zs_pool { struct zspage { struct { unsigned int fullness:FULLNESS_BITS; - unsigned int class:CLASS_BITS; + unsigned int class:CLASS_BITS + 1; unsigned int isolated:ISOLATED_BITS; unsigned int magic:MAGIC_VAL_BITS; }; @@ -364,7 +360,7 @@ static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) { return kmem_cache_alloc(pool->zspage_cachep, flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); -}; +} static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) { @@ -938,7 +934,6 @@ static void reset_page(struct page *page) { __ClearPageMovable(page); ClearPagePrivate(page); - ClearPagePrivate2(page); set_page_private(page, 0); page_mapcount_reset(page); ClearPageHugeObject(page); @@ -1085,7 +1080,7 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, * 2. each sub-page point to zspage using page->private * * we set PG_private to identify the first page (i.e. no other sub-page - * has this flag set) and PG_private_2 to identify the last page. + * has this flag set). */ for (i = 0; i < nr_pages; i++) { page = pages[i]; @@ -1100,8 +1095,6 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, } else { prev_page->freelist = page; } - if (i == nr_pages - 1) - SetPagePrivate2(page); prev_page = page; } } @@ -1284,61 +1277,21 @@ out: #endif /* CONFIG_PGTABLE_MAPPING */ -static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action, - void *pcpu) +static int zs_cpu_prepare(unsigned int cpu) { - int ret, cpu = (long)pcpu; struct mapping_area *area; - switch (action) { - case CPU_UP_PREPARE: - area = &per_cpu(zs_map_area, cpu); - ret = __zs_cpu_up(area); - if (ret) - return notifier_from_errno(ret); - break; - case CPU_DEAD: - case CPU_UP_CANCELED: - area = &per_cpu(zs_map_area, cpu); - __zs_cpu_down(area); - break; - } - - return NOTIFY_OK; + area = &per_cpu(zs_map_area, cpu); + return __zs_cpu_up(area); } -static struct notifier_block zs_cpu_nb = { - .notifier_call = zs_cpu_notifier -}; - -static int zs_register_cpu_notifier(void) +static int zs_cpu_dead(unsigned int cpu) { - int cpu, uninitialized_var(ret); - - cpu_notifier_register_begin(); - - __register_cpu_notifier(&zs_cpu_nb); - for_each_online_cpu(cpu) { - ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); - if (notifier_to_errno(ret)) - break; - } - - cpu_notifier_register_done(); - return notifier_to_errno(ret); -} - -static void zs_unregister_cpu_notifier(void) -{ - int cpu; - - cpu_notifier_register_begin(); - - for_each_online_cpu(cpu) - zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); - __unregister_cpu_notifier(&zs_cpu_nb); + struct mapping_area *area; - cpu_notifier_register_done(); + area = &per_cpu(zs_map_area, cpu); + __zs_cpu_down(area); + return 0; } static void __init init_zs_size_classes(void) @@ -2423,7 +2376,7 @@ struct zs_pool *zs_create_pool(const char *name) goto err; /* - * Iterate reversly, because, size of size_class that we want to use + * Iterate reversely, because, size of size_class that we want to use * for merging should be larger or equal to current size. */ for (i = zs_size_classes - 1; i >= 0; i--) { @@ -2534,10 +2487,10 @@ static int __init zs_init(void) if (ret) goto out; - ret = zs_register_cpu_notifier(); - + ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare", + zs_cpu_prepare, zs_cpu_dead); if (ret) - goto notifier_fail; + goto hp_setup_fail; init_zs_size_classes(); @@ -2549,8 +2502,7 @@ static int __init zs_init(void) return 0; -notifier_fail: - zs_unregister_cpu_notifier(); +hp_setup_fail: zsmalloc_unmount(); out: return ret; @@ -2562,7 +2514,7 @@ static void __exit zs_exit(void) zpool_unregister_driver(&zs_zpool_driver); #endif zsmalloc_unmount(); - zs_unregister_cpu_notifier(); + cpuhp_remove_state(CPUHP_MM_ZS_PREPARE); zs_stat_exit(); } diff --git a/mm/zswap.c b/mm/zswap.c index 275b22cc8df4..eedc27894b10 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -76,9 +76,17 @@ static u64 zswap_duplicate_entry; * tunables **********************************/ +#define ZSWAP_PARAM_UNSET "" + /* Enable/disable zswap (disabled by default) */ static bool zswap_enabled; -module_param_named(enabled, zswap_enabled, bool, 0644); +static int zswap_enabled_param_set(const char *, + const struct kernel_param *); +static struct kernel_param_ops zswap_enabled_param_ops = { + .set = zswap_enabled_param_set, + .get = param_get_bool, +}; +module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); /* Crypto compressor to use */ #define ZSWAP_COMPRESSOR_DEFAULT "lzo" @@ -118,7 +126,7 @@ struct zswap_pool { struct kref kref; struct list_head list; struct work_struct work; - struct notifier_block notifier; + struct hlist_node node; char tfm_name[CRYPTO_MAX_ALG_NAME]; }; @@ -176,6 +184,12 @@ static atomic_t zswap_pools_count = ATOMIC_INIT(0); /* used by param callback function */ static bool zswap_init_started; +/* fatal error during init */ +static bool zswap_init_failed; + +/* init completed, but couldn't create the initial pool */ +static bool zswap_has_pool; + /********************************* * helpers and fwd declarations **********************************/ @@ -352,143 +366,58 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, **********************************/ static DEFINE_PER_CPU(u8 *, zswap_dstmem); -static int __zswap_cpu_dstmem_notifier(unsigned long action, unsigned long cpu) +static int zswap_dstmem_prepare(unsigned int cpu) { u8 *dst; - switch (action) { - case CPU_UP_PREPARE: - dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!dst) { - pr_err("can't allocate compressor buffer\n"); - return NOTIFY_BAD; - } - per_cpu(zswap_dstmem, cpu) = dst; - break; - case CPU_DEAD: - case CPU_UP_CANCELED: - dst = per_cpu(zswap_dstmem, cpu); - kfree(dst); - per_cpu(zswap_dstmem, cpu) = NULL; - break; - default: - break; + dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!dst) { + pr_err("can't allocate compressor buffer\n"); + return -ENOMEM; } - return NOTIFY_OK; + per_cpu(zswap_dstmem, cpu) = dst; + return 0; } -static int zswap_cpu_dstmem_notifier(struct notifier_block *nb, - unsigned long action, void *pcpu) +static int zswap_dstmem_dead(unsigned int cpu) { - return __zswap_cpu_dstmem_notifier(action, (unsigned long)pcpu); -} + u8 *dst; -static struct notifier_block zswap_dstmem_notifier = { - .notifier_call = zswap_cpu_dstmem_notifier, -}; + dst = per_cpu(zswap_dstmem, cpu); + kfree(dst); + per_cpu(zswap_dstmem, cpu) = NULL; -static int __init zswap_cpu_dstmem_init(void) -{ - unsigned long cpu; - - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - if (__zswap_cpu_dstmem_notifier(CPU_UP_PREPARE, cpu) == - NOTIFY_BAD) - goto cleanup; - __register_cpu_notifier(&zswap_dstmem_notifier); - cpu_notifier_register_done(); return 0; - -cleanup: - for_each_online_cpu(cpu) - __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu); - cpu_notifier_register_done(); - return -ENOMEM; -} - -static void zswap_cpu_dstmem_destroy(void) -{ - unsigned long cpu; - - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu); - __unregister_cpu_notifier(&zswap_dstmem_notifier); - cpu_notifier_register_done(); } -static int __zswap_cpu_comp_notifier(struct zswap_pool *pool, - unsigned long action, unsigned long cpu) +static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { + struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_comp *tfm; - switch (action) { - case CPU_UP_PREPARE: - if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu))) - break; - tfm = crypto_alloc_comp(pool->tfm_name, 0, 0); - if (IS_ERR_OR_NULL(tfm)) { - pr_err("could not alloc crypto comp %s : %ld\n", - pool->tfm_name, PTR_ERR(tfm)); - return NOTIFY_BAD; - } - *per_cpu_ptr(pool->tfm, cpu) = tfm; - break; - case CPU_DEAD: - case CPU_UP_CANCELED: - tfm = *per_cpu_ptr(pool->tfm, cpu); - if (!IS_ERR_OR_NULL(tfm)) - crypto_free_comp(tfm); - *per_cpu_ptr(pool->tfm, cpu) = NULL; - break; - default: - break; - } - return NOTIFY_OK; -} - -static int zswap_cpu_comp_notifier(struct notifier_block *nb, - unsigned long action, void *pcpu) -{ - unsigned long cpu = (unsigned long)pcpu; - struct zswap_pool *pool = container_of(nb, typeof(*pool), notifier); - - return __zswap_cpu_comp_notifier(pool, action, cpu); -} + if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu))) + return 0; -static int zswap_cpu_comp_init(struct zswap_pool *pool) -{ - unsigned long cpu; - - memset(&pool->notifier, 0, sizeof(pool->notifier)); - pool->notifier.notifier_call = zswap_cpu_comp_notifier; - - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - if (__zswap_cpu_comp_notifier(pool, CPU_UP_PREPARE, cpu) == - NOTIFY_BAD) - goto cleanup; - __register_cpu_notifier(&pool->notifier); - cpu_notifier_register_done(); + tfm = crypto_alloc_comp(pool->tfm_name, 0, 0); + if (IS_ERR_OR_NULL(tfm)) { + pr_err("could not alloc crypto comp %s : %ld\n", + pool->tfm_name, PTR_ERR(tfm)); + return -ENOMEM; + } + *per_cpu_ptr(pool->tfm, cpu) = tfm; return 0; - -cleanup: - for_each_online_cpu(cpu) - __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu); - cpu_notifier_register_done(); - return -ENOMEM; } -static void zswap_cpu_comp_destroy(struct zswap_pool *pool) +static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) { - unsigned long cpu; + struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); + struct crypto_comp *tfm; - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu); - __unregister_cpu_notifier(&pool->notifier); - cpu_notifier_register_done(); + tfm = *per_cpu_ptr(pool->tfm, cpu); + if (!IS_ERR_OR_NULL(tfm)) + crypto_free_comp(tfm); + *per_cpu_ptr(pool->tfm, cpu) = NULL; + return 0; } /********************************* @@ -500,7 +429,8 @@ static struct zswap_pool *__zswap_pool_current(void) struct zswap_pool *pool; pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); - WARN_ON(!pool); + WARN_ONCE(!pool && zswap_has_pool, + "%s: no page storage pool!\n", __func__); return pool; } @@ -519,7 +449,7 @@ static struct zswap_pool *zswap_pool_current_get(void) rcu_read_lock(); pool = __zswap_pool_current(); - if (!pool || !zswap_pool_get(pool)) + if (!zswap_pool_get(pool)) pool = NULL; rcu_read_unlock(); @@ -535,7 +465,9 @@ static struct zswap_pool *zswap_pool_last_get(void) list_for_each_entry_rcu(pool, &zswap_pools, list) last = pool; - if (!WARN_ON(!last) && !zswap_pool_get(last)) + WARN_ONCE(!last && zswap_has_pool, + "%s: no page storage pool!\n", __func__); + if (!zswap_pool_get(last)) last = NULL; rcu_read_unlock(); @@ -569,6 +501,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) struct zswap_pool *pool; char name[38]; /* 'zswap' + 32 char (max) num + \0 */ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + int ret; + + if (!zswap_has_pool) { + /* if either are unset, pool initialization failed, and we + * need both params to be set correctly before trying to + * create a pool. + */ + if (!strcmp(type, ZSWAP_PARAM_UNSET)) + return NULL; + if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) + return NULL; + } pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) { @@ -593,7 +537,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) goto error; } - if (zswap_cpu_comp_init(pool)) + ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, + &pool->node); + if (ret) goto error; pr_debug("using %s compressor\n", pool->tfm_name); @@ -617,29 +563,41 @@ error: static __init struct zswap_pool *__zswap_pool_create_fallback(void) { - if (!crypto_has_comp(zswap_compressor, 0, 0)) { - if (!strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) { - pr_err("default compressor %s not available\n", - zswap_compressor); - return NULL; - } + bool has_comp, has_zpool; + + has_comp = crypto_has_comp(zswap_compressor, 0, 0); + if (!has_comp && strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) { pr_err("compressor %s not available, using default %s\n", zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT); param_free_charp(&zswap_compressor); zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; + has_comp = crypto_has_comp(zswap_compressor, 0, 0); } - if (!zpool_has_pool(zswap_zpool_type)) { - if (!strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { - pr_err("default zpool %s not available\n", - zswap_zpool_type); - return NULL; - } + if (!has_comp) { + pr_err("default compressor %s not available\n", + zswap_compressor); + param_free_charp(&zswap_compressor); + zswap_compressor = ZSWAP_PARAM_UNSET; + } + + has_zpool = zpool_has_pool(zswap_zpool_type); + if (!has_zpool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { pr_err("zpool %s not available, using default %s\n", zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT); param_free_charp(&zswap_zpool_type); zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; + has_zpool = zpool_has_pool(zswap_zpool_type); + } + if (!has_zpool) { + pr_err("default zpool %s not available\n", + zswap_zpool_type); + param_free_charp(&zswap_zpool_type); + zswap_zpool_type = ZSWAP_PARAM_UNSET; } + if (!has_comp || !has_zpool) + return NULL; + return zswap_pool_create(zswap_zpool_type, zswap_compressor); } @@ -647,7 +605,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool) { zswap_pool_debug("destroying", pool); - zswap_cpu_comp_destroy(pool); + cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->tfm); zpool_destroy_pool(pool->zpool); kfree(pool); @@ -655,6 +613,9 @@ static void zswap_pool_destroy(struct zswap_pool *pool) static int __must_check zswap_pool_get(struct zswap_pool *pool) { + if (!pool) + return 0; + return kref_get_unless_zero(&pool->kref); } @@ -706,8 +667,13 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, char *s = strstrip((char *)val); int ret; + if (zswap_init_failed) { + pr_err("can't set param, initialization failed\n"); + return -ENODEV; + } + /* no change required */ - if (!strcmp(s, *(char **)kp->arg)) + if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) return 0; /* if this is load-time (pre-init) param setting, @@ -738,21 +704,26 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, pool = zswap_pool_find_get(type, compressor); if (pool) { zswap_pool_debug("using existing", pool); + WARN_ON(pool == zswap_pool_current()); list_del_rcu(&pool->list); - } else { - spin_unlock(&zswap_pools_lock); - pool = zswap_pool_create(type, compressor); - spin_lock(&zswap_pools_lock); } + spin_unlock(&zswap_pools_lock); + + if (!pool) + pool = zswap_pool_create(type, compressor); + if (pool) ret = param_set_charp(s, kp); else ret = -EINVAL; + spin_lock(&zswap_pools_lock); + if (!ret) { put_pool = zswap_pool_current(); list_add_rcu(&pool->list, &zswap_pools); + zswap_has_pool = true; } else if (pool) { /* add the possibly pre-existing pool to the end of the pools * list; if it's new (and empty) then it'll be removed and @@ -764,6 +735,17 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, spin_unlock(&zswap_pools_lock); + if (!zswap_has_pool && !pool) { + /* if initial pool creation failed, and this pool creation also + * failed, maybe both compressor and zpool params were bad. + * Allow changing this param, so pool creation will succeed + * when the other param is changed. We already verified this + * param is ok in the zpool_has_pool() or crypto_has_comp() + * checks above. + */ + ret = param_set_charp(s, kp); + } + /* drop the ref from either the old current pool, * or the new pool we failed to add */ @@ -785,6 +767,21 @@ static int zswap_zpool_param_set(const char *val, return __zswap_param_set(val, kp, NULL, zswap_compressor); } +static int zswap_enabled_param_set(const char *val, + const struct kernel_param *kp) +{ + if (zswap_init_failed) { + pr_err("can't enable, initialization failed\n"); + return -ENODEV; + } + if (!zswap_has_pool && zswap_init_started) { + pr_err("can't enable, no pool configured\n"); + return -ENODEV; + } + + return param_set_bool(val, kp); +} + /********************************* * writeback code **********************************/ @@ -1238,6 +1235,7 @@ static void __exit zswap_debugfs_exit(void) { } static int __init init_zswap(void) { struct zswap_pool *pool; + int ret; zswap_init_started = true; @@ -1246,31 +1244,44 @@ static int __init init_zswap(void) goto cache_fail; } - if (zswap_cpu_dstmem_init()) { + ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare", + zswap_dstmem_prepare, zswap_dstmem_dead); + if (ret) { pr_err("dstmem alloc failed\n"); goto dstmem_fail; } + ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, + "mm/zswap_pool:prepare", + zswap_cpu_comp_prepare, + zswap_cpu_comp_dead); + if (ret) + goto hp_fail; + pool = __zswap_pool_create_fallback(); - if (!pool) { + if (pool) { + pr_info("loaded using pool %s/%s\n", pool->tfm_name, + zpool_get_type(pool->zpool)); + list_add(&pool->list, &zswap_pools); + zswap_has_pool = true; + } else { pr_err("pool creation failed\n"); - goto pool_fail; + zswap_enabled = false; } - pr_info("loaded using pool %s/%s\n", pool->tfm_name, - zpool_get_type(pool->zpool)); - - list_add(&pool->list, &zswap_pools); frontswap_register_ops(&zswap_frontswap_ops); if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); return 0; -pool_fail: - zswap_cpu_dstmem_destroy(); +hp_fail: + cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE); dstmem_fail: zswap_entry_cache_destroy(); cache_fail: + /* if built-in, we aren't unloaded on failure; don't allow use */ + zswap_init_failed = true; + zswap_enabled = false; return -ENOMEM; } /* must be late so crypto has time to come up */ |