diff options
Diffstat (limited to 'mm')
85 files changed, 4199 insertions, 2314 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 24c045b24b95..02d44e3420f5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -9,7 +9,6 @@ config SELECT_MEMORY_MODEL choice prompt "Memory model" depends on SELECT_MEMORY_MODEL - default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT default FLATMEM_MANUAL help @@ -149,6 +148,9 @@ config MEMORY_ISOLATION config HAVE_BOOTMEM_INFO_NODE def_bool n +config ARCH_ENABLE_MEMORY_HOTPLUG + bool + # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" @@ -177,12 +179,20 @@ config MEMORY_HOTPLUG_DEFAULT_ONLINE Say N here if you want the default policy to keep all hot-plugged memory blocks in 'offline' state. +config ARCH_ENABLE_MEMORY_HOTREMOVE + bool + config MEMORY_HOTREMOVE bool "Allow for memory hot remove" select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64) depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION +config MHP_MEMMAP_ON_MEMORY + def_bool y + depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP + depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE + # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. @@ -274,6 +284,13 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION config ARCH_ENABLE_THP_MIGRATION bool +config HUGETLB_PAGE_SIZE_VARIABLE + def_bool n + help + Allows the pageblock_order value to be dynamic instead of just standard + HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available + on a platform. + config CONTIG_ALLOC def_bool (MEMORY_ISOLATION && COMPACTION) || CMA @@ -283,12 +300,11 @@ config PHYS_ADDR_T_64BIT config BOUNCE bool "Enable bounce buffers" default y - depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) + depends on BLOCK && MMU && HIGHMEM help - Enable bounce buffers for devices that cannot access - the full range of memory available to the CPU. Enabled - by default when ZONE_DMA or HIGHMEM is selected, but you - may say n to override this. + Enable bounce buffers for devices that cannot access the full range of + memory available to the CPU. Enabled by default when HIGHMEM is + selected, but you may say n to override this. config VIRT_TO_BUS bool @@ -513,6 +529,13 @@ config CMA_DEBUGFS help Turns on the DebugFS interface for CMA. +config CMA_SYSFS + bool "CMA information through sysfs interface" + depends on CMA && SYSFS + help + This option exposes some sysfs attributes to get information + from CMA. + config CMA_AREAS int "Maximum count of the CMA areas" depends on CMA @@ -760,6 +783,9 @@ config IDLE_PAGE_TRACKING See Documentation/admin-guide/mm/idle_page_tracking.rst for more details. +config ARCH_HAS_CACHE_LINE_SIZE + bool + config ARCH_HAS_PTE_DEVMAP bool @@ -872,4 +898,7 @@ config MAPPING_DIRTY_HELPERS config KMAP_LOCAL bool +# struct io_mapping based helper. Selected by drivers that need them +config IO_MAPPING + bool endmenu diff --git a/mm/Makefile b/mm/Makefile index 72227b24a616..bf71e295e9f6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -58,9 +58,13 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ page-alloc-y := page_alloc.o page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o +# Give 'memory_hotplug' its own module-parameter namespace +memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o + obj-y += page-alloc.o obj-y += init-mm.o obj-y += memblock.o +obj-y += $(memory-hotplug-y) ifdef CONFIG_MMU obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o @@ -83,7 +87,6 @@ obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_KFENCE) += kfence/ obj-$(CONFIG_FAILSLAB) += failslab.o -obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o @@ -109,6 +112,7 @@ obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o +obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o @@ -120,3 +124,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o +obj-$(CONFIG_IO_MAPPING) += io-mapping.o @@ -24,7 +24,6 @@ #include <linux/memblock.h> #include <linux/err.h> #include <linux/mm.h> -#include <linux/mutex.h> #include <linux/sizes.h> #include <linux/slab.h> #include <linux/log2.h> @@ -80,16 +79,17 @@ static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, } static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, - unsigned int count) + unsigned long count) { unsigned long bitmap_no, bitmap_count; + unsigned long flags; bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit; bitmap_count = cma_bitmap_pages_to_bits(cma, count); - mutex_lock(&cma->lock); + spin_lock_irqsave(&cma->lock, flags); bitmap_clear(cma->bitmap, bitmap_no, bitmap_count); - mutex_unlock(&cma->lock); + spin_unlock_irqrestore(&cma->lock, flags); } static void __init cma_activate_area(struct cma *cma) @@ -118,7 +118,7 @@ static void __init cma_activate_area(struct cma *cma) pfn += pageblock_nr_pages) init_cma_reserved_pageblock(pfn_to_page(pfn)); - mutex_init(&cma->lock); + spin_lock_init(&cma->lock); #ifdef CONFIG_CMA_DEBUGFS INIT_HLIST_HEAD(&cma->mem_head); @@ -392,7 +392,7 @@ static void cma_debug_show_areas(struct cma *cma) unsigned long nr_part, nr_total = 0; unsigned long nbits = cma_bitmap_maxno(cma); - mutex_lock(&cma->lock); + spin_lock_irq(&cma->lock); pr_info("number of available pages: "); for (;;) { next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start); @@ -407,7 +407,7 @@ static void cma_debug_show_areas(struct cma *cma) start = next_zero_bit + nr_zero; } pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count); - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); } #else static inline void cma_debug_show_areas(struct cma *cma) { } @@ -423,25 +423,27 @@ static inline void cma_debug_show_areas(struct cma *cma) { } * This function allocates part of contiguous memory on specific * contiguous memory area. */ -struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, - bool no_warn) +struct page *cma_alloc(struct cma *cma, unsigned long count, + unsigned int align, bool no_warn) { unsigned long mask, offset; unsigned long pfn = -1; unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; - size_t i; + unsigned long i; struct page *page = NULL; int ret = -ENOMEM; if (!cma || !cma->count || !cma->bitmap) - return NULL; + goto out; - pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma, + pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma, count, align); if (!count) - return NULL; + goto out; + + trace_cma_alloc_start(cma->name, count, align); mask = cma_bitmap_aligned_mask(cma, align); offset = cma_bitmap_aligned_offset(cma, align); @@ -449,15 +451,15 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, bitmap_count = cma_bitmap_pages_to_bits(cma, count); if (bitmap_count > bitmap_maxno) - return NULL; + goto out; for (;;) { - mutex_lock(&cma->lock); + spin_lock_irq(&cma->lock); bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap, bitmap_maxno, start, bitmap_count, mask, offset); if (bitmap_no >= bitmap_maxno) { - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); break; } bitmap_set(cma->bitmap, bitmap_no, bitmap_count); @@ -466,7 +468,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, * our exclusive use. If the migration fails we will take the * lock again and unmark it. */ - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, @@ -483,11 +485,14 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, pr_debug("%s(): memory range at %p is busy, retrying\n", __func__, pfn_to_page(pfn)); + + trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn), + count, align); /* try again with a bit different memory target */ start = bitmap_no + mask + 1; } - trace_cma_alloc(pfn, page, count, align); + trace_cma_alloc_finish(cma->name, pfn, page, count, align); /* * CMA can allocate multiple page blocks, which results in different @@ -500,12 +505,22 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, } if (ret && !no_warn) { - pr_err("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n", - __func__, cma->name, count, ret); + pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n", + __func__, cma->name, count, ret); cma_debug_show_areas(cma); } pr_debug("%s(): returned %p\n", __func__, page); +out: + if (page) { + count_vm_event(CMA_ALLOC_SUCCESS); + cma_sysfs_account_success_pages(cma, count); + } else { + count_vm_event(CMA_ALLOC_FAIL); + if (cma) + cma_sysfs_account_fail_pages(cma, count); + } + return page; } @@ -519,14 +534,15 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, * It returns false when provided pages do not belong to contiguous area and * true otherwise. */ -bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) +bool cma_release(struct cma *cma, const struct page *pages, + unsigned long count) { unsigned long pfn; if (!cma || !pages) return false; - pr_debug("%s(page %p, count %u)\n", __func__, (void *)pages, count); + pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); pfn = page_to_pfn(pages); @@ -537,7 +553,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) free_contig_range(pfn, count); cma_clear_bitmap(cma, pfn, count); - trace_cma_release(pfn, pages, count); + trace_cma_release(cma->name, pfn, pages, count); return true; } @@ -3,19 +3,33 @@ #define __MM_CMA_H__ #include <linux/debugfs.h> +#include <linux/kobject.h> + +struct cma_kobject { + struct kobject kobj; + struct cma *cma; +}; struct cma { unsigned long base_pfn; unsigned long count; unsigned long *bitmap; unsigned int order_per_bit; /* Order of pages represented by one bit */ - struct mutex lock; + spinlock_t lock; #ifdef CONFIG_CMA_DEBUGFS struct hlist_head mem_head; spinlock_t mem_head_lock; struct debugfs_u32_array dfs_bitmap; #endif char name[CMA_MAX_NAME]; +#ifdef CONFIG_CMA_SYSFS + /* the number of CMA page successful allocations */ + atomic64_t nr_pages_succeeded; + /* the number of CMA page allocation failures */ + atomic64_t nr_pages_failed; + /* kobject requires dynamic object */ + struct cma_kobject *cma_kobj; +#endif }; extern struct cma cma_areas[MAX_CMA_AREAS]; @@ -26,4 +40,13 @@ static inline unsigned long cma_bitmap_maxno(struct cma *cma) return cma->count >> cma->order_per_bit; } +#ifdef CONFIG_CMA_SYSFS +void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages); +void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages); +#else +static inline void cma_sysfs_account_success_pages(struct cma *cma, + unsigned long nr_pages) {}; +static inline void cma_sysfs_account_fail_pages(struct cma *cma, + unsigned long nr_pages) {}; +#endif #endif diff --git a/mm/cma_debug.c b/mm/cma_debug.c index d5bf8aa34fdc..2e7704955f4f 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -36,10 +36,10 @@ static int cma_used_get(void *data, u64 *val) struct cma *cma = data; unsigned long used; - mutex_lock(&cma->lock); + spin_lock_irq(&cma->lock); /* pages counter is smaller than sizeof(int) */ used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma)); - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); *val = (u64)used << cma->order_per_bit; return 0; @@ -53,7 +53,7 @@ static int cma_maxchunk_get(void *data, u64 *val) unsigned long start, end = 0; unsigned long bitmap_maxno = cma_bitmap_maxno(cma); - mutex_lock(&cma->lock); + spin_lock_irq(&cma->lock); for (;;) { start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end); if (start >= bitmap_maxno) @@ -61,7 +61,7 @@ static int cma_maxchunk_get(void *data, u64 *val) end = find_next_bit(cma->bitmap, bitmap_maxno, start); maxchunk = max(end - start, maxchunk); } - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); *val = (u64)maxchunk << cma->order_per_bit; return 0; diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c new file mode 100644 index 000000000000..eb2f39caff59 --- /dev/null +++ b/mm/cma_sysfs.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * CMA SysFS Interface + * + * Copyright (c) 2021 Minchan Kim <minchan@kernel.org> + */ + +#include <linux/cma.h> +#include <linux/kernel.h> +#include <linux/slab.h> + +#include "cma.h" + +#define CMA_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages) +{ + atomic64_add(nr_pages, &cma->nr_pages_succeeded); +} + +void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages) +{ + atomic64_add(nr_pages, &cma->nr_pages_failed); +} + +static inline struct cma *cma_from_kobj(struct kobject *kobj) +{ + return container_of(kobj, struct cma_kobject, kobj)->cma; +} + +static ssize_t alloc_pages_success_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct cma *cma = cma_from_kobj(kobj); + + return sysfs_emit(buf, "%llu\n", + atomic64_read(&cma->nr_pages_succeeded)); +} +CMA_ATTR_RO(alloc_pages_success); + +static ssize_t alloc_pages_fail_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct cma *cma = cma_from_kobj(kobj); + + return sysfs_emit(buf, "%llu\n", atomic64_read(&cma->nr_pages_failed)); +} +CMA_ATTR_RO(alloc_pages_fail); + +static void cma_kobj_release(struct kobject *kobj) +{ + struct cma *cma = cma_from_kobj(kobj); + struct cma_kobject *cma_kobj = cma->cma_kobj; + + kfree(cma_kobj); + cma->cma_kobj = NULL; +} + +static struct attribute *cma_attrs[] = { + &alloc_pages_success_attr.attr, + &alloc_pages_fail_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(cma); + +static struct kobj_type cma_ktype = { + .release = cma_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = cma_groups, +}; + +static int __init cma_sysfs_init(void) +{ + struct kobject *cma_kobj_root; + struct cma_kobject *cma_kobj; + struct cma *cma; + int i, err; + + cma_kobj_root = kobject_create_and_add("cma", mm_kobj); + if (!cma_kobj_root) + return -ENOMEM; + + for (i = 0; i < cma_area_count; i++) { + cma_kobj = kzalloc(sizeof(*cma_kobj), GFP_KERNEL); + if (!cma_kobj) { + err = -ENOMEM; + goto out; + } + + cma = &cma_areas[i]; + cma->cma_kobj = cma_kobj; + cma_kobj->cma = cma; + err = kobject_init_and_add(&cma_kobj->kobj, &cma_ktype, + cma_kobj_root, "%s", cma->name); + if (err) { + kobject_put(&cma_kobj->kobj); + goto out; + } + } + + return 0; +out: + while (--i >= 0) { + cma = &cma_areas[i]; + kobject_put(&cma->cma_kobj->kobj); + } + kobject_put(cma_kobj_root); + + return err; +} +subsys_initcall(cma_sysfs_init); diff --git a/mm/compaction.c b/mm/compaction.c index e04f4476e68e..3a6c6b821f80 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -787,15 +787,14 @@ static bool too_many_isolated(pg_data_t *pgdat) * * Isolate all pages that can be migrated from the range specified by * [low_pfn, end_pfn). The range is expected to be within same pageblock. - * Returns zero if there is a fatal signal pending, otherwise PFN of the - * first page that was not scanned (which may be both less, equal to or more - * than end_pfn). + * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion, + * -ENOMEM in case we could not allocate a page, or 0. + * cc->migrate_pfn will contain the next pfn to scan. * * The pages are isolated on cc->migratepages list (not required to be empty), - * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field - * is neither read nor updated. + * and cc->nr_migratepages is updated accordingly. */ -static unsigned long +static int isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn, isolate_mode_t isolate_mode) { @@ -809,6 +808,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, bool skip_on_failure = false; unsigned long next_skip_pfn = 0; bool skip_updated = false; + int ret = 0; + + cc->migrate_pfn = low_pfn; /* * Ensure that there are not too many pages isolated from the LRU @@ -818,16 +820,16 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, while (unlikely(too_many_isolated(pgdat))) { /* stop isolation if there are still pages not migrated */ if (cc->nr_migratepages) - return 0; + return -EAGAIN; /* async migration should just abort */ if (cc->mode == MIGRATE_ASYNC) - return 0; + return -EAGAIN; congestion_wait(BLK_RW_ASYNC, HZ/10); if (fatal_signal_pending(current)) - return 0; + return -EINTR; } cond_resched(); @@ -875,8 +877,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (fatal_signal_pending(current)) { cc->contended = true; + ret = -EINTR; - low_pfn = 0; goto fatal_pending; } @@ -904,6 +906,38 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, valid_page = page; } + if (PageHuge(page) && cc->alloc_contig) { + ret = isolate_or_dissolve_huge_page(page, &cc->migratepages); + + /* + * Fail isolation in case isolate_or_dissolve_huge_page() + * reports an error. In case of -ENOMEM, abort right away. + */ + if (ret < 0) { + /* Do not report -EBUSY down the chain */ + if (ret == -EBUSY) + ret = 0; + low_pfn += (1UL << compound_order(page)) - 1; + goto isolate_fail; + } + + if (PageHuge(page)) { + /* + * Hugepage was successfully isolated and placed + * on the cc->migratepages list. + */ + low_pfn += compound_nr(page) - 1; + goto isolate_success_no_list; + } + + /* + * Ok, the hugepage was dissolved. Now these pages are + * Buddy and cannot be re-allocated because they are + * isolated. Fall-through as the check below handles + * Buddy pages. + */ + } + /* * Skip if free. We read page order here without zone lock * which is generally unsafe, but the race window is small and @@ -1037,6 +1071,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, isolate_success: list_add(&page->lru, &cc->migratepages); +isolate_success_no_list: cc->nr_migratepages += compound_nr(page); nr_isolated += compound_nr(page); @@ -1063,7 +1098,7 @@ isolate_fail_put: put_page(page); isolate_fail: - if (!skip_on_failure) + if (!skip_on_failure && ret != -ENOMEM) continue; /* @@ -1089,6 +1124,9 @@ isolate_fail: */ next_skip_pfn += 1UL << cc->order; } + + if (ret == -ENOMEM) + break; } /* @@ -1130,7 +1168,9 @@ fatal_pending: if (nr_isolated) count_compact_events(COMPACTISOLATED, nr_isolated); - return low_pfn; + cc->migrate_pfn = low_pfn; + + return ret; } /** @@ -1139,15 +1179,15 @@ fatal_pending: * @start_pfn: The first PFN to start isolating. * @end_pfn: The one-past-last PFN. * - * Returns zero if isolation fails fatally due to e.g. pending signal. - * Otherwise, function returns one-past-the-last PFN of isolated page - * (which may be greater than end_pfn if end fell in a middle of a THP page). + * Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM + * in case we could not allocate a page, or 0. */ -unsigned long +int isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn, block_start_pfn, block_end_pfn; + int ret = 0; /* Scan block by block. First and last block may be incomplete */ pfn = start_pfn; @@ -1166,17 +1206,17 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, block_end_pfn, cc->zone)) continue; - pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, - ISOLATE_UNEVICTABLE); + ret = isolate_migratepages_block(cc, pfn, block_end_pfn, + ISOLATE_UNEVICTABLE); - if (!pfn) + if (ret) break; if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX) break; } - return pfn; + return ret; } #endif /* CONFIG_COMPACTION || CONFIG_CMA */ @@ -1847,7 +1887,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) */ for (; block_end_pfn <= cc->free_pfn; fast_find_block = false, - low_pfn = block_end_pfn, + cc->migrate_pfn = low_pfn = block_end_pfn, block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { @@ -1889,10 +1929,8 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) } /* Perform the isolation */ - low_pfn = isolate_migratepages_block(cc, low_pfn, - block_end_pfn, isolate_mode); - - if (!low_pfn) + if (isolate_migratepages_block(cc, low_pfn, block_end_pfn, + isolate_mode)) return ISOLATE_ABORT; /* @@ -1903,9 +1941,6 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) break; } - /* Record where migration scanner will be restarted. */ - cc->migrate_pfn = low_pfn; - return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; } @@ -2319,7 +2354,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync); - migrate_prep_local(); + /* lru_add_drain_all could be expensive with involving other CPUs */ + lru_add_drain(); while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) { int err; @@ -2494,6 +2530,14 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, */ WRITE_ONCE(current->capture_control, NULL); *capture = READ_ONCE(capc.page); + /* + * Technically, it is also possible that compaction is skipped but + * the page is still captured out of luck(IRQ came and freed the page). + * Returning COMPACT_SUCCESS in such cases helps in properly accounting + * the COMPACT[STALL|FAIL] when compaction is skipped. + */ + if (*capture) + ret = COMPACT_SUCCESS; return ret; } @@ -2657,9 +2701,6 @@ static void compact_nodes(void) compact_node(nid); } -/* The written value is actually unused, all memory is compacted */ -int sysctl_compact_memory; - /* * Tunable for proactive compaction. It determines how * aggressively the kernel should compact memory in the @@ -2844,7 +2885,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx) */ static int kcompactd(void *p) { - pg_data_t *pgdat = (pg_data_t*)p; + pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; unsigned int proactive_defer = 0; diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index a9bd6ce1ba02..05efe98a9ac2 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -247,7 +247,7 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { pmd_t pmd; - if (!arch_ioremap_pmd_supported()) + if (!arch_vmap_pmd_supported(prot)) return; pr_debug("Validating PMD huge\n"); @@ -385,7 +385,7 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { pud_t pud; - if (!arch_ioremap_pud_supported()) + if (!arch_vmap_pud_supported(prot)) return; pr_debug("Validating PUD huge\n"); diff --git a/mm/dmapool.c b/mm/dmapool.c index f3791532fef2..16483f86360e 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -157,7 +157,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, if (!retval) return retval; - strlcpy(retval->name, name, sizeof(retval->name)); + strscpy(retval->name, name, sizeof(retval->name)); retval->dev = dev; diff --git a/mm/filemap.c b/mm/filemap.c index 43700480d897..7fadf211643c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -142,17 +142,6 @@ static void page_cache_delete(struct address_space *mapping, page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ - - if (shadow) { - mapping->nrexceptional += nr; - /* - * Make sure the nrexceptional update is committed before - * the nrpages update so that final truncate racing - * with reclaim does not see both counters 0 at the - * same time and miss a shadow entry. - */ - smp_wmb(); - } mapping->nrpages -= nr; } @@ -629,13 +618,53 @@ EXPORT_SYMBOL(filemap_fdatawait_keep_errors); /* Returns true if writeback might be needed or already in progress. */ static bool mapping_needs_writeback(struct address_space *mapping) { - if (dax_mapping(mapping)) - return mapping->nrexceptional; - return mapping->nrpages; } /** + * filemap_range_needs_writeback - check if range potentially needs writeback + * @mapping: address space within which to check + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Find at least one page in the range supplied, usually used to check if + * direct writing in this range will trigger a writeback. Used by O_DIRECT + * read/write with IOCB_NOWAIT, to see if the caller needs to do + * filemap_write_and_wait_range() before proceeding. + * + * Return: %true if the caller should do filemap_write_and_wait_range() before + * doing O_DIRECT to a page in this range, %false otherwise. + */ +bool filemap_range_needs_writeback(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) +{ + XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); + pgoff_t max = end_byte >> PAGE_SHIFT; + struct page *page; + + if (!mapping_needs_writeback(mapping)) + return false; + if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && + !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) + return false; + if (end_byte < start_byte) + return false; + + rcu_read_lock(); + xas_for_each(&xas, page, max) { + if (xas_retry(&xas, page)) + continue; + if (xa_is_value(page)) + continue; + if (PageDirty(page) || PageLocked(page) || PageWriteback(page)) + break; + } + rcu_read_unlock(); + return page != NULL; +} +EXPORT_SYMBOL_GPL(filemap_range_needs_writeback); + +/** * filemap_write_and_wait_range - write out & wait on a file range * @mapping: the address_space for the pages * @lstart: offset in bytes where the range starts @@ -882,8 +911,6 @@ noinline int __add_to_page_cache_locked(struct page *page, if (xas_error(&xas)) goto unlock; - if (old) - mapping->nrexceptional--; mapping->nrpages++; /* hugetlb pages do not participate in page cache accounting */ @@ -1433,6 +1460,67 @@ void unlock_page(struct page *page) EXPORT_SYMBOL(unlock_page); /** + * end_page_private_2 - Clear PG_private_2 and release any waiters + * @page: The page + * + * Clear the PG_private_2 bit on a page and wake up any sleepers waiting for + * this. The page ref held for PG_private_2 being set is released. + * + * This is, for example, used when a netfs page is being written to a local + * disk cache, thereby allowing writes to the cache for the same page to be + * serialised. + */ +void end_page_private_2(struct page *page) +{ + page = compound_head(page); + VM_BUG_ON_PAGE(!PagePrivate2(page), page); + clear_bit_unlock(PG_private_2, &page->flags); + wake_up_page_bit(page, PG_private_2); + put_page(page); +} +EXPORT_SYMBOL(end_page_private_2); + +/** + * wait_on_page_private_2 - Wait for PG_private_2 to be cleared on a page + * @page: The page to wait on + * + * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page. + */ +void wait_on_page_private_2(struct page *page) +{ + page = compound_head(page); + while (PagePrivate2(page)) + wait_on_page_bit(page, PG_private_2); +} +EXPORT_SYMBOL(wait_on_page_private_2); + +/** + * wait_on_page_private_2_killable - Wait for PG_private_2 to be cleared on a page + * @page: The page to wait on + * + * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page or until a + * fatal signal is received by the calling task. + * + * Return: + * - 0 if successful. + * - -EINTR if a fatal signal was encountered. + */ +int wait_on_page_private_2_killable(struct page *page) +{ + int ret = 0; + + page = compound_head(page); + while (PagePrivate2(page)) { + ret = wait_on_page_bit_killable(page, PG_private_2); + if (ret < 0) + break; + } + + return ret; +} +EXPORT_SYMBOL(wait_on_page_private_2_killable); + +/** * end_page_writeback - end writeback against a page * @page: the page */ @@ -1663,7 +1751,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); * @mapping: the address_space to search * @index: The page cache index. * - * Looks up the page cache slot at @mapping & @offset. If there is a + * Looks up the page cache slot at @mapping & @index. If there is a * page cache page, the head page is returned with an increased refcount. * * If the slot holds a shadow entry of a previously evicted page, or a @@ -1969,8 +2057,14 @@ unlock: put: put_page(page); next: - if (!xa_is_value(page) && PageTransHuge(page)) - xas_set(&xas, page->index + thp_nr_pages(page)); + if (!xa_is_value(page) && PageTransHuge(page)) { + unsigned int nr_pages = thp_nr_pages(page); + + /* Final THP may cross MAX_LFS_FILESIZE on 32-bit */ + xas_set(&xas, page->index + nr_pages); + if (xas.xa_index < nr_pages) + break; + } } rcu_read_unlock(); @@ -2238,8 +2332,6 @@ static int filemap_read_page(struct file *file, struct address_space *mapping, return error; if (PageUptodate(page)) return 0; - if (!page->mapping) /* page truncated */ - return AOP_TRUNCATED_PAGE; shrink_readahead_size_eio(&file->f_ra); return -EIO; } @@ -2571,8 +2663,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) size = i_size_read(inode); if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_has_page(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1)) + if (filemap_range_needs_writeback(mapping, iocb->ki_pos, + iocb->ki_pos + count - 1)) return -EAGAIN; } else { retval = filemap_write_and_wait_range(mapping, @@ -2672,7 +2764,7 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, loff_t end, int whence) { XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); - pgoff_t max = (end - 1) / PAGE_SIZE; + pgoff_t max = (end - 1) >> PAGE_SHIFT; bool seek_data = (whence == SEEK_DATA); struct page *page; @@ -2681,7 +2773,8 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, rcu_read_lock(); while ((page = find_get_entry(&xas, max, XA_PRESENT))) { - loff_t pos = xas.xa_index * PAGE_SIZE; + loff_t pos = (u64)xas.xa_index << PAGE_SHIFT; + unsigned int seek_size; if (start < pos) { if (!seek_data) @@ -2689,25 +2782,25 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, start = pos; } - pos += seek_page_size(&xas, page); + seek_size = seek_page_size(&xas, page); + pos = round_up(pos + 1, seek_size); start = page_seek_hole_data(&xas, mapping, page, start, pos, seek_data); if (start < pos) goto unlock; + if (start >= end) + break; + if (seek_size > PAGE_SIZE) + xas_set(&xas, pos >> PAGE_SHIFT); if (!xa_is_value(page)) put_page(page); } - rcu_read_unlock(); - if (seek_data) - return -ENXIO; - goto out; - + start = -ENXIO; unlock: rcu_read_unlock(); - if (!xa_is_value(page)) + if (page && !xa_is_value(page)) put_page(page); -out: if (start > end) return end; return start; @@ -2771,7 +2864,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; - DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff); + DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); struct file *fpin = NULL; unsigned int mmap_miss; @@ -2783,7 +2876,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) if (vmf->vma->vm_flags & VM_SEQ_READ) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); - page_cache_sync_ra(&ractl, ra, ra->ra_pages); + page_cache_sync_ra(&ractl, ra->ra_pages); return fpin; } @@ -2869,7 +2962,6 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) struct file *file = vmf->vma->vm_file; struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; - struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; pgoff_t max_off; @@ -2956,14 +3048,8 @@ page_not_uptodate: * because there really aren't any performance issues here * and we need to check for errors. */ - ClearPageError(page); fpin = maybe_unlock_mmap_for_io(vmf, fpin); - error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (!PageUptodate(page)) - error = -EIO; - } + error = filemap_read_page(file, mapping, page); if (fpin) goto out_retry; put_page(page); @@ -2971,7 +3057,6 @@ page_not_uptodate: if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; - shrink_readahead_size_eio(ra); return VM_FAULT_SIGBUS; out_retry: @@ -3182,7 +3267,7 @@ const struct vm_operations_struct generic_file_vm_ops = { /* This is used for a general mmap of a disk file */ -int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { struct address_space *mapping = file->f_mapping; @@ -3207,11 +3292,11 @@ vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { return VM_FAULT_SIGBUS; } -int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } -int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) +int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } @@ -3639,7 +3724,7 @@ EXPORT_SYMBOL(generic_perform_write); ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct address_space * mapping = file->f_mapping; + struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t written = 0; ssize_t err; diff --git a/mm/frontswap.c b/mm/frontswap.c index 2183a56c7874..130e301c5ac0 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -60,16 +60,20 @@ static u64 frontswap_succ_stores; static u64 frontswap_failed_stores; static u64 frontswap_invalidates; -static inline void inc_frontswap_loads(void) { +static inline void inc_frontswap_loads(void) +{ data_race(frontswap_loads++); } -static inline void inc_frontswap_succ_stores(void) { +static inline void inc_frontswap_succ_stores(void) +{ data_race(frontswap_succ_stores++); } -static inline void inc_frontswap_failed_stores(void) { +static inline void inc_frontswap_failed_stores(void) +{ data_race(frontswap_failed_stores++); } -static inline void inc_frontswap_invalidates(void) { +static inline void inc_frontswap_invalidates(void) +{ data_race(frontswap_invalidates++); } #else @@ -87,11 +87,12 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page, int orig_refs = refs; /* - * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast - * path, so fail and let the caller fall back to the slow path. + * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a + * right zone, so fail and let the caller fall back to the slow + * path. */ - if (unlikely(flags & FOLL_LONGTERM) && - is_migrate_cma_page(page)) + if (unlikely((flags & FOLL_LONGTERM) && + !is_pinnable_page(page))) return NULL; /* @@ -213,6 +214,58 @@ void unpin_user_page(struct page *page) } EXPORT_SYMBOL(unpin_user_page); +static inline void compound_range_next(unsigned long i, unsigned long npages, + struct page **list, struct page **head, + unsigned int *ntails) +{ + struct page *next, *page; + unsigned int nr = 1; + + if (i >= npages) + return; + + next = *list + i; + page = compound_head(next); + if (PageCompound(page) && compound_order(page) >= 1) + nr = min_t(unsigned int, + page + compound_nr(page) - next, npages - i); + + *head = page; + *ntails = nr; +} + +#define for_each_compound_range(__i, __list, __npages, __head, __ntails) \ + for (__i = 0, \ + compound_range_next(__i, __npages, __list, &(__head), &(__ntails)); \ + __i < __npages; __i += __ntails, \ + compound_range_next(__i, __npages, __list, &(__head), &(__ntails))) + +static inline void compound_next(unsigned long i, unsigned long npages, + struct page **list, struct page **head, + unsigned int *ntails) +{ + struct page *page; + unsigned int nr; + + if (i >= npages) + return; + + page = compound_head(list[i]); + for (nr = i + 1; nr < npages; nr++) { + if (compound_head(list[nr]) != page) + break; + } + + *head = page; + *ntails = nr - i; +} + +#define for_each_compound_head(__i, __list, __npages, __head, __ntails) \ + for (__i = 0, \ + compound_next(__i, __npages, __list, &(__head), &(__ntails)); \ + __i < __npages; __i += __ntails, \ + compound_next(__i, __npages, __list, &(__head), &(__ntails))) + /** * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages * @pages: array of pages to be maybe marked dirty, and definitely released. @@ -239,20 +292,15 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty) { unsigned long index; - - /* - * TODO: this can be optimized for huge pages: if a series of pages is - * physically contiguous and part of the same compound page, then a - * single operation to the head page should suffice. - */ + struct page *head; + unsigned int ntails; if (!make_dirty) { unpin_user_pages(pages, npages); return; } - for (index = 0; index < npages; index++) { - struct page *page = compound_head(pages[index]); + for_each_compound_head(index, pages, npages, head, ntails) { /* * Checking PageDirty at this point may race with * clear_page_dirty_for_io(), but that's OK. Two key @@ -273,14 +321,50 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, * written back, so it gets written back again in the * next writeback cycle. This is harmless. */ - if (!PageDirty(page)) - set_page_dirty_lock(page); - unpin_user_page(page); + if (!PageDirty(head)) + set_page_dirty_lock(head); + put_compound_head(head, ntails, FOLL_PIN); } } EXPORT_SYMBOL(unpin_user_pages_dirty_lock); /** + * unpin_user_page_range_dirty_lock() - release and optionally dirty + * gup-pinned page range + * + * @page: the starting page of a range maybe marked dirty, and definitely released. + * @npages: number of consecutive pages to release. + * @make_dirty: whether to mark the pages dirty + * + * "gup-pinned page range" refers to a range of pages that has had one of the + * pin_user_pages() variants called on that page. + * + * For the page ranges defined by [page .. page+npages], make that range (or + * its head pages, if a compound page) dirty, if @make_dirty is true, and if the + * page range was previously listed as clean. + * + * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is + * required, then the caller should a) verify that this is really correct, + * because _lock() is usually required, and b) hand code it: + * set_page_dirty_lock(), unpin_user_page(). + * + */ +void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, + bool make_dirty) +{ + unsigned long index; + struct page *head; + unsigned int ntails; + + for_each_compound_range(index, &page, npages, head, ntails) { + if (make_dirty && !PageDirty(head)) + set_page_dirty_lock(head); + put_compound_head(head, ntails, FOLL_PIN); + } +} +EXPORT_SYMBOL(unpin_user_page_range_dirty_lock); + +/** * unpin_user_pages() - release an array of gup-pinned pages. * @pages: array of pages to be marked dirty and released. * @npages: number of pages in the @pages array. @@ -292,6 +376,8 @@ EXPORT_SYMBOL(unpin_user_pages_dirty_lock); void unpin_user_pages(struct page **pages, unsigned long npages) { unsigned long index; + struct page *head; + unsigned int ntails; /* * If this WARN_ON() fires, then the system *might* be leaking pages (by @@ -300,13 +386,9 @@ void unpin_user_pages(struct page **pages, unsigned long npages) */ if (WARN_ON(IS_ERR_VALUE(npages))) return; - /* - * TODO: this can be optimized for huge pages: if a series of pages is - * physically contiguous and part of the same compound page, then a - * single operation to the head page should suffice. - */ - for (index = 0; index < npages; index++) - unpin_user_page(pages[index]); + + for_each_compound_head(index, pages, npages, head, ntails) + put_compound_head(head, ntails, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_pages); @@ -435,18 +517,6 @@ retry: } } - if (flags & FOLL_SPLIT && PageTransCompound(page)) { - get_page(page); - pte_unmap_unlock(ptep, ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - if (ret) - return ERR_PTR(ret); - goto retry; - } - /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ if (unlikely(!try_grab_page(page, flags))) { page = ERR_PTR(-ENOMEM); @@ -591,7 +661,7 @@ retry_locked: spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } - if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) { + if (flags & FOLL_SPLIT_PMD) { int ret; page = pmd_page(*pmd); if (is_huge_zero_page(page)) { @@ -600,19 +670,7 @@ retry_locked: split_huge_pmd(vma, pmd, address); if (pmd_trans_unstable(pmd)) ret = -EBUSY; - } else if (flags & FOLL_SPLIT) { - if (unlikely(!try_get_page(page))) { - spin_unlock(ptl); - return ERR_PTR(-ENOMEM); - } - spin_unlock(ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - if (pmd_none(*pmd)) - return no_page_table(vma, flags); - } else { /* flags & FOLL_SPLIT_PMD */ + } else { spin_unlock(ptl); split_huge_pmd(vma, pmd, address); ret = pte_alloc(mm, pmd) ? -ENOMEM : 0; @@ -1470,7 +1528,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, { struct vm_area_struct *vma; unsigned long vm_flags; - int i; + long i; /* calculate required read or write permissions. * If FOLL_FORCE is set, we only require the "MAY" flags. @@ -1535,116 +1593,100 @@ struct page *get_dump_page(unsigned long addr) FOLL_FORCE | FOLL_DUMP | FOLL_GET); if (locked) mmap_read_unlock(mm); + + if (ret == 1 && is_page_poisoned(page)) + return NULL; + return (ret == 1) ? page : NULL; } #endif /* CONFIG_ELF_CORE */ -#ifdef CONFIG_CMA -static long check_and_migrate_cma_pages(struct mm_struct *mm, - unsigned long start, - unsigned long nr_pages, - struct page **pages, - struct vm_area_struct **vmas, - unsigned int gup_flags) +#ifdef CONFIG_MIGRATION +/* + * Check whether all pages are pinnable, if so return number of pages. If some + * pages are not pinnable, migrate them, and unpin all pages. Return zero if + * pages were migrated, or if some pages were not successfully isolated. + * Return negative error if migration fails. + */ +static long check_and_migrate_movable_pages(unsigned long nr_pages, + struct page **pages, + unsigned int gup_flags) { unsigned long i; - unsigned long step; + unsigned long isolation_error_count = 0; bool drain_allow = true; - bool migrate_allow = true; - LIST_HEAD(cma_page_list); - long ret = nr_pages; + LIST_HEAD(movable_page_list); + long ret = 0; + struct page *prev_head = NULL; + struct page *head; struct migration_target_control mtc = { .nid = NUMA_NO_NODE, - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN, + .gfp_mask = GFP_USER | __GFP_NOWARN, }; -check_again: - for (i = 0; i < nr_pages;) { - - struct page *head = compound_head(pages[i]); - - /* - * gup may start from a tail page. Advance step by the left - * part. - */ - step = compound_nr(head) - (pages[i] - head); + for (i = 0; i < nr_pages; i++) { + head = compound_head(pages[i]); + if (head == prev_head) + continue; + prev_head = head; /* - * If we get a page from the CMA zone, since we are going to - * be pinning these entries, we might as well move them out - * of the CMA zone if possible. + * If we get a movable page, since we are going to be pinning + * these entries, try to move them out if possible. */ - if (is_migrate_cma_page(head)) { - if (PageHuge(head)) - isolate_huge_page(head, &cma_page_list); - else { + if (!is_pinnable_page(head)) { + if (PageHuge(head)) { + if (!isolate_huge_page(head, &movable_page_list)) + isolation_error_count++; + } else { if (!PageLRU(head) && drain_allow) { lru_add_drain_all(); drain_allow = false; } - if (!isolate_lru_page(head)) { - list_add_tail(&head->lru, &cma_page_list); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + - page_is_file_lru(head), - thp_nr_pages(head)); + if (isolate_lru_page(head)) { + isolation_error_count++; + continue; } + list_add_tail(&head->lru, &movable_page_list); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + + page_is_file_lru(head), + thp_nr_pages(head)); } } - - i += step; } - if (!list_empty(&cma_page_list)) { - /* - * drop the above get_user_pages reference. - */ - if (gup_flags & FOLL_PIN) - unpin_user_pages(pages, nr_pages); - else - for (i = 0; i < nr_pages; i++) - put_page(pages[i]); - - if (migrate_pages(&cma_page_list, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) { - /* - * some of the pages failed migration. Do get_user_pages - * without migration. - */ - migrate_allow = false; + /* + * If list is empty, and no isolation errors, means that all pages are + * in the correct zone. + */ + if (list_empty(&movable_page_list) && !isolation_error_count) + return nr_pages; - if (!list_empty(&cma_page_list)) - putback_movable_pages(&cma_page_list); - } - /* - * We did migrate all the pages, Try to get the page references - * again migrating any new CMA pages which we failed to isolate - * earlier. - */ - ret = __get_user_pages_locked(mm, start, nr_pages, - pages, vmas, NULL, - gup_flags); - - if ((ret > 0) && migrate_allow) { - nr_pages = ret; - drain_allow = true; - goto check_again; - } + if (gup_flags & FOLL_PIN) { + unpin_user_pages(pages, nr_pages); + } else { + for (i = 0; i < nr_pages; i++) + put_page(pages[i]); + } + if (!list_empty(&movable_page_list)) { + ret = migrate_pages(&movable_page_list, alloc_migration_target, + NULL, (unsigned long)&mtc, MIGRATE_SYNC, + MR_LONGTERM_PIN); + if (ret && !list_empty(&movable_page_list)) + putback_movable_pages(&movable_page_list); } - return ret; + return ret > 0 ? -ENOMEM : ret; } #else -static long check_and_migrate_cma_pages(struct mm_struct *mm, - unsigned long start, - unsigned long nr_pages, - struct page **pages, - struct vm_area_struct **vmas, - unsigned int gup_flags) +static long check_and_migrate_movable_pages(unsigned long nr_pages, + struct page **pages, + unsigned int gup_flags) { return nr_pages; } -#endif /* CONFIG_CMA */ +#endif /* CONFIG_MIGRATION */ /* * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which @@ -1657,21 +1699,22 @@ static long __gup_longterm_locked(struct mm_struct *mm, struct vm_area_struct **vmas, unsigned int gup_flags) { - unsigned long flags = 0; + unsigned int flags; long rc; - if (gup_flags & FOLL_LONGTERM) - flags = memalloc_nocma_save(); - - rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, - gup_flags); + if (!(gup_flags & FOLL_LONGTERM)) + return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + NULL, gup_flags); + flags = memalloc_pin_save(); + do { + rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + NULL, gup_flags); + if (rc <= 0) + break; + rc = check_and_migrate_movable_pages(rc, pages, gup_flags); + } while (!rc); + memalloc_pin_restore(flags); - if (gup_flags & FOLL_LONGTERM) { - if (rc > 0) - rc = check_and_migrate_cma_pages(mm, start, rc, pages, - vmas, gup_flags); - memalloc_nocma_restore(flags); - } return rc; } diff --git a/mm/gup_test.c b/mm/gup_test.c index e3cf78e5873e..d974dec19e1c 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -52,6 +52,12 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages, dump_page(page, "gup_test failure"); break; + } else if (cmd == PIN_LONGTERM_BENCHMARK && + WARN(!is_pinnable_page(page), + "pages[%lu] is NOT pinnable but pinned\n", + i)) { + dump_page(page, "gup_test failure"); + break; } } break; @@ -94,7 +100,7 @@ static int __gup_test_ioctl(unsigned int cmd, { ktime_t start_time, end_time; unsigned long i, nr_pages, addr, next; - int nr; + long nr; struct page **pages; int ret = 0; bool needs_mmap_lock = @@ -126,37 +132,34 @@ static int __gup_test_ioctl(unsigned int cmd, nr = (next - addr) / PAGE_SIZE; } - /* Filter out most gup flags: only allow a tiny subset here: */ - gup->flags &= FOLL_WRITE; - switch (cmd) { case GUP_FAST_BENCHMARK: - nr = get_user_pages_fast(addr, nr, gup->flags, + nr = get_user_pages_fast(addr, nr, gup->gup_flags, pages + i); break; case GUP_BASIC_TEST: - nr = get_user_pages(addr, nr, gup->flags, pages + i, + nr = get_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); break; case PIN_FAST_BENCHMARK: - nr = pin_user_pages_fast(addr, nr, gup->flags, + nr = pin_user_pages_fast(addr, nr, gup->gup_flags, pages + i); break; case PIN_BASIC_TEST: - nr = pin_user_pages(addr, nr, gup->flags, pages + i, + nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); break; case PIN_LONGTERM_BENCHMARK: nr = pin_user_pages(addr, nr, - gup->flags | FOLL_LONGTERM, + gup->gup_flags | FOLL_LONGTERM, pages + i, NULL); break; case DUMP_USER_PAGES_TEST: - if (gup->flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) - nr = pin_user_pages(addr, nr, gup->flags, + if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) + nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); else - nr = get_user_pages(addr, nr, gup->flags, + nr = get_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); break; default: @@ -187,7 +190,7 @@ static int __gup_test_ioctl(unsigned int cmd, start_time = ktime_get(); - put_back_pages(cmd, pages, nr_pages, gup->flags); + put_back_pages(cmd, pages, nr_pages, gup->test_flags); end_time = ktime_get(); gup->put_delta_usec = ktime_us_delta(end_time, start_time); diff --git a/mm/gup_test.h b/mm/gup_test.h index 90a6713d50eb..887ac1d5f5bc 100644 --- a/mm/gup_test.h +++ b/mm/gup_test.h @@ -21,7 +21,8 @@ struct gup_test { __u64 addr; __u64 size; __u32 nr_pages_per_call; - __u32 flags; + __u32 gup_flags; + __u32 test_flags; /* * Each non-zero entry is the number of the page (1-based: first page is * page 1, so that zero entries mean "do nothing") from the .addr base. diff --git a/mm/highmem.c b/mm/highmem.c index 86f2b9495f9c..e389337e00b4 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -104,7 +104,7 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) atomic_long_t _totalhigh_pages __read_mostly; EXPORT_SYMBOL(_totalhigh_pages); -unsigned int __nr_free_highpages (void) +unsigned int __nr_free_highpages(void) { struct zone *zone; unsigned int pages = 0; @@ -120,7 +120,7 @@ unsigned int __nr_free_highpages (void) static int pkmap_count[LAST_PKMAP]; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); -pte_t * pkmap_page_table; +pte_t *pkmap_page_table; /* * Most architectures have no use for kmap_high_get(), so let's abstract @@ -147,6 +147,7 @@ struct page *__kmap_to_page(void *vaddr) if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { int i = PKMAP_NR(addr); + return pte_page(pkmap_page_table[i]); } @@ -278,9 +279,8 @@ void *kmap_high(struct page *page) pkmap_count[PKMAP_NR(vaddr)]++; BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2); unlock_kmap(); - return (void*) vaddr; + return (void *) vaddr; } - EXPORT_SYMBOL(kmap_high); #ifdef ARCH_NEEDS_KMAP_HIGH_GET @@ -305,7 +305,7 @@ void *kmap_high_get(struct page *page) pkmap_count[PKMAP_NR(vaddr)]++; } unlock_kmap_any(flags); - return (void*) vaddr; + return (void *) vaddr; } #endif @@ -618,7 +618,7 @@ void __kmap_local_sched_out(void) int idx; /* With debug all even slots are unmapped and act as guard */ - if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { + if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL) && !(i & 0x01)) { WARN_ON_ONCE(!pte_none(pteval)); continue; } @@ -654,7 +654,7 @@ void __kmap_local_sched_in(void) int idx; /* With debug all even slots are unmapped and act as guard */ - if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { + if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL) && !(i & 0x01)) { WARN_ON_ONCE(!pte_none(pteval)); continue; } @@ -737,7 +737,6 @@ done: spin_unlock_irqrestore(&pas->lock, flags); return ret; } - EXPORT_SYMBOL(page_address); /** diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ae907a9c2050..98456017744d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -7,6 +7,7 @@ #include <linux/mm.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/numa_balancing.h> #include <linux/highmem.h> @@ -77,18 +78,18 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma) return false; } -static struct page *get_huge_zero_page(void) +static bool get_huge_zero_page(void) { struct page *zero_page; retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) - return READ_ONCE(huge_zero_page); + return true; zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); if (!zero_page) { count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); - return NULL; + return false; } count_vm_event(THP_ZERO_PAGE_ALLOC); preempt_disable(); @@ -101,7 +102,7 @@ retry: /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); preempt_enable(); - return READ_ONCE(huge_zero_page); + return true; } static void put_huge_zero_page(void) @@ -624,14 +625,12 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, /* Deliver the page fault to userland */ if (userfaultfd_missing(vma)) { - vm_fault_t ret2; - spin_unlock(vmf->ptl); put_page(page); pte_free(vma->vm_mm, pgtable); - ret2 = handle_userfault(vmf, VM_UFFD_MISSING); - VM_BUG_ON(ret2 & VM_FAULT_FALLBACK); - return ret2; + ret = handle_userfault(vmf, VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + return ret; } entry = mk_huge_pmd(page, vma->vm_page_prot); @@ -1293,7 +1292,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) } page = pmd_page(orig_pmd); - VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); + VM_BUG_ON_PAGE(!PageHead(page), page); /* Lock page for reuse_swap_page() */ if (!trylock_page(page)) { @@ -1464,12 +1463,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) */ page_locked = trylock_page(page); target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == NUMA_NO_NODE) { - /* If the page was locked, there are no parallel migrations */ - if (page_locked) - goto clear_pmdnuma; - } - /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { page_nid = NUMA_NO_NODE; @@ -1478,6 +1471,11 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) spin_unlock(vmf->ptl); put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); goto out; + } else if (target_nid == NUMA_NO_NODE) { + /* There are no parallel migrations and page is in the right + * node. Clear the numa hinting info in this pmd. + */ + goto clear_pmdnuma; } /* @@ -1696,7 +1694,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); entry = pmd_to_swp_entry(orig_pmd); - page = pfn_to_page(swp_offset(entry)); + page = migration_entry_to_page(entry); flush_needed = 0; } else WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); @@ -2104,7 +2102,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); - page = pfn_to_page(swp_offset(entry)); + page = migration_entry_to_page(entry); write = is_write_migration_entry(entry); young = false; soft_dirty = pmd_swp_soft_dirty(old_pmd); @@ -2303,44 +2301,38 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, __split_huge_pmd(vma, pmd, address, freeze, page); } +static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) +{ + /* + * If the new address isn't hpage aligned and it could previously + * contain an hugepage: check if we need to split an huge pmd. + */ + if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) && + range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE), + ALIGN(address, HPAGE_PMD_SIZE))) + split_huge_pmd_address(vma, address, false, NULL); +} + void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next) { - /* - * If the new start address isn't hpage aligned and it could - * previously contain an hugepage: check if we need to split - * an huge pmd. - */ - if (start & ~HPAGE_PMD_MASK && - (start & HPAGE_PMD_MASK) >= vma->vm_start && - (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_pmd_address(vma, start, false, NULL); + /* Check if we need to split start first. */ + split_huge_pmd_if_needed(vma, start); - /* - * If the new end address isn't hpage aligned and it could - * previously contain an hugepage: check if we need to split - * an huge pmd. - */ - if (end & ~HPAGE_PMD_MASK && - (end & HPAGE_PMD_MASK) >= vma->vm_start && - (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_pmd_address(vma, end, false, NULL); + /* Check if we need to split end next. */ + split_huge_pmd_if_needed(vma, end); /* - * If we're also updating the vma->vm_next->vm_start, if the new - * vm_next->vm_start isn't hpage aligned and it could previously - * contain an hugepage: check if we need to split an huge pmd. + * If we're also updating the vma->vm_next->vm_start, + * check if we need to split it. */ if (adjust_next > 0) { struct vm_area_struct *next = vma->vm_next; unsigned long nstart = next->vm_start; nstart += adjust_next; - if (nstart & ~HPAGE_PMD_MASK && - (nstart & HPAGE_PMD_MASK) >= next->vm_start && - (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) - split_huge_pmd_address(next, nstart, false, NULL); + split_huge_pmd_if_needed(next, nstart); } } @@ -2838,8 +2830,8 @@ void deferred_split_huge_page(struct page *page) ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG if (memcg) - memcg_set_shrinker_bit(memcg, page_to_nid(page), - deferred_split_shrinker.id); + set_shrinker_bit(memcg, page_to_nid(page), + deferred_split_shrinker.id); #endif } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); @@ -2924,16 +2916,14 @@ static struct shrinker deferred_split_shrinker = { }; #ifdef CONFIG_DEBUG_FS -static int split_huge_pages_set(void *data, u64 val) +static void split_huge_pages_all(void) { struct zone *zone; struct page *page; unsigned long pfn, max_zone_pfn; unsigned long total = 0, split = 0; - if (val != 1) - return -EINVAL; - + pr_debug("Split all THPs\n"); for_each_populated_zone(zone) { max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { @@ -2957,15 +2947,243 @@ static int split_huge_pages_set(void *data, u64 val) unlock_page(page); next: put_page(page); + cond_resched(); } } - pr_info("%lu of %lu THP split\n", split, total); + pr_debug("%lu of %lu THP split\n", split, total); +} - return 0; +static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) +{ + return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || + is_vm_hugetlb_page(vma); +} + +static int split_huge_pages_pid(int pid, unsigned long vaddr_start, + unsigned long vaddr_end) +{ + int ret = 0; + struct task_struct *task; + struct mm_struct *mm; + unsigned long total = 0, split = 0; + unsigned long addr; + + vaddr_start &= PAGE_MASK; + vaddr_end &= PAGE_MASK; + + /* Find the task_struct from pid */ + rcu_read_lock(); + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + ret = -ESRCH; + goto out; + } + get_task_struct(task); + rcu_read_unlock(); + + /* Find the mm_struct */ + mm = get_task_mm(task); + put_task_struct(task); + + if (!mm) { + ret = -EINVAL; + goto out; + } + + pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n", + pid, vaddr_start, vaddr_end); + + mmap_read_lock(mm); + /* + * always increase addr by PAGE_SIZE, since we could have a PTE page + * table filled with PTE-mapped THPs, each of which is distinct. + */ + for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { + struct vm_area_struct *vma = find_vma(mm, addr); + unsigned int follflags; + struct page *page; + + if (!vma || addr < vma->vm_start) + break; + + /* skip special VMA and hugetlb VMA */ + if (vma_not_suitable_for_thp_split(vma)) { + addr = vma->vm_end; + continue; + } + + /* FOLL_DUMP to ignore special (like zero) pages */ + follflags = FOLL_GET | FOLL_DUMP; + page = follow_page(vma, addr, follflags); + + if (IS_ERR(page)) + continue; + if (!page) + continue; + + if (!is_transparent_hugepage(page)) + goto next; + + total++; + if (!can_split_huge_page(compound_head(page), NULL)) + goto next; + + if (!trylock_page(page)) + goto next; + + if (!split_huge_page(page)) + split++; + + unlock_page(page); +next: + put_page(page); + cond_resched(); + } + mmap_read_unlock(mm); + mmput(mm); + + pr_debug("%lu of %lu THP split\n", split, total); + +out: + return ret; +} + +static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, + pgoff_t off_end) +{ + struct filename *file; + struct file *candidate; + struct address_space *mapping; + int ret = -EINVAL; + pgoff_t index; + int nr_pages = 1; + unsigned long total = 0, split = 0; + + file = getname_kernel(file_path); + if (IS_ERR(file)) + return ret; + + candidate = file_open_name(file, O_RDONLY, 0); + if (IS_ERR(candidate)) + goto out; + + pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n", + file_path, off_start, off_end); + + mapping = candidate->f_mapping; + + for (index = off_start; index < off_end; index += nr_pages) { + struct page *fpage = pagecache_get_page(mapping, index, + FGP_ENTRY | FGP_HEAD, 0); + + nr_pages = 1; + if (xa_is_value(fpage) || !fpage) + continue; + + if (!is_transparent_hugepage(fpage)) + goto next; + + total++; + nr_pages = thp_nr_pages(fpage); + + if (!trylock_page(fpage)) + goto next; + + if (!split_huge_page(fpage)) + split++; + + unlock_page(fpage); +next: + put_page(fpage); + cond_resched(); + } + + filp_close(candidate, NULL); + ret = 0; + + pr_debug("%lu of %lu file-backed THP split\n", split, total); +out: + putname(file); + return ret; } -DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, - "%llu\n"); + +#define MAX_INPUT_BUF_SZ 255 + +static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppops) +{ + static DEFINE_MUTEX(split_debug_mutex); + ssize_t ret; + /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */ + char input_buf[MAX_INPUT_BUF_SZ]; + int pid; + unsigned long vaddr_start, vaddr_end; + + ret = mutex_lock_interruptible(&split_debug_mutex); + if (ret) + return ret; + + ret = -EFAULT; + + memset(input_buf, 0, MAX_INPUT_BUF_SZ); + if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ))) + goto out; + + input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; + + if (input_buf[0] == '/') { + char *tok; + char *buf = input_buf; + char file_path[MAX_INPUT_BUF_SZ]; + pgoff_t off_start = 0, off_end = 0; + size_t input_len = strlen(input_buf); + + tok = strsep(&buf, ","); + if (tok) { + strncpy(file_path, tok, MAX_INPUT_BUF_SZ); + } else { + ret = -EINVAL; + goto out; + } + + ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end); + if (ret != 2) { + ret = -EINVAL; + goto out; + } + ret = split_huge_pages_in_file(file_path, off_start, off_end); + if (!ret) + ret = input_len; + + goto out; + } + + ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end); + if (ret == 1 && pid == 1) { + split_huge_pages_all(); + ret = strlen(input_buf); + goto out; + } else if (ret != 3) { + ret = -EINVAL; + goto out; + } + + ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end); + if (!ret) + ret = strlen(input_buf); +out: + mutex_unlock(&split_debug_mutex); + return ret; + +} + +static const struct file_operations split_huge_pages_fops = { + .owner = THIS_MODULE, + .write = split_huge_pages_write, + .llseek = no_llseek, +}; static int __init split_huge_pages_debugfs(void) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5b1ab1f427c5..629aa4c2259c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -39,7 +39,6 @@ #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #include <linux/node.h> -#include <linux/userfaultfd_k.h> #include <linux/page_owner.h> #include "internal.h" @@ -94,9 +93,10 @@ static inline bool subpool_is_free(struct hugepage_subpool *spool) return true; } -static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) +static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, + unsigned long irq_flags) { - spin_unlock(&spool->lock); + spin_unlock_irqrestore(&spool->lock, irq_flags); /* If no pages are used, and no other handles to the subpool * remain, give up any reservations based on minimum size and @@ -135,10 +135,12 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, void hugepage_put_subpool(struct hugepage_subpool *spool) { - spin_lock(&spool->lock); + unsigned long flags; + + spin_lock_irqsave(&spool->lock, flags); BUG_ON(!spool->count); spool->count--; - unlock_or_release_subpool(spool); + unlock_or_release_subpool(spool, flags); } /* @@ -157,7 +159,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, if (!spool) return ret; - spin_lock(&spool->lock); + spin_lock_irq(&spool->lock); if (spool->max_hpages != -1) { /* maximum size accounting */ if ((spool->used_hpages + delta) <= spool->max_hpages) @@ -184,7 +186,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, } unlock_ret: - spin_unlock(&spool->lock); + spin_unlock_irq(&spool->lock); return ret; } @@ -198,11 +200,12 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, long delta) { long ret = delta; + unsigned long flags; if (!spool) return delta; - spin_lock(&spool->lock); + spin_lock_irqsave(&spool->lock, flags); if (spool->max_hpages != -1) /* maximum size accounting */ spool->used_hpages -= delta; @@ -223,7 +226,7 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, * If hugetlbfs_put_super couldn't free spool due to an outstanding * quota reference, free it now. */ - unlock_or_release_subpool(spool); + unlock_or_release_subpool(spool, flags); return ret; } @@ -280,6 +283,17 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, nrg->reservation_counter = &h_cg->rsvd_hugepage[hstate_index(h)]; nrg->css = &h_cg->css; + /* + * The caller will hold exactly one h_cg->css reference for the + * whole contiguous reservation region. But this area might be + * scattered when there are already some file_regions reside in + * it. As a result, many file_regions may share only one css + * reference. In order to ensure that one file_region must hold + * exactly one h_cg->css reference, we should do css_get for + * each file_region and leave the reference held by caller + * untouched. + */ + css_get(&h_cg->css); if (!resv->pages_per_hpage) resv->pages_per_hpage = pages_per_huge_page(h); /* pages_per_hpage should be the same for all entries in @@ -293,6 +307,14 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, #endif } +static void put_uncharge_info(struct file_region *rg) +{ +#ifdef CONFIG_CGROUP_HUGETLB + if (rg->css) + css_put(rg->css); +#endif +} + static bool has_same_uncharge_info(struct file_region *rg, struct file_region *org) { @@ -316,6 +338,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) prg->to = rg->to; list_del(&rg->link); + put_uncharge_info(rg); kfree(rg); rg = prg; @@ -327,6 +350,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) nrg->from = rg->from; list_del(&rg->link); + put_uncharge_info(rg); kfree(rg); } } @@ -532,7 +556,6 @@ retry: resv->adds_in_progress -= in_regions_needed; spin_unlock(&resv->lock); - VM_BUG_ON(add < 0); return add; } @@ -662,7 +685,7 @@ retry: del += t - f; hugetlb_cgroup_uncharge_file_region( - resv, rg, t - f); + resv, rg, t - f, false); /* New entry for end of split region */ nrg->from = t; @@ -683,7 +706,7 @@ retry: if (f <= rg->from && t >= rg->to) { /* Remove entire region */ del += rg->to - rg->from; hugetlb_cgroup_uncharge_file_region(resv, rg, - rg->to - rg->from); + rg->to - rg->from, true); list_del(&rg->link); kfree(rg); continue; @@ -691,13 +714,13 @@ retry: if (f <= rg->from) { /* Trim beginning of region */ hugetlb_cgroup_uncharge_file_region(resv, rg, - t - rg->from); + t - rg->from, false); del += t - rg->from; rg->from = t; } else { /* Trim end of region */ hugetlb_cgroup_uncharge_file_region(resv, rg, - rg->to - f); + rg->to - f, false); del += rg->to - f; rg->to = f; @@ -722,13 +745,20 @@ void hugetlb_fix_reserve_counts(struct inode *inode) { struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; + bool reserved = false; rsv_adjust = hugepage_subpool_get_pages(spool, 1); - if (rsv_adjust) { + if (rsv_adjust > 0) { struct hstate *h = hstate_inode(inode); - hugetlb_acct_memory(h, 1); + if (!hugetlb_acct_memory(h, 1)) + reserved = true; + } else if (!rsv_adjust) { + reserved = true; } + + if (!reserved) + pr_warn("hugetlb: Huge Page Reserved count may go negative.\n"); } /* @@ -1038,6 +1068,8 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); + + lockdep_assert_held(&hugetlb_lock); list_move(&page->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; @@ -1047,10 +1079,11 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) { struct page *page; - bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA); + bool pin = !!(current->flags & PF_MEMALLOC_PIN); + lockdep_assert_held(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { - if (nocma && is_migrate_cma_page(page)) + if (pin && !is_pinnable_page(page)) continue; if (PageHWPoison(page)) @@ -1184,7 +1217,7 @@ static int hstate_next_node_to_alloc(struct hstate *h, } /* - * helper for free_pool_huge_page() - return the previously saved + * helper for remove_pool_huge_page() - return the previously saved * node ["this node"] from which to free a huge page. Advance the * next node id whether or not we find a free huge page to free so * that the next attempt to free addresses the next node. @@ -1252,7 +1285,7 @@ static void free_gigantic_page(struct page *page, unsigned int order) static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { - unsigned long nr_pages = 1UL << huge_page_order(h); + unsigned long nr_pages = pages_per_huge_page(h); if (nid == NUMA_NO_NODE) nid = numa_mem_id(); @@ -1306,6 +1339,42 @@ static inline void destroy_compound_gigantic_page(struct page *page, unsigned int order) { } #endif +/* + * Remove hugetlb page from lists, and update dtor so that page appears + * as just a compound page. A reference is held on the page. + * + * Must be called with hugetlb lock held. + */ +static void remove_hugetlb_page(struct hstate *h, struct page *page, + bool adjust_surplus) +{ + int nid = page_to_nid(page); + + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); + + lockdep_assert_held(&hugetlb_lock); + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + return; + + list_del(&page->lru); + + if (HPageFreed(page)) { + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + } + if (adjust_surplus) { + h->surplus_huge_pages--; + h->surplus_huge_pages_node[nid]--; + } + + set_page_refcounted(page); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + + h->nr_huge_pages--; + h->nr_huge_pages_node[nid]--; +} + static void update_and_free_page(struct hstate *h, struct page *page) { int i; @@ -1314,8 +1383,6 @@ static void update_and_free_page(struct hstate *h, struct page *page) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; - h->nr_huge_pages--; - h->nr_huge_pages_node[page_to_nid(page)]--; for (i = 0; i < pages_per_huge_page(h); i++, subpage = mem_map_next(subpage, page, i)) { subpage->flags &= ~(1 << PG_locked | 1 << PG_error | @@ -1323,24 +1390,24 @@ static void update_and_free_page(struct hstate *h, struct page *page) 1 << PG_active | 1 << PG_private | 1 << PG_writeback); } - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); - set_compound_page_dtor(page, NULL_COMPOUND_DTOR); - set_page_refcounted(page); if (hstate_is_gigantic(h)) { - /* - * Temporarily drop the hugetlb_lock, because - * we might block in free_gigantic_page(). - */ - spin_unlock(&hugetlb_lock); destroy_compound_gigantic_page(page, huge_page_order(h)); free_gigantic_page(page, huge_page_order(h)); - spin_lock(&hugetlb_lock); } else { __free_pages(page, huge_page_order(h)); } } +static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) +{ + struct page *page, *t_page; + + list_for_each_entry_safe(page, t_page, list, lru) { + update_and_free_page(h, page); + cond_resched(); + } +} + struct hstate *size_to_hstate(unsigned long size) { struct hstate *h; @@ -1352,7 +1419,7 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } -static void __free_huge_page(struct page *page) +void free_huge_page(struct page *page) { /* * Can't pass hstate in here because it is called from the @@ -1362,6 +1429,7 @@ static void __free_huge_page(struct page *page) int nid = page_to_nid(page); struct hugepage_subpool *spool = hugetlb_page_subpool(page); bool restore_reserve; + unsigned long flags; VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(page_mapcount(page), page); @@ -1390,7 +1458,7 @@ static void __free_huge_page(struct page *page) restore_reserve = true; } - spin_lock(&hugetlb_lock); + spin_lock_irqsave(&hugetlb_lock, flags); ClearHPageMigratable(page); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); @@ -1400,82 +1468,46 @@ static void __free_huge_page(struct page *page) h->resv_huge_pages++; if (HPageTemporary(page)) { - list_del(&page->lru); - ClearHPageTemporary(page); + remove_hugetlb_page(h, page, false); + spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_page(h, page); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ - list_del(&page->lru); + remove_hugetlb_page(h, page, true); + spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_page(h, page); - h->surplus_huge_pages--; - h->surplus_huge_pages_node[nid]--; } else { arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); + spin_unlock_irqrestore(&hugetlb_lock, flags); } - spin_unlock(&hugetlb_lock); } /* - * As free_huge_page() can be called from a non-task context, we have - * to defer the actual freeing in a workqueue to prevent potential - * hugetlb_lock deadlock. - * - * free_hpage_workfn() locklessly retrieves the linked list of pages to - * be freed and frees them one-by-one. As the page->mapping pointer is - * going to be cleared in __free_huge_page() anyway, it is reused as the - * llist_node structure of a lockless linked list of huge pages to be freed. + * Must be called with the hugetlb lock held */ -static LLIST_HEAD(hpage_freelist); - -static void free_hpage_workfn(struct work_struct *work) -{ - struct llist_node *node; - struct page *page; - - node = llist_del_all(&hpage_freelist); - - while (node) { - page = container_of((struct address_space **)node, - struct page, mapping); - node = node->next; - __free_huge_page(page); - } -} -static DECLARE_WORK(free_hpage_work, free_hpage_workfn); - -void free_huge_page(struct page *page) +static void __prep_account_new_huge_page(struct hstate *h, int nid) { - /* - * Defer freeing if in non-task context to avoid hugetlb_lock deadlock. - */ - if (!in_task()) { - /* - * Only call schedule_work() if hpage_freelist is previously - * empty. Otherwise, schedule_work() had been called but the - * workfn hasn't retrieved the list yet. - */ - if (llist_add((struct llist_node *)&page->mapping, - &hpage_freelist)) - schedule_work(&free_hpage_work); - return; - } - - __free_huge_page(page); + lockdep_assert_held(&hugetlb_lock); + h->nr_huge_pages++; + h->nr_huge_pages_node[nid]++; } -static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +static void __prep_new_huge_page(struct page *page) { INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); hugetlb_set_page_subpool(page, NULL); set_hugetlb_cgroup(page, NULL); set_hugetlb_cgroup_rsvd(page, NULL); - spin_lock(&hugetlb_lock); - h->nr_huge_pages++; - h->nr_huge_pages_node[nid]++; - ClearHPageFreed(page); - spin_unlock(&hugetlb_lock); +} + +static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +{ + __prep_new_huge_page(page); + spin_lock_irq(&hugetlb_lock); + __prep_account_new_huge_page(h, nid); + spin_unlock_irq(&hugetlb_lock); } static void prep_compound_gigantic_page(struct page *page, unsigned int order) @@ -1595,7 +1627,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); - page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); + page = __alloc_pages(gfp_mask, order, nid, nmask); if (page) __count_vm_event(HTLB_BUDDY_PGALLOC); else @@ -1672,17 +1704,20 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, } /* - * Free huge page from pool from next node to free. - * Attempt to keep persistent huge pages more or less - * balanced over allowed nodes. + * Remove huge page from pool from next node to free. Attempt to keep + * persistent huge pages more or less balanced over allowed nodes. + * This routine only 'removes' the hugetlb page. The caller must make + * an additional call to free the page to low level allocators. * Called with hugetlb_lock locked. */ -static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, - bool acct_surplus) +static struct page *remove_pool_huge_page(struct hstate *h, + nodemask_t *nodes_allowed, + bool acct_surplus) { int nr_nodes, node; - int ret = 0; + struct page *page = NULL; + lockdep_assert_held(&hugetlb_lock); for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { /* * If we're returning unused surplus pages, only examine @@ -1690,23 +1725,14 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, */ if ((!acct_surplus || h->surplus_huge_pages_node[node]) && !list_empty(&h->hugepage_freelists[node])) { - struct page *page = - list_entry(h->hugepage_freelists[node].next, + page = list_entry(h->hugepage_freelists[node].next, struct page, lru); - list_del(&page->lru); - h->free_huge_pages--; - h->free_huge_pages_node[node]--; - if (acct_surplus) { - h->surplus_huge_pages--; - h->surplus_huge_pages_node[node]--; - } - update_and_free_page(h, page); - ret = 1; + remove_hugetlb_page(h, page, acct_surplus); break; } } - return ret; + return page; } /* @@ -1728,7 +1754,7 @@ retry: if (!PageHuge(page)) return 0; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (!PageHuge(page)) { rc = 0; goto out; @@ -1737,7 +1763,6 @@ retry: if (!page_count(page)) { struct page *head = compound_head(page); struct hstate *h = page_hstate(head); - int nid = page_to_nid(head); if (h->free_huge_pages - h->resv_huge_pages == 0) goto out; @@ -1746,7 +1771,7 @@ retry: * when it is dissolved. */ if (unlikely(!HPageFreed(head))) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); cond_resched(); /* @@ -1768,15 +1793,14 @@ retry: SetPageHWPoison(page); ClearPageHWPoison(head); } - list_del(&head->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; + remove_hugetlb_page(h, page, false); h->max_huge_pages--; + spin_unlock_irq(&hugetlb_lock); update_and_free_page(h, head); - rc = 0; + return 0; } out: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return rc; } @@ -1818,16 +1842,16 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, if (hstate_is_gigantic(h)) return NULL; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) goto out_unlock; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); if (!page) return NULL; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * We could have raced with the pool size change. * Double check that and simply deallocate the new page @@ -1837,7 +1861,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { SetHPageTemporary(page); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); put_page(page); return NULL; } else { @@ -1846,7 +1870,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, } out_unlock: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return page; } @@ -1896,17 +1920,17 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (h->free_huge_pages - h->resv_huge_pages > 0) { struct page *page; page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); if (page) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return page; } } - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); } @@ -1943,6 +1967,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) long needed, allocated; bool alloc_ok = true; + lockdep_assert_held(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - h->free_huge_pages; if (needed <= 0) { h->resv_huge_pages += delta; @@ -1954,7 +1979,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) ret = -ENOMEM; retry: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), NUMA_NO_NODE, NULL); @@ -1971,7 +1996,7 @@ retry: * After retaking hugetlb_lock, we need to recalculate 'needed' * because either resv_huge_pages or free_huge_pages may have changed. */ - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - (h->free_huge_pages + allocated); if (needed > 0) { @@ -2011,12 +2036,12 @@ retry: enqueue_huge_page(h, page); } free: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) put_page(page); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); return ret; } @@ -2028,17 +2053,17 @@ free: * to the associated reservation map. * 2) Free any unused surplus pages that may have been allocated to satisfy * the reservation. As many as unused_resv_pages may be freed. - * - * Called with hugetlb_lock held. However, the lock could be dropped (and - * reacquired) during calls to cond_resched_lock. Whenever dropping the lock, - * we must make sure nobody else can claim pages we are in the process of - * freeing. Do this by ensuring resv_huge_page always is greater than the - * number of huge pages we plan to free when dropping the lock. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; + struct page *page; + LIST_HEAD(page_list); + + lockdep_assert_held(&hugetlb_lock); + /* Uncommit the reservation */ + h->resv_huge_pages -= unused_resv_pages; /* Cannot return gigantic pages currently */ if (hstate_is_gigantic(h)) @@ -2055,24 +2080,21 @@ static void return_unused_surplus_pages(struct hstate *h, * evenly across all nodes with memory. Iterate across these nodes * until we can no longer free unreserved surplus pages. This occurs * when the nodes with surplus pages have no free pages. - * free_pool_huge_page() will balance the freed pages across the + * remove_pool_huge_page() will balance the freed pages across the * on-line nodes with memory and will handle the hstate accounting. - * - * Note that we decrement resv_huge_pages as we free the pages. If - * we drop the lock, resv_huge_pages will still be sufficiently large - * to cover subsequent pages we may free. */ while (nr_pages--) { - h->resv_huge_pages--; - unused_resv_pages--; - if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) + page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1); + if (!page) goto out; - cond_resched_lock(&hugetlb_lock); + + list_add(&page->lru, &page_list); } out: - /* Fully uncommit the reservation */ - h->resv_huge_pages -= unused_resv_pages; + spin_unlock_irq(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + spin_lock_irq(&hugetlb_lock); } @@ -2154,27 +2176,26 @@ static long __vma_reservation_common(struct hstate *h, if (vma->vm_flags & VM_MAYSHARE) return ret; - else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) { - /* - * In most cases, reserves always exist for private mappings. - * However, a file associated with mapping could have been - * hole punched or truncated after reserves were consumed. - * As subsequent fault on such a range will not use reserves. - * Subtle - The reserve map for private mappings has the - * opposite meaning than that of shared mappings. If NO - * entry is in the reserve map, it means a reservation exists. - * If an entry exists in the reserve map, it means the - * reservation has already been consumed. As a result, the - * return value of this routine is the opposite of the - * value returned from reserve map manipulation routines above. - */ - if (ret) - return 0; - else - return 1; - } - else - return ret < 0 ? ret : 0; + /* + * We know private mapping must have HPAGE_RESV_OWNER set. + * + * In most cases, reserves always exist for private mappings. + * However, a file associated with mapping could have been + * hole punched or truncated after reserves were consumed. + * As subsequent fault on such a range will not use reserves. + * Subtle - The reserve map for private mappings has the + * opposite meaning than that of shared mappings. If NO + * entry is in the reserve map, it means a reservation exists. + * If an entry exists in the reserve map, it means the + * reservation has already been consumed. As a result, the + * return value of this routine is the opposite of the + * value returned from reserve map manipulation routines above. + */ + if (ret > 0) + return 0; + if (ret == 0) + return 1; + return ret; } static long vma_needs_reservation(struct hstate *h, @@ -2245,6 +2266,134 @@ static void restore_reserve_on_error(struct hstate *h, } } +/* + * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one + * @h: struct hstate old page belongs to + * @old_page: Old page to dissolve + * @list: List to isolate the page in case we need to + * Returns 0 on success, otherwise negated error. + */ +static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, + struct list_head *list) +{ + gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + int nid = page_to_nid(old_page); + struct page *new_page; + int ret = 0; + + /* + * Before dissolving the page, we need to allocate a new one for the + * pool to remain stable. Using alloc_buddy_huge_page() allows us to + * not having to deal with prep_new_huge_page() and avoids dealing of any + * counters. This simplifies and let us do the whole thing under the + * lock. + */ + new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); + if (!new_page) + return -ENOMEM; + +retry: + spin_lock_irq(&hugetlb_lock); + if (!PageHuge(old_page)) { + /* + * Freed from under us. Drop new_page too. + */ + goto free_new; + } else if (page_count(old_page)) { + /* + * Someone has grabbed the page, try to isolate it here. + * Fail with -EBUSY if not possible. + */ + spin_unlock_irq(&hugetlb_lock); + if (!isolate_huge_page(old_page, list)) + ret = -EBUSY; + spin_lock_irq(&hugetlb_lock); + goto free_new; + } else if (!HPageFreed(old_page)) { + /* + * Page's refcount is 0 but it has not been enqueued in the + * freelist yet. Race window is small, so we can succeed here if + * we retry. + */ + spin_unlock_irq(&hugetlb_lock); + cond_resched(); + goto retry; + } else { + /* + * Ok, old_page is still a genuine free hugepage. Remove it from + * the freelist and decrease the counters. These will be + * incremented again when calling __prep_account_new_huge_page() + * and enqueue_huge_page() for new_page. The counters will remain + * stable since this happens under the lock. + */ + remove_hugetlb_page(h, old_page, false); + + /* + * new_page needs to be initialized with the standard hugetlb + * state. This is normally done by prep_new_huge_page() but + * that takes hugetlb_lock which is already held so we need to + * open code it here. + * Reference count trick is needed because allocator gives us + * referenced page but the pool requires pages with 0 refcount. + */ + __prep_new_huge_page(new_page); + __prep_account_new_huge_page(h, nid); + page_ref_dec(new_page); + enqueue_huge_page(h, new_page); + + /* + * Pages have been replaced, we can safely free the old one. + */ + spin_unlock_irq(&hugetlb_lock); + update_and_free_page(h, old_page); + } + + return ret; + +free_new: + spin_unlock_irq(&hugetlb_lock); + __free_pages(new_page, huge_page_order(h)); + + return ret; +} + +int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) +{ + struct hstate *h; + struct page *head; + int ret = -EBUSY; + + /* + * The page might have been dissolved from under our feet, so make sure + * to carefully check the state under the lock. + * Return success when racing as if we dissolved the page ourselves. + */ + spin_lock_irq(&hugetlb_lock); + if (PageHuge(page)) { + head = compound_head(page); + h = page_hstate(head); + } else { + spin_unlock_irq(&hugetlb_lock); + return 0; + } + spin_unlock_irq(&hugetlb_lock); + + /* + * Fence off gigantic pages as there is a cyclic dependency between + * alloc_contig_range and them. Return -ENOMEM as this has the effect + * of bailing out right away without further retrying. + */ + if (hstate_is_gigantic(h)) + return -ENOMEM; + + if (page_count(head) && isolate_huge_page(head, list)) + ret = 0; + else if (!page_count(head)) + ret = alloc_and_dissolve_huge_page(h, head, list); + + return ret; +} + struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { @@ -2295,7 +2444,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, /* If this allocation is not consuming a reservation, charge it now. */ - deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma); + deferred_reserve = map_chg || avoid_reserve; if (deferred_reserve) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); @@ -2307,7 +2456,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, if (ret) goto out_uncharge_cgroup_reservation; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * glb_chg is passed to indicate whether or not a page must be taken * from the global free pool (global change). gbl_chg == 0 indicates @@ -2315,7 +2464,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, */ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); if (!page) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); page = alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; @@ -2323,7 +2472,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, SetHPageRestoreReserve(page); h->resv_huge_pages--; } - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); list_add(&page->lru, &h->hugepage_activelist); /* Fall through */ } @@ -2336,7 +2485,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, h_cg, page); } - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); hugetlb_set_page_subpool(page, spool); @@ -2526,24 +2675,32 @@ static void try_to_free_low(struct hstate *h, unsigned long count, nodemask_t *nodes_allowed) { int i; + LIST_HEAD(page_list); + lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h)) return; + /* + * Collect pages to be freed on a list, and free after dropping lock + */ for_each_node_mask(i, *nodes_allowed) { struct page *page, *next; struct list_head *freel = &h->hugepage_freelists[i]; list_for_each_entry_safe(page, next, freel, lru) { if (count >= h->nr_huge_pages) - return; + goto out; if (PageHighMem(page)) continue; - list_del(&page->lru); - update_and_free_page(h, page); - h->free_huge_pages--; - h->free_huge_pages_node[page_to_nid(page)]--; + remove_hugetlb_page(h, page, false); + list_add(&page->lru, &page_list); } } + +out: + spin_unlock_irq(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + spin_lock_irq(&hugetlb_lock); } #else static inline void try_to_free_low(struct hstate *h, unsigned long count, @@ -2562,6 +2719,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, { int nr_nodes, node; + lockdep_assert_held(&hugetlb_lock); VM_BUG_ON(delta != -1 && delta != 1); if (delta < 0) { @@ -2589,6 +2747,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { unsigned long min_count, ret; + struct page *page; + LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); /* @@ -2601,7 +2761,12 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, else return -ENOMEM; - spin_lock(&hugetlb_lock); + /* + * resize_lock mutex prevents concurrent adjustments to number of + * pages in hstate via the proc/sysfs interfaces. + */ + mutex_lock(&h->resize_lock); + spin_lock_irq(&hugetlb_lock); /* * Check for a node specific request. @@ -2632,7 +2797,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, */ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { if (count > persistent_huge_pages(h)) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); + mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); return -EINVAL; } @@ -2661,14 +2827,14 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * page, free_huge_page will handle it by freeing the page * and reducing the surplus. */ - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); /* yield cpu to avoid soft lockup */ cond_resched(); ret = alloc_pool_huge_page(h, nodes_allowed, node_alloc_noretry); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (!ret) goto out; @@ -2695,18 +2861,30 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; min_count = max(count, min_count); try_to_free_low(h, min_count, nodes_allowed); + + /* + * Collect pages to be removed on list without dropping lock + */ while (min_count < persistent_huge_pages(h)) { - if (!free_pool_huge_page(h, nodes_allowed, 0)) + page = remove_pool_huge_page(h, nodes_allowed, 0); + if (!page) break; - cond_resched_lock(&hugetlb_lock); + + list_add(&page->lru, &page_list); } + /* free the pages after dropping lock */ + spin_unlock_irq(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + spin_lock_irq(&hugetlb_lock); + while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) break; } out: h->max_huge_pages = persistent_huge_pages(h); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); + mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); @@ -2861,9 +3039,9 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, if (err) return err; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); h->nr_overcommit_huge_pages = input; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return count; } @@ -3194,6 +3372,7 @@ void __init hugetlb_add_hstate(unsigned int order) BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order == 0); h = &hstates[hugetlb_max_hstate++]; + mutex_init(&h->resize_lock); h->order = order; h->mask = ~(huge_page_size(h) - 1); for (i = 0; i < MAX_NUMNODES; ++i) @@ -3246,10 +3425,10 @@ static int __init hugepages_setup(char *s) /* * Global state is always initialized later in hugetlb_init. - * But we need to allocate >= MAX_ORDER hstates here early to still + * But we need to allocate gigantic hstates here early to still * use the bootmem allocator. */ - if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) + if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate)) hugetlb_hstate_alloc_pages(parsed_hstate); last_mhp = mhp; @@ -3449,9 +3628,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, goto out; if (write) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); h->nr_overcommit_huge_pages = tmp; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); } out: return ret; @@ -3547,7 +3726,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta) if (!delta) return 0; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * When cpuset is configured, it breaks the strict hugetlb page * reservation as the accounting is done on a global variable. Such @@ -3586,7 +3765,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta) return_unused_surplus_pages(h, (unsigned long) -delta); out: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return ret; } @@ -3774,7 +3953,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) continue; - dst_pte = huge_pte_alloc(dst, addr, sz); + dst_pte = huge_pte_alloc(dst, vma, addr, sz); if (!dst_pte) { ret = -ENOMEM; break; @@ -4289,6 +4468,44 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, return 0; } +static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, + unsigned int flags, + unsigned long haddr, + unsigned long reason) +{ + vm_fault_t ret; + u32 hash; + struct vm_fault vmf = { + .vma = vma, + .address = haddr, + .flags = flags, + + /* + * Hard to debug if it ends up being + * used by a callee that assumes + * something about the other + * uninitialized fields... same as in + * memory.c + */ + }; + + /* + * hugetlb_fault_mutex and i_mmap_rwsem must be + * dropped before handling userfault. Reacquire + * after handling fault to make calling code simpler. + */ + hash = hugetlb_fault_mutex_hash(mapping, idx); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); + ret = handle_userfault(&vmf, reason); + i_mmap_lock_read(mapping); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + return ret; +} + static vm_fault_t hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, @@ -4327,35 +4544,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, retry: page = find_lock_page(mapping, idx); if (!page) { - /* - * Check for page in userfault range - */ + /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { - u32 hash; - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .flags = flags, - /* - * Hard to debug if it ends up being - * used by a callee that assumes - * something about the other - * uninitialized fields... same as in - * memory.c - */ - }; - - /* - * hugetlb_fault_mutex and i_mmap_rwsem must be - * dropped before handling userfault. Reacquire - * after handling fault to make calling code simpler. - */ - hash = hugetlb_fault_mutex_hash(mapping, idx); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); - ret = handle_userfault(&vmf, VM_UFFD_MISSING); - i_mmap_lock_read(mapping); - mutex_lock(&hugetlb_fault_mutex_table[hash]); + ret = hugetlb_handle_userfault(vma, mapping, idx, + flags, haddr, + VM_UFFD_MISSING); goto out; } @@ -4374,13 +4567,10 @@ retry: * sure there really is no pte entry. */ ptl = huge_pte_lock(h, mm, ptep); - if (!huge_pte_none(huge_ptep_get(ptep))) { - ret = 0; - spin_unlock(ptl); - goto out; - } + ret = 0; + if (huge_pte_none(huge_ptep_get(ptep))) + ret = vmf_error(PTR_ERR(page)); spin_unlock(ptl); - ret = vmf_error(PTR_ERR(page)); goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); @@ -4414,6 +4604,16 @@ retry: VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; } + + /* Check for page in userfault range. */ + if (userfaultfd_minor(vma)) { + unlock_page(page); + put_page(page); + ret = hugetlb_handle_userfault(vma, mapping, idx, + flags, haddr, + VM_UFFD_MINOR); + goto out; + } } /* @@ -4542,7 +4742,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ mapping = vma->vm_file->f_mapping; i_mmap_lock_read(mapping); - ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); + ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); if (!ptep) { i_mmap_unlock_read(mapping); return VM_FAULT_OOM; @@ -4654,6 +4854,7 @@ out_mutex: return ret; } +#ifdef CONFIG_USERFAULTFD /* * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with * modifications for huge pages. @@ -4663,8 +4864,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep) { + bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); struct address_space *mapping; pgoff_t idx; unsigned long size; @@ -4674,8 +4877,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, spinlock_t *ptl; int ret; struct page *page; + int writable; - if (!*pagep) { + mapping = dst_vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, dst_vma, dst_addr); + + if (is_continue) { + ret = -EFAULT; + page = find_lock_page(mapping, idx); + if (!page) + goto out; + } else if (!*pagep) { ret = -ENOMEM; page = alloc_huge_page(dst_vma, dst_addr, 0); if (IS_ERR(page)) @@ -4704,13 +4916,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, */ __SetPageUptodate(page); - mapping = dst_vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, dst_vma, dst_addr); - - /* - * If shared, add to page cache - */ - if (vm_shared) { + /* Add shared, newly allocated pages to the page cache. */ + if (vm_shared && !is_continue) { size = i_size_read(mapping->host) >> huge_page_shift(h); ret = -EFAULT; if (idx >= size) @@ -4755,8 +4962,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); } - _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE); - if (dst_vma->vm_flags & VM_WRITE) + /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */ + if (is_continue && !vm_shared) + writable = 0; + else + writable = dst_vma->vm_flags & VM_WRITE; + + _dst_pte = make_huge_pte(dst_vma, page, writable); + if (writable) _dst_pte = huge_pte_mkdirty(_dst_pte); _dst_pte = pte_mkyoung(_dst_pte); @@ -4770,20 +4983,22 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); - SetHPageMigratable(page); - if (vm_shared) + if (!is_continue) + SetHPageMigratable(page); + if (vm_shared || is_continue) unlock_page(page); ret = 0; out: return ret; out_release_unlock: spin_unlock(ptl); - if (vm_shared) + if (vm_shared || is_continue) unlock_page(page); out_release_nounlock: put_page(page); goto out; } +#endif /* CONFIG_USERFAULTFD */ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, int refs, struct page **pages, @@ -4975,14 +5190,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, return i ? i : err; } -#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE -/* - * ARCHes with special requirements for evicting HUGETLB backing TLB entries can - * implement this. - */ -#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) -#endif - unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) { @@ -5187,6 +5394,10 @@ bool hugetlb_reserve_pages(struct inode *inode, */ long rsv_adjust; + /* + * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the + * reference to h_cg->css. See comment below for detail. + */ hugetlb_cgroup_uncharge_cgroup_rsvd( hstate_index(h), (chg - add) * pages_per_huge_page(h), h_cg); @@ -5194,6 +5405,14 @@ bool hugetlb_reserve_pages(struct inode *inode, rsv_adjust = hugepage_subpool_put_pages(spool, chg - add); hugetlb_acct_memory(h, -rsv_adjust); + } else if (h_cg) { + /* + * The file_regions will hold their own reference to + * h_cg->css. So we should release the reference held + * via hugetlb_cgroup_charge_cgroup_rsvd() when we are + * done. + */ + hugetlb_cgroup_put_rsvd_cgroup(h_cg); } } return true; @@ -5247,6 +5466,9 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, /* * If the subpool has a minimum size, the number of global * reservations to be released may be adjusted. + * + * Note that !resv_map implies freed == 0. So (chg - freed) + * won't go negative. */ gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); hugetlb_acct_memory(h, -gbl_reserve); @@ -5293,6 +5515,15 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) return false; } +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) +{ +#ifdef CONFIG_USERFAULTFD + if (uffd_disable_huge_pmd_share(vma)) + return false; +#endif + return vma_shareable(vma, addr); +} + /* * Determine if start,end range within vma could be mapped by shared pmd. * If yes, adjust start and end to cover range associated with possible @@ -5337,9 +5568,9 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is * only required for subsequent processing. */ -pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) { - struct vm_area_struct *vma = find_vma(mm, addr); struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; @@ -5349,9 +5580,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) pte_t *pte; spinlock_t *ptl; - if (!vma_shareable(vma, addr)) - return (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_assert_locked(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) @@ -5415,9 +5643,10 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; return 1; } -#define want_pmd_share() (1) + #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ -pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) { return NULL; } @@ -5432,11 +5661,15 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { } -#define want_pmd_share() (0) + +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) +{ + return false; +} #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgd; @@ -5454,8 +5687,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, pte = (pte_t *)pud; } else { BUG_ON(sz != PMD_SIZE); - if (want_pmd_share() && pud_none(*pud)) - pte = huge_pmd_share(mm, addr, pud); + if (want_pmd_share(vma, addr) && pud_none(*pud)) + pte = huge_pmd_share(mm, vma, addr, pud); else pte = (pte_t *)pmd_alloc(mm, pud, addr); } @@ -5599,7 +5832,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list) { bool ret = true; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (!PageHeadHuge(page) || !HPageMigratable(page) || !get_page_unless_zero(page)) { @@ -5609,16 +5842,16 @@ bool isolate_huge_page(struct page *page, struct list_head *list) ClearHPageMigratable(page); list_move_tail(&page->lru, list); unlock: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return ret; } void putback_active_hugepage(struct page *page) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); SetHPageMigratable(page); list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); put_page(page); } @@ -5646,13 +5879,70 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) SetHPageTemporary(oldpage); ClearHPageTemporary(newpage); - spin_lock(&hugetlb_lock); + /* + * There is no need to transfer the per-node surplus state + * when we do not cross the node. + */ + if (new_nid == old_nid) + return; + spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages_node[old_nid]) { h->surplus_huge_pages_node[old_nid]--; h->surplus_huge_pages_node[new_nid]++; } - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); + } +} + +/* + * This function will unconditionally remove all the shared pmd pgtable entries + * within the specific vma for a hugetlbfs memory range. + */ +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) +{ + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); + struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; + unsigned long address, start, end; + spinlock_t *ptl; + pte_t *ptep; + + if (!(vma->vm_flags & VM_MAYSHARE)) + return; + + start = ALIGN(vma->vm_start, PUD_SIZE); + end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); + + if (start >= end) + return; + + /* + * No need to call adjust_range_if_pmd_sharing_possible(), because + * we have already done the PUD_SIZE alignment. + */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + start, end); + mmu_notifier_invalidate_range_start(&range); + i_mmap_lock_write(vma->vm_file->f_mapping); + for (address = start; address < end; address += PUD_SIZE) { + unsigned long tmp = address; + + ptep = huge_pte_offset(mm, address, sz); + if (!ptep) + continue; + ptl = huge_pte_lock(h, mm, ptep); + /* We don't want 'address' to be changed */ + huge_pmd_unshare(mm, vma, &tmp, ptep); + spin_unlock(ptl); } + flush_hugetlb_tlb_range(vma, start, end); + i_mmap_unlock_write(vma->vm_file->f_mapping); + /* + * No need to call mmu_notifier_invalidate_range(), see + * Documentation/vm/mmu_notifier.rst. + */ + mmu_notifier_invalidate_range_end(&range); } #ifdef CONFIG_CMA diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index f68b51fcda3d..5383023d0cca 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -204,11 +204,11 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) do { idx = 0; for_each_hstate(h) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_activelist, lru) hugetlb_cgroup_move_parent(idx, h_cg, page); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); idx++; } cond_resched(); @@ -391,7 +391,8 @@ void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, struct file_region *rg, - unsigned long nr_pages) + unsigned long nr_pages, + bool region_del) { if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) return; @@ -400,7 +401,12 @@ void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, !resv->reservation_counter) { page_counter_uncharge(rg->reservation_counter, nr_pages * resv->pages_per_hpage); - css_put(rg->css); + /* + * Only do css_put(rg->css) when we delete the entire region + * because one file_region must hold exactly one css reference. + */ + if (region_del) + css_put(rg->css); } } @@ -778,8 +784,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) if (hugetlb_cgroup_disabled()) return; - VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); h_cg = hugetlb_cgroup_from_page(oldhpage); h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage); set_hugetlb_cgroup(oldhpage, NULL); @@ -789,7 +794,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) set_hugetlb_cgroup(newhpage, h_cg); set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd); list_move(&newhpage->lru, &h->hugepage_activelist); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return; } diff --git a/mm/internal.h b/mm/internal.h index 1432feec62df..feeaaf06705d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -51,13 +51,12 @@ void unmap_page_range(struct mmu_gather *tlb, void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_size); -void force_page_cache_ra(struct readahead_control *, struct file_ra_state *, - unsigned long nr); +void force_page_cache_ra(struct readahead_control *, unsigned long nr); static inline void force_page_cache_readahead(struct address_space *mapping, struct file *file, pgoff_t index, unsigned long nr_to_read) { - DEFINE_READAHEAD(ractl, file, mapping, index); - force_page_cache_ra(&ractl, &file->f_ra, nr_to_read); + DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index); + force_page_cache_ra(&ractl, nr_to_read); } unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, @@ -97,6 +96,26 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } +/* + * When kernel touch the user page, the user page may be have been marked + * poison but still mapped in user space, if without this page, the kernel + * can guarantee the data integrity and operation success, the kernel is + * better to check the posion status and avoid touching it, be good not to + * panic, coredump for process fatal signal is a sample case matching this + * scenario. Or if kernel can't guarantee the data integrity, it's better + * not to call this function, let kernel touch the poison page and get to + * panic. + */ +static inline bool is_page_poisoned(struct page *page) +{ + if (PageHWPoison(page)) + return true; + else if (PageHuge(page) && PageHWPoison(compound_head(page))) + return true; + + return false; +} + extern unsigned long highest_memmap_pfn; /* @@ -126,10 +145,10 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); * family of functions. * * nodemask, migratetype and highest_zoneidx are initialized only once in - * __alloc_pages_nodemask() and then never change. + * __alloc_pages() and then never change. * * zonelist, preferred_zone and highest_zoneidx are set first in - * __alloc_pages_nodemask() for the fast path, and might be later changed + * __alloc_pages() for the fast path, and might be later changed * in __alloc_pages_slowpath(). All other functions pass the whole structure * by a const pointer. */ @@ -225,7 +244,13 @@ struct compact_control { unsigned int nr_freepages; /* Number of isolated free pages */ unsigned int nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ - unsigned long migrate_pfn; /* isolate_migratepages search base */ + /* + * Acts as an in/out parameter to page isolation for migration. + * isolate_migratepages uses it as a search base. + * isolate_migratepages_block will update the value to the next pfn + * after the last isolated one. + */ + unsigned long migrate_pfn; unsigned long fast_start_pfn; /* a pfn to start linear scan from */ struct zone *zone; unsigned long total_migrate_scanned; @@ -261,7 +286,7 @@ struct capture_control { unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn); -unsigned long +int isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); int find_suitable_fallback(struct free_area *area, unsigned int order, @@ -427,7 +452,9 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } static inline void mlock_migrate_page(struct page *new, struct page *old) { } - +static inline void vunmap_range_noflush(unsigned long start, unsigned long end) +{ +} #endif /* !CONFIG_MMU */ /* @@ -618,4 +645,21 @@ struct migration_target_control { gfp_t gfp_mask; }; +/* + * mm/vmalloc.c + */ +#ifdef CONFIG_MMU +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift); +#else +static inline +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + return -EINVAL; +} +#endif + +void vunmap_range_noflush(unsigned long start, unsigned long end); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 11c75fb07584..32e390c42c53 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -22,7 +22,7 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, unsigned long, shared.rb_subtree_last, - vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) + vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree) /* Insert node immediately after prev in the interval tree */ void vma_interval_tree_insert_after(struct vm_area_struct *node, diff --git a/mm/io-mapping.c b/mm/io-mapping.c new file mode 100644 index 000000000000..01b362799930 --- /dev/null +++ b/mm/io-mapping.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/mm.h> +#include <linux/io-mapping.h> + +/** + * io_mapping_map_user - remap an I/O mapping to userspace + * @iomap: the source io_mapping + * @vma: user vma to map to + * @addr: target user address to start at + * @pfn: physical address of kernel memory + * @size: size of map area + * + * Note: this is only safe if the mm semaphore is held when called. + */ +int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, unsigned long size) +{ + vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + + if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags)) + return -EINVAL; + + /* We rely on prevalidation of the io-mapping to skip track_pfn(). */ + return remap_pfn_range_notrack(vma, addr, pfn, size, + __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) | + (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK))); +} +EXPORT_SYMBOL_GPL(io_mapping_map_user); diff --git a/mm/ioremap.c b/mm/ioremap.c index 5fa1ab41d152..d1dcc7e744ac 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -16,237 +16,22 @@ #include "pgalloc-track.h" #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -static int __read_mostly ioremap_p4d_capable; -static int __read_mostly ioremap_pud_capable; -static int __read_mostly ioremap_pmd_capable; -static int __read_mostly ioremap_huge_disabled; +static bool __ro_after_init iomap_max_page_shift = PAGE_SHIFT; static int __init set_nohugeiomap(char *str) { - ioremap_huge_disabled = 1; + iomap_max_page_shift = P4D_SHIFT; return 0; } early_param("nohugeiomap", set_nohugeiomap); - -void __init ioremap_huge_init(void) -{ - if (!ioremap_huge_disabled) { - if (arch_ioremap_p4d_supported()) - ioremap_p4d_capable = 1; - if (arch_ioremap_pud_supported()) - ioremap_pud_capable = 1; - if (arch_ioremap_pmd_supported()) - ioremap_pmd_capable = 1; - } -} - -static inline int ioremap_p4d_enabled(void) -{ - return ioremap_p4d_capable; -} - -static inline int ioremap_pud_enabled(void) -{ - return ioremap_pud_capable; -} - -static inline int ioremap_pmd_enabled(void) -{ - return ioremap_pmd_capable; -} - -#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ -static inline int ioremap_p4d_enabled(void) { return 0; } -static inline int ioremap_pud_enabled(void) { return 0; } -static inline int ioremap_pmd_enabled(void) { return 0; } +#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ +static const bool iomap_max_page_shift = PAGE_SHIFT; #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ -static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot, - pgtbl_mod_mask *mask) -{ - pte_t *pte; - u64 pfn; - - pfn = phys_addr >> PAGE_SHIFT; - pte = pte_alloc_kernel_track(pmd, addr, mask); - if (!pte) - return -ENOMEM; - do { - BUG_ON(!pte_none(*pte)); - set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); - pfn++; - } while (pte++, addr += PAGE_SIZE, addr != end); - *mask |= PGTBL_PTE_MODIFIED; - return 0; -} - -static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, - pgprot_t prot) -{ - if (!ioremap_pmd_enabled()) - return 0; - - if ((end - addr) != PMD_SIZE) - return 0; - - if (!IS_ALIGNED(addr, PMD_SIZE)) - return 0; - - if (!IS_ALIGNED(phys_addr, PMD_SIZE)) - return 0; - - if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) - return 0; - - return pmd_set_huge(pmd, phys_addr, prot); -} - -static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot, - pgtbl_mod_mask *mask) -{ - pmd_t *pmd; - unsigned long next; - - pmd = pmd_alloc_track(&init_mm, pud, addr, mask); - if (!pmd) - return -ENOMEM; - do { - next = pmd_addr_end(addr, end); - - if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) { - *mask |= PGTBL_PMD_MODIFIED; - continue; - } - - if (ioremap_pte_range(pmd, addr, next, phys_addr, prot, mask)) - return -ENOMEM; - } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); - return 0; -} - -static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, - pgprot_t prot) -{ - if (!ioremap_pud_enabled()) - return 0; - - if ((end - addr) != PUD_SIZE) - return 0; - - if (!IS_ALIGNED(addr, PUD_SIZE)) - return 0; - - if (!IS_ALIGNED(phys_addr, PUD_SIZE)) - return 0; - - if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) - return 0; - - return pud_set_huge(pud, phys_addr, prot); -} - -static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot, - pgtbl_mod_mask *mask) -{ - pud_t *pud; - unsigned long next; - - pud = pud_alloc_track(&init_mm, p4d, addr, mask); - if (!pud) - return -ENOMEM; - do { - next = pud_addr_end(addr, end); - - if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) { - *mask |= PGTBL_PUD_MODIFIED; - continue; - } - - if (ioremap_pmd_range(pud, addr, next, phys_addr, prot, mask)) - return -ENOMEM; - } while (pud++, phys_addr += (next - addr), addr = next, addr != end); - return 0; -} - -static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, - pgprot_t prot) -{ - if (!ioremap_p4d_enabled()) - return 0; - - if ((end - addr) != P4D_SIZE) - return 0; - - if (!IS_ALIGNED(addr, P4D_SIZE)) - return 0; - - if (!IS_ALIGNED(phys_addr, P4D_SIZE)) - return 0; - - if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) - return 0; - - return p4d_set_huge(p4d, phys_addr, prot); -} - -static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot, - pgtbl_mod_mask *mask) -{ - p4d_t *p4d; - unsigned long next; - - p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); - if (!p4d) - return -ENOMEM; - do { - next = p4d_addr_end(addr, end); - - if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) { - *mask |= PGTBL_P4D_MODIFIED; - continue; - } - - if (ioremap_pud_range(p4d, addr, next, phys_addr, prot, mask)) - return -ENOMEM; - } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); - return 0; -} - int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { - pgd_t *pgd; - unsigned long start; - unsigned long next; - int err; - pgtbl_mod_mask mask = 0; - - might_sleep(); - BUG_ON(addr >= end); - - start = addr; - pgd = pgd_offset_k(addr); - do { - next = pgd_addr_end(addr, end); - err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot, - &mask); - if (err) - break; - } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); - - flush_cache_vmap(start, end); - - if (mask & ARCH_PAGE_TABLE_SYNC_MASK) - arch_sync_kernel_mappings(start, end); - - return err; + return vmap_range(addr, end, phys_addr, prot, iomap_max_page_shift); } #ifdef CONFIG_GENERIC_IOREMAP diff --git a/mm/kasan/common.c b/mm/kasan/common.c index b5e08d4cefec..6bb87f2acd4e 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -60,16 +60,16 @@ void kasan_disable_current(void) void __kasan_unpoison_range(const void *address, size_t size) { - kasan_unpoison(address, size); + kasan_unpoison(address, size, false); } -#if CONFIG_KASAN_STACK +#ifdef CONFIG_KASAN_STACK /* Unpoison the entire stack for a task. */ void kasan_unpoison_task_stack(struct task_struct *task) { void *base = task_stack_page(task); - kasan_unpoison(base, THREAD_SIZE); + kasan_unpoison(base, THREAD_SIZE, false); } /* Unpoison the stack for the current task beyond a watermark sp value. */ @@ -82,7 +82,7 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) */ void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1)); - kasan_unpoison(base, watermark - base); + kasan_unpoison(base, watermark - base, false); } #endif /* CONFIG_KASAN_STACK */ @@ -97,7 +97,7 @@ slab_flags_t __kasan_never_merge(void) return 0; } -void __kasan_alloc_pages(struct page *page, unsigned int order) +void __kasan_alloc_pages(struct page *page, unsigned int order, bool init) { u8 tag; unsigned long i; @@ -108,14 +108,14 @@ void __kasan_alloc_pages(struct page *page, unsigned int order) tag = kasan_random_tag(); for (i = 0; i < (1 << order); i++) page_kasan_tag_set(page + i, tag); - kasan_unpoison(page_address(page), PAGE_SIZE << order); + kasan_unpoison(page_address(page), PAGE_SIZE << order, init); } -void __kasan_free_pages(struct page *page, unsigned int order) +void __kasan_free_pages(struct page *page, unsigned int order, bool init) { if (likely(!PageHighMem(page))) kasan_poison(page_address(page), PAGE_SIZE << order, - KASAN_FREE_PAGE); + KASAN_FREE_PAGE, init); } /* @@ -251,18 +251,18 @@ void __kasan_poison_slab(struct page *page) for (i = 0; i < compound_nr(page); i++) page_kasan_tag_reset(page + i); kasan_poison(page_address(page), page_size(page), - KASAN_KMALLOC_REDZONE); + KASAN_KMALLOC_REDZONE, false); } void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object) { - kasan_unpoison(object, cache->object_size); + kasan_unpoison(object, cache->object_size, false); } void __kasan_poison_object_data(struct kmem_cache *cache, void *object) { kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), - KASAN_KMALLOC_REDZONE); + KASAN_KMALLOC_REDZONE, false); } /* @@ -322,8 +322,8 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, return (void *)object; } -static inline bool ____kasan_slab_free(struct kmem_cache *cache, - void *object, unsigned long ip, bool quarantine) +static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, + unsigned long ip, bool quarantine, bool init) { u8 tag; void *tagged_object; @@ -351,7 +351,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, } kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), - KASAN_KMALLOC_FREE); + KASAN_KMALLOC_FREE, init); if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine)) return false; @@ -362,9 +362,10 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, return kasan_quarantine_put(cache, object); } -bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) +bool __kasan_slab_free(struct kmem_cache *cache, void *object, + unsigned long ip, bool init) { - return ____kasan_slab_free(cache, object, ip, true); + return ____kasan_slab_free(cache, object, ip, true, init); } static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) @@ -407,9 +408,9 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) if (unlikely(!PageSlab(page))) { if (____kasan_kfree_large(ptr, ip)) return; - kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE); + kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE, false); } else { - ____kasan_slab_free(page->slab_cache, ptr, ip, false); + ____kasan_slab_free(page->slab_cache, ptr, ip, false, false); } } @@ -428,7 +429,7 @@ static void set_alloc_info(struct kmem_cache *cache, void *object, } void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, - void *object, gfp_t flags) + void *object, gfp_t flags, bool init) { u8 tag; void *tagged_object; @@ -453,7 +454,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, * Unpoison the whole object. * For kmalloc() allocations, kasan_kmalloc() will do precise poisoning. */ - kasan_unpoison(tagged_object, cache->object_size); + kasan_unpoison(tagged_object, cache->object_size, init); /* Save alloc info (if possible) for non-kmalloc() allocations. */ if (kasan_stack_collection_enabled()) @@ -496,7 +497,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache, redzone_end = round_up((unsigned long)(object + cache->object_size), KASAN_GRANULE_SIZE); kasan_poison((void *)redzone_start, redzone_end - redzone_start, - KASAN_KMALLOC_REDZONE); + KASAN_KMALLOC_REDZONE, false); /* * Save alloc info (if possible) for kmalloc() allocations. @@ -546,7 +547,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, KASAN_GRANULE_SIZE); redzone_end = (unsigned long)ptr + page_size(virt_to_page(ptr)); kasan_poison((void *)redzone_start, redzone_end - redzone_start, - KASAN_PAGE_REDZONE); + KASAN_PAGE_REDZONE, false); return (void *)ptr; } @@ -563,7 +564,7 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag * Part of it might already have been unpoisoned, but it's unknown * how big that part is. */ - kasan_unpoison(object, size); + kasan_unpoison(object, size, false); page = virt_to_head_page(object); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 2e55e0f82f39..53cbf28859b5 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -208,11 +208,11 @@ static void register_global(struct kasan_global *global) { size_t aligned_size = round_up(global->size, KASAN_GRANULE_SIZE); - kasan_unpoison(global->beg, global->size); + kasan_unpoison(global->beg, global->size, false); kasan_poison(global->beg + aligned_size, global->size_with_redzone - aligned_size, - KASAN_GLOBAL_REDZONE); + KASAN_GLOBAL_REDZONE, false); } void __asan_register_globals(struct kasan_global *globals, size_t size) @@ -292,11 +292,11 @@ void __asan_alloca_poison(unsigned long addr, size_t size) WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE)); kasan_unpoison((const void *)(addr + rounded_down_size), - size - rounded_down_size); + size - rounded_down_size, false); kasan_poison(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, - KASAN_ALLOCA_LEFT); + KASAN_ALLOCA_LEFT, false); kasan_poison(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE, - KASAN_ALLOCA_RIGHT); + KASAN_ALLOCA_RIGHT, false); } EXPORT_SYMBOL(__asan_alloca_poison); @@ -306,7 +306,7 @@ void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom) if (unlikely(!stack_top || stack_top > stack_bottom)) return; - kasan_unpoison(stack_top, stack_bottom - stack_top); + kasan_unpoison(stack_top, stack_bottom - stack_top, false); } EXPORT_SYMBOL(__asan_allocas_unpoison); diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 2aad21fda156..4004388b4e4b 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -25,6 +25,12 @@ enum kasan_arg { KASAN_ARG_ON, }; +enum kasan_arg_mode { + KASAN_ARG_MODE_DEFAULT, + KASAN_ARG_MODE_SYNC, + KASAN_ARG_MODE_ASYNC, +}; + enum kasan_arg_stacktrace { KASAN_ARG_STACKTRACE_DEFAULT, KASAN_ARG_STACKTRACE_OFF, @@ -38,6 +44,7 @@ enum kasan_arg_fault { }; static enum kasan_arg kasan_arg __ro_after_init; +static enum kasan_arg_mode kasan_arg_mode __ro_after_init; static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init; static enum kasan_arg_fault kasan_arg_fault __ro_after_init; @@ -45,6 +52,10 @@ static enum kasan_arg_fault kasan_arg_fault __ro_after_init; DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); EXPORT_SYMBOL(kasan_flag_enabled); +/* Whether the asynchronous mode is enabled. */ +bool kasan_flag_async __ro_after_init; +EXPORT_SYMBOL_GPL(kasan_flag_async); + /* Whether to collect alloc/free stack traces. */ DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace); @@ -68,6 +79,23 @@ static int __init early_kasan_flag(char *arg) } early_param("kasan", early_kasan_flag); +/* kasan.mode=sync/async */ +static int __init early_kasan_mode(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "sync")) + kasan_arg_mode = KASAN_ARG_MODE_SYNC; + else if (!strcmp(arg, "async")) + kasan_arg_mode = KASAN_ARG_MODE_ASYNC; + else + return -EINVAL; + + return 0; +} +early_param("kasan.mode", early_kasan_mode); + /* kasan.stacktrace=off/on */ static int __init early_kasan_flag_stacktrace(char *arg) { @@ -115,7 +143,15 @@ void kasan_init_hw_tags_cpu(void) return; hw_init_tags(KASAN_TAG_MAX); - hw_enable_tagging(); + + /* + * Enable async mode only when explicitly requested through + * the command line. + */ + if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC) + hw_enable_tagging_async(); + else + hw_enable_tagging_sync(); } /* kasan_init_hw_tags() is called once on boot CPU. */ @@ -132,6 +168,22 @@ void __init kasan_init_hw_tags(void) /* Enable KASAN. */ static_branch_enable(&kasan_flag_enabled); + switch (kasan_arg_mode) { + case KASAN_ARG_MODE_DEFAULT: + /* + * Default to sync mode. + * Do nothing, kasan_flag_async keeps its default value. + */ + break; + case KASAN_ARG_MODE_SYNC: + /* Do nothing, kasan_flag_async keeps its default value. */ + break; + case KASAN_ARG_MODE_ASYNC: + /* Async mode enabled. */ + kasan_flag_async = true; + break; + } + switch (kasan_arg_stacktrace) { case KASAN_ARG_STACKTRACE_DEFAULT: /* Default to enabling stack trace collection. */ @@ -194,10 +246,16 @@ void kasan_set_tagging_report_once(bool state) } EXPORT_SYMBOL_GPL(kasan_set_tagging_report_once); -void kasan_enable_tagging(void) +void kasan_enable_tagging_sync(void) +{ + hw_enable_tagging_sync(); +} +EXPORT_SYMBOL_GPL(kasan_enable_tagging_sync); + +void kasan_force_async_fault(void) { - hw_enable_tagging(); + hw_force_async_tag_fault(); } -EXPORT_SYMBOL_GPL(kasan_enable_tagging); +EXPORT_SYMBOL_GPL(kasan_force_async_fault); #endif diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 8c55634d6edd..3820ca54743b 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -7,20 +7,37 @@ #include <linux/stackdepot.h> #ifdef CONFIG_KASAN_HW_TAGS + #include <linux/static_key.h> + DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace); +extern bool kasan_flag_async __ro_after_init; + static inline bool kasan_stack_collection_enabled(void) { return static_branch_unlikely(&kasan_flag_stacktrace); } + +static inline bool kasan_async_mode_enabled(void) +{ + return kasan_flag_async; +} #else + static inline bool kasan_stack_collection_enabled(void) { return true; } + +static inline bool kasan_async_mode_enabled(void) +{ + return false; +} + #endif extern bool kasan_flag_panic __ro_after_init; +extern bool kasan_flag_async __ro_after_init; #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) @@ -146,7 +163,7 @@ struct kasan_alloc_meta { struct kasan_track alloc_track; #ifdef CONFIG_KASAN_GENERIC /* - * call_rcu() call stack is stored into struct kasan_alloc_meta. + * The auxiliary stack is stored into struct kasan_alloc_meta. * The free stack is stored into struct kasan_free_meta. */ depot_stack_handle_t aux_stack[2]; @@ -231,7 +248,7 @@ void *kasan_find_first_bad_addr(void *addr, size_t size); const char *kasan_get_bug_type(struct kasan_access_info *info); void kasan_metadata_fetch_row(char *buffer, void *row); -#if defined(CONFIG_KASAN_GENERIC) && CONFIG_KASAN_STACK +#if defined(CONFIG_KASAN_GENERIC) && defined(CONFIG_KASAN_STACK) void kasan_print_address_stack_frame(const void *addr); #else static inline void kasan_print_address_stack_frame(const void *addr) { } @@ -275,8 +292,11 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #ifdef CONFIG_KASAN_HW_TAGS -#ifndef arch_enable_tagging -#define arch_enable_tagging() +#ifndef arch_enable_tagging_sync +#define arch_enable_tagging_sync() +#endif +#ifndef arch_enable_tagging_async +#define arch_enable_tagging_async() #endif #ifndef arch_init_tags #define arch_init_tags(max_tag) @@ -284,6 +304,9 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #ifndef arch_set_tagging_report_once #define arch_set_tagging_report_once(state) #endif +#ifndef arch_force_async_tag_fault +#define arch_force_async_tag_fault() +#endif #ifndef arch_get_random_tag #define arch_get_random_tag() (0xFF) #endif @@ -291,19 +314,23 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #define arch_get_mem_tag(addr) (0xFF) #endif #ifndef arch_set_mem_tag_range -#define arch_set_mem_tag_range(addr, size, tag) ((void *)(addr)) +#define arch_set_mem_tag_range(addr, size, tag, init) ((void *)(addr)) #endif -#define hw_enable_tagging() arch_enable_tagging() +#define hw_enable_tagging_sync() arch_enable_tagging_sync() +#define hw_enable_tagging_async() arch_enable_tagging_async() #define hw_init_tags(max_tag) arch_init_tags(max_tag) #define hw_set_tagging_report_once(state) arch_set_tagging_report_once(state) +#define hw_force_async_tag_fault() arch_force_async_tag_fault() #define hw_get_random_tag() arch_get_random_tag() #define hw_get_mem_tag(addr) arch_get_mem_tag(addr) -#define hw_set_mem_tag_range(addr, size, tag) arch_set_mem_tag_range((addr), (size), (tag)) +#define hw_set_mem_tag_range(addr, size, tag, init) \ + arch_set_mem_tag_range((addr), (size), (tag), (init)) #else /* CONFIG_KASAN_HW_TAGS */ -#define hw_enable_tagging() +#define hw_enable_tagging_sync() +#define hw_enable_tagging_async() #define hw_set_tagging_report_once(state) #endif /* CONFIG_KASAN_HW_TAGS */ @@ -311,12 +338,14 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_set_tagging_report_once(bool state); -void kasan_enable_tagging(void); +void kasan_enable_tagging_sync(void); +void kasan_force_async_fault(void); #else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ static inline void kasan_set_tagging_report_once(bool state) { } -static inline void kasan_enable_tagging(void) { } +static inline void kasan_enable_tagging_sync(void) { } +static inline void kasan_force_async_fault(void) { } #endif /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ @@ -330,7 +359,7 @@ static inline u8 kasan_random_tag(void) { return 0; } #ifdef CONFIG_KASAN_HW_TAGS -static inline void kasan_poison(const void *addr, size_t size, u8 value) +static inline void kasan_poison(const void *addr, size_t size, u8 value, bool init) { addr = kasan_reset_tag(addr); @@ -343,10 +372,10 @@ static inline void kasan_poison(const void *addr, size_t size, u8 value) if (WARN_ON(size & KASAN_GRANULE_MASK)) return; - hw_set_mem_tag_range((void *)addr, size, value); + hw_set_mem_tag_range((void *)addr, size, value, init); } -static inline void kasan_unpoison(const void *addr, size_t size) +static inline void kasan_unpoison(const void *addr, size_t size, bool init) { u8 tag = get_tag(addr); @@ -360,7 +389,7 @@ static inline void kasan_unpoison(const void *addr, size_t size) return; size = round_up(size, KASAN_GRANULE_SIZE); - hw_set_mem_tag_range((void *)addr, size, tag); + hw_set_mem_tag_range((void *)addr, size, tag, init); } static inline bool kasan_byte_accessible(const void *addr) @@ -368,8 +397,7 @@ static inline bool kasan_byte_accessible(const void *addr) u8 ptr_tag = get_tag(addr); u8 mem_tag = hw_get_mem_tag((void *)addr); - return (mem_tag != KASAN_TAG_INVALID) && - (ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag); + return ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag; } #else /* CONFIG_KASAN_HW_TAGS */ @@ -379,22 +407,24 @@ static inline bool kasan_byte_accessible(const void *addr) * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE * @size - range size, must be aligned to KASAN_GRANULE_SIZE * @value - value that's written to metadata for the range + * @init - whether to initialize the memory range (only for hardware tag-based) * * The size gets aligned to KASAN_GRANULE_SIZE before marking the range. */ -void kasan_poison(const void *addr, size_t size, u8 value); +void kasan_poison(const void *addr, size_t size, u8 value, bool init); /** * kasan_unpoison - mark the memory range as accessible * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE * @size - range size, can be unaligned + * @init - whether to initialize the memory range (only for hardware tag-based) * * For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before * marking the range. * For the generic mode, the last granule of the memory range gets partially * unpoisoned based on the @size. */ -void kasan_unpoison(const void *addr, size_t size); +void kasan_unpoison(const void *addr, size_t size, bool init); bool kasan_byte_accessible(const void *addr); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 87b271206163..14bd51ea2348 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -87,7 +87,8 @@ static void start_report(unsigned long *flags) static void end_report(unsigned long *flags, unsigned long addr) { - trace_error_report_end(ERROR_DETECTOR_KASAN, addr); + if (!kasan_async_mode_enabled()) + trace_error_report_end(ERROR_DETECTOR_KASAN, addr); pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); @@ -360,6 +361,25 @@ void kasan_report_invalid_free(void *object, unsigned long ip) end_report(&flags, (unsigned long)object); } +#ifdef CONFIG_KASAN_HW_TAGS +void kasan_report_async(void) +{ + unsigned long flags; + +#if IS_ENABLED(CONFIG_KUNIT) + if (current->kunit_test) + kasan_update_kunit_status(current->kunit_test); +#endif /* IS_ENABLED(CONFIG_KUNIT) */ + + start_report(&flags); + pr_err("BUG: KASAN: invalid-access\n"); + pr_err("Asynchronous mode enabled: no access details available\n"); + pr_err("\n"); + dump_stack(); + end_report(&flags, 0); +} +#endif /* CONFIG_KASAN_HW_TAGS */ + static void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) { diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 41f374585144..139615ef326b 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -128,7 +128,7 @@ void kasan_metadata_fetch_row(char *buffer, void *row) memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); } -#if CONFIG_KASAN_STACK +#ifdef CONFIG_KASAN_STACK static bool __must_check tokenize_frame_descr(const char **frame_descr, char *token, size_t max_tok_len, unsigned long *value) @@ -148,7 +148,7 @@ static bool __must_check tokenize_frame_descr(const char **frame_descr, } /* Copy token (+ 1 byte for '\0'). */ - strlcpy(token, *frame_descr, tok_len + 1); + strscpy(token, *frame_descr, tok_len + 1); } /* Advance frame_descr past separator. */ diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 63f43443f5d7..727ad4629173 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -69,7 +69,7 @@ void *memcpy(void *dest, const void *src, size_t len) return __memcpy(dest, src, len); } -void kasan_poison(const void *addr, size_t size, u8 value) +void kasan_poison(const void *addr, size_t size, u8 value, bool init) { void *shadow_start, *shadow_end; @@ -106,7 +106,7 @@ void kasan_poison_last_granule(const void *addr, size_t size) } #endif -void kasan_unpoison(const void *addr, size_t size) +void kasan_unpoison(const void *addr, size_t size, bool init) { u8 tag = get_tag(addr); @@ -129,7 +129,7 @@ void kasan_unpoison(const void *addr, size_t size) return; /* Unpoison all granules that cover the object. */ - kasan_poison(addr, round_up(size, KASAN_GRANULE_SIZE), tag); + kasan_poison(addr, round_up(size, KASAN_GRANULE_SIZE), tag, false); /* Partially poison the last granule for the generic mode. */ if (IS_ENABLED(CONFIG_KASAN_GENERIC)) @@ -344,7 +344,7 @@ void kasan_poison_vmalloc(const void *start, unsigned long size) return; size = round_up(size, KASAN_GRANULE_SIZE); - kasan_poison(start, size, KASAN_VMALLOC_INVALID); + kasan_poison(start, size, KASAN_VMALLOC_INVALID, false); } void kasan_unpoison_vmalloc(const void *start, unsigned long size) @@ -352,7 +352,7 @@ void kasan_unpoison_vmalloc(const void *start, unsigned long size) if (!is_vmalloc_or_module_addr(start)) return; - kasan_unpoison(start, size); + kasan_unpoison(start, size, false); } static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index 94c2d33be333..9df8e7f69e87 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -121,10 +121,14 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write, bool kasan_byte_accessible(const void *addr) { u8 tag = get_tag(addr); - u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr))); + void *untagged_addr = kasan_reset_tag(addr); + u8 shadow_byte; - return (shadow_byte != KASAN_TAG_INVALID) && - (tag == KASAN_TAG_KERNEL || tag == shadow_byte); + if (untagged_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) + return false; + + shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(untagged_addr)); + return tag == KASAN_TAG_KERNEL || tag == shadow_byte; } #define DEFINE_HWASAN_LOAD_STORE(size) \ @@ -159,7 +163,7 @@ EXPORT_SYMBOL(__hwasan_storeN_noabort); void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size) { - kasan_poison((void *)addr, size, tag); + kasan_poison((void *)addr, size, tag, false); } EXPORT_SYMBOL(__hwasan_tag_memory); diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 3b8ec938470a..e18fbbd5d9b4 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -10,14 +10,17 @@ #include <linux/atomic.h> #include <linux/bug.h> #include <linux/debugfs.h> +#include <linux/irq_work.h> #include <linux/kcsan-checks.h> #include <linux/kfence.h> +#include <linux/kmemleak.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/memblock.h> #include <linux/moduleparam.h> #include <linux/random.h> #include <linux/rcupdate.h> +#include <linux/sched/sysctl.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -371,6 +374,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z /* Restore page protection if there was an OOB access. */ if (meta->unprotected_page) { + memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE); kfence_protect(meta->unprotected_page); meta->unprotected_page = 0; } @@ -480,6 +484,14 @@ static bool __init kfence_init_pool(void) addr += 2 * PAGE_SIZE; } + /* + * The pool is live and will never be deallocated from this point on. + * Remove the pool object from the kmemleak object tree, as it would + * otherwise overlap with allocations returned by kfence_alloc(), which + * are registered with kmemleak through the slab post-alloc hook. + */ + kmemleak_free(__kfence_pool); + return true; err: @@ -577,6 +589,17 @@ late_initcall(kfence_debugfs_init); /* === Allocation Gate Timer ================================================ */ +#ifdef CONFIG_KFENCE_STATIC_KEYS +/* Wait queue to wake up allocation-gate timer task. */ +static DECLARE_WAIT_QUEUE_HEAD(allocation_wait); + +static void wake_up_kfence_timer(struct irq_work *work) +{ + wake_up(&allocation_wait); +} +static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer); +#endif + /* * Set up delayed work, which will enable and disable the static key. We need to * use a work queue (rather than a simple timer), since enabling and disabling a @@ -594,29 +617,27 @@ static void toggle_allocation_gate(struct work_struct *work) if (!READ_ONCE(kfence_enabled)) return; - /* Enable static key, and await allocation to happen. */ atomic_set(&kfence_allocation_gate, 0); #ifdef CONFIG_KFENCE_STATIC_KEYS + /* Enable static key, and await allocation to happen. */ static_branch_enable(&kfence_allocation_key); - /* - * Await an allocation. Timeout after 1 second, in case the kernel stops - * doing allocations, to avoid stalling this worker task for too long. - */ - { - unsigned long end_wait = jiffies + HZ; - - do { - set_current_state(TASK_UNINTERRUPTIBLE); - if (atomic_read(&kfence_allocation_gate) != 0) - break; - schedule_timeout(1); - } while (time_before(jiffies, end_wait)); - __set_current_state(TASK_RUNNING); + + if (sysctl_hung_task_timeout_secs) { + /* + * During low activity with no allocations we might wait a + * while; let's avoid the hung task warning. + */ + wait_event_timeout(allocation_wait, atomic_read(&kfence_allocation_gate), + sysctl_hung_task_timeout_secs * HZ / 2); + } else { + wait_event(allocation_wait, atomic_read(&kfence_allocation_gate)); } + /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); #endif - schedule_delayed_work(&kfence_timer, msecs_to_jiffies(kfence_sample_interval)); + queue_delayed_work(system_power_efficient_wq, &kfence_timer, + msecs_to_jiffies(kfence_sample_interval)); } static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate); @@ -645,7 +666,7 @@ void __init kfence_init(void) } WRITE_ONCE(kfence_enabled, true); - schedule_delayed_work(&kfence_timer, 0); + queue_delayed_work(system_power_efficient_wq, &kfence_timer, 0); pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool, (void *)(__kfence_pool + KFENCE_POOL_SIZE)); @@ -719,6 +740,19 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) */ if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1) return NULL; +#ifdef CONFIG_KFENCE_STATIC_KEYS + /* + * waitqueue_active() is fully ordered after the update of + * kfence_allocation_gate per atomic_inc_return(). + */ + if (waitqueue_active(&allocation_wait)) { + /* + * Calling wake_up() here may deadlock when allocations happen + * from within timer code. Use an irq_work to defer it. + */ + irq_work_queue(&wake_up_kfence_timer_work); + } +#endif if (!READ_ONCE(kfence_enabled)) return NULL; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a7d6cb912b05..ea74da3232ab 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -481,7 +481,7 @@ int __khugepaged_enter(struct mm_struct *mm) return -ENOMEM; /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm); + VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { free_mm_slot(mm_slot); return 0; @@ -716,17 +716,17 @@ next: if (pte_write(pteval)) writable = true; } - if (likely(writable)) { - if (likely(referenced)) { - result = SCAN_SUCCEED; - trace_mm_collapse_huge_page_isolate(page, none_or_zero, - referenced, writable, result); - return 1; - } - } else { + + if (unlikely(!writable)) { result = SCAN_PAGE_RO; + } else if (unlikely(!referenced)) { + result = SCAN_LACK_REFERENCED_PAGE; + } else { + result = SCAN_SUCCEED; + trace_mm_collapse_huge_page_isolate(page, none_or_zero, + referenced, writable, result); + return 1; } - out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(page, none_or_zero, @@ -809,7 +809,7 @@ static bool khugepaged_scan_abort(int nid) * If node_reclaim_mode is disabled, then no extra effort is made to * allocate memory locally. */ - if (!node_reclaim_mode) + if (!node_reclaim_enabled()) return false; /* If there is a count for this node already, it must be acceptable */ @@ -1128,10 +1128,10 @@ static void collapse_huge_page(struct mm_struct *mm, mmap_write_lock(mm); result = hugepage_vma_revalidate(mm, address, &vma); if (result) - goto out; + goto out_up_write; /* check if the pmd is still valid */ if (mm_find_pmd(mm, address) != pmd) - goto out; + goto out_up_write; anon_vma_lock_write(vma->anon_vma); @@ -1171,7 +1171,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); result = SCAN_FAIL; - goto out; + goto out_up_write; } /* @@ -1183,19 +1183,18 @@ static void collapse_huge_page(struct mm_struct *mm, __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, &compound_pagelist); pte_unmap(pte); + /* + * spin_lock() below is not the equivalent of smp_wmb(), but + * the smp_wmb() inside __SetPageUptodate() can be reused to + * avoid the copy_huge_page writes to become visible after + * the set_pmd_at() write. + */ __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); - /* - * spin_lock() below is not the equivalent of smp_wmb(), so - * this is needed to avoid the copy_huge_page writes to become - * visible after the set_pmd_at() write. - */ - smp_wmb(); - spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address, true); @@ -1216,8 +1215,6 @@ out_nolock: mem_cgroup_uncharge(*hpage); trace_mm_collapse_huge_page(mm, isolated, result); return; -out: - goto out_up_write; } static int khugepaged_scan_pmd(struct mm_struct *mm, @@ -1274,10 +1271,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out_unmap; } } - if (!pte_present(pteval)) { - result = SCAN_PTE_NON_PRESENT; - goto out_unmap; - } if (pte_uffd_wp(pteval)) { /* * Don't collapse the page if any of the small @@ -1447,7 +1440,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) int i; if (!vma || !vma->vm_file || - vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE) + !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) return; /* @@ -1533,16 +1526,16 @@ abort: goto drop_hpage; } -static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) { struct mm_struct *mm = mm_slot->mm; int i; if (likely(mm_slot->nr_pte_mapped_thp == 0)) - return 0; + return; if (!mmap_write_trylock(mm)) - return -EBUSY; + return; if (unlikely(khugepaged_test_exit(mm))) goto out; @@ -1553,7 +1546,6 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) out: mm_slot->nr_pte_mapped_thp = 0; mmap_write_unlock(mm); - return 0; } static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) @@ -2057,9 +2049,8 @@ static void khugepaged_scan_file(struct mm_struct *mm, BUILD_BUG(); } -static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) { - return 0; } #endif @@ -2205,11 +2196,9 @@ static void khugepaged_do_scan(void) { struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; - unsigned int pages = khugepaged_pages_to_scan; + unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); bool wait = true; - barrier(); /* write khugepaged_pages_to_scan to local stack */ - lru_add_drain_all(); while (progress < pages) { diff --git a/mm/kmemleak.c b/mm/kmemleak.c index c0014d3b91c1..92a2d4885808 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -97,6 +97,7 @@ #include <linux/atomic.h> #include <linux/kasan.h> +#include <linux/kfence.h> #include <linux/kmemleak.h> #include <linux/memory_hotplug.h> @@ -589,7 +590,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, atomic_set(&object->use_count, 1); object->flags = OBJECT_ALLOCATED; object->pointer = ptr; - object->size = size; + object->size = kfence_ksize((void *)ptr) ?: size; object->excess_ref = 0; object->min_count = min_count; object->count = 0; /* white color initially */ @@ -1202,7 +1203,7 @@ static void update_refs(struct kmemleak_object *object) } /* - * Memory scanning is a long process and it needs to be interruptable. This + * Memory scanning is a long process and it needs to be interruptible. This * function checks whether such interrupt condition occurred. */ static int scan_should_stop(void) @@ -215,8 +215,6 @@ struct rmap_item { #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ #define STABLE_FLAG 0x200 /* is listed from the stable tree */ -#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG) - /* to mask all the flags */ /* The stable and unstable tree heads */ static struct rb_root one_stable_tree[1] = { RB_ROOT }; @@ -778,12 +776,11 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) struct page *page; stable_node = rmap_item->head; - page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); + page = get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK); if (!page) goto out; hlist_del(&rmap_item->hlist); - unlock_page(page); put_page(page); if (!hlist_empty(&stable_node->hlist)) @@ -794,6 +791,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); + rmap_item->head = NULL; rmap_item->address &= PAGE_MASK; } else if (rmap_item->address & UNSTABLE_FLAG) { @@ -817,8 +815,7 @@ out: cond_resched(); /* we're called from many long loops */ } -static void remove_trailing_rmap_items(struct mm_slot *mm_slot, - struct rmap_item **rmap_list) +static void remove_trailing_rmap_items(struct rmap_item **rmap_list) { while (*rmap_list) { struct rmap_item *rmap_item = *rmap_list; @@ -989,7 +986,7 @@ static int unmerge_and_remove_all_rmap_items(void) goto error; } - remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); + remove_trailing_rmap_items(&mm_slot->rmap_list); mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); @@ -1771,7 +1768,6 @@ chain_append: * stable_node_dup is the dup to replace. */ if (stable_node_dup == stable_node) { - VM_BUG_ON(is_stable_node_chain(stable_node_dup)); VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* chain is missing so create it */ stable_node = alloc_stable_node_chain(stable_node_dup, @@ -1785,7 +1781,6 @@ chain_append: * of the current nid for this page * content. */ - VM_BUG_ON(!is_stable_node_chain(stable_node)); VM_BUG_ON(!is_stable_node_dup(stable_node_dup)); VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); @@ -2337,7 +2332,7 @@ next_mm: * Nuke all the rmap_items that are above this current rmap: * because there were no VM_MERGEABLE vmas with such addresses. */ - remove_trailing_rmap_items(slot, ksm_scan.rmap_list); + remove_trailing_rmap_items(ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(slot->mm_list.next, @@ -2634,7 +2629,7 @@ again: vma = vmac->vma; /* Ignore the stable/unstable/sqnr flags */ - addr = rmap_item->address & ~KSM_FLAG_MASK; + addr = rmap_item->address & PAGE_MASK; if (addr < vma->vm_start || addr >= vma->vm_end) continue; diff --git a/mm/list_lru.c b/mm/list_lru.c index 6f067b6b935f..cd58790d0fb3 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -125,8 +125,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) - memcg_set_shrinker_bit(memcg, nid, - lru_shrinker_id(lru)); + set_shrinker_bit(memcg, nid, + lru_shrinker_id(lru)); nlru->nr_items++; spin_unlock(&nlru->lock); return true; @@ -540,7 +540,7 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid, if (src->nr_items) { dst->nr_items += src->nr_items; - memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); + set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); src->nr_items = 0; } diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index b59054ef2e10..b890854ec761 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -165,10 +165,12 @@ static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end, return 0; } +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD /* Huge pud */ walk->action = ACTION_CONTINUE; if (pud_trans_huge(pudval) || pud_devmap(pudval)) WARN_ON(pud_write(pudval) || pud_dirty(pudval)); +#endif return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e064ac0d850a..3004afb6d090 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -255,10 +255,8 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) #ifdef CONFIG_MEMCG_KMEM extern spinlock_t css_set_lock; -static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, - unsigned int nr_pages); -static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, - unsigned int nr_pages); +static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, + unsigned int nr_pages); static void obj_cgroup_release(struct percpu_ref *ref) { @@ -295,7 +293,7 @@ static void obj_cgroup_release(struct percpu_ref *ref) spin_lock_irqsave(&css_set_lock, flags); memcg = obj_cgroup_memcg(objcg); if (nr_pages) - __memcg_kmem_uncharge(memcg, nr_pages); + obj_cgroup_uncharge_pages(objcg, nr_pages); list_del(&objcg->list); mem_cgroup_put(memcg); spin_unlock_irqrestore(&css_set_lock, flags); @@ -402,129 +400,6 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); #endif -static int memcg_shrinker_map_size; -static DEFINE_MUTEX(memcg_shrinker_map_mutex); - -static void memcg_free_shrinker_map_rcu(struct rcu_head *head) -{ - kvfree(container_of(head, struct memcg_shrinker_map, rcu)); -} - -static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, - int size, int old_size) -{ - struct memcg_shrinker_map *new, *old; - int nid; - - lockdep_assert_held(&memcg_shrinker_map_mutex); - - for_each_node(nid) { - old = rcu_dereference_protected( - mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true); - /* Not yet online memcg */ - if (!old) - return 0; - - new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); - if (!new) - return -ENOMEM; - - /* Set all old bits, clear all new bits */ - memset(new->map, (int)0xff, old_size); - memset((void *)new->map + old_size, 0, size - old_size); - - rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new); - call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); - } - - return 0; -} - -static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) -{ - struct mem_cgroup_per_node *pn; - struct memcg_shrinker_map *map; - int nid; - - if (mem_cgroup_is_root(memcg)) - return; - - for_each_node(nid) { - pn = mem_cgroup_nodeinfo(memcg, nid); - map = rcu_dereference_protected(pn->shrinker_map, true); - kvfree(map); - rcu_assign_pointer(pn->shrinker_map, NULL); - } -} - -static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) -{ - struct memcg_shrinker_map *map; - int nid, size, ret = 0; - - if (mem_cgroup_is_root(memcg)) - return 0; - - mutex_lock(&memcg_shrinker_map_mutex); - size = memcg_shrinker_map_size; - for_each_node(nid) { - map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); - if (!map) { - memcg_free_shrinker_maps(memcg); - ret = -ENOMEM; - break; - } - rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); - } - mutex_unlock(&memcg_shrinker_map_mutex); - - return ret; -} - -int memcg_expand_shrinker_maps(int new_id) -{ - int size, old_size, ret = 0; - struct mem_cgroup *memcg; - - size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); - old_size = memcg_shrinker_map_size; - if (size <= old_size) - return 0; - - mutex_lock(&memcg_shrinker_map_mutex); - if (!root_mem_cgroup) - goto unlock; - - for_each_mem_cgroup(memcg) { - if (mem_cgroup_is_root(memcg)) - continue; - ret = memcg_expand_one_shrinker_map(memcg, size, old_size); - if (ret) { - mem_cgroup_iter_break(NULL, memcg); - goto unlock; - } - } -unlock: - if (!ret) - memcg_shrinker_map_size = size; - mutex_unlock(&memcg_shrinker_map_mutex); - return ret; -} - -void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) -{ - if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { - struct memcg_shrinker_map *map; - - rcu_read_lock(); - map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); - /* Pairs with smp mb in shrink_slab() */ - smp_mb__before_atomic(); - set_bit(shrinker_id, map->map); - rcu_read_unlock(); - } -} - /** * mem_cgroup_css_from_page - css of the memcg associated with a page * @page: page of interest @@ -713,7 +588,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) int nid; for_each_node(nid) { - mz = mem_cgroup_nodeinfo(memcg, nid); + mz = memcg->nodeinfo[nid]; mctz = soft_limit_tree_node(nid); if (mctz) mem_cgroup_remove_exceeded(mz, mctz); @@ -764,28 +639,37 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) */ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) { - long x, threshold = MEMCG_CHARGE_BATCH; - if (mem_cgroup_disabled()) return; - if (memcg_stat_item_in_bytes(idx)) - threshold <<= PAGE_SHIFT; + __this_cpu_add(memcg->vmstats_percpu->state[idx], val); + cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); +} - x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); - if (unlikely(abs(x) > threshold)) { - struct mem_cgroup *mi; +/* idx can be of type enum memcg_stat_item or node_stat_item. */ +static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = READ_ONCE(memcg->vmstats.state[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} - /* - * Batch local counters to keep them in sync with - * the hierarchical ones. - */ - __this_cpu_add(memcg->vmstats_local->stat[idx], x); - for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - atomic_long_add(x, &mi->vmstats[idx]); +/* idx can be of type enum memcg_stat_item or node_stat_item. */ +static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) +{ + long x = 0; + int cpu; + + for_each_possible_cpu(cpu) + x += per_cpu(memcg->vmstats_percpu->state[idx], cpu); +#ifdef CONFIG_SMP + if (x < 0) x = 0; - } - __this_cpu_write(memcg->vmstats_percpu->stat[idx], x); +#endif + return x; } static struct mem_cgroup_per_node * @@ -796,7 +680,7 @@ parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) parent = parent_mem_cgroup(pn->memcg); if (!parent) return NULL; - return mem_cgroup_nodeinfo(parent, nid); + return parent->nodeinfo[nid]; } void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, @@ -855,18 +739,22 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { struct page *head = compound_head(page); /* rmap on tail pages */ - struct mem_cgroup *memcg = page_memcg(head); + struct mem_cgroup *memcg; pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; + rcu_read_lock(); + memcg = page_memcg(head); /* Untracked pages have no memcg, no lruvec. Update only the node */ if (!memcg) { + rcu_read_unlock(); __mod_node_page_state(pgdat, idx, val); return; } lruvec = mem_cgroup_lruvec(memcg, pgdat); __mod_lruvec_state(lruvec, idx, val); + rcu_read_unlock(); } EXPORT_SYMBOL(__mod_lruvec_page_state); @@ -903,30 +791,16 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { - unsigned long x; - if (mem_cgroup_disabled()) return; - x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); - if (unlikely(x > MEMCG_CHARGE_BATCH)) { - struct mem_cgroup *mi; - - /* - * Batch local counters to keep them in sync with - * the hierarchical ones. - */ - __this_cpu_add(memcg->vmstats_local->events[idx], x); - for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - atomic_long_add(x, &mi->vmevents[idx]); - x = 0; - } - __this_cpu_write(memcg->vmstats_percpu->events[idx], x); + __this_cpu_add(memcg->vmstats_percpu->events[idx], count); + cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) { - return atomic_long_read(&memcg->vmevents[event]); + return READ_ONCE(memcg->vmstats.events[event]); } static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) @@ -935,7 +809,7 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) int cpu; for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_local->events[event], cpu); + x += per_cpu(memcg->vmstats_percpu->events[event], cpu); return x; } @@ -1055,20 +929,6 @@ static __always_inline struct mem_cgroup *active_memcg(void) return current->active_memcg; } -static __always_inline struct mem_cgroup *get_active_memcg(void) -{ - struct mem_cgroup *memcg; - - rcu_read_lock(); - memcg = active_memcg(); - /* remote memcg must hold a ref. */ - if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css))) - memcg = root_mem_cgroup; - rcu_read_unlock(); - - return memcg; -} - static __always_inline bool memcg_kmem_bypass(void) { /* Allow remote memcg charging from any context. */ @@ -1083,20 +943,6 @@ static __always_inline bool memcg_kmem_bypass(void) } /** - * If active memcg is set, do not fallback to current->mm->memcg. - */ -static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) -{ - if (memcg_kmem_bypass()) - return NULL; - - if (unlikely(active_memcg())) - return get_active_memcg(); - - return get_mem_cgroup_from_mm(current->mm); -} - -/** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root * @prev: previously returned memcg, NULL on first invocation @@ -1136,7 +982,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (reclaim) { struct mem_cgroup_per_node *mz; - mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); + mz = root->nodeinfo[reclaim->pgdat->node_id]; iter = &mz->iter; if (prev && reclaim->generation != iter->generation) @@ -1238,7 +1084,7 @@ static void __invalidate_reclaim_iterators(struct mem_cgroup *from, int nid; for_each_node(nid) { - mz = mem_cgroup_nodeinfo(from, nid); + mz = from->nodeinfo[nid]; iter = &mz->iter; cmpxchg(&iter->position, dead_memcg, NULL); } @@ -1571,6 +1417,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg) * * Current memory state: */ + cgroup_rstat_flush(memcg->css.cgroup); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { u64 size; @@ -2118,11 +1965,10 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) * This function protects unlocked LRU pages from being moved to * another cgroup. * - * It ensures lifetime of the returned memcg. Caller is responsible - * for the lifetime of the page; __unlock_page_memcg() is available - * when @page might get freed inside the locked section. + * It ensures lifetime of the locked memcg. Caller is responsible + * for the lifetime of the page. */ -struct mem_cgroup *lock_page_memcg(struct page *page) +void lock_page_memcg(struct page *page) { struct page *head = compound_head(page); /* rmap on tail pages */ struct mem_cgroup *memcg; @@ -2132,21 +1978,15 @@ struct mem_cgroup *lock_page_memcg(struct page *page) * The RCU lock is held throughout the transaction. The fast * path can get away without acquiring the memcg->move_lock * because page moving starts with an RCU grace period. - * - * The RCU lock also protects the memcg from being freed when - * the page state that is going to change is the only thing - * preventing the page itself from being freed. E.g. writeback - * doesn't hold a page reference and relies on PG_writeback to - * keep off truncation, migration and so forth. */ rcu_read_lock(); if (mem_cgroup_disabled()) - return NULL; + return; again: memcg = page_memcg(head); if (unlikely(!memcg)) - return NULL; + return; #ifdef CONFIG_PROVE_LOCKING local_irq_save(flags); @@ -2155,7 +1995,7 @@ again: #endif if (atomic_read(&memcg->moving_account) <= 0) - return memcg; + return; spin_lock_irqsave(&memcg->move_lock, flags); if (memcg != page_memcg(head)) { @@ -2164,24 +2004,17 @@ again: } /* - * When charge migration first begins, we can have locked and - * unlocked page stat updates happening concurrently. Track - * the task who has the lock for unlock_page_memcg(). + * When charge migration first begins, we can have multiple + * critical sections holding the fast-path RCU lock and one + * holding the slowpath move_lock. Track the task who has the + * move_lock for unlock_page_memcg(). */ memcg->move_lock_task = current; memcg->move_lock_flags = flags; - - return memcg; } EXPORT_SYMBOL(lock_page_memcg); -/** - * __unlock_page_memcg - unlock and unpin a memcg - * @memcg: the memcg - * - * Unlock and unpin a memcg returned by lock_page_memcg(). - */ -void __unlock_page_memcg(struct mem_cgroup *memcg) +static void __unlock_page_memcg(struct mem_cgroup *memcg) { if (memcg && memcg->move_lock_task == current) { unsigned long flags = memcg->move_lock_flags; @@ -2381,50 +2214,39 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) mutex_unlock(&percpu_charge_mutex); } -static int memcg_hotplug_cpu_dead(unsigned int cpu) +static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu) { - struct memcg_stock_pcp *stock; - struct mem_cgroup *memcg, *mi; - - stock = &per_cpu(memcg_stock, cpu); - drain_stock(stock); + int nid; - for_each_mem_cgroup(memcg) { + for_each_node(nid) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + unsigned long stat[NR_VM_NODE_STAT_ITEMS]; + struct batched_lruvec_stat *lstatc; int i; - for (i = 0; i < MEMCG_NR_STAT; i++) { - int nid; - long x; - - x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0); - if (x) - for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - atomic_long_add(x, &memcg->vmstats[i]); - - if (i >= NR_VM_NODE_STAT_ITEMS) - continue; + lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu); + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + stat[i] = lstatc->count[i]; + lstatc->count[i] = 0; + } - for_each_node(nid) { - struct mem_cgroup_per_node *pn; + do { + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + atomic_long_add(stat[i], &pn->lruvec_stat[i]); + } while ((pn = parent_nodeinfo(pn, nid))); + } +} - pn = mem_cgroup_nodeinfo(memcg, nid); - x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); - if (x) - do { - atomic_long_add(x, &pn->lruvec_stat[i]); - } while ((pn = parent_nodeinfo(pn, nid))); - } - } +static int memcg_hotplug_cpu_dead(unsigned int cpu) +{ + struct memcg_stock_pcp *stock; + struct mem_cgroup *memcg; - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { - long x; + stock = &per_cpu(memcg_stock, cpu); + drain_stock(stock); - x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0); - if (x) - for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - atomic_long_add(x, &memcg->vmevents[i]); - } - } + for_each_mem_cgroup(memcg) + memcg_flush_lruvec_page_state(memcg, cpu); return 0; } @@ -2793,9 +2615,6 @@ retry: if (gfp_mask & __GFP_RETRY_MAYFAIL) goto nomem; - if (gfp_mask & __GFP_NOFAIL) - goto force; - if (fatal_signal_pending(current)) goto force; @@ -2905,6 +2724,20 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) page->memcg_data = (unsigned long)memcg; } +static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) +{ + struct mem_cgroup *memcg; + + rcu_read_lock(); +retry: + memcg = obj_cgroup_memcg(objcg); + if (unlikely(!css_tryget(&memcg->css))) + goto retry; + rcu_read_unlock(); + + return memcg; +} + #ifdef CONFIG_MEMCG_KMEM int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, gfp_t gfp, bool new_page) @@ -3056,23 +2889,45 @@ static void memcg_free_cache_id(int id) ida_simple_remove(&memcg_cache_ida, id); } -/** - * __memcg_kmem_charge: charge a number of kernel pages to a memcg - * @memcg: memory cgroup to charge +/* + * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg + * @objcg: object cgroup to uncharge + * @nr_pages: number of pages to uncharge + */ +static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, + unsigned int nr_pages) +{ + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(objcg); + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->kmem, nr_pages); + refill_stock(memcg, nr_pages); + + css_put(&memcg->css); +} + +/* + * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg + * @objcg: object cgroup to charge * @gfp: reclaim mode * @nr_pages: number of pages to charge * * Returns 0 on success, an error code on failure. */ -static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, - unsigned int nr_pages) +static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, + unsigned int nr_pages) { struct page_counter *counter; + struct mem_cgroup *memcg; int ret; + memcg = get_mem_cgroup_from_objcg(objcg); + ret = try_charge(memcg, gfp, nr_pages); if (ret) - return ret; + goto out; if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { @@ -3084,25 +2939,15 @@ static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, */ if (gfp & __GFP_NOFAIL) { page_counter_charge(&memcg->kmem, nr_pages); - return 0; + goto out; } cancel_charge(memcg, nr_pages); - return -ENOMEM; + ret = -ENOMEM; } - return 0; -} - -/** - * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg - * @memcg: memcg to uncharge - * @nr_pages: number of pages to uncharge - */ -static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) -{ - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->kmem, nr_pages); +out: + css_put(&memcg->css); - refill_stock(memcg, nr_pages); + return ret; } /** @@ -3115,18 +2960,18 @@ static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_page */ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { - struct mem_cgroup *memcg; + struct obj_cgroup *objcg; int ret = 0; - memcg = get_mem_cgroup_from_current(); - if (memcg && !mem_cgroup_is_root(memcg)) { - ret = __memcg_kmem_charge(memcg, gfp, 1 << order); + objcg = get_obj_cgroup_from_current(); + if (objcg) { + ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); if (!ret) { - page->memcg_data = (unsigned long)memcg | + page->memcg_data = (unsigned long)objcg | MEMCG_DATA_KMEM; return 0; } - css_put(&memcg->css); + obj_cgroup_put(objcg); } return ret; } @@ -3138,16 +2983,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) */ void __memcg_kmem_uncharge_page(struct page *page, int order) { - struct mem_cgroup *memcg = page_memcg(page); + struct obj_cgroup *objcg; unsigned int nr_pages = 1 << order; - if (!memcg) + if (!PageMemcgKmem(page)) return; - VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - __memcg_kmem_uncharge(memcg, nr_pages); + objcg = __page_objcg(page); + obj_cgroup_uncharge_pages(objcg, nr_pages); page->memcg_data = 0; - css_put(&memcg->css); + obj_cgroup_put(objcg); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) @@ -3180,11 +3025,8 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock) unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); - if (nr_pages) { - rcu_read_lock(); - __memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages); - rcu_read_unlock(); - } + if (nr_pages) + obj_cgroup_uncharge_pages(old, nr_pages); /* * The leftover is flushed to the centralized per-memcg value. @@ -3242,7 +3084,6 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) { - struct mem_cgroup *memcg; unsigned int nr_pages, nr_bytes; int ret; @@ -3259,24 +3100,16 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) * refill_obj_stock(), called from this function or * independently later. */ - rcu_read_lock(); -retry: - memcg = obj_cgroup_memcg(objcg); - if (unlikely(!css_tryget(&memcg->css))) - goto retry; - rcu_read_unlock(); - nr_pages = size >> PAGE_SHIFT; nr_bytes = size & (PAGE_SIZE - 1); if (nr_bytes) nr_pages += 1; - ret = __memcg_kmem_charge(memcg, gfp, nr_pages); + ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); if (!ret && nr_bytes) refill_obj_stock(objcg, PAGE_SIZE - nr_bytes); - css_put(&memcg->css); return ret; } @@ -3300,7 +3133,11 @@ void split_page_memcg(struct page *head, unsigned int nr) for (i = 1; i < nr; i++) head[i].memcg_data = head->memcg_data; - css_get_many(&memcg->css, nr - 1); + + if (PageMemcgKmem(head)) + obj_cgroup_get_many(__page_objcg(head), nr - 1); + else + css_get_many(&memcg->css, nr - 1); } #ifdef CONFIG_MEMCG_SWAP @@ -3549,6 +3386,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) unsigned long val; if (mem_cgroup_is_root(memcg)) { + cgroup_rstat_flush(memcg->css.cgroup); val = memcg_page_state(memcg, NR_FILE_PAGES) + memcg_page_state(memcg, NR_ANON_MAPPED); if (swap) @@ -3613,57 +3451,6 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } -static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) -{ - unsigned long stat[MEMCG_NR_STAT] = {0}; - struct mem_cgroup *mi; - int node, cpu, i; - - for_each_online_cpu(cpu) - for (i = 0; i < MEMCG_NR_STAT; i++) - stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu); - - for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - for (i = 0; i < MEMCG_NR_STAT; i++) - atomic_long_add(stat[i], &mi->vmstats[i]); - - for_each_node(node) { - struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; - struct mem_cgroup_per_node *pi; - - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - stat[i] = 0; - - for_each_online_cpu(cpu) - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - stat[i] += per_cpu( - pn->lruvec_stat_cpu->count[i], cpu); - - for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - atomic_long_add(stat[i], &pi->lruvec_stat[i]); - } -} - -static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg) -{ - unsigned long events[NR_VM_EVENT_ITEMS]; - struct mem_cgroup *mi; - int cpu, i; - - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - events[i] = 0; - - for_each_online_cpu(cpu) - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - events[i] += per_cpu(memcg->vmstats_percpu->events[i], - cpu); - - for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - atomic_long_add(events[i], &mi->vmevents[i]); -} - #ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { @@ -3980,6 +3767,8 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) int nid; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + cgroup_rstat_flush(memcg->css.cgroup); + for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { seq_printf(m, "%s=%lu", stat->name, mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, @@ -4050,6 +3839,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); + cgroup_rstat_flush(memcg->css.cgroup); + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; @@ -4108,7 +3899,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) unsigned long file_cost = 0; for_each_online_pgdat(pgdat) { - mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + mz = memcg->nodeinfo[pgdat->node_id]; anon_cost += mz->lruvec.anon_cost; file_cost += mz->lruvec.file_cost; @@ -4137,7 +3928,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, if (val > 100) return -EINVAL; - if (css->parent) + if (!mem_cgroup_is_root(memcg)) memcg->swappiness = val; else vm_swappiness = val; @@ -4487,7 +4278,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, struct mem_cgroup *memcg = mem_cgroup_from_css(css); /* cannot set to root cgroup and only 0 and 1 are allowed */ - if (!css->parent || !((val == 0) || (val == 1))) + if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) return -EINVAL; memcg->oom_kill_disable = val; @@ -4526,22 +4317,6 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) return &memcg->cgwb_domain; } -/* - * idx can be of type enum memcg_stat_item or node_stat_item. - * Keep in sync with memcg_exact_page(). - */ -static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx) -{ - long x = atomic_long_read(&memcg->vmstats[idx]); - int cpu; - - for_each_online_cpu(cpu) - x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx]; - if (x < 0) - x = 0; - return x; -} - /** * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg * @wb: bdi_writeback in question @@ -4567,13 +4342,14 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); + cgroup_rstat_flush_irqsafe(memcg->css.cgroup); - *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); - *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + - memcg_exact_page_state(memcg, NR_ACTIVE_FILE); - *pheadroom = PAGE_COUNTER_MAX; + *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); + *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); + *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) + + memcg_page_state(memcg, NR_ACTIVE_FILE); + *pheadroom = PAGE_COUNTER_MAX; while ((parent = parent_mem_cgroup(memcg))) { unsigned long ceiling = min(READ_ONCE(memcg->memory.max), READ_ONCE(memcg->memory.high)); @@ -5205,19 +4981,20 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); - free_percpu(memcg->vmstats_local); kfree(memcg); } static void mem_cgroup_free(struct mem_cgroup *memcg) { + int cpu; + memcg_wb_domain_exit(memcg); /* - * Flush percpu vmstats and vmevents to guarantee the value correctness - * on parent's and all ancestor levels. + * Flush percpu lruvec stats to guarantee the value + * correctness on parent's and all ancestor levels. */ - memcg_flush_percpu_vmstats(memcg); - memcg_flush_percpu_vmevents(memcg); + for_each_online_cpu(cpu) + memcg_flush_lruvec_page_state(memcg, cpu); __mem_cgroup_free(memcg); } @@ -5244,11 +5021,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; } - memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu, - GFP_KERNEL_ACCOUNT); - if (!memcg->vmstats_local) - goto fail; - memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, GFP_KERNEL_ACCOUNT); if (!memcg->vmstats_percpu) @@ -5346,11 +5118,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); /* - * A memcg must be visible for memcg_expand_shrinker_maps() + * A memcg must be visible for expand_shrinker_info() * by the time the maps are allocated. So, we allocate maps * here, when for_each_mem_cgroup() can't skip it. */ - if (memcg_alloc_shrinker_maps(memcg)) { + if (alloc_shrinker_info(memcg)) { mem_cgroup_id_remove(memcg); return -ENOMEM; } @@ -5382,6 +5154,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) page_counter_set_low(&memcg->memory, 0); memcg_offline_kmem(memcg); + reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); drain_all_stock(memcg); @@ -5414,7 +5187,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); - memcg_free_shrinker_maps(memcg); + free_shrinker_info(memcg); memcg_free_kmem(memcg); mem_cgroup_free(memcg); } @@ -5448,6 +5221,62 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg_wb_domain_size_changed(memcg); } +static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + struct memcg_vmstats_percpu *statc; + long delta, v; + int i; + + statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); + + for (i = 0; i < MEMCG_NR_STAT; i++) { + /* + * Collect the aggregated propagation counts of groups + * below us. We're in a per-cpu loop here and this is + * a global counter, so the first cycle will get them. + */ + delta = memcg->vmstats.state_pending[i]; + if (delta) + memcg->vmstats.state_pending[i] = 0; + + /* Add CPU changes on this level since the last flush */ + v = READ_ONCE(statc->state[i]); + if (v != statc->state_prev[i]) { + delta += v - statc->state_prev[i]; + statc->state_prev[i] = v; + } + + if (!delta) + continue; + + /* Aggregate counts on this level and propagate upwards */ + memcg->vmstats.state[i] += delta; + if (parent) + parent->vmstats.state_pending[i] += delta; + } + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { + delta = memcg->vmstats.events_pending[i]; + if (delta) + memcg->vmstats.events_pending[i] = 0; + + v = READ_ONCE(statc->events[i]); + if (v != statc->events_prev[i]) { + delta += v - statc->events_prev[i]; + statc->events_prev[i] = v; + } + + if (!delta) + continue; + + memcg->vmstats.events[i] += delta; + if (parent) + parent->vmstats.events_pending[i] += delta; + } +} + #ifdef CONFIG_MMU /* Handlers for move charge at task migration. */ static int mem_cgroup_do_precharge(unsigned long count) @@ -6501,6 +6330,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_released = mem_cgroup_css_released, .css_free = mem_cgroup_css_free, .css_reset = mem_cgroup_css_reset, + .css_rstat_flush = mem_cgroup_css_rstat_flush, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, @@ -6683,6 +6513,27 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, atomic_long_read(&parent->memory.children_low_usage))); } +static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg, + gfp_t gfp) +{ + unsigned int nr_pages = thp_nr_pages(page); + int ret; + + ret = try_charge(memcg, gfp, nr_pages); + if (ret) + goto out; + + css_get(&memcg->css); + commit_charge(page, memcg); + + local_irq_disable(); + mem_cgroup_charge_statistics(memcg, page, nr_pages); + memcg_check_events(memcg, page); + local_irq_enable(); +out: + return ret; +} + /** * mem_cgroup_charge - charge a newly allocated page to a cgroup * @page: page to charge @@ -6692,55 +6543,71 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, * Try to charge @page to the memcg that @mm belongs to, reclaiming * pages according to @gfp_mask if necessary. * + * Do not use this for pages allocated for swapin. + * * Returns 0 on success. Otherwise, an error code is returned. */ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - unsigned int nr_pages = thp_nr_pages(page); - struct mem_cgroup *memcg = NULL; - int ret = 0; + struct mem_cgroup *memcg; + int ret; if (mem_cgroup_disabled()) - goto out; + return 0; - if (PageSwapCache(page)) { - swp_entry_t ent = { .val = page_private(page), }; - unsigned short id; + memcg = get_mem_cgroup_from_mm(mm); + ret = __mem_cgroup_charge(page, memcg, gfp_mask); + css_put(&memcg->css); - /* - * Every swap fault against a single page tries to charge the - * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. page and memcg binding is - * protected by the page lock, which serializes swap cache - * removal, which in turn serializes uncharging. - */ - VM_BUG_ON_PAGE(!PageLocked(page), page); - if (page_memcg(compound_head(page))) - goto out; + return ret; +} - id = lookup_swap_cgroup_id(ent); - rcu_read_lock(); - memcg = mem_cgroup_from_id(id); - if (memcg && !css_tryget_online(&memcg->css)) - memcg = NULL; - rcu_read_unlock(); - } +/** + * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin + * @page: page to charge + * @mm: mm context of the victim + * @gfp: reclaim mode + * @entry: swap entry for which the page is allocated + * + * This function charges a page allocated for swapin. Please call this before + * adding the page to the swapcache. + * + * Returns 0 on success. Otherwise, an error code is returned. + */ +int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, + gfp_t gfp, swp_entry_t entry) +{ + struct mem_cgroup *memcg; + unsigned short id; + int ret; - if (!memcg) - memcg = get_mem_cgroup_from_mm(mm); + if (mem_cgroup_disabled()) + return 0; - ret = try_charge(memcg, gfp_mask, nr_pages); - if (ret) - goto out_put; + id = lookup_swap_cgroup_id(entry); + rcu_read_lock(); + memcg = mem_cgroup_from_id(id); + if (!memcg || !css_tryget_online(&memcg->css)) + memcg = get_mem_cgroup_from_mm(mm); + rcu_read_unlock(); - css_get(&memcg->css); - commit_charge(page, memcg); + ret = __mem_cgroup_charge(page, memcg, gfp); - local_irq_disable(); - mem_cgroup_charge_statistics(memcg, page, nr_pages); - memcg_check_events(memcg, page); - local_irq_enable(); + css_put(&memcg->css); + return ret; +} +/* + * mem_cgroup_swapin_uncharge_swap - uncharge swap slot + * @entry: swap entry for which the page is charged + * + * Call this function after successfully adding the charged page to swapcache. + * + * Note: This function assumes the page for which swap slot is being uncharged + * is order 0 page. + */ +void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) +{ /* * Cgroup1's unified memory+swap counter has been charged with the * new swapcache page, finish the transfer by uncharging the swap @@ -6753,25 +6620,19 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) * correspond 1:1 to page and swap slot lifetimes: we charge the * page to memory here, and uncharge swap when the slot is freed. */ - if (do_memsw_account() && PageSwapCache(page)) { - swp_entry_t entry = { .val = page_private(page) }; + if (!mem_cgroup_disabled() && do_memsw_account()) { /* * The swap entry might not get freed for a long time, * let's not wait for it. The page already received a * memory+swap charge, drop the swap entry duplicate. */ - mem_cgroup_uncharge_swap(entry, nr_pages); + mem_cgroup_uncharge_swap(entry, 1); } - -out_put: - css_put(&memcg->css); -out: - return ret; } struct uncharge_gather { struct mem_cgroup *memcg; - unsigned long nr_pages; + unsigned long nr_memory; unsigned long pgpgout; unsigned long nr_kmem; struct page *dummy_page; @@ -6786,10 +6647,10 @@ static void uncharge_batch(const struct uncharge_gather *ug) { unsigned long flags; - if (!mem_cgroup_is_root(ug->memcg)) { - page_counter_uncharge(&ug->memcg->memory, ug->nr_pages); + if (ug->nr_memory) { + page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); if (do_memsw_account()) - page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages); + page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); memcg_oom_recover(ug->memcg); @@ -6797,7 +6658,7 @@ static void uncharge_batch(const struct uncharge_gather *ug) local_irq_save(flags); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); + __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); @@ -6808,40 +6669,60 @@ static void uncharge_batch(const struct uncharge_gather *ug) static void uncharge_page(struct page *page, struct uncharge_gather *ug) { unsigned long nr_pages; + struct mem_cgroup *memcg; + struct obj_cgroup *objcg; VM_BUG_ON_PAGE(PageLRU(page), page); - if (!page_memcg(page)) - return; - /* * Nobody should be changing or seriously looking at - * page_memcg(page) at this point, we have fully + * page memcg or objcg at this point, we have fully * exclusive access to the page. */ + if (PageMemcgKmem(page)) { + objcg = __page_objcg(page); + /* + * This get matches the put at the end of the function and + * kmem pages do not hold memcg references anymore. + */ + memcg = get_mem_cgroup_from_objcg(objcg); + } else { + memcg = __page_memcg(page); + } - if (ug->memcg != page_memcg(page)) { + if (!memcg) + return; + + if (ug->memcg != memcg) { if (ug->memcg) { uncharge_batch(ug); uncharge_gather_clear(ug); } - ug->memcg = page_memcg(page); + ug->memcg = memcg; + ug->dummy_page = page; /* pairs with css_put in uncharge_batch */ - css_get(&ug->memcg->css); + css_get(&memcg->css); } nr_pages = compound_nr(page); - ug->nr_pages += nr_pages; - if (PageMemcgKmem(page)) + if (PageMemcgKmem(page)) { + ug->nr_memory += nr_pages; ug->nr_kmem += nr_pages; - else + + page->memcg_data = 0; + obj_cgroup_put(objcg); + } else { + /* LRU pages aren't accounted at the root level */ + if (!mem_cgroup_is_root(memcg)) + ug->nr_memory += nr_pages; ug->pgpgout++; - ug->dummy_page = page; - page->memcg_data = 0; - css_put(&ug->memcg->css); + page->memcg_data = 0; + } + + css_put(&memcg->css); } /** diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 24210c9bd843..bd3945446d47 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1368,7 +1368,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, * communicated in siginfo, see kill_proc() */ start = (page->index << PAGE_SHIFT) & ~(size - 1); - unmap_mapping_range(page->mapping, start, start + size, 0); + unmap_mapping_range(page->mapping, start, size, 0); } kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags); rc = 0; diff --git a/mm/memory.c b/mm/memory.c index 5efa07fb6cdc..cbdc2cd9cedb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -166,7 +166,7 @@ static int __init init_zero_pfn(void) zero_pfn = page_to_pfn(ZERO_PAGE(0)); return 0; } -core_initcall(init_zero_pfn); +early_initcall(init_zero_pfn); void mm_trace_rss_stat(struct mm_struct *mm, int member, long count) { @@ -2260,26 +2260,17 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return 0; } -/** - * remap_pfn_range - remap kernel memory to userspace - * @vma: user vma to map to - * @addr: target page aligned user address to start at - * @pfn: page frame number of kernel physical memory address - * @size: size of mapping area - * @prot: page protection flags for this mapping - * - * Note: this is only safe if the mm semaphore is held when called. - * - * Return: %0 on success, negative error code otherwise. +/* + * Variant of remap_pfn_range that does not call track_pfn_remap. The caller + * must have pre-validated the caching bits of the pgprot_t. */ -int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) { pgd_t *pgd; unsigned long next; unsigned long end = addr + PAGE_ALIGN(size); struct mm_struct *mm = vma->vm_mm; - unsigned long remap_pfn = pfn; int err; if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) @@ -2309,10 +2300,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, vma->vm_pgoff = pfn; } - err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size)); - if (err) - return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; BUG_ON(addr >= end); @@ -2324,12 +2311,36 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, err = remap_p4d_range(mm, pgd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) - break; + return err; } while (pgd++, addr = next, addr != end); + return 0; +} + +/** + * remap_pfn_range - remap kernel memory to userspace + * @vma: user vma to map to + * @addr: target page aligned user address to start at + * @pfn: page frame number of kernel physical memory address + * @size: size of mapping area + * @prot: page protection flags for this mapping + * + * Note: this is only safe if the mm semaphore is held when called. + * + * Return: %0 on success, negative error code otherwise. + */ +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + int err; + + err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); if (err) - untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size)); + return -EINVAL; + err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); + if (err) + untrack_pfn(vma, pfn, PAGE_ALIGN(size)); return err; } EXPORT_SYMBOL(remap_pfn_range); @@ -2446,13 +2457,21 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, } do { next = pmd_addr_end(addr, end); - if (create || !pmd_none_or_clear_bad(pmd)) { - err = apply_to_pte_range(mm, pmd, addr, next, fn, data, - create, mask); - if (err) - break; + if (pmd_none(*pmd) && !create) + continue; + if (WARN_ON_ONCE(pmd_leaf(*pmd))) + return -EINVAL; + if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) { + if (!create) + continue; + pmd_clear_bad(pmd); } + err = apply_to_pte_range(mm, pmd, addr, next, + fn, data, create, mask); + if (err) + break; } while (pmd++, addr = next, addr != end); + return err; } @@ -2474,13 +2493,21 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, } do { next = pud_addr_end(addr, end); - if (create || !pud_none_or_clear_bad(pud)) { - err = apply_to_pmd_range(mm, pud, addr, next, fn, data, - create, mask); - if (err) - break; + if (pud_none(*pud) && !create) + continue; + if (WARN_ON_ONCE(pud_leaf(*pud))) + return -EINVAL; + if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) { + if (!create) + continue; + pud_clear_bad(pud); } + err = apply_to_pmd_range(mm, pud, addr, next, + fn, data, create, mask); + if (err) + break; } while (pud++, addr = next, addr != end); + return err; } @@ -2502,13 +2529,21 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, } do { next = p4d_addr_end(addr, end); - if (create || !p4d_none_or_clear_bad(p4d)) { - err = apply_to_pud_range(mm, p4d, addr, next, fn, data, - create, mask); - if (err) - break; + if (p4d_none(*p4d) && !create) + continue; + if (WARN_ON_ONCE(p4d_leaf(*p4d))) + return -EINVAL; + if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) { + if (!create) + continue; + p4d_clear_bad(p4d); } + err = apply_to_pud_range(mm, p4d, addr, next, + fn, data, create, mask); + if (err) + break; } while (p4d++, addr = next, addr != end); + return err; } @@ -2528,9 +2563,17 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); - if (!create && pgd_none_or_clear_bad(pgd)) + if (pgd_none(*pgd) && !create) continue; - err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask); + if (WARN_ON_ONCE(pgd_leaf(*pgd))) + return -EINVAL; + if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) { + if (!create) + continue; + pgd_clear_bad(pgd); + } + err = apply_to_p4d_range(mm, pgd, addr, next, + fn, data, create, &mask); if (err) break; } while (pgd++, addr = next, addr != end); @@ -3309,28 +3352,26 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (page) { - int err; - __SetPageLocked(page); __SetPageSwapBacked(page); - set_page_private(page, entry.val); - /* Tell memcg to use swap ownership records */ - SetPageSwapCache(page); - err = mem_cgroup_charge(page, vma->vm_mm, - GFP_KERNEL); - ClearPageSwapCache(page); - if (err) { + if (mem_cgroup_swapin_charge_page(page, + vma->vm_mm, GFP_KERNEL, entry)) { ret = VM_FAULT_OOM; goto out_page; } + mem_cgroup_swapin_uncharge_swap(entry); shadow = get_shadow_from_swap_cache(entry); if (shadow) workingset_refault(page, shadow); lru_cache_add(page); + + /* To provide entry to swap_readpage() */ + set_page_private(page, entry.val); swap_readpage(page, true); + set_page_private(page, 0); } } else { page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, @@ -4100,7 +4141,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) int page_nid = NUMA_NO_NODE; int last_cpupid; int target_nid; - bool migrated = false; pte_t pte, old_pte; bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; @@ -4117,29 +4157,17 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) goto out; } - /* - * Make it present again, Depending on how arch implementes non - * accessible ptes, some can allow access by kernel mode. - */ - old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); + /* Get the normal PTE */ + old_pte = ptep_get(vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); - pte = pte_mkyoung(pte); - if (was_writable) - pte = pte_mkwrite(pte); - ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); - update_mmu_cache(vma, vmf->address, vmf->pte); page = vm_normal_page(vma, vmf->address, pte); - if (!page) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; - } + if (!page) + goto out_map; /* TODO: handle PTE-mapped THP */ - if (PageCompound(page)) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; - } + if (PageCompound(page)) + goto out_map; /* * Avoid grouping on RO pages in general. RO pages shouldn't hurt as @@ -4149,7 +4177,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. */ - if (!pte_write(pte)) + if (!was_writable) flags |= TNF_NO_GROUP; /* @@ -4163,24 +4191,45 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); - pte_unmap_unlock(vmf->pte, vmf->ptl); if (target_nid == NUMA_NO_NODE) { put_page(page); - goto out; + goto out_map; } + pte_unmap_unlock(vmf->pte, vmf->ptl); /* Migrate to the requested node */ - migrated = migrate_misplaced_page(page, vma, target_nid); - if (migrated) { + if (migrate_misplaced_page(page, vma, target_nid)) { page_nid = target_nid; flags |= TNF_MIGRATED; - } else + } else { flags |= TNF_MIGRATE_FAIL; + vmf->pte = pte_offset_map(vmf->pmd, vmf->address); + spin_lock(vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + goto out; + } + goto out_map; + } out: if (page_nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, page_nid, 1, flags); return 0; +out_map: + /* + * Make it present again, depending on how arch implements + * non-accessible ptes, some can allow access by kernel mode. + */ + old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); + pte = pte_modify(old_pte, vma->vm_page_prot); + pte = pte_mkyoung(pte); + if (was_writable) + pte = pte_mkwrite(pte); + ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); + update_mmu_cache(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); + goto out; } static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0cdbbfbc5757..70620d0dd923 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -42,6 +42,16 @@ #include "internal.h" #include "shuffle.h" + +/* + * memory_hotplug.memmap_on_memory parameter + */ +static bool memmap_on_memory __ro_after_init; +#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY +module_param(memmap_on_memory, bool, 0444); +MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); +#endif + /* * online_page_callback contains pointer to current page onlining function. * Initially it is generic_online_page(). If it is required it could be @@ -648,9 +658,16 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) * decide to not expose all pages to the buddy (e.g., expose them * later). We account all pages as being online and belonging to this * zone ("present"). + * When using memmap_on_memory, the range might not be aligned to + * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect + * this and the first chunk to online will be pageblock_nr_pages. */ - for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) - (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1); + for (pfn = start_pfn; pfn < end_pfn;) { + int order = min(MAX_ORDER - 1UL, __ffs(pfn)); + + (*online_page_callback)(pfn_to_page(pfn), order); + pfn += (1UL << order); + } /* mark all involved sections as online */ online_mem_sections(start_pfn, end_pfn); @@ -817,7 +834,7 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn return movable_node_enabled ? movable_zone : kernel_zone; } -struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, +struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, unsigned long nr_pages) { if (online_type == MMOP_ONLINE_KERNEL) @@ -829,24 +846,86 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, return default_zone_for_pfn(nid, start_pfn, nr_pages); } -int __ref online_pages(unsigned long pfn, unsigned long nr_pages, - int online_type, int nid) +/* + * This function should only be called by memory_block_{online,offline}, + * and {online,offline}_pages. + */ +void adjust_present_page_count(struct zone *zone, long nr_pages) +{ + unsigned long flags; + + zone->present_pages += nr_pages; + pgdat_resize_lock(zone->zone_pgdat, &flags); + zone->zone_pgdat->node_present_pages += nr_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); +} + +int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, + struct zone *zone) +{ + unsigned long end_pfn = pfn + nr_pages; + int ret; + + ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); + if (ret) + return ret; + + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); + + /* + * It might be that the vmemmap_pages fully span sections. If that is + * the case, mark those sections online here as otherwise they will be + * left offline. + */ + if (nr_pages >= PAGES_PER_SECTION) + online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); + + return ret; +} + +void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long end_pfn = pfn + nr_pages; + + /* + * It might be that the vmemmap_pages fully span sections. If that is + * the case, mark those sections offline here as otherwise they will be + * left online. + */ + if (nr_pages >= PAGES_PER_SECTION) + offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); + + /* + * The pages associated with this vmemmap have been offlined, so + * we can reset its state here. + */ + remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages); + kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); +} + +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone) { unsigned long flags; - struct zone *zone; int need_zonelists_rebuild = 0; + const int nid = zone_to_nid(zone); int ret; struct memory_notify arg; - /* We can only online full sections (e.g., SECTION_IS_ONLINE) */ + /* + * {on,off}lining is constrained to full memory sections (or more + * precisly to memory blocks from the user space POV). + * memmap_on_memory is an exception because it reserves initial part + * of the physical memory space for vmemmaps. That space is pageblock + * aligned. + */ if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION))) + !IS_ALIGNED(pfn, pageblock_nr_pages) || + !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; mem_hotplug_begin(); /* associate pfn range with the zone */ - zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); arg.start_pfn = pfn; @@ -877,11 +956,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, } online_pages_range(pfn, nr_pages); - zone->present_pages += nr_pages; - - pgdat_resize_lock(zone->zone_pgdat, &flags); - zone->zone_pgdat->node_present_pages += nr_pages; - pgdat_resize_unlock(zone->zone_pgdat, &flags); + adjust_present_page_count(zone, nr_pages); node_states_set_node(nid, &arg); if (need_zonelists_rebuild) @@ -1064,6 +1139,45 @@ static int online_memory_block(struct memory_block *mem, void *arg) return device_online(&mem->dev); } +bool mhp_supports_memmap_on_memory(unsigned long size) +{ + unsigned long nr_vmemmap_pages = size / PAGE_SIZE; + unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page); + unsigned long remaining_size = size - vmemmap_size; + + /* + * Besides having arch support and the feature enabled at runtime, we + * need a few more assumptions to hold true: + * + * a) We span a single memory block: memory onlining/offlinin;g happens + * in memory block granularity. We don't want the vmemmap of online + * memory blocks to reside on offline memory blocks. In the future, + * we might want to support variable-sized memory blocks to make the + * feature more versatile. + * + * b) The vmemmap pages span complete PMDs: We don't want vmemmap code + * to populate memory from the altmap for unrelated parts (i.e., + * other memory blocks) + * + * c) The vmemmap pages (and thereby the pages that will be exposed to + * the buddy) have to cover full pageblocks: memory onlining/offlining + * code requires applicable ranges to be page-aligned, for example, to + * set the migratetypes properly. + * + * TODO: Although we have a check here to make sure that vmemmap pages + * fully populate a PMD, it is not the right place to check for + * this. A much better solution involves improving vmemmap code + * to fallback to base pages when trying to populate vmemmap using + * altmap as an alternative source of memory, and we do not exactly + * populate a single PMD. + */ + return memmap_on_memory && + IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) && + size == memory_block_size_bytes() && + IS_ALIGNED(vmemmap_size, PMD_SIZE) && + IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)); +} + /* * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations (triggered e.g. by sysfs). @@ -1073,6 +1187,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; + struct vmem_altmap mhp_altmap = {}; u64 start, size; bool new_node = false; int ret; @@ -1099,13 +1214,26 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error; new_node = ret; + /* + * Self hosted memmap array + */ + if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { + if (!mhp_supports_memmap_on_memory(size)) { + ret = -EINVAL; + goto error; + } + mhp_altmap.free = PHYS_PFN(size); + mhp_altmap.base_pfn = PHYS_PFN(start); + params.altmap = &mhp_altmap; + } + /* call arch's memory hotadd */ ret = arch_add_memory(nid, start, size, ¶ms); if (ret < 0) goto error; /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size); + ret = create_memory_block_devices(start, size, mhp_altmap.alloc); if (ret) { arch_remove_memory(nid, start, size, NULL); goto error; @@ -1573,9 +1701,16 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) int ret, node; char *reason; - /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */ + /* + * {on,off}lining is constrained to full memory sections (or more + * precisly to memory blocks from the user space POV). + * memmap_on_memory is an exception because it reserves initial part + * of the physical memory space for vmemmaps. That space is pageblock + * aligned. + */ if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION))) + !IS_ALIGNED(start_pfn, pageblock_nr_pages) || + !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; mem_hotplug_begin(); @@ -1611,6 +1746,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) * in a way that pages from isolated pageblock are left on pcplists. */ zone_pcp_disable(zone); + lru_cache_disable(); /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, @@ -1642,7 +1778,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) } cond_resched(); - lru_add_drain_all(); ret = scan_movable_pages(pfn, end_pfn, &pfn); if (!ret) { @@ -1687,15 +1822,12 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages; spin_unlock_irqrestore(&zone->lock, flags); + lru_cache_enable(); zone_pcp_enable(zone); /* removal success */ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); - zone->present_pages -= nr_pages; - - pgdat_resize_lock(zone->zone_pgdat, &flags); - zone->zone_pgdat->node_present_pages -= nr_pages; - pgdat_resize_unlock(zone->zone_pgdat, &flags); + adjust_present_page_count(zone, -nr_pages); init_per_zone_wmark_min(); @@ -1750,6 +1882,14 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) return 0; } +static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg) +{ + /* + * If not set, continue with the next block. + */ + return mem->nr_vmemmap_pages; +} + static int check_cpu_on_node(pg_data_t *pgdat) { int cpu; @@ -1824,6 +1964,9 @@ EXPORT_SYMBOL(try_offline_node); static int __ref try_remove_memory(int nid, u64 start, u64 size) { int rc = 0; + struct vmem_altmap mhp_altmap = {}; + struct vmem_altmap *altmap = NULL; + unsigned long nr_vmemmap_pages; BUG_ON(check_hotplug_memory_range(start, size)); @@ -1836,6 +1979,31 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) if (rc) return rc; + /* + * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in + * the same granularity it was added - a single memory block. + */ + if (memmap_on_memory) { + nr_vmemmap_pages = walk_memory_blocks(start, size, NULL, + get_nr_vmemmap_pages_cb); + if (nr_vmemmap_pages) { + if (size != memory_block_size_bytes()) { + pr_warn("Refuse to remove %#llx - %#llx," + "wrong granularity\n", + start, start + size); + return -EINVAL; + } + + /* + * Let remove_pmd_table->free_hugepage_table do the + * right thing if we used vmem_altmap when hot-adding + * the range. + */ + mhp_altmap.alloc = nr_vmemmap_pages; + altmap = &mhp_altmap; + } + } + /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); @@ -1847,7 +2015,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) mem_hotplug_begin(); - arch_remove_memory(nid, start, size, NULL); + arch_remove_memory(nid, start, size, altmap); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { memblock_free(start, size); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ab51132547b8..3ebe2cfc64af 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -330,7 +330,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { - nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed, + nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, *nodes); pol->w.cpuset_mems_allowed = *nodes; } @@ -1124,7 +1124,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, int err = 0; nodemask_t tmp; - migrate_prep(); + lru_cache_disable(); mmap_read_lock(mm); @@ -1161,7 +1161,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, tmp = *from; while (!nodes_empty(tmp)) { - int s,d; + int s, d; int source = NUMA_NO_NODE; int dest = 0; @@ -1208,6 +1208,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, break; } mmap_read_unlock(mm); + + lru_cache_enable(); if (err < 0) return err; return busy; @@ -1323,7 +1325,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - migrate_prep(); + lru_cache_disable(); } { NODEMASK_SCRATCH(scratch); @@ -1371,6 +1373,8 @@ up_out: mmap_write_unlock(mm); mpol_out: mpol_put(new); + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + lru_cache_enable(); return err; } @@ -2140,7 +2144,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, { struct page *page; - page = __alloc_pages(gfp, order, nid); + page = __alloc_pages(gfp, order, nid, NULL); /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ if (!static_branch_likely(&vm_numa_stat_key)) return page; @@ -2153,30 +2157,22 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, } /** - * alloc_pages_vma - Allocate a page for a VMA. - * - * @gfp: - * %GFP_USER user allocation. - * %GFP_KERNEL kernel allocations, - * %GFP_HIGHMEM highmem/user allocations, - * %GFP_FS allocation should not call back into a file system. - * %GFP_ATOMIC don't sleep. + * alloc_pages_vma - Allocate a page for a VMA. + * @gfp: GFP flags. + * @order: Order of the GFP allocation. + * @vma: Pointer to VMA or NULL if not available. + * @addr: Virtual address of the allocation. Must be inside @vma. + * @node: Which node to prefer for allocation (modulo policy). + * @hugepage: For hugepages try only the preferred node if possible. * - * @order:Order of the GFP allocation. - * @vma: Pointer to VMA or NULL if not available. - * @addr: Virtual Address of the allocation. Must be inside the VMA. - * @node: Which node to prefer for allocation (modulo policy). - * @hugepage: for hugepages try only the preferred node if possible + * Allocate a page for a specific address in @vma, using the appropriate + * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock + * of the mm_struct of the VMA to prevent it from going away. Should be + * used for all allocations for pages that will be mapped into user space. * - * This function allocates a page from the kernel page pool and applies - * a NUMA policy associated with the VMA or the current process. - * When VMA is not NULL caller must read-lock the mmap_lock of the - * mm_struct of the VMA to prevent it from going away. Should be used for - * all allocations for pages that will be mapped into user space. Returns - * NULL when no page can be allocated. + * Return: The page on success or NULL if allocation fails. */ -struct page * -alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, +struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, int node, bool hugepage) { struct mempolicy *pol; @@ -2237,7 +2233,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); - page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); + page = __alloc_pages(gfp, order, preferred_nid, nmask); mpol_cond_put(pol); out: return page; @@ -2245,21 +2241,20 @@ out: EXPORT_SYMBOL(alloc_pages_vma); /** - * alloc_pages_current - Allocate pages. + * alloc_pages - Allocate pages. + * @gfp: GFP flags. + * @order: Power of two of number of pages to allocate. * - * @gfp: - * %GFP_USER user allocation, - * %GFP_KERNEL kernel allocation, - * %GFP_HIGHMEM highmem allocation, - * %GFP_FS don't call back into a file system. - * %GFP_ATOMIC don't sleep. - * @order: Power of two of allocation size in pages. 0 is a single page. + * Allocate 1 << @order contiguous pages. The physical address of the + * first page is naturally aligned (eg an order-3 allocation will be aligned + * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current + * process is honoured when in process context. * - * Allocate a page from the kernel page pool. When not in - * interrupt context and apply the current process NUMA policy. - * Returns NULL when no page can be allocated. + * Context: Can be called from any context, providing the appropriate GFP + * flags are used. + * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages_current(gfp_t gfp, unsigned order) +struct page *alloc_pages(gfp_t gfp, unsigned order) { struct mempolicy *pol = &default_policy; struct page *page; @@ -2274,13 +2269,13 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) if (pol->mode == MPOL_INTERLEAVE) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else - page = __alloc_pages_nodemask(gfp, order, + page = __alloc_pages(gfp, order, policy_node(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); return page; } -EXPORT_SYMBOL(alloc_pages_current); +EXPORT_SYMBOL(alloc_pages); int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { @@ -2457,14 +2452,11 @@ static void sp_free(struct sp_node *n) * @addr: virtual address where page mapped * * Lookup current policy node id for vma,addr and "compare to" page's - * node id. - * - * Returns: - * -1 - not misplaced, page is in the right node - * node - node id where the page should be - * - * Policy determination "mimics" alloc_page_vma(). + * node id. Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. + * + * Return: -1 if the page is in a node that is valid for this policy, or a + * suitable node ID to allocate a replacement page from. */ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) { diff --git a/mm/mempool.c b/mm/mempool.c index 79959fac27d7..a258cf4de575 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -106,7 +106,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element) if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) kasan_slab_free_mempool(element); else if (pool->alloc == mempool_alloc_pages) - kasan_free_pages(element, (unsigned long)pool->pool_data); + kasan_free_pages(element, (unsigned long)pool->pool_data, false); } static void kasan_unpoison_element(mempool_t *pool, void *element) @@ -114,7 +114,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element) if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) kasan_unpoison_range(element, __ksize(element)); else if (pool->alloc == mempool_alloc_pages) - kasan_alloc_pages(element, (unsigned long)pool->pool_data); + kasan_alloc_pages(element, (unsigned long)pool->pool_data, false); } static __always_inline void add_element(mempool_t *pool, void *element) @@ -251,7 +251,7 @@ EXPORT_SYMBOL(mempool_init); mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data) { - return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data, + return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data, GFP_KERNEL, NUMA_NO_NODE); } EXPORT_SYMBOL(mempool_create); diff --git a/mm/memremap.c b/mm/memremap.c index 7aa7d6e80ee5..15a074ffb8d7 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2015 Intel Corporation. All rights reserved. */ #include <linux/device.h> #include <linux/io.h> diff --git a/mm/migrate.c b/mm/migrate.c index 62b81d5257aa..6b37d00890ca 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -57,28 +57,6 @@ #include "internal.h" -/* - * migrate_prep() needs to be called before we start compiling a list of pages - * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is - * undesirable, use migrate_prep_local() - */ -void migrate_prep(void) -{ - /* - * Clear the LRU lists so pages can be isolated. - * Note that pages may be moved off the LRU after we have - * drained them. Those pages will fail to migrate like other - * pages that may be busy. - */ - lru_add_drain_all(); -} - -/* Do the necessary work of migrate_prep but not if it involves other CPUs */ -void migrate_prep_local(void) -{ - lru_add_drain(); -} - int isolate_movable_page(struct page *page, isolate_mode_t mode) { struct address_space *mapping; @@ -140,15 +118,10 @@ out: return -EBUSY; } -/* It should be called on page which is PG_movable */ -void putback_movable_page(struct page *page) +static void putback_movable_page(struct page *page) { struct address_space *mapping; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageMovable(page), page); - VM_BUG_ON_PAGE(!PageIsolated(page), page); - mapping = page_mapping(page); mapping->a_ops->putback_page(page); __ClearPageIsolated(page); @@ -1375,7 +1348,7 @@ out_unlock: out: if (rc == MIGRATEPAGE_SUCCESS) putback_active_hugepage(hpage); - else if (rc != -EAGAIN && rc != MIGRATEPAGE_SUCCESS) + else if (rc != -EAGAIN) list_move_tail(&hpage->lru, ret); /* @@ -1445,6 +1418,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int rc, nr_subpages; LIST_HEAD(ret_pages); + trace_mm_migrate_pages_start(mode, reason); + if (!swapwrite) current->flags |= PF_SWAPWRITE; @@ -1617,7 +1592,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE) gfp_mask |= __GFP_HIGHMEM; - new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask); + new_page = __alloc_pages(gfp_mask, order, nid, mtc->nmask); if (new_page && PageTransHuge(new_page)) prep_transhuge_page(new_page); @@ -1769,7 +1744,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, int start, i; int err = 0, err1; - migrate_prep(); + lru_cache_disable(); for (i = start = 0; i < nr_pages; i++) { const void __user *p; @@ -1838,6 +1813,7 @@ out_flush: if (err >= 0) err = err1; out: + lru_cache_enable(); return err; } @@ -2110,17 +2086,6 @@ bool pmd_trans_migrating(pmd_t pmd) return PageLocked(page); } -static inline bool is_shared_exec_page(struct vm_area_struct *vma, - struct page *page) -{ - if (page_mapcount(page) != 1 && - (page_is_file_lru(page) || vma_is_shmem(vma)) && - (vma->vm_flags & VM_EXEC)) - return true; - - return false; -} - /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on @@ -2138,7 +2103,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, * Don't migrate file pages that are mapped in multiple processes * with execute permissions as they are probably shared libraries. */ - if (is_shared_exec_page(vma, page)) + if (page_mapcount(page) != 1 && page_is_file_lru(page) && + (vma->vm_flags & VM_EXEC)) goto out; /* @@ -2193,9 +2159,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, int page_lru = page_is_file_lru(page); unsigned long start = address & HPAGE_PMD_MASK; - if (is_shared_exec_page(vma, page)) - goto out; - new_page = alloc_pages_node(node, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), HPAGE_PMD_ORDER); @@ -2307,7 +2270,6 @@ out_fail: out_unlock: unlock_page(page); -out: put_page(page); return 0; } @@ -2316,44 +2278,38 @@ out: #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEVICE_PRIVATE -static int migrate_vma_collect_hole(unsigned long start, +static int migrate_vma_collect_skip(unsigned long start, unsigned long end, - __always_unused int depth, struct mm_walk *walk) { struct migrate_vma *migrate = walk->private; unsigned long addr; - /* Only allow populating anonymous memory. */ - if (!vma_is_anonymous(walk->vma)) { - for (addr = start; addr < end; addr += PAGE_SIZE) { - migrate->src[migrate->npages] = 0; - migrate->dst[migrate->npages] = 0; - migrate->npages++; - } - return 0; - } - for (addr = start; addr < end; addr += PAGE_SIZE) { - migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; migrate->dst[migrate->npages] = 0; - migrate->npages++; - migrate->cpages++; + migrate->src[migrate->npages++] = 0; } return 0; } -static int migrate_vma_collect_skip(unsigned long start, +static int migrate_vma_collect_hole(unsigned long start, unsigned long end, + __always_unused int depth, struct mm_walk *walk) { struct migrate_vma *migrate = walk->private; unsigned long addr; + /* Only allow populating anonymous memory. */ + if (!vma_is_anonymous(walk->vma)) + return migrate_vma_collect_skip(start, end, walk); + for (addr = start; addr < end; addr += PAGE_SIZE) { + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; migrate->dst[migrate->npages] = 0; - migrate->src[migrate->npages++] = 0; + migrate->npages++; + migrate->cpages++; } return 0; @@ -2973,6 +2929,13 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); entry = swp_entry_to_pte(swp_entry); + } else { + /* + * For now we only support migrating to un-addressable + * device memory. + */ + pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); + goto abort; } } else { entry = mk_pte(page, vma->vm_page_prot); diff --git a/mm/mlock.c b/mm/mlock.c index f8f8cc32d03d..df590fda5688 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -559,7 +559,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, vm_flags_t flags) { unsigned long nstart, end, tmp; - struct vm_area_struct * vma, * prev; + struct vm_area_struct *vma, *prev; int error; VM_BUG_ON(offset_in_page(start)); @@ -737,7 +737,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) */ static int apply_mlockall_flags(int flags) { - struct vm_area_struct * vma, * prev = NULL; + struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; current->mm->def_flags &= VM_LOCKED_CLEAR_MASK; diff --git a/mm/mm_init.c b/mm/mm_init.c index 8e02e865cc65..9ddaf0e1b0ab 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -19,10 +19,6 @@ #ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; -#ifndef SECTIONS_SHIFT -#define SECTIONS_SHIFT 0 -#endif - /* The zonelists are simply reported, validation is manual. */ void __init mminit_verify_zonelist(void) { diff --git a/mm/mmap.c b/mm/mmap.c index 3f287599a7a3..c1b848fa7da6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -93,6 +93,12 @@ static void unmap_region(struct mm_struct *mm, * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (copy) copy w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and + * MAP_PRIVATE (with Enhanced PAN supported): + * r: (no) no + * w: (no) no + * x: (yes) yes */ pgprot_t protection_map[16] __ro_after_init = { __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, @@ -3023,25 +3029,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, flags &= MAP_NONBLOCK; flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; - if (vma->vm_flags & VM_LOCKED) { - struct vm_area_struct *tmp; + if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; - /* drop PG_Mlocked flag for over-mapped range */ - for (tmp = vma; tmp->vm_start >= start + size; - tmp = tmp->vm_next) { - /* - * Split pmd and munlock page on the border - * of the range. - */ - vma_adjust_trans_huge(tmp, start, start + size, 0); - - munlock_vma_pages_range(tmp, - max(tmp->vm_start, start), - min(tmp->vm_end, start + size)); - } - } - file = get_file(vma->vm_file); ret = do_mmap(vma->vm_file, start, size, prot, flags, pgoff, &populate, NULL); @@ -3403,14 +3393,10 @@ static const char *special_mapping_name(struct vm_area_struct *vma) return ((struct vm_special_mapping *)vma->vm_private_data)->name; } -static int special_mapping_mremap(struct vm_area_struct *new_vma, - unsigned long flags) +static int special_mapping_mremap(struct vm_area_struct *new_vma) { struct vm_special_mapping *sm = new_vma->vm_private_data; - if (flags & MREMAP_DONTUNMAP) - return -EINVAL; - if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) return -EFAULT; diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 0dc7149b0c61..1b9837419bf9 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -249,16 +249,6 @@ void tlb_flush_mmu(struct mmu_gather *tlb) tlb_flush_mmu_free(tlb); } -/** - * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down - * @tlb: the mmu_gather structure to initialize - * @mm: the mm_struct of the target address space - * @fullmm: @mm is without users and we're going to destroy the full address - * space (exit/execve) - * - * Called to initialize an (on-stack) mmu_gather structure for page-table - * tear-down from @mm. - */ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) { @@ -283,11 +273,30 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, inc_tlb_flush_pending(tlb->mm); } +/** + * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down + * @tlb: the mmu_gather structure to initialize + * @mm: the mm_struct of the target address space + * + * Called to initialize an (on-stack) mmu_gather structure for page-table + * tear-down from @mm. + */ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) { __tlb_gather_mmu(tlb, mm, false); } +/** + * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down + * @tlb: the mmu_gather structure to initialize + * @mm: the mm_struct of the target address space + * + * In this case, @mm is without users and we're going to destroy the + * full address space (exit/execve). + * + * Called to initialize an (on-stack) mmu_gather structure for page-table + * tear-down from @mm. + */ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) { __tlb_gather_mmu(tlb, mm, true); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 61ee40ed804e..459d195d2ff6 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -501,10 +501,33 @@ static int mn_hlist_invalidate_range_start( ""); WARN_ON(mmu_notifier_range_blockable(range) || _ret != -EAGAIN); + /* + * We call all the notifiers on any EAGAIN, + * there is no way for a notifier to know if + * its start method failed, thus a start that + * does EAGAIN can't also do end. + */ + WARN_ON(ops->invalidate_range_end); ret = _ret; } } } + + if (ret) { + /* + * Must be non-blocking to get here. If there are multiple + * notifiers and one or more failed start, any that succeeded + * start are expecting their end to be called. Do so now. + */ + hlist_for_each_entry_rcu(subscription, &subscriptions->list, + hlist, srcu_read_lock_held(&srcu)) { + if (!subscription->ops->invalidate_range_end) + continue; + + subscription->ops->invalidate_range_end(subscription, + range); + } + } srcu_read_unlock(&srcu, id); return ret; diff --git a/mm/mremap.c b/mm/mremap.c index ec8f840399ed..d22629ff8f3c 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -545,7 +545,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (moved_len < old_len) { err = -ENOMEM; } else if (vma->vm_ops && vma->vm_ops->mremap) { - err = vma->vm_ops->mremap(new_vma, flags); + err = vma->vm_ops->mremap(new_vma); } if (unlikely(err)) { @@ -653,8 +653,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, return ERR_PTR(-EINVAL); } - if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) || - vma->vm_flags & VM_SHARED)) + if ((flags & MREMAP_DONTUNMAP) && + (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) return ERR_PTR(-EINVAL); if (is_vm_hugetlb_page(vma)) diff --git a/mm/msync.c b/mm/msync.c index 69c6d2029531..137d1c104f3e 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -55,7 +55,9 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) goto out; /* * If the interval [start,end) covers some unmapped address ranges, - * just ignore them, but return -ENOMEM at the end. + * just ignore them, but return -ENOMEM at the end. Besides, if the + * flag is MS_ASYNC (w/o MS_INVALIDATE) the result would be -ENOMEM + * anyway and there is nothing left to do, so return immediately. */ mmap_read_lock(mm); vma = find_vma(mm, start); @@ -69,6 +71,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) goto out_unlock; /* Here start < vma->vm_end. */ if (start < vma->vm_start) { + if (flags == MS_ASYNC) + goto out_unlock; start = vma->vm_start; if (start >= end) goto out_unlock; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 9efaf430cfd3..3df2ac6b8686 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -170,7 +170,7 @@ static bool oom_unkillable_task(struct task_struct *p) return false; } -/** +/* * Check whether unreclaimable slab amount is greater than * all user memory(LRU pages). * dump_unreclaimable_slab() could help in the case that @@ -993,7 +993,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) if (oom_group) { mem_cgroup_print_oom_group(oom_group); mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, - (void*)message); + (void *)message); mem_cgroup_put(oom_group); } } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index eb34d204d4ee..5e761fb62800 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2722,12 +2722,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - struct mem_cgroup *memcg; - struct lruvec *lruvec; int ret; - memcg = lock_page_memcg(page); - lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + lock_page_memcg(page); if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2755,11 +2752,11 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - dec_lruvec_state(lruvec, NR_WRITEBACK); + dec_lruvec_page_state(page, NR_WRITEBACK); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); inc_node_page_state(page, NR_WRITTEN); } - __unlock_page_memcg(memcg); + unlock_page_memcg(page); return ret; } @@ -2833,6 +2830,22 @@ void wait_on_page_writeback(struct page *page) } EXPORT_SYMBOL_GPL(wait_on_page_writeback); +/* + * Wait for a page to complete writeback. Returns -EINTR if we get a + * fatal signal while waiting. + */ +int wait_on_page_writeback_killable(struct page *page) +{ + while (PageWriteback(page)) { + trace_wait_on_page_writeback(page, page_mapping(page)); + if (wait_on_page_bit_killable(page, PG_writeback)) + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL_GPL(wait_on_page_writeback_killable); + /** * wait_for_stable_page() - wait for writeback to finish, if necessary. * @page: The page to wait on. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cfc72873961d..bcdc0c6f21f1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -72,7 +72,6 @@ #include <linux/padata.h> #include <linux/khugepaged.h> #include <linux/buffer_head.h> - #include <asm/sections.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -108,6 +107,17 @@ typedef int __bitwise fpi_t; */ #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) +/* + * Don't poison memory with KASAN (only for the tag-based modes). + * During boot, all non-reserved memblock memory is exposed to page_alloc. + * Poisoning all that memory lengthens boot time, especially on systems with + * large amount of RAM. This flag is used to skip that poisoning. + * This is only done for the tag-based KASAN modes, as those are able to + * detect memory corruptions with the memory tags assigned by default. + * All memory allocated normally after boot gets poisoned as usual. + */ +#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2)) + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_FRACTION (8) @@ -167,10 +177,10 @@ unsigned long totalcma_pages __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; -DEFINE_STATIC_KEY_FALSE(init_on_alloc); +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); EXPORT_SYMBOL(init_on_alloc); -DEFINE_STATIC_KEY_FALSE(init_on_free); +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); EXPORT_SYMBOL(init_on_free); static bool _init_on_alloc_enabled_early __read_mostly @@ -384,10 +394,15 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages); * on-demand allocation and then freed again before the deferred pages * initialization is done, but this is not likely to happen. */ -static inline void kasan_free_nondeferred_pages(struct page *page, int order) +static inline void kasan_free_nondeferred_pages(struct page *page, int order, + bool init, fpi_t fpi_flags) { - if (!static_branch_unlikely(&deferred_pages)) - kasan_free_pages(page, order); + if (static_branch_unlikely(&deferred_pages)) + return; + if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) + return; + kasan_free_pages(page, order, init); } /* Returns true if the struct page for the pfn is uninitialised */ @@ -438,7 +453,14 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } #else -#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o) +static inline void kasan_free_nondeferred_pages(struct page *page, int order, + bool init, fpi_t fpi_flags) +{ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) + return; + kasan_free_pages(page, order, init); +} static inline bool early_page_uninitialised(unsigned long pfn) { @@ -764,32 +786,36 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, */ void init_mem_debugging_and_hardening(void) { + bool page_poisoning_requested = false; + +#ifdef CONFIG_PAGE_POISONING + /* + * Page poisoning is debug page alloc for some arches. If + * either of those options are enabled, enable poisoning. + */ + if (page_poisoning_enabled() || + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && + debug_pagealloc_enabled())) { + static_branch_enable(&_page_poisoning_enabled); + page_poisoning_requested = true; + } +#endif + if (_init_on_alloc_enabled_early) { - if (page_poisoning_enabled()) + if (page_poisoning_requested) pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " "will take precedence over init_on_alloc\n"); else static_branch_enable(&init_on_alloc); } if (_init_on_free_enabled_early) { - if (page_poisoning_enabled()) + if (page_poisoning_requested) pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " "will take precedence over init_on_free\n"); else static_branch_enable(&init_on_free); } -#ifdef CONFIG_PAGE_POISONING - /* - * Page poisoning is debug page alloc for some arches. If - * either of those options are enabled, enable poisoning. - */ - if (page_poisoning_enabled() || - (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && - debug_pagealloc_enabled())) - static_branch_enable(&_page_poisoning_enabled); -#endif - #ifdef CONFIG_DEBUG_PAGEALLOC if (!debug_pagealloc_enabled()) return; @@ -1103,7 +1129,7 @@ static inline bool page_expected_state(struct page *page, if (unlikely((unsigned long)page->mapping | page_ref_count(page) | #ifdef CONFIG_MEMCG - (unsigned long)page_memcg(page) | + page->memcg_data | #endif (page->flags & check_flags))) return false; @@ -1128,7 +1154,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; } #ifdef CONFIG_MEMCG - if (unlikely(page_memcg(page))) + if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; #endif return bad_reason; @@ -1216,9 +1242,10 @@ static void kernel_init_free_pages(struct page *page, int numpages) } static __always_inline bool free_pages_prepare(struct page *page, - unsigned int order, bool check_free) + unsigned int order, bool check_free, fpi_t fpi_flags) { int bad = 0; + bool init; VM_BUG_ON_PAGE(PageTail(page), page); @@ -1276,16 +1303,21 @@ static __always_inline bool free_pages_prepare(struct page *page, debug_check_no_obj_freed(page_address(page), PAGE_SIZE << order); } - if (want_init_on_free()) - kernel_init_free_pages(page, 1 << order); kernel_poison_pages(page, 1 << order); /* + * As memory initialization might be integrated into KASAN, + * kasan_free_pages and kernel_init_free_pages must be + * kept together to avoid discrepancies in behavior. + * * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - kasan_free_nondeferred_pages(page, order); + init = want_init_on_free(); + if (init && !kasan_has_integrated_init()) + kernel_init_free_pages(page, 1 << order); + kasan_free_nondeferred_pages(page, order, init, fpi_flags); /* * arch_free_page() can make the page's contents inaccessible. s390 @@ -1307,7 +1339,7 @@ static __always_inline bool free_pages_prepare(struct page *page, */ static bool free_pcp_prepare(struct page *page) { - return free_pages_prepare(page, 0, true); + return free_pages_prepare(page, 0, true, FPI_NONE); } static bool bulkfree_pcp_prepare(struct page *page) @@ -1327,9 +1359,9 @@ static bool bulkfree_pcp_prepare(struct page *page) static bool free_pcp_prepare(struct page *page) { if (debug_pagealloc_enabled_static()) - return free_pages_prepare(page, 0, true); + return free_pages_prepare(page, 0, true, FPI_NONE); else - return free_pages_prepare(page, 0, false); + return free_pages_prepare(page, 0, false, FPI_NONE); } static bool bulkfree_pcp_prepare(struct page *page) @@ -1537,7 +1569,7 @@ static void __free_pages_ok(struct page *page, unsigned int order, int migratetype; unsigned long pfn = page_to_pfn(page); - if (!free_pages_prepare(page, order, true)) + if (!free_pages_prepare(page, order, true, fpi_flags)) return; migratetype = get_pfnblock_migratetype(page, pfn); @@ -1574,7 +1606,7 @@ void __free_pages_core(struct page *page, unsigned int order) * Bypass PCP and place fresh pages right to the tail, primarily * relevant for memory onlining. */ - __free_pages_ok(page, order, FPI_TO_TAIL); + __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON); } #ifdef CONFIG_NEED_MULTIPLE_NODES @@ -2292,17 +2324,32 @@ static bool check_new_pages(struct page *page, unsigned int order) inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { + bool init; + set_page_private(page, 0); set_page_refcounted(page); arch_alloc_page(page, order); debug_pagealloc_map_pages(page, 1 << order); - kasan_alloc_pages(page, order); + + /* + * Page unpoisoning must happen before memory initialization. + * Otherwise, the poison pattern will be overwritten for __GFP_ZERO + * allocations and the page unpoisoning code will complain. + */ kernel_unpoison_pages(page, 1 << order); - set_page_owner(page, order, gfp_flags); - if (!want_init_on_free() && want_init_on_alloc(gfp_flags)) + /* + * As memory initialization might be integrated into KASAN, + * kasan_alloc_pages and kernel_init_free_pages must be + * kept together to avoid discrepancies in behavior. + */ + init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + kasan_alloc_pages(page, order, init); + if (init && !kasan_has_integrated_init()) kernel_init_free_pages(page, 1 << order); + + set_page_owner(page, order, gfp_flags); } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, @@ -2386,19 +2433,21 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, * boundary. If alignment is required, use move_freepages_block() */ static int move_freepages(struct zone *zone, - struct page *start_page, struct page *end_page, + unsigned long start_pfn, unsigned long end_pfn, int migratetype, int *num_movable) { struct page *page; + unsigned long pfn; unsigned int order; int pages_moved = 0; - for (page = start_page; page <= end_page;) { - if (!pfn_valid_within(page_to_pfn(page))) { - page++; + for (pfn = start_pfn; pfn <= end_pfn;) { + if (!pfn_valid_within(pfn)) { + pfn++; continue; } + page = pfn_to_page(pfn); if (!PageBuddy(page)) { /* * We assume that pages that could be isolated for @@ -2408,8 +2457,7 @@ static int move_freepages(struct zone *zone, if (num_movable && (PageLRU(page) || __PageMovable(page))) (*num_movable)++; - - page++; + pfn++; continue; } @@ -2419,7 +2467,7 @@ static int move_freepages(struct zone *zone, order = buddy_order(page); move_to_free_list(page, zone, order, migratetype); - page += 1 << order; + pfn += 1 << order; pages_moved += 1 << order; } @@ -2429,25 +2477,22 @@ static int move_freepages(struct zone *zone, int move_freepages_block(struct zone *zone, struct page *page, int migratetype, int *num_movable) { - unsigned long start_pfn, end_pfn; - struct page *start_page, *end_page; + unsigned long start_pfn, end_pfn, pfn; if (num_movable) *num_movable = 0; - start_pfn = page_to_pfn(page); - start_pfn = start_pfn & ~(pageblock_nr_pages-1); - start_page = pfn_to_page(start_pfn); - end_page = start_page + pageblock_nr_pages - 1; + pfn = page_to_pfn(page); + start_pfn = pfn & ~(pageblock_nr_pages - 1); end_pfn = start_pfn + pageblock_nr_pages - 1; /* Do not cross zone boundaries */ if (!zone_spans_pfn(zone, start_pfn)) - start_page = page; + start_pfn = pfn; if (!zone_spans_pfn(zone, end_pfn)) return 0; - return move_freepages(zone, start_page, end_page, migratetype, + return move_freepages(zone, start_pfn, end_pfn, migratetype, num_movable); } @@ -2908,7 +2953,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { - int i, alloced = 0; + int i, allocated = 0; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -2931,7 +2976,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages are ordered properly. */ list_add_tail(&page->lru, list); - alloced++; + allocated++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); @@ -2940,12 +2985,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, /* * i pages were removed from the buddy list even if some leak due * to check_pcp_refill failing so adjust NR_FREE_PAGES based - * on i. Do not confuse with 'alloced' which is the number of + * on i. Do not confuse with 'allocated' which is the number of * pages added to the pcp list. */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); - return alloced; + return allocated; } #ifdef CONFIG_NUMA @@ -3415,7 +3460,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) } /* Remove page from the per-cpu list, caller must protect the list */ -static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, +static inline +struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, unsigned int alloc_flags, struct per_cpu_pages *pcp, struct list_head *list) @@ -3813,16 +3859,13 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) return alloc_flags; } -static inline unsigned int current_alloc_flags(gfp_t gfp_mask, - unsigned int alloc_flags) +/* Must be called after current_gfp_context() which can change gfp_mask */ +static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask, + unsigned int alloc_flags) { #ifdef CONFIG_CMA - unsigned int pflags = current->flags; - - if (!(pflags & PF_MEMALLOC_NOCMA) && - gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; - #endif return alloc_flags; } @@ -3922,7 +3965,7 @@ retry: if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; - if (node_reclaim_mode == 0 || + if (!node_reclaim_enabled() || !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue; @@ -4158,6 +4201,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, memalloc_noreclaim_restore(noreclaim_flag); psi_memstall_leave(&pflags); + if (*compact_result == COMPACT_SKIPPED) + return NULL; /* * At least in one zone compaction wasn't deferred or skipped, so let's * count a compaction stall @@ -4478,7 +4523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; - alloc_flags = current_alloc_flags(gfp_mask, alloc_flags); + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); return alloc_flags; } @@ -4780,7 +4825,7 @@ retry: reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) - alloc_flags = current_alloc_flags(gfp_mask, reserve_flags); + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags); /* * Reset the nodemask and zonelist iterators if memory policies can be @@ -4921,7 +4966,7 @@ got_pg: static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask, - struct alloc_context *ac, gfp_t *alloc_mask, + struct alloc_context *ac, gfp_t *alloc_gfp, unsigned int *alloc_flags) { ac->highest_zoneidx = gfp_zone(gfp_mask); @@ -4930,7 +4975,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, ac->migratetype = gfp_migratetype(gfp_mask); if (cpusets_enabled()) { - *alloc_mask |= __GFP_HARDWALL; + *alloc_gfp |= __GFP_HARDWALL; /* * When we are in the interrupt context, it is irrelevant * to the current task context. It means that any node ok. @@ -4949,7 +4994,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, if (should_fail_alloc_page(gfp_mask, order)) return false; - *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags); + *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags); /* Dirty zone balancing only done in the fast path */ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); @@ -4966,15 +5011,160 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, } /* + * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array + * @gfp: GFP flags for the allocation + * @preferred_nid: The preferred NUMA node ID to allocate from + * @nodemask: Set of nodes to allocate from, may be NULL + * @nr_pages: The number of pages desired on the list or array + * @page_list: Optional list to store the allocated pages + * @page_array: Optional array to store the pages + * + * This is a batched version of the page allocator that attempts to + * allocate nr_pages quickly. Pages are added to page_list if page_list + * is not NULL, otherwise it is assumed that the page_array is valid. + * + * For lists, nr_pages is the number of pages that should be allocated. + * + * For arrays, only NULL elements are populated with pages and nr_pages + * is the maximum number of pages that will be stored in the array. + * + * Returns the number of pages on the list or array. + */ +unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, + nodemask_t *nodemask, int nr_pages, + struct list_head *page_list, + struct page **page_array) +{ + struct page *page; + unsigned long flags; + struct zone *zone; + struct zoneref *z; + struct per_cpu_pages *pcp; + struct list_head *pcp_list; + struct alloc_context ac; + gfp_t alloc_gfp; + unsigned int alloc_flags = ALLOC_WMARK_LOW; + int nr_populated = 0; + + if (unlikely(nr_pages <= 0)) + return 0; + + /* + * Skip populated array elements to determine if any pages need + * to be allocated before disabling IRQs. + */ + while (page_array && page_array[nr_populated] && nr_populated < nr_pages) + nr_populated++; + + /* Use the single page allocator for one page. */ + if (nr_pages - nr_populated == 1) + goto failed; + + /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */ + gfp &= gfp_allowed_mask; + alloc_gfp = gfp; + if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) + return 0; + gfp = alloc_gfp; + + /* Find an allowed local zone that meets the low watermark. */ + for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) { + unsigned long mark; + + if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && + !__cpuset_zone_allowed(zone, gfp)) { + continue; + } + + if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone && + zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) { + goto failed; + } + + mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; + if (zone_watermark_fast(zone, 0, mark, + zonelist_zone_idx(ac.preferred_zoneref), + alloc_flags, gfp)) { + break; + } + } + + /* + * If there are no allowed local zones that meets the watermarks then + * try to allocate a single page and reclaim if necessary. + */ + if (unlikely(!zone)) + goto failed; + + /* Attempt the batch allocation */ + local_irq_save(flags); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + pcp_list = &pcp->lists[ac.migratetype]; + + while (nr_populated < nr_pages) { + + /* Skip existing pages */ + if (page_array && page_array[nr_populated]) { + nr_populated++; + continue; + } + + page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags, + pcp, pcp_list); + if (unlikely(!page)) { + /* Try and get at least one page */ + if (!nr_populated) + goto failed_irq; + break; + } + + /* + * Ideally this would be batched but the best way to do + * that cheaply is to first convert zone_statistics to + * be inaccurate per-cpu counter like vm_events to avoid + * a RMW cycle then do the accounting with IRQs enabled. + */ + __count_zid_vm_events(PGALLOC, zone_idx(zone), 1); + zone_statistics(ac.preferred_zoneref->zone, zone); + + prep_new_page(page, 0, gfp, 0); + if (page_list) + list_add(&page->lru, page_list); + else + page_array[nr_populated] = page; + nr_populated++; + } + + local_irq_restore(flags); + + return nr_populated; + +failed_irq: + local_irq_restore(flags); + +failed: + page = __alloc_pages(gfp, 0, preferred_nid, nodemask); + if (page) { + if (page_list) + list_add(&page->lru, page_list); + else + page_array[nr_populated] = page; + nr_populated++; + } + + return nr_populated; +} +EXPORT_SYMBOL_GPL(__alloc_pages_bulk); + +/* * This is the 'heart' of the zoned buddy allocator. */ -struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, +struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask) { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; - gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ + gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { }; /* @@ -4982,33 +5172,36 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, * so bail out early if the request is out of bound. */ if (unlikely(order >= MAX_ORDER)) { - WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); + WARN_ON_ONCE(!(gfp & __GFP_NOWARN)); return NULL; } - gfp_mask &= gfp_allowed_mask; - alloc_mask = gfp_mask; - if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) + gfp &= gfp_allowed_mask; + /* + * Apply scoped allocation constraints. This is mainly about GFP_NOFS + * resp. GFP_NOIO which has to be inherited for all allocation requests + * from a particular context which has been marked by + * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures + * movable zones are not used during allocation. + */ + gfp = current_gfp_context(gfp); + alloc_gfp = gfp; + if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac, + &alloc_gfp, &alloc_flags)) return NULL; /* * Forbid the first pass from falling back to types that fragment * memory until all local zones are considered. */ - alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask); + alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp); /* First allocation attempt */ - page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); + page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); if (likely(page)) goto out; - /* - * Apply scoped allocation constraints. This is mainly about GFP_NOFS - * resp. GFP_NOIO which has to be inherited for all allocation requests - * from a particular context which has been marked by - * memalloc_no{fs,io}_{save,restore}. - */ - alloc_mask = current_gfp_context(gfp_mask); + alloc_gfp = gfp; ac.spread_dirty_pages = false; /* @@ -5017,20 +5210,20 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, */ ac.nodemask = nodemask; - page = __alloc_pages_slowpath(alloc_mask, order, &ac); + page = __alloc_pages_slowpath(alloc_gfp, order, &ac); out: - if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && - unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) { + if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT) && page && + unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) { __free_pages(page, order); page = NULL; } - trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); + trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); return page; } -EXPORT_SYMBOL(__alloc_pages_nodemask); +EXPORT_SYMBOL(__alloc_pages); /* * Common helper functions. Never use with __GFP_HIGHMEM because the returned @@ -7689,7 +7882,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char return pages; } -void __init mem_init_print_info(const char *str) +void __init mem_init_print_info(void) { unsigned long physpages, codesize, datasize, rosize, bss_size; unsigned long init_code_size, init_data_size; @@ -7728,17 +7921,17 @@ void __init mem_init_print_info(const char *str) #ifdef CONFIG_HIGHMEM ", %luK highmem" #endif - "%s%s)\n", + ")\n", nr_free_pages() << (PAGE_SHIFT - 10), physpages << (PAGE_SHIFT - 10), codesize >> 10, datasize >> 10, rosize >> 10, (init_data_size + init_code_size) >> 10, bss_size >> 10, (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), - totalcma_pages << (PAGE_SHIFT - 10), + totalcma_pages << (PAGE_SHIFT - 10) #ifdef CONFIG_HIGHMEM - totalhigh_pages() << (PAGE_SHIFT - 10), + , totalhigh_pages() << (PAGE_SHIFT - 10) #endif - str ? ", " : "", str ? str : ""); + ); } /** @@ -8222,6 +8415,7 @@ void *__init alloc_large_system_hash(const char *tablename, void *table = NULL; gfp_t gfp_flags; bool virt; + bool huge; /* allow the kernel cmdline to have a say */ if (!numentries) { @@ -8289,6 +8483,7 @@ void *__init alloc_large_system_hash(const char *tablename, } else if (get_order(size) >= MAX_ORDER || hashdist) { table = __vmalloc(size, gfp_flags); virt = true; + huge = is_vm_area_hugepages(table); } else { /* * If bucketsize is not a power-of-two, we may free @@ -8305,7 +8500,7 @@ void *__init alloc_large_system_hash(const char *tablename, pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, - virt ? "vmalloc" : "linear"); + virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear"); if (_hash_shift) *_hash_shift = log2qty; @@ -8450,6 +8645,27 @@ static unsigned long pfn_max_align_up(unsigned long pfn) pageblock_nr_pages)); } +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) +/* Usage: See admin-guide/dynamic-debug-howto.rst */ +static void alloc_contig_dump_pages(struct list_head *page_list) +{ + DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure"); + + if (DYNAMIC_DEBUG_BRANCH(descriptor)) { + struct page *page; + + dump_stack(); + list_for_each_entry(page, page_list, lru) + dump_page(page, "migration failure"); + } +} +#else +static inline void alloc_contig_dump_pages(struct list_head *page_list) +{ +} +#endif + /* [start, end) must belong to a single zone. */ static int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end) @@ -8464,7 +8680,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; - migrate_prep(); + lru_cache_disable(); while (pfn < end || !list_empty(&cc->migratepages)) { if (fatal_signal_pending(current)) { @@ -8474,14 +8690,13 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, if (list_empty(&cc->migratepages)) { cc->nr_migratepages = 0; - pfn = isolate_migratepages_range(cc, pfn, end); - if (!pfn) { - ret = -EINTR; + ret = isolate_migratepages_range(cc, pfn, end); + if (ret && ret != -EAGAIN) break; - } + pfn = cc->migrate_pfn; tries = 0; } else if (++tries == 5) { - ret = ret < 0 ? ret : -EBUSY; + ret = -EBUSY; break; } @@ -8491,8 +8706,18 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, ret = migrate_pages(&cc->migratepages, alloc_migration_target, NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE); + + /* + * On -ENOMEM, migrate_pages() bails out right away. It is pointless + * to retry again over this error, so do the same here. + */ + if (ret == -ENOMEM) + break; } + + lru_cache_enable(); if (ret < 0) { + alloc_contig_dump_pages(&cc->migratepages); putback_movable_pages(&cc->migratepages); return ret; } @@ -8583,7 +8808,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, ret = __alloc_contig_migrate_range(&cc, start, end); if (ret && ret != -EBUSY) goto done; - ret =0; + ret = 0; /* * Pages from [start, end) are within a MAX_ORDER_NR_PAGES @@ -8602,8 +8827,6 @@ int alloc_contig_range(unsigned long start, unsigned long end, * isolated thus they won't get removed from buddy. */ - lru_add_drain_all(); - order = 0; outer_start = start; while (!PageBuddy(pfn_to_page(outer_start))) { @@ -8629,8 +8852,6 @@ int alloc_contig_range(unsigned long start, unsigned long end, /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, 0)) { - pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n", - __func__, outer_start, end); ret = -EBUSY; goto done; } @@ -8680,12 +8901,6 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, if (PageReserved(page)) return false; - - if (page_count(page) > 0) - return false; - - if (PageHuge(page)) - return false; } return true; } @@ -8757,9 +8972,9 @@ struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, } #endif /* CONFIG_CONTIG_ALLOC */ -void free_contig_range(unsigned long pfn, unsigned int nr_pages) +void free_contig_range(unsigned long pfn, unsigned long nr_pages) { - unsigned int count = 0; + unsigned long count = 0; for (; nr_pages--; pfn++) { struct page *page = pfn_to_page(pfn); @@ -8767,7 +8982,7 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages) count += page_count(page) != 1; __free_page(page); } - WARN(count != 0, "%d pages are still in use!\n", count); + WARN(count != 0, "%lu pages are still in use!\n", count); } EXPORT_SYMBOL(free_contig_range); @@ -8805,12 +9020,9 @@ void zone_pcp_enable(struct zone *zone) void zone_pcp_reset(struct zone *zone) { - unsigned long flags; int cpu; struct per_cpu_pageset *pset; - /* avoid races with drain_pages() */ - local_irq_save(flags); if (zone->pageset != &boot_pageset) { for_each_online_cpu(cpu) { pset = per_cpu_ptr(zone->pageset, cpu); @@ -8819,7 +9031,6 @@ void zone_pcp_reset(struct zone *zone) free_percpu(zone->pageset); zone->pageset = &boot_pageset; } - local_irq_restore(flags); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/mm/page_counter.c b/mm/page_counter.c index c6860f51b6c6..7d83641eb86b 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -52,9 +52,13 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_sub_return(nr_pages, &counter->usage); - propagate_protected_usage(counter, new); /* More uncharges than charges? */ - WARN_ON_ONCE(new < 0); + if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n", + new, nr_pages)) { + new = 0; + atomic_long_set(&counter->usage, new); + } + propagate_protected_usage(counter, new); } /** diff --git a/mm/page_owner.c b/mm/page_owner.c index d15c7c4994f5..9661d5320a07 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -27,6 +27,7 @@ struct page_owner { depot_stack_handle_t handle; depot_stack_handle_t free_handle; u64 ts_nsec; + u64 free_ts_nsec; pid_t pid; }; @@ -41,13 +42,7 @@ static void init_early_allocated_pages(void); static int __init early_page_owner_param(char *buf) { - if (!buf) - return -EINVAL; - - if (strcmp(buf, "on") == 0) - page_owner_enabled = true; - - return 0; + return kstrtobool(buf, &page_owner_enabled); } early_param("page_owner", early_page_owner_param); @@ -103,42 +98,30 @@ static inline struct page_owner *get_page_owner(struct page_ext *page_ext) return (void *)page_ext + page_owner_ops.offset; } -static inline bool check_recursive_alloc(unsigned long *entries, - unsigned int nr_entries, - unsigned long ip) -{ - unsigned int i; - - for (i = 0; i < nr_entries; i++) { - if (entries[i] == ip) - return true; - } - return false; -} - static noinline depot_stack_handle_t save_stack(gfp_t flags) { unsigned long entries[PAGE_OWNER_STACK_DEPTH]; depot_stack_handle_t handle; unsigned int nr_entries; - nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); - /* - * We need to check recursion here because our request to - * stackdepot could trigger memory allocation to save new - * entry. New memory allocation would reach here and call - * stack_depot_save_entries() again if we don't catch it. There is - * still not enough memory in stackdepot so it would try to - * allocate memory again and loop forever. + * Avoid recursion. + * + * Sometimes page metadata allocation tracking requires more + * memory to be allocated: + * - when new stack trace is saved to stack depot + * - when backtrace itself is calculated (ia64) */ - if (check_recursive_alloc(entries, nr_entries, _RET_IP_)) + if (current->in_page_owner) return dummy_handle; + current->in_page_owner = 1; + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); handle = stack_depot_save(entries, nr_entries, flags); if (!handle) handle = failure_handle; + current->in_page_owner = 0; return handle; } @@ -146,25 +129,27 @@ void __reset_page_owner(struct page *page, unsigned int order) { int i; struct page_ext *page_ext; - depot_stack_handle_t handle = 0; + depot_stack_handle_t handle; struct page_owner *page_owner; - - handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); + u64 free_ts_nsec = local_clock(); page_ext = lookup_page_ext(page); if (unlikely(!page_ext)) return; + + handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); for (i = 0; i < (1 << order); i++) { __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); page_owner = get_page_owner(page_ext); page_owner->free_handle = handle; + page_owner->free_ts_nsec = free_ts_nsec; page_ext = page_ext_next(page_ext); } } -static inline void __set_page_owner_handle(struct page *page, - struct page_ext *page_ext, depot_stack_handle_t handle, - unsigned int order, gfp_t gfp_mask) +static inline void __set_page_owner_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, + unsigned int order, gfp_t gfp_mask) { struct page_owner *page_owner; int i; @@ -194,7 +179,7 @@ noinline void __set_page_owner(struct page *page, unsigned int order, return; handle = save_stack(gfp_mask); - __set_page_owner_handle(page, page_ext, handle, order, gfp_mask); + __set_page_owner_handle(page_ext, handle, order, gfp_mask); } void __set_page_owner_migrate_reason(struct page *page, int reason) @@ -243,6 +228,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) new_page_owner->handle = old_page_owner->handle; new_page_owner->pid = old_page_owner->pid; new_page_owner->ts_nsec = old_page_owner->ts_nsec; + new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; /* * We don't clear the bit on the oldpage as it's going to be freed @@ -356,10 +342,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = snprintf(kbuf, count, - "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns\n", + "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n", page_owner->order, page_owner->gfp_mask, &page_owner->gfp_mask, page_owner->pid, - page_owner->ts_nsec); + page_owner->ts_nsec, page_owner->free_ts_nsec); if (ret >= count) goto err; @@ -435,9 +421,9 @@ void __dump_page_owner(struct page *page) else pr_alert("page_owner tracks the page as freed\n"); - pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu\n", + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n", page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, - page_owner->pid, page_owner->ts_nsec); + page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec); handle = READ_ONCE(page_owner->handle); if (!handle) { @@ -612,7 +598,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) continue; /* Found early allocated page */ - __set_page_owner_handle(page, page_ext, early_handle, + __set_page_owner_handle(page_ext, early_handle, 0, 0); count++; } diff --git a/mm/page_poison.c b/mm/page_poison.c index 65cdf844c8ad..98438985e1ed 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -2,6 +2,7 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/mm.h> +#include <linux/mmdebug.h> #include <linux/highmem.h> #include <linux/page_ext.h> #include <linux/poison.h> @@ -45,7 +46,7 @@ static bool single_bit_flip(unsigned char a, unsigned char b) return error && !(error & (error - 1)); } -static void check_poison_mem(unsigned char *mem, size_t bytes) +static void check_poison_mem(struct page *page, unsigned char *mem, size_t bytes) { static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10); unsigned char *start; @@ -70,6 +71,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, end - start + 1, 1); dump_stack(); + dump_page(page, "pagealloc: corrupted page details"); } static void unpoison_page(struct page *page) @@ -77,12 +79,14 @@ static void unpoison_page(struct page *page) void *addr; addr = kmap_atomic(page); + kasan_disable_current(); /* * Page poisoning when enabled poisons each and every page * that is freed to buddy. Thus no extra check is done to * see if a page was poisoned. */ - check_poison_mem(addr, PAGE_SIZE); + check_poison_mem(page, kasan_reset_tag(addr), PAGE_SIZE); + kasan_enable_current(); kunmap_atomic(addr); } diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 18b768ac7dca..095d7eaa0db4 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -87,7 +87,7 @@ extern spinlock_t pcpu_lock; extern struct list_head *pcpu_chunk_lists; extern int pcpu_nr_slots; -extern int pcpu_nr_empty_pop_pages; +extern int pcpu_nr_empty_pop_pages[]; extern struct pcpu_chunk *pcpu_first_chunk; extern struct pcpu_chunk *pcpu_reserved_chunk; diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index c8400a2adbc2..f6026dbcdf6b 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -145,6 +145,7 @@ static int percpu_stats_show(struct seq_file *m, void *v) int slot, max_nr_alloc; int *buffer; enum pcpu_chunk_type type; + int nr_empty_pop_pages; alloc_buffer: spin_lock_irq(&pcpu_lock); @@ -165,7 +166,11 @@ alloc_buffer: goto alloc_buffer; } -#define PL(X) \ + nr_empty_pop_pages = 0; + for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) + nr_empty_pop_pages += pcpu_nr_empty_pop_pages[type]; + +#define PL(X) \ seq_printf(m, " %-20s: %12lld\n", #X, (long long int)pcpu_stats_ai.X) seq_printf(m, @@ -196,7 +201,7 @@ alloc_buffer: PU(nr_max_chunks); PU(min_alloc_size); PU(max_alloc_size); - P("empty_pop_pages", pcpu_nr_empty_pop_pages); + P("empty_pop_pages", nr_empty_pop_pages); seq_putc(m, '\n'); #undef PU diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index e46f7a6917f9..8d3844bc0c7c 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -8,6 +8,7 @@ * Chunks are mapped into vmalloc areas and populated page by page. * This is the default chunk allocator. */ +#include "internal.h" static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) @@ -133,7 +134,7 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) { - unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); + vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT)); } /** @@ -192,8 +193,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, static int __pcpu_map_pages(unsigned long addr, struct page **pages, int nr_pages) { - return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, - PAGE_KERNEL, pages); + return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT), + PAGE_KERNEL, pages, PAGE_SHIFT); } /** diff --git a/mm/percpu.c b/mm/percpu.c index 6596a0a4286e..23308113a5ff 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -173,10 +173,10 @@ struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */ static LIST_HEAD(pcpu_map_extend_chunks); /* - * The number of empty populated pages, protected by pcpu_lock. The - * reserved chunk doesn't contribute to the count. + * The number of empty populated pages by chunk type, protected by pcpu_lock. + * The reserved chunk doesn't contribute to the count. */ -int pcpu_nr_empty_pop_pages; +int pcpu_nr_empty_pop_pages[PCPU_NR_CHUNK_TYPES]; /* * The number of populated pages in use by the allocator, protected by @@ -556,7 +556,7 @@ static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr) { chunk->nr_empty_pop_pages += nr; if (chunk != pcpu_reserved_chunk) - pcpu_nr_empty_pop_pages += nr; + pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] += nr; } /* @@ -1832,7 +1832,7 @@ area_found: mutex_unlock(&pcpu_alloc_mutex); } - if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) + if (pcpu_nr_empty_pop_pages[type] < PCPU_EMPTY_POP_PAGES_LOW) pcpu_schedule_balance_work(); /* clear the areas and return address relative to base address */ @@ -2000,7 +2000,7 @@ retry_pop: pcpu_atomic_alloc_failed = false; } else { nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - - pcpu_nr_empty_pop_pages, + pcpu_nr_empty_pop_pages[type], 0, PCPU_EMPTY_POP_PAGES_HIGH); } @@ -2580,7 +2580,7 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* link the first chunk in */ pcpu_first_chunk = chunk; - pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; + pcpu_nr_empty_pop_pages[PCPU_CHUNK_ROOT] = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); /* include all regions of the first chunk */ diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index f5fee9cf90f8..4bcc11958089 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -9,7 +9,6 @@ #include <linux/mm.h> #include <linux/uio.h> #include <linux/sched.h> -#include <linux/compat.h> #include <linux/sched/mm.h> #include <linux/highmem.h> #include <linux/ptrace.h> diff --git a/mm/ptdump.c b/mm/ptdump.c index 4354c1422d57..da751448d0e4 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -111,7 +111,7 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - pte_t val = READ_ONCE(*pte); + pte_t val = ptep_get(pte); if (st->effective_prot) st->effective_prot(st, 4, pte_val(val)); diff --git a/mm/readahead.c b/mm/readahead.c index c5b0457415be..d589f147f4c2 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -198,8 +198,6 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, for (i = 0; i < nr_to_read; i++) { struct page *page = xa_load(&mapping->i_pages, index + i); - BUG_ON(index + i != ractl->_index + ractl->_nr_pages); - if (page && !xa_is_value(page)) { /* * Page already present? Kick off the current batch @@ -210,6 +208,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, * not worth getting one just for that. */ read_pages(ractl, &page_pool, true); + i = ractl->_index + ractl->_nr_pages - index - 1; continue; } @@ -223,6 +222,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, gfp_mask) < 0) { put_page(page); read_pages(ractl, &page_pool, true); + i = ractl->_index + ractl->_nr_pages - index - 1; continue; } if (i == nr_to_read - lookahead_size) @@ -272,9 +272,10 @@ void do_page_cache_ra(struct readahead_control *ractl, * memory at once. */ void force_page_cache_ra(struct readahead_control *ractl, - struct file_ra_state *ra, unsigned long nr_to_read) + unsigned long nr_to_read) { struct address_space *mapping = ractl->mapping; + struct file_ra_state *ra = ractl->ra; struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages, index; @@ -433,10 +434,10 @@ static int try_context_readahead(struct address_space *mapping, * A minimal readahead algorithm for trivial sequential/random reads. */ static void ondemand_readahead(struct readahead_control *ractl, - struct file_ra_state *ra, bool hit_readahead_marker, - unsigned long req_size) + bool hit_readahead_marker, unsigned long req_size) { struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); + struct file_ra_state *ra = ractl->ra; unsigned long max_pages = ra->ra_pages; unsigned long add_pages; unsigned long index = readahead_index(ractl); @@ -550,7 +551,7 @@ readit: } void page_cache_sync_ra(struct readahead_control *ractl, - struct file_ra_state *ra, unsigned long req_count) + unsigned long req_count) { bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM); @@ -560,7 +561,7 @@ void page_cache_sync_ra(struct readahead_control *ractl, * read-ahead will do the right thing and limit the read to just the * requested range, which we'll set to 1 page for this case. */ - if (!ra->ra_pages || blk_cgroup_congested()) { + if (!ractl->ra->ra_pages || blk_cgroup_congested()) { if (!ractl->file) return; req_count = 1; @@ -569,21 +570,20 @@ void page_cache_sync_ra(struct readahead_control *ractl, /* be dumb */ if (do_forced_ra) { - force_page_cache_ra(ractl, ra, req_count); + force_page_cache_ra(ractl, req_count); return; } /* do read-ahead */ - ondemand_readahead(ractl, ra, false, req_count); + ondemand_readahead(ractl, false, req_count); } EXPORT_SYMBOL_GPL(page_cache_sync_ra); void page_cache_async_ra(struct readahead_control *ractl, - struct file_ra_state *ra, struct page *page, - unsigned long req_count) + struct page *page, unsigned long req_count) { /* no read-ahead */ - if (!ra->ra_pages) + if (!ractl->ra->ra_pages) return; /* @@ -604,7 +604,7 @@ void page_cache_async_ra(struct readahead_control *ractl, return; /* do read-ahead */ - ondemand_readahead(ractl, ra, true, req_count); + ondemand_readahead(ractl, true, req_count); } EXPORT_SYMBOL_GPL(page_cache_async_ra); @@ -638,3 +638,78 @@ SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) { return ksys_readahead(fd, offset, count); } + +/** + * readahead_expand - Expand a readahead request + * @ractl: The request to be expanded + * @new_start: The revised start + * @new_len: The revised size of the request + * + * Attempt to expand a readahead request outwards from the current size to the + * specified size by inserting locked pages before and after the current window + * to increase the size to the new window. This may involve the insertion of + * THPs, in which case the window may get expanded even beyond what was + * requested. + * + * The algorithm will stop if it encounters a conflicting page already in the + * pagecache and leave a smaller expansion than requested. + * + * The caller must check for this by examining the revised @ractl object for a + * different expansion than was requested. + */ +void readahead_expand(struct readahead_control *ractl, + loff_t new_start, size_t new_len) +{ + struct address_space *mapping = ractl->mapping; + struct file_ra_state *ra = ractl->ra; + pgoff_t new_index, new_nr_pages; + gfp_t gfp_mask = readahead_gfp_mask(mapping); + + new_index = new_start / PAGE_SIZE; + + /* Expand the leading edge downwards */ + while (ractl->_index > new_index) { + unsigned long index = ractl->_index - 1; + struct page *page = xa_load(&mapping->i_pages, index); + + if (page && !xa_is_value(page)) + return; /* Page apparently present */ + + page = __page_cache_alloc(gfp_mask); + if (!page) + return; + if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { + put_page(page); + return; + } + + ractl->_nr_pages++; + ractl->_index = page->index; + } + + new_len += new_start - readahead_pos(ractl); + new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE); + + /* Expand the trailing edge upwards */ + while (ractl->_nr_pages < new_nr_pages) { + unsigned long index = ractl->_index + ractl->_nr_pages; + struct page *page = xa_load(&mapping->i_pages, index); + + if (page && !xa_is_value(page)) + return; /* Page apparently present */ + + page = __page_cache_alloc(gfp_mask); + if (!page) + return; + if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { + put_page(page); + return; + } + ractl->_nr_pages++; + if (ra) { + ra->size++; + ra->async_size++; + } + } +} +EXPORT_SYMBOL(readahead_expand); diff --git a/mm/shmem.c b/mm/shmem.c index b2db4ed0fbc7..a08cedefbfaa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2846,6 +2846,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_ffree = sbinfo->free_inodes; } /* else leave those fields 0 like simple_statfs */ + + buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); + return 0; } @@ -3505,7 +3508,7 @@ static int shmem_parse_options(struct fs_context *fc, void *data) } } if (*this_char) { - char *value = strchr(this_char,'='); + char *value = strchr(this_char, '='); size_t len = 0; int err; diff --git a/mm/shuffle.c b/mm/shuffle.c index 9c2e145a747a..c13c33b247e8 100644 --- a/mm/shuffle.c +++ b/mm/shuffle.c @@ -147,8 +147,8 @@ void __meminit __shuffle_zone(struct zone *z) spin_unlock_irqrestore(&z->lock, flags); } -/** - * shuffle_free_memory - reduce the predictability of the page allocator +/* + * __shuffle_free_memory - reduce the predictability of the page allocator * @pgdat: node page data */ void __meminit __shuffle_free_memory(pg_data_t *pgdat) diff --git a/mm/slab.c b/mm/slab.c index ae651bf540b7..df45c437b394 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3216,6 +3216,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_ void *ptr; int slab_node = numa_mem_id(); struct obj_cgroup *objcg = NULL; + bool init = false; flags &= gfp_allowed_mask; cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags); @@ -3254,12 +3255,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_ out: local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - - if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr) - memset(ptr, 0, cachep->object_size); + init = slab_want_init_on_alloc(flags, cachep); out_hooks: - slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr); + slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init); return ptr; } @@ -3301,6 +3300,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo unsigned long save_flags; void *objp; struct obj_cgroup *objcg = NULL; + bool init = false; flags &= gfp_allowed_mask; cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags); @@ -3317,12 +3317,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); - - if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp) - memset(objp, 0, cachep->object_size); + init = slab_want_init_on_alloc(flags, cachep); out: - slab_post_alloc_hook(cachep, objcg, flags, 1, &objp); + slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init); return objp; } @@ -3427,17 +3425,24 @@ free_done: static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, unsigned long caller) { + bool init; + if (is_kfence_address(objp)) { kmemleak_free_recursive(objp, cachep->flags); __kfence_free(objp); return; } - if (unlikely(slab_want_init_on_free(cachep))) + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_free and initialization memset must be + * kept together to avoid discrepancies in behavior. + */ + init = slab_want_init_on_free(cachep); + if (init && !kasan_has_integrated_init()) memset(objp, 0, cachep->object_size); - - /* Put the object into the quarantine, don't touch it for now. */ - if (kasan_slab_free(cachep, objp)) + /* KASAN might put objp into memory quarantine, delaying its reuse. */ + if (kasan_slab_free(cachep, objp, init)) return; /* Use KCSAN to help debug racy use-after-free. */ @@ -3542,18 +3547,18 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_); - /* Clear memory outside IRQ disabled section */ - if (unlikely(slab_want_init_on_alloc(flags, s))) - for (i = 0; i < size; i++) - memset(p[i], 0, s->object_size); - - slab_post_alloc_hook(s, objcg, flags, size, p); + /* + * memcg and kmem_cache debug support and memory initialization. + * Done outside of the IRQ disabled section. + */ + slab_post_alloc_hook(s, objcg, flags, size, p, + slab_want_init_on_alloc(flags, s)); /* FIXME: Trace call missing. Christoph would like a bulk variant */ return size; error: local_irq_enable(); cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); - slab_post_alloc_hook(s, objcg, flags, i, p); + slab_post_alloc_hook(s, objcg, flags, i, p, false); __kmem_cache_free_bulk(s, i, p); return 0; } @@ -3651,6 +3656,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, EXPORT_SYMBOL(__kmalloc_node_track_caller); #endif /* CONFIG_NUMA */ +#ifdef CONFIG_PRINTK void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) { struct kmem_cache *cachep; @@ -3670,6 +3676,7 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) if (DEBUG && cachep->flags & SLAB_STORE_USER) kpp->kp_ret = *dbg_userword(cachep, objp); } +#endif /** * __do_kmalloc - allocate memory diff --git a/mm/slab.h b/mm/slab.h index 076582f58f68..18c1927cd196 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -506,15 +506,24 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, } static inline void slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, - gfp_t flags, size_t size, void **p) + struct obj_cgroup *objcg, gfp_t flags, + size_t size, void **p, bool init) { size_t i; flags &= gfp_allowed_mask; + + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_alloc and initialization memset must be + * kept together to avoid discrepancies in behavior. + * + * As p[i] might get tagged, memset and kmemleak hook come after KASAN. + */ for (i = 0; i < size; i++) { - p[i] = kasan_slab_alloc(s, p[i], flags); - /* As p[i] might get tagged, call kmemleak hook after KASAN. */ + p[i] = kasan_slab_alloc(s, p[i], flags, init); + if (p[i] && init && !kasan_has_integrated_init()) + memset(p[i], 0, s->object_size); kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, flags); } @@ -601,7 +610,8 @@ static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) { - if (static_branch_unlikely(&init_on_alloc)) { + if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc)) { if (c->ctor) return false; if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) @@ -613,12 +623,14 @@ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) static inline bool slab_want_init_on_free(struct kmem_cache *c) { - if (static_branch_unlikely(&init_on_free)) + if (static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, + &init_on_free)) return !(c->ctor || (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); return false; } +#ifdef CONFIG_PRINTK #define KS_ADDRS_COUNT 16 struct kmem_obj_info { void *kp_ptr; @@ -630,5 +642,6 @@ struct kmem_obj_info { void *kp_stack[KS_ADDRS_COUNT]; }; void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page); +#endif #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 88e833986332..f8833d3e5d47 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -71,11 +71,19 @@ static int __init setup_slab_nomerge(char *str) return 1; } +static int __init setup_slab_merge(char *str) +{ + slab_nomerge = false; + return 1; +} + #ifdef CONFIG_SLUB __setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0); +__setup_param("slub_merge", slub_merge, setup_slab_merge, 0); #endif __setup("slab_nomerge", setup_slab_nomerge); +__setup("slab_merge", setup_slab_merge); /* * Determine the size of a slab object @@ -526,6 +534,7 @@ bool slab_is_available(void) return slab_state >= UP; } +#ifdef CONFIG_PRINTK /** * kmem_valid_obj - does the pointer reference a valid slab object? * @object: pointer to query. @@ -544,6 +553,7 @@ bool kmem_valid_obj(void *object) page = virt_to_head_page(object); return PageSlab(page); } +EXPORT_SYMBOL_GPL(kmem_valid_obj); /** * kmem_dump_obj - Print available slab provenance information @@ -600,6 +610,8 @@ void kmem_dump_obj(void *object) pr_info(" %pS\n", kp.kp_stack[i]); } } +EXPORT_SYMBOL_GPL(kmem_dump_obj); +#endif #ifndef CONFIG_SLOB /* Create a cache during boot when no slab services are available yet */ diff --git a/mm/slob.c b/mm/slob.c index 0578429b991b..74d3f6e60666 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -461,11 +461,13 @@ out: spin_unlock_irqrestore(&slob_lock, flags); } +#ifdef CONFIG_PRINTK void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) { kpp->kp_ptr = object; kpp->kp_page = page; } +#endif /* * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. diff --git a/mm/slub.c b/mm/slub.c index 3021ce9bf1b3..68123b21e65f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3,7 +3,7 @@ * SLUB: A slab allocator that limits cache line use instead of queuing * objects in per cpu and per node lists. * - * The allocator synchronizes using per slab locks or atomic operatios + * The allocator synchronizes using per slab locks or atomic operations * and only uses a centralized lock to manage a pool of partial slabs. * * (C) 2007 SGI, Christoph Lameter @@ -160,7 +160,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) #undef SLUB_DEBUG_CMPXCHG /* - * Mininum number of partial slabs. These will be left on the partial + * Minimum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. */ #define MIN_PARTIAL 5 @@ -624,7 +624,7 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time) if (!t->addr) return; - pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + pr_err("%s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); #ifdef CONFIG_STACKTRACE { @@ -650,8 +650,9 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", - page, page->objects, page->inuse, page->freelist, page->flags); + pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n", + page, page->objects, page->inuse, page->freelist, + page->flags, &page->flags); } @@ -706,7 +707,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_page_info(page); - pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); if (s->flags & SLAB_RED_ZONE) @@ -799,7 +800,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, end--; slab_bug(s, "%s overwritten", what); - pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault - addr, fault[0], value); print_trailer(s, page, object); @@ -832,7 +833,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * * A. Free pointer (if we cannot overwrite object on free) * B. Tracking data for SLAB_STORE_USER - * C. Padding to reach required alignment boundary or at mininum + * C. Padding to reach required alignment boundary or at minimum * one word if debugging is on to be able to detect writes * before the word boundary. * @@ -1532,7 +1533,8 @@ static __always_inline void kfree_hook(void *x) kasan_kfree_large(x); } -static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) +static __always_inline bool slab_free_hook(struct kmem_cache *s, + void *x, bool init) { kmemleak_free_recursive(x, s->flags); @@ -1558,8 +1560,25 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) __kcsan_check_access(x, s->object_size, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); - /* KASAN might put x into memory quarantine, delaying its reuse */ - return kasan_slab_free(s, x); + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_free and initialization memset's must be + * kept together to avoid discrepancies in behavior. + * + * The initialization memset's clear the object and the metadata, + * but don't touch the SLAB redzone. + */ + if (init) { + int rsize; + + if (!kasan_has_integrated_init()) + memset(kasan_reset_tag(x), 0, s->object_size); + rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0; + memset((char *)kasan_reset_tag(x) + s->inuse, 0, + s->size - s->inuse - rsize); + } + /* KASAN might put x into memory quarantine, delaying its reuse. */ + return kasan_slab_free(s, x, init); } static inline bool slab_free_freelist_hook(struct kmem_cache *s, @@ -1569,10 +1588,9 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, void *object; void *next = *head; void *old_tail = *tail ? *tail : *head; - int rsize; if (is_kfence_address(next)) { - slab_free_hook(s, next); + slab_free_hook(s, next, false); return true; } @@ -1584,20 +1602,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, object = next; next = get_freepointer(s, object); - if (slab_want_init_on_free(s)) { - /* - * Clear the object and the metadata, but don't touch - * the redzone. - */ - memset(kasan_reset_tag(object), 0, s->object_size); - rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad - : 0; - memset((char *)kasan_reset_tag(object) + s->inuse, 0, - s->size - s->inuse - rsize); - - } /* If object's reuse doesn't have to be delayed */ - if (!slab_free_hook(s, object)) { + if (!slab_free_hook(s, object, slab_want_init_on_free(s))) { /* Move object to the new freelist */ set_freepointer(s, object, *head); *head = object; @@ -2822,6 +2828,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct page *page; unsigned long tid; struct obj_cgroup *objcg = NULL; + bool init = false; s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags); if (!s) @@ -2899,12 +2906,10 @@ redo: } maybe_wipe_obj_freeptr(s, object); - - if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) - memset(kasan_reset_tag(object), 0, s->object_size); + init = slab_want_init_on_alloc(gfpflags, s); out: - slab_post_alloc_hook(s, objcg, gfpflags, 1, &object); + slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init); return object; } @@ -3236,7 +3241,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, } if (is_kfence_address(object)) { - slab_free_hook(df->s, object); + slab_free_hook(df->s, object, false); __kfence_free(object); p[size] = NULL; /* mark object processed */ return size; @@ -3356,20 +3361,16 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, c->tid = next_tid(c->tid); local_irq_enable(); - /* Clear memory outside IRQ disabled fastpath loop */ - if (unlikely(slab_want_init_on_alloc(flags, s))) { - int j; - - for (j = 0; j < i; j++) - memset(kasan_reset_tag(p[j]), 0, s->object_size); - } - - /* memcg and kmem_cache debug support */ - slab_post_alloc_hook(s, objcg, flags, size, p); + /* + * memcg and kmem_cache debug support and memory initialization. + * Done outside of the IRQ disabled fastpath loop. + */ + slab_post_alloc_hook(s, objcg, flags, size, p, + slab_want_init_on_alloc(flags, s)); return i; error: local_irq_enable(); - slab_post_alloc_hook(s, objcg, flags, i, p); + slab_post_alloc_hook(s, objcg, flags, i, p, false); __kmem_cache_free_bulk(s, i, p); return 0; } @@ -3421,7 +3422,7 @@ static unsigned int slub_min_objects; * * Higher order allocations also allow the placement of more objects in a * slab and thereby reduce object handling overhead. If the user has - * requested a higher mininum order then we start with that one instead of + * requested a higher minimum order then we start with that one instead of * the smallest order which will fit the object. */ static inline unsigned int slab_order(unsigned int size, @@ -3579,7 +3580,7 @@ static void early_kmem_cache_node_alloc(int node) init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif - n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL); + n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); page->freelist = get_freepointer(kmem_cache_node, n); page->inuse = 1; page->frozen = 0; @@ -3827,6 +3828,15 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) { +#ifdef CONFIG_SLUB_DEBUG + /* + * If no slub_debug was enabled globally, the static key is not yet + * enabled by setup_slub_debug(). Enable it if the cache is being + * created with any of the debugging flags passed explicitly. + */ + if (flags & SLAB_DEBUG_FLAGS) + static_branch_enable(&slub_debug_enabled); +#endif s->flags = kmem_cache_flags(s->size, flags, s->name); #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); @@ -3898,7 +3908,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, for_each_object(p, s, addr, page->objects) { if (!test_bit(__obj_to_index(s, addr, p), map)) { - pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); + pr_err("Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } @@ -3963,6 +3973,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) return 0; } +#ifdef CONFIG_PRINTK void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) { void *base; @@ -4002,6 +4013,7 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) #endif #endif } +#endif /******************************************************************** * Kmalloc subsystem diff --git a/mm/sparse.c b/mm/sparse.c index 7bd23f9d6cef..b2ada9dc00cb 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -257,7 +257,7 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en if (unlikely(!mem_section)) { unsigned long size, align; - size = sizeof(struct mem_section*) * NR_SECTION_ROOTS; + size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; align = 1 << (INTERNODE_CACHE_SHIFT); mem_section = memblock_alloc(size, align); if (!mem_section) @@ -547,6 +547,7 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.", __func__, nid); pnum_begin = pnum; + sparse_buffer_fini(); goto failed; } check_usemap_section_nr(nid, usage); @@ -623,7 +624,6 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) } } -#ifdef CONFIG_MEMORY_HOTREMOVE /* Mark all memory sections within the pfn range as offline */ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) { @@ -644,7 +644,6 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) ms->section_mem_map &= ~SECTION_IS_ONLINE; } } -#endif #ifdef CONFIG_SPARSEMEM_VMEMMAP static struct page * __meminit populate_section_memmap(unsigned long pfn, diff --git a/mm/swap.c b/mm/swap.c index 31b844d4ed94..a75a8265302b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -36,6 +36,7 @@ #include <linux/hugetlb.h> #include <linux/page_idle.h> #include <linux/local_lock.h> +#include <linux/buffer_head.h> #include "internal.h" @@ -235,6 +236,18 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec) } } +/* return true if pagevec needs to drain */ +static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page) +{ + bool ret = false; + + if (!pagevec_add(pvec, page) || PageCompound(page) || + lru_cache_disabled()) + ret = true; + + return ret; +} + /* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the @@ -252,7 +265,7 @@ void rotate_reclaimable_page(struct page *page) get_page(page); local_lock_irqsave(&lru_rotate.lock, flags); pvec = this_cpu_ptr(&lru_rotate.pvec); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, pagevec_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } @@ -343,7 +356,7 @@ static void activate_page(struct page *page) local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.activate_page); get_page(page); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, __activate_page); local_unlock(&lru_pvecs.lock); } @@ -458,7 +471,7 @@ void lru_cache_add(struct page *page) get_page(page); local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_add); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) __pagevec_lru_add(pvec); local_unlock(&lru_pvecs.lock); } @@ -629,6 +642,7 @@ void lru_add_drain_cpu(int cpu) pagevec_lru_move_fn(pvec, lru_lazyfree_fn); activate_page_drain(cpu); + invalidate_bh_lrus_cpu(cpu); } /** @@ -654,7 +668,7 @@ void deactivate_file_page(struct page *page) local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); local_unlock(&lru_pvecs.lock); } @@ -676,7 +690,7 @@ void deactivate_page(struct page *page) local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate); get_page(page); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, lru_deactivate_fn); local_unlock(&lru_pvecs.lock); } @@ -698,7 +712,7 @@ void mark_page_lazyfree(struct page *page) local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree); get_page(page); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn); local_unlock(&lru_pvecs.lock); } @@ -735,7 +749,7 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) * Calling this function with cpu hotplug locks held can actually lead * to obscure indirect dependencies via WQ context. */ -void lru_add_drain_all(void) +inline void __lru_add_drain_all(bool force_all_cpus) { /* * lru_drain_gen - Global pages generation number @@ -780,7 +794,7 @@ void lru_add_drain_all(void) * (C) Exit the draining operation if a newer generation, from another * lru_add_drain_all(), was already scheduled for draining. Check (A). */ - if (unlikely(this_gen != lru_drain_gen)) + if (unlikely(this_gen != lru_drain_gen && !force_all_cpus)) goto done; /* @@ -810,12 +824,14 @@ void lru_add_drain_all(void) for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); - if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || + if (force_all_cpus || + pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || - need_activate_page_drain(cpu)) { + need_activate_page_drain(cpu) || + has_bh_in_lru(cpu, NULL)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); __cpumask_set_cpu(cpu, &has_work); @@ -828,6 +844,11 @@ void lru_add_drain_all(void) done: mutex_unlock(&lock); } + +void lru_add_drain_all(void) +{ + __lru_add_drain_all(false); +} #else void lru_add_drain_all(void) { @@ -835,6 +856,34 @@ void lru_add_drain_all(void) } #endif /* CONFIG_SMP */ +atomic_t lru_disable_count = ATOMIC_INIT(0); + +/* + * lru_cache_disable() needs to be called before we start compiling + * a list of pages to be migrated using isolate_lru_page(). + * It drains pages on LRU cache and then disable on all cpus until + * lru_cache_enable is called. + * + * Must be paired with a call to lru_cache_enable(). + */ +void lru_cache_disable(void) +{ + atomic_inc(&lru_disable_count); +#ifdef CONFIG_SMP + /* + * lru_add_drain_all in the force mode will schedule draining on + * all online CPUs so any calls of lru_cache_disabled wrapped by + * local_lock or preemption disabled would be ordered by that. + * The atomic operation doesn't need to have stronger ordering + * requirements because that is enforeced by the scheduling + * guarantees. + */ + __lru_add_drain_all(true); +#else + lru_add_drain(); +#endif +} + /** * release_pages - batched put_page() * @pages: array of pages to release diff --git a/mm/swap_state.c b/mm/swap_state.c index 3cdee7b11da9..3a1259c13f3b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -132,7 +132,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, xas_store(&xas, page); xas_next(&xas); } - address_space->nrexceptional -= nr_shadows; address_space->nrpages += nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_lruvec_page_state(page, NR_SWAPCACHE, nr); @@ -172,8 +171,6 @@ void __delete_from_swap_cache(struct page *page, xas_next(&xas); } ClearPageSwapCache(page); - if (shadow) - address_space->nrexceptional += nr; address_space->nrpages -= nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr); @@ -275,7 +272,6 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, xas_store(&xas, NULL); nr_shadows++; } - address_space->nrexceptional -= nr_shadows; xa_unlock_irq(&address_space->i_pages); /* search the next swapcache until we meet end */ @@ -497,16 +493,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, __SetPageLocked(page); __SetPageSwapBacked(page); - /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) { - put_swap_page(page, entry); + if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry)) goto fail_unlock; - } - if (mem_cgroup_charge(page, NULL, gfp_mask)) { - delete_from_swap_cache(page); + /* May fail (-ENOMEM) if XArray node allocation failed. */ + if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; - } + + mem_cgroup_swapin_uncharge_swap(entry); if (shadow) workingset_refault(page, shadow); @@ -517,6 +511,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return page; fail_unlock: + put_swap_page(page, entry); unlock_page(page); put_page(page); return NULL; diff --git a/mm/swapfile.c b/mm/swapfile.c index 084a5b9a18e5..149e77454e3c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2780,7 +2780,7 @@ static int swap_show(struct seq_file *swap, void *v) unsigned int bytes, inuse; if (si == SEQ_START_TOKEN) { - seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); + seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); return 0; } @@ -3284,7 +3284,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) sizeof(long), GFP_KERNEL); - if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { + if (p->bdev && (swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { /* * When discard is enabled for swap with no particular * policy flagged, we set all swap discard flags here in diff --git a/mm/truncate.c b/mm/truncate.c index 455944264663..95af244b112a 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -40,7 +40,6 @@ static inline void __clear_shadow_entry(struct address_space *mapping, if (xas_load(&xas) != entry) return; xas_store(&xas, NULL); - mapping->nrexceptional--; } static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, @@ -295,7 +294,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pgoff_t index; int i; - if (mapping->nrpages == 0 && mapping->nrexceptional == 0) + if (mapping_empty(mapping)) goto out; /* Offsets within partial pages */ @@ -440,9 +439,6 @@ EXPORT_SYMBOL(truncate_inode_pages); */ void truncate_inode_pages_final(struct address_space *mapping) { - unsigned long nrexceptional; - unsigned long nrpages; - /* * Page reclaim can not participate in regular inode lifetime * management (can't call iput()) and thus can race with the @@ -452,16 +448,7 @@ void truncate_inode_pages_final(struct address_space *mapping) */ mapping_set_exiting(mapping); - /* - * When reclaim installs eviction entries, it increases - * nrexceptional first, then decreases nrpages. Make sure we see - * this in the right order or we might miss an entry. - */ - nrpages = mapping->nrpages; - smp_rmb(); - nrexceptional = mapping->nrexceptional; - - if (nrpages || nrexceptional) { + if (!mapping_empty(mapping)) { /* * As truncation uses a lockless tree lookup, cycle * the tree lock to make sure any ongoing tree @@ -633,7 +620,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int ret2 = 0; int did_range_unmap = 0; - if (mapping->nrpages == 0 && mapping->nrexceptional == 0) + if (mapping_empty(mapping)) goto out; pagevec_init(&pvec); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 9a3d451402d7..e14b3820c6a8 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -207,7 +207,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool zeropage) + enum mcopy_atomic_mode mode) { int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED; int vm_shared = dst_vma->vm_flags & VM_SHARED; @@ -227,7 +227,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, * by THP. Since we can not reliably insert a zero page, this * feature is not supported. */ - if (zeropage) { + if (mode == MCOPY_ATOMIC_ZEROPAGE) { mmap_read_unlock(dst_mm); return -EINVAL; } @@ -273,8 +273,6 @@ retry: } while (src_addr < src_start + len) { - pte_t dst_pteval; - BUG_ON(dst_addr >= dst_start + len); /* @@ -290,23 +288,23 @@ retry: mutex_lock(&hugetlb_fault_mutex_table[hash]); err = -ENOMEM; - dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize); + dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); if (!dst_pte) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); goto out_unlock; } - err = -EEXIST; - dst_pteval = huge_ptep_get(dst_pte); - if (!huge_pte_none(dst_pteval)) { + if (mode != MCOPY_ATOMIC_CONTINUE && + !huge_pte_none(huge_ptep_get(dst_pte))) { + err = -EEXIST; mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); goto out_unlock; } err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, - dst_addr, src_addr, &page); + dst_addr, src_addr, mode, &page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); @@ -408,7 +406,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool zeropage); + enum mcopy_atomic_mode mode); #endif /* CONFIG_HUGETLB_PAGE */ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, @@ -458,7 +456,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool zeropage, + enum mcopy_atomic_mode mcopy_mode, bool *mmap_changing, __u64 mode) { @@ -469,6 +467,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, long copied; struct page *page; bool wp_copy; + bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE); /* * Sanitize the command parameters: @@ -527,10 +526,12 @@ retry: */ if (is_vm_hugetlb_page(dst_vma)) return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, - src_start, len, zeropage); + src_start, len, mcopy_mode); if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; + if (mcopy_mode == MCOPY_ATOMIC_CONTINUE) + goto out_unlock; /* * Ensure the dst_vma has a anon_vma or this page @@ -626,14 +627,22 @@ ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, bool *mmap_changing, __u64 mode) { - return __mcopy_atomic(dst_mm, dst_start, src_start, len, false, - mmap_changing, mode); + return __mcopy_atomic(dst_mm, dst_start, src_start, len, + MCOPY_ATOMIC_NORMAL, mmap_changing, mode); } ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool *mmap_changing) { - return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0); + return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, + mmap_changing, 0); +} + +ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, + unsigned long len, bool *mmap_changing) +{ + return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE, + mmap_changing, 0); } int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, diff --git a/mm/util.c b/mm/util.c index 54870226cea6..a8bf17f18a81 100644 --- a/mm/util.c +++ b/mm/util.c @@ -711,16 +711,6 @@ struct address_space *page_mapping(struct page *page) } EXPORT_SYMBOL(page_mapping); -/* - * For file cache pages, return the address_space, otherwise return NULL - */ -struct address_space *page_mapping_file(struct page *page) -{ - if (unlikely(PageSwapCache(page))) - return NULL; - return page_mapping(page); -} - /* Slow path of page_mapcount() for compound pages */ int __page_mapcount(struct page *page) { @@ -775,7 +765,7 @@ int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer, * The deviation of sync_overcommit_as could be big with loose policy * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply - * with the strict "NEVER", and to avoid possible race condtion (even + * with the strict "NEVER", and to avoid possible race condition (even * though user usually won't too frequently do the switching to policy * OVERCOMMIT_NEVER), the switch is done in the following order: * 1. changing the batch @@ -983,6 +973,7 @@ int __weak memcmp_pages(struct page *page1, struct page *page2) return ret; } +#ifdef CONFIG_PRINTK /** * mem_dump_obj - Print available provenance information * @object: object for which to find provenance information. @@ -996,20 +987,26 @@ int __weak memcmp_pages(struct page *page1, struct page *page2) */ void mem_dump_obj(void *object) { + const char *type; + if (kmem_valid_obj(object)) { kmem_dump_obj(object); return; } + if (vmalloc_dump_obj(object)) return; - if (!virt_addr_valid(object)) { - if (object == NULL) - pr_cont(" NULL pointer.\n"); - else if (object == ZERO_SIZE_PTR) - pr_cont(" zero-size pointer.\n"); - else - pr_cont(" non-paged memory.\n"); - return; - } - pr_cont(" non-slab/vmalloc memory.\n"); + + if (virt_addr_valid(object)) + type = "non-slab/vmalloc memory"; + else if (object == NULL) + type = "NULL pointer"; + else if (object == ZERO_SIZE_PTR) + type = "zero-size pointer"; + else + type = "non-paged memory"; + + pr_cont(" %s\n", type); } +EXPORT_SYMBOL_GPL(mem_dump_obj); +#endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4f5f8c907897..9c539f0730a5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -34,7 +34,7 @@ #include <linux/bitops.h> #include <linux/rbtree_augmented.h> #include <linux/overflow.h> - +#include <linux/pgtable.h> #include <linux/uaccess.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> @@ -42,6 +42,19 @@ #include "internal.h" #include "pgalloc-track.h" +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC +static bool __ro_after_init vmap_allow_huge = true; + +static int __init set_nohugevmalloc(char *str) +{ + vmap_allow_huge = false; + return 0; +} +early_param("nohugevmalloc", set_nohugevmalloc); +#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ +static const bool vmap_allow_huge = false; +#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ + bool is_vmalloc_addr(const void *x) { unsigned long addr = (unsigned long)x; @@ -68,6 +81,218 @@ static void free_work(struct work_struct *w) } /*** Page table manipulation functions ***/ +static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) +{ + pte_t *pte; + u64 pfn; + + pfn = phys_addr >> PAGE_SHIFT; + pte = pte_alloc_kernel_track(pmd, addr, mask); + if (!pte) + return -ENOMEM; + do { + BUG_ON(!pte_none(*pte)); + set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); + pfn++; + } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; + return 0; +} + +static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + if (max_page_shift < PMD_SHIFT) + return 0; + + if (!arch_vmap_pmd_supported(prot)) + return 0; + + if ((end - addr) != PMD_SIZE) + return 0; + + if (!IS_ALIGNED(addr, PMD_SIZE)) + return 0; + + if (!IS_ALIGNED(phys_addr, PMD_SIZE)) + return 0; + + if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) + return 0; + + return pmd_set_huge(pmd, phys_addr, prot); +} + +static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_alloc_track(&init_mm, pud, addr, mask); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + + if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, + max_page_shift)) { + *mask |= PGTBL_PMD_MODIFIED; + continue; + } + + if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask)) + return -ENOMEM; + } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); + return 0; +} + +static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + if (max_page_shift < PUD_SHIFT) + return 0; + + if (!arch_vmap_pud_supported(prot)) + return 0; + + if ((end - addr) != PUD_SIZE) + return 0; + + if (!IS_ALIGNED(addr, PUD_SIZE)) + return 0; + + if (!IS_ALIGNED(phys_addr, PUD_SIZE)) + return 0; + + if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) + return 0; + + return pud_set_huge(pud, phys_addr, prot); +} + +static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ + pud_t *pud; + unsigned long next; + + pud = pud_alloc_track(&init_mm, p4d, addr, mask); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + + if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, + max_page_shift)) { + *mask |= PGTBL_PUD_MODIFIED; + continue; + } + + if (vmap_pmd_range(pud, addr, next, phys_addr, prot, + max_page_shift, mask)) + return -ENOMEM; + } while (pud++, phys_addr += (next - addr), addr = next, addr != end); + return 0; +} + +static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + if (max_page_shift < P4D_SHIFT) + return 0; + + if (!arch_vmap_p4d_supported(prot)) + return 0; + + if ((end - addr) != P4D_SIZE) + return 0; + + if (!IS_ALIGNED(addr, P4D_SIZE)) + return 0; + + if (!IS_ALIGNED(phys_addr, P4D_SIZE)) + return 0; + + if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) + return 0; + + return p4d_set_huge(p4d, phys_addr, prot); +} + +static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); + if (!p4d) + return -ENOMEM; + do { + next = p4d_addr_end(addr, end); + + if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, + max_page_shift)) { + *mask |= PGTBL_P4D_MODIFIED; + continue; + } + + if (vmap_pud_range(p4d, addr, next, phys_addr, prot, + max_page_shift, mask)) + return -ENOMEM; + } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); + return 0; +} + +static int vmap_range_noflush(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + pgd_t *pgd; + unsigned long start; + unsigned long next; + int err; + pgtbl_mod_mask mask = 0; + + might_sleep(); + BUG_ON(addr >= end); + + start = addr; + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, + max_page_shift, &mask); + if (err) + break; + } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + + return err; +} + +int vmap_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + int err; + + err = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift); + flush_cache_vmap(addr, end); + + return err; +} static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) @@ -153,22 +378,20 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, } while (p4d++, addr = next, addr != end); } -/** - * unmap_kernel_range_noflush - unmap kernel VM area - * @start: start of the VM area to unmap - * @size: size of the VM area to unmap +/* + * vunmap_range_noflush is similar to vunmap_range, but does not + * flush caches or TLBs. * - * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify - * should have been allocated using get_vm_area() and its friends. + * The caller is responsible for calling flush_cache_vmap() before calling + * this function, and flush_tlb_kernel_range after it has returned + * successfully (and before the addresses are expected to cause a page fault + * or be re-mapped for something else, if TLB flushes are being delayed or + * coalesced). * - * NOTE: - * This function does NOT do any cache flushing. The caller is responsible - * for calling flush_cache_vunmap() on to-be-mapped areas before calling this - * function and flush_tlb_kernel_range() after. + * This is an internal function only. Do not use outside mm/. */ -void unmap_kernel_range_noflush(unsigned long start, unsigned long size) +void vunmap_range_noflush(unsigned long start, unsigned long end) { - unsigned long end = start + size; unsigned long next; pgd_t *pgd; unsigned long addr = start; @@ -189,7 +412,23 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size) arch_sync_kernel_mappings(start, end); } -static int vmap_pte_range(pmd_t *pmd, unsigned long addr, +/** + * vunmap_range - unmap kernel virtual addresses + * @addr: start of the VM area to unmap + * @end: end of the VM area to unmap (non-inclusive) + * + * Clears any present PTEs in the virtual address range, flushes TLBs and + * caches. Any subsequent access to the address before it has been re-mapped + * is a kernel bug. + */ +void vunmap_range(unsigned long addr, unsigned long end) +{ + flush_cache_vunmap(addr, end); + vunmap_range_noflush(addr, end); + flush_tlb_kernel_range(addr, end); +} + +static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { @@ -217,7 +456,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -static int vmap_pmd_range(pud_t *pud, unsigned long addr, +static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { @@ -229,13 +468,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask)) + if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } -static int vmap_pud_range(p4d_t *p4d, unsigned long addr, +static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { @@ -247,13 +486,13 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, return -ENOMEM; do { next = pud_addr_end(addr, end); - if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask)) + if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } -static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, +static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { @@ -265,37 +504,18 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask)) + if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; } -/** - * map_kernel_range_noflush - map kernel VM area with the specified pages - * @addr: start of the VM area to map - * @size: size of the VM area to map - * @prot: page protection flags to use - * @pages: pages to map - * - * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should - * have been allocated using get_vm_area() and its friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is responsible for - * calling flush_cache_vmap() on to-be-mapped areas before calling this - * function. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int map_kernel_range_noflush(unsigned long addr, unsigned long size, - pgprot_t prot, struct page **pages) +static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages) { unsigned long start = addr; - unsigned long end = addr + size; - unsigned long next; pgd_t *pgd; + unsigned long next; int err = 0; int nr = 0; pgtbl_mod_mask mask = 0; @@ -306,7 +526,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) mask |= PGTBL_PGD_MODIFIED; - err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); + err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) return err; } while (pgd++, addr = next, addr != end); @@ -317,14 +537,61 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, return 0; } -int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, - struct page **pages) +/* + * vmap_pages_range_noflush is similar to vmap_pages_range, but does not + * flush caches. + * + * The caller is responsible for calling flush_cache_vmap() after this + * function returns successfully and before the addresses are accessed. + * + * This is an internal function only. Do not use outside mm/. + */ +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) { - int ret; + unsigned int i, nr = (end - addr) >> PAGE_SHIFT; + + WARN_ON(page_shift < PAGE_SHIFT); + + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || + page_shift == PAGE_SHIFT) + return vmap_small_pages_range_noflush(addr, end, prot, pages); + + for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { + int err; + + err = vmap_range_noflush(addr, addr + (1UL << page_shift), + __pa(page_address(pages[i])), prot, + page_shift); + if (err) + return err; - ret = map_kernel_range_noflush(start, size, prot, pages); - flush_cache_vmap(start, start + size); - return ret; + addr += 1UL << page_shift; + } + + return 0; +} + +/** + * vmap_pages_range - map pages to a kernel virtual address + * @addr: start of the VM area to map + * @end: end of the VM area to map (non-inclusive) + * @prot: page protection flags to use + * @pages: pages to map (always PAGE_SIZE pages) + * @page_shift: maximum shift that the pages may be mapped with, @pages must + * be aligned and contiguous up to at least this shift. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int vmap_pages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + int err; + + err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + flush_cache_vmap(addr, end); + return err; } int is_vmalloc_or_module_addr(const void *x) @@ -343,7 +610,9 @@ int is_vmalloc_or_module_addr(const void *x) } /* - * Walk a vmap address to the struct page it maps. + * Walk a vmap address to the struct page it maps. Huge vmap mappings will + * return the tail page that corresponds to the base page address, which + * matches small vmap mappings. */ struct page *vmalloc_to_page(const void *vmalloc_addr) { @@ -363,25 +632,33 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) if (pgd_none(*pgd)) return NULL; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) + return NULL; /* XXX: no allowance for huge pgd */ + if (WARN_ON_ONCE(pgd_bad(*pgd))) + return NULL; + p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) return NULL; - pud = pud_offset(p4d, addr); + if (p4d_leaf(*p4d)) + return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(p4d_bad(*p4d))) + return NULL; - /* - * Don't dereference bad PUD or PMD (below) entries. This will also - * identify huge mappings, which we may encounter on architectures - * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be - * identified as vmalloc addresses by is_vmalloc_addr(), but are - * not [unambiguously] associated with a struct page, so there is - * no correct value to return for them. - */ - WARN_ON_ONCE(pud_bad(*pud)); - if (pud_none(*pud) || pud_bad(*pud)) + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) return NULL; + if (pud_leaf(*pud)) + return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pud_bad(*pud))) + return NULL; + pmd = pmd_offset(pud, addr); - WARN_ON_ONCE(pmd_bad(*pmd)); - if (pmd_none(*pmd) || pmd_bad(*pmd)) + if (pmd_none(*pmd)) + return NULL; + if (pmd_leaf(*pmd)) + return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pmd_bad(*pmd))) return NULL; ptep = pte_offset_map(pmd, addr); @@ -389,6 +666,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) if (pte_present(pte)) page = pte_page(pte); pte_unmap(ptep); + return page; } EXPORT_SYMBOL(vmalloc_to_page); @@ -1152,6 +1430,29 @@ static void free_vmap_area(struct vmap_area *va) spin_unlock(&free_vmap_area_lock); } +static inline void +preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) +{ + struct vmap_area *va = NULL; + + /* + * Preload this CPU with one extra vmap_area object. It is used + * when fit type of free area is NE_FIT_TYPE. It guarantees that + * a CPU that does an allocation is preloaded. + * + * We do it in non-atomic context, thus it allows us to use more + * permissive allocation masks to be more stable under low memory + * condition and high memory pressure. + */ + if (!this_cpu_read(ne_fit_preload_node)) + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); + + spin_lock(lock); + + if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va)) + kmem_cache_free(vmap_area_cachep, va); +} + /* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. @@ -1161,7 +1462,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long vstart, unsigned long vend, int node, gfp_t gfp_mask) { - struct vmap_area *va, *pva; + struct vmap_area *va; unsigned long addr; int purged = 0; int ret; @@ -1187,43 +1488,14 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); retry: - /* - * Preload this CPU with one extra vmap_area object. It is used - * when fit type of free area is NE_FIT_TYPE. Please note, it - * does not guarantee that an allocation occurs on a CPU that - * is preloaded, instead we minimize the case when it is not. - * It can happen because of cpu migration, because there is a - * race until the below spinlock is taken. - * - * The preload is done in non-atomic context, thus it allows us - * to use more permissive allocation masks to be more stable under - * low memory condition and high memory pressure. In rare case, - * if not preloaded, GFP_NOWAIT is used. - * - * Set "pva" to NULL here, because of "retry" path. - */ - pva = NULL; - - if (!this_cpu_read(ne_fit_preload_node)) - /* - * Even if it fails we do not really care about that. - * Just proceed as it is. If needed "overflow" path - * will refill the cache we allocate from. - */ - pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); - - spin_lock(&free_vmap_area_lock); - - if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) - kmem_cache_free(vmap_area_cachep, pva); + preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); + addr = __alloc_vmap_area(size, align, vstart, vend); + spin_unlock(&free_vmap_area_lock); /* * If an allocation fails, the "vend" address is * returned. Therefore trigger the overflow path. */ - addr = __alloc_vmap_area(size, align, vstart, vend); - spin_unlock(&free_vmap_area_lock); - if (unlikely(addr == vend)) goto overflow; @@ -1231,7 +1503,6 @@ retry: va->va_end = addr + size; va->vm = NULL; - spin_lock(&vmap_area_lock); insert_vmap_area(va, &vmap_area_root, &vmap_area_list); spin_unlock(&vmap_area_lock); @@ -1448,7 +1719,7 @@ static void free_vmap_area_noflush(struct vmap_area *va) static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); - unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); + vunmap_range_noflush(va->va_start, va->va_end); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); @@ -1726,7 +1997,7 @@ static void vb_free(unsigned long addr, unsigned long size) offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); - unmap_kernel_range_noflush(addr, size); + vunmap_range_noflush(addr, addr + size); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(addr, addr + size); @@ -1762,7 +2033,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) rcu_read_lock(); list_for_each_entry_rcu(vb, &vbq->free, free_list) { spin_lock(&vb->lock); - if (vb->dirty) { + if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) { unsigned long va_start = vb->va->va_start; unsigned long s, e; @@ -1879,16 +2150,36 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) kasan_unpoison_vmalloc(mem, size); - if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) { + if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, + pages, PAGE_SHIFT) < 0) { vm_unmap_ram(mem, count); return NULL; } + return mem; } EXPORT_SYMBOL(vm_map_ram); static struct vm_struct *vmlist __initdata; +static inline unsigned int vm_area_page_order(struct vm_struct *vm) +{ +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC + return vm->page_order; +#else + return 0; +#endif +} + +static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order) +{ +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC + vm->page_order = order; +#else + BUG_ON(order != 0); +#endif +} + /** * vm_area_add_early - add vmap area early during boot * @vm: vm_struct to add @@ -2023,23 +2314,6 @@ void __init vmalloc_init(void) vmap_initialized = true; } -/** - * unmap_kernel_range - unmap kernel VM area and flush cache and TLB - * @addr: start of the VM area to unmap - * @size: size of the VM area to unmap - * - * Similar to unmap_kernel_range_noflush() but flushes vcache before - * the unmapping and tlb after. - */ -void unmap_kernel_range(unsigned long addr, unsigned long size) -{ - unsigned long end = addr + size; - - flush_cache_vunmap(addr, end); - unmap_kernel_range_noflush(addr, size); - flush_tlb_kernel_range(addr, end); -} - static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { @@ -2199,6 +2473,7 @@ static inline void set_area_direct_map(const struct vm_struct *area, { int i; + /* HUGE_VMALLOC passes small pages to set_direct_map */ for (i = 0; i < area->nr_pages; i++) if (page_address(area->pages[i])) set_direct_map(area->pages[i]); @@ -2208,6 +2483,7 @@ static inline void set_area_direct_map(const struct vm_struct *area, static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) { unsigned long start = ULONG_MAX, end = 0; + unsigned int page_order = vm_area_page_order(area); int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; int flush_dmap = 0; int i; @@ -2232,11 +2508,14 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) * map. Find the start and end range of the direct mappings to make sure * the vm_unmap_aliases() flush includes the direct map. */ - for (i = 0; i < area->nr_pages; i++) { + for (i = 0; i < area->nr_pages; i += 1U << page_order) { unsigned long addr = (unsigned long)page_address(area->pages[i]); if (addr) { + unsigned long page_size; + + page_size = PAGE_SIZE << page_order; start = min(addr, start); - end = max(addr + PAGE_SIZE, end); + end = max(addr + page_size, end); flush_dmap = 1; } } @@ -2277,13 +2556,14 @@ static void __vunmap(const void *addr, int deallocate_pages) vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { + unsigned int page_order = vm_area_page_order(area); int i; - for (i = 0; i < area->nr_pages; i++) { + for (i = 0; i < area->nr_pages; i += 1U << page_order) { struct page *page = area->pages[i]; BUG_ON(!page); - __free_pages(page, 0); + __free_pages(page, page_order); } atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); @@ -2402,6 +2682,7 @@ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) { struct vm_struct *area; + unsigned long addr; unsigned long size; /* In bytes */ might_sleep(); @@ -2414,8 +2695,9 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot), - pages) < 0) { + addr = (unsigned long)area->addr; + if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), + pages, PAGE_SHIFT) < 0) { vunmap(area->addr); return NULL; } @@ -2474,15 +2756,19 @@ EXPORT_SYMBOL_GPL(vmap_pfn); #endif /* CONFIG_VMAP_PFN */ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot, int node) + pgprot_t prot, unsigned int page_shift, + int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)area->addr; + unsigned long size = get_vm_area_size(area); unsigned long array_size; - unsigned int i; + unsigned int nr_small_pages = size >> PAGE_SHIFT; + unsigned int page_order; struct page **pages; + unsigned int i; - array_size = (unsigned long)nr_pages * sizeof(struct page *); + array_size = (unsigned long)nr_small_pages * sizeof(struct page *); gfp_mask |= __GFP_NOWARN; if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; @@ -2497,42 +2783,60 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (!pages) { free_vm_area(area); + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "page array size %lu allocation failed", + nr_small_pages * PAGE_SIZE, array_size); return NULL; } area->pages = pages; - area->nr_pages = nr_pages; + area->nr_pages = nr_small_pages; + set_vm_area_page_order(area, page_shift - PAGE_SHIFT); - for (i = 0; i < area->nr_pages; i++) { - struct page *page; + page_order = vm_area_page_order(area); - if (node == NUMA_NO_NODE) - page = alloc_page(gfp_mask); - else - page = alloc_pages_node(node, gfp_mask, 0); + /* + * Careful, we allocate and map page_order pages, but tracking is done + * per PAGE_SIZE page so as to keep the vm_struct APIs independent of + * the physical/mapped size. + */ + for (i = 0; i < area->nr_pages; i += 1U << page_order) { + struct page *page; + int p; + /* Compound pages required for remap_vmalloc_page */ + page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vfree() */ area->nr_pages = i; atomic_long_add(area->nr_pages, &nr_vmalloc_pages); + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "page order %u allocation failed", + area->nr_pages * PAGE_SIZE, page_order); goto fail; } - area->pages[i] = page; + + for (p = 0; p < (1U << page_order); p++) + area->pages[i + p] = page + p; + if (gfpflags_allow_blocking(gfp_mask)) cond_resched(); } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), - prot, pages) < 0) + if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0) { + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "failed to map pages", + area->nr_pages * PAGE_SIZE); goto fail; + } return area->addr; fail: - warn_alloc(gfp_mask, NULL, - "vmalloc: allocation failure, allocated %ld of %ld bytes", - (area->nr_pages*PAGE_SIZE), area->size); __vfree(area->addr); return NULL; } @@ -2563,19 +2867,54 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, struct vm_struct *area; void *addr; unsigned long real_size = size; + unsigned long real_align = align; + unsigned int shift = PAGE_SHIFT; - size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > totalram_pages()) - goto fail; + if (WARN_ON_ONCE(!size)) + return NULL; + + if ((size >> PAGE_SHIFT) > totalram_pages()) { + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "exceeds total pages", real_size); + return NULL; + } + + if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) && + arch_vmap_pmd_supported(prot)) { + unsigned long size_per_node; + + /* + * Try huge pages. Only try for PAGE_KERNEL allocations, + * others like modules don't yet expect huge pages in + * their allocations due to apply_to_page_range not + * supporting them. + */ - area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED | + size_per_node = size; + if (node == NUMA_NO_NODE) + size_per_node /= num_online_nodes(); + if (size_per_node >= PMD_SIZE) { + shift = PMD_SHIFT; + align = max(real_align, 1UL << shift); + size = ALIGN(real_size, 1UL << shift); + } + } + +again: + size = PAGE_ALIGN(size); + area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); - if (!area) + if (!area) { + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "vm_struct allocation failed", real_size); goto fail; + } - addr = __vmalloc_area_node(area, gfp_mask, prot, node); + addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!addr) - return NULL; + goto fail; /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED @@ -2589,8 +2928,13 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return addr; fail: - warn_alloc(gfp_mask, NULL, - "vmalloc: allocation failure: %lu bytes", real_size); + if (shift > PAGE_SHIFT) { + shift = PAGE_SHIFT; + align = real_align; + size = real_size; + goto again; + } + return NULL; } @@ -2739,7 +3083,7 @@ EXPORT_SYMBOL(vzalloc_node); * 64b systems should always have either DMA or DMA32 zones. For others * GFP_DMA32 should do the right thing and use the normal zone. */ -#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL +#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #endif /** @@ -2894,7 +3238,10 @@ long vread(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; spin_lock(&vmap_area_lock); - list_for_each_entry(va, &vmap_area_list, list) { + va = __find_vmap_area((unsigned long)addr); + if (!va) + goto finished; + list_for_each_entry_from(va, &vmap_area_list, list) { if (!count) break; @@ -3072,7 +3419,6 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, return 0; } -EXPORT_SYMBOL(remap_vmalloc_range_partial); /** * remap_vmalloc_range - map vmalloc pages to userspace @@ -3450,6 +3796,7 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) } #endif /* CONFIG_SMP */ +#ifdef CONFIG_PRINTK bool vmalloc_dump_obj(void *object) { struct vm_struct *vm; @@ -3462,6 +3809,7 @@ bool vmalloc_dump_obj(void *object) vm->nr_pages, (unsigned long)vm->addr, vm->caller); return true; } +#endif #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) diff --git a/mm/vmscan.c b/mm/vmscan.c index 562e87cbd7a1..5199b9696bab 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -185,39 +185,181 @@ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG -/* - * We allow subsystems to populate their shrinker-related - * LRU lists before register_shrinker_prepared() is called - * for the shrinker, since we don't want to impose - * restrictions on their internal registration order. - * In this case shrink_slab_memcg() may find corresponding - * bit is set in the shrinkers map. - * - * This value is used by the function to detect registering - * shrinkers and to skip do_shrink_slab() calls for them. - */ -#define SHRINKER_REGISTERING ((struct shrinker *)~0UL) +static int shrinker_nr_max; + +/* The shrinker_info is expanded in a batch of BITS_PER_LONG */ +static inline int shrinker_map_size(int nr_items) +{ + return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); +} + +static inline int shrinker_defer_size(int nr_items) +{ + return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); +} + +static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, + int nid) +{ + return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + lockdep_is_held(&shrinker_rwsem)); +} + +static int expand_one_shrinker_info(struct mem_cgroup *memcg, + int map_size, int defer_size, + int old_map_size, int old_defer_size) +{ + struct shrinker_info *new, *old; + struct mem_cgroup_per_node *pn; + int nid; + int size = map_size + defer_size; + + for_each_node(nid) { + pn = memcg->nodeinfo[nid]; + old = shrinker_info_protected(memcg, nid); + /* Not yet online memcg */ + if (!old) + return 0; + + new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); + if (!new) + return -ENOMEM; + + new->nr_deferred = (atomic_long_t *)(new + 1); + new->map = (void *)new->nr_deferred + defer_size; + + /* map: set all old bits, clear all new bits */ + memset(new->map, (int)0xff, old_map_size); + memset((void *)new->map + old_map_size, 0, map_size - old_map_size); + /* nr_deferred: copy old values, clear all new values */ + memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); + memset((void *)new->nr_deferred + old_defer_size, 0, + defer_size - old_defer_size); + + rcu_assign_pointer(pn->shrinker_info, new); + kvfree_rcu(old, rcu); + } + + return 0; +} + +void free_shrinker_info(struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *pn; + struct shrinker_info *info; + int nid; + + for_each_node(nid) { + pn = memcg->nodeinfo[nid]; + info = rcu_dereference_protected(pn->shrinker_info, true); + kvfree(info); + rcu_assign_pointer(pn->shrinker_info, NULL); + } +} + +int alloc_shrinker_info(struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + int nid, size, ret = 0; + int map_size, defer_size = 0; + + down_write(&shrinker_rwsem); + map_size = shrinker_map_size(shrinker_nr_max); + defer_size = shrinker_defer_size(shrinker_nr_max); + size = map_size + defer_size; + for_each_node(nid) { + info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); + if (!info) { + free_shrinker_info(memcg); + ret = -ENOMEM; + break; + } + info->nr_deferred = (atomic_long_t *)(info + 1); + info->map = (void *)info->nr_deferred + defer_size; + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); + } + up_write(&shrinker_rwsem); + + return ret; +} + +static inline bool need_expand(int nr_max) +{ + return round_up(nr_max, BITS_PER_LONG) > + round_up(shrinker_nr_max, BITS_PER_LONG); +} + +static int expand_shrinker_info(int new_id) +{ + int ret = 0; + int new_nr_max = new_id + 1; + int map_size, defer_size = 0; + int old_map_size, old_defer_size = 0; + struct mem_cgroup *memcg; + + if (!need_expand(new_nr_max)) + goto out; + + if (!root_mem_cgroup) + goto out; + + lockdep_assert_held(&shrinker_rwsem); + + map_size = shrinker_map_size(new_nr_max); + defer_size = shrinker_defer_size(new_nr_max); + old_map_size = shrinker_map_size(shrinker_nr_max); + old_defer_size = shrinker_defer_size(shrinker_nr_max); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + ret = expand_one_shrinker_info(memcg, map_size, defer_size, + old_map_size, old_defer_size); + if (ret) { + mem_cgroup_iter_break(NULL, memcg); + goto out; + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); +out: + if (!ret) + shrinker_nr_max = new_nr_max; + + return ret; +} + +void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) +{ + if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { + struct shrinker_info *info; + + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); + /* Pairs with smp mb in shrink_slab() */ + smp_mb__before_atomic(); + set_bit(shrinker_id, info->map); + rcu_read_unlock(); + } +} static DEFINE_IDR(shrinker_idr); -static int shrinker_nr_max; static int prealloc_memcg_shrinker(struct shrinker *shrinker) { int id, ret = -ENOMEM; + if (mem_cgroup_disabled()) + return -ENOSYS; + down_write(&shrinker_rwsem); /* This may call shrinker, so it must use down_read_trylock() */ - id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL); + id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; if (id >= shrinker_nr_max) { - if (memcg_expand_shrinker_maps(id)) { + if (expand_shrinker_info(id)) { idr_remove(&shrinker_idr, id); goto unlock; } - - shrinker_nr_max = id + 1; } shrinker->id = id; ret = 0; @@ -232,9 +374,51 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) BUG_ON(id < 0); - down_write(&shrinker_rwsem); + lockdep_assert_held(&shrinker_rwsem); + idr_remove(&shrinker_idr, id); - up_write(&shrinker_rwsem); +} + +static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + + info = shrinker_info_protected(memcg, nid); + return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); +} + +static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + + info = shrinker_info_protected(memcg, nid); + return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); +} + +void reparent_shrinker_deferred(struct mem_cgroup *memcg) +{ + int i, nid; + long nr; + struct mem_cgroup *parent; + struct shrinker_info *child_info, *parent_info; + + parent = parent_mem_cgroup(memcg); + if (!parent) + parent = root_mem_cgroup; + + /* Prevent from concurrent shrinker_info expand */ + down_read(&shrinker_rwsem); + for_each_node(nid) { + child_info = shrinker_info_protected(memcg, nid); + parent_info = shrinker_info_protected(parent, nid); + for (i = 0; i < shrinker_nr_max; i++) { + nr = atomic_long_read(&child_info->nr_deferred[i]); + atomic_long_add(nr, &parent_info->nr_deferred[i]); + } + } + up_read(&shrinker_rwsem); } static bool cgroup_reclaim(struct scan_control *sc) @@ -268,13 +452,25 @@ static bool writeback_throttling_sane(struct scan_control *sc) #else static int prealloc_memcg_shrinker(struct shrinker *shrinker) { - return 0; + return -ENOSYS; } static void unregister_memcg_shrinker(struct shrinker *shrinker) { } +static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + return 0; +} + +static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + return 0; +} + static bool cgroup_reclaim(struct scan_control *sc) { return false; @@ -286,6 +482,39 @@ static bool writeback_throttling_sane(struct scan_control *sc) } #endif +static long xchg_nr_deferred(struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (sc->memcg && + (shrinker->flags & SHRINKER_MEMCG_AWARE)) + return xchg_nr_deferred_memcg(nid, shrinker, + sc->memcg); + + return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); +} + + +static long add_nr_deferred(long nr, struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (sc->memcg && + (shrinker->flags & SHRINKER_MEMCG_AWARE)) + return add_nr_deferred_memcg(nr, nid, shrinker, + sc->memcg); + + return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); +} + /* * This misses isolated pages which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is @@ -335,8 +564,18 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, */ int prealloc_shrinker(struct shrinker *shrinker) { - unsigned int size = sizeof(*shrinker->nr_deferred); + unsigned int size; + int err; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + err = prealloc_memcg_shrinker(shrinker); + if (err != -ENOSYS) + return err; + + shrinker->flags &= ~SHRINKER_MEMCG_AWARE; + } + size = sizeof(*shrinker->nr_deferred); if (shrinker->flags & SHRINKER_NUMA_AWARE) size *= nr_node_ids; @@ -344,26 +583,17 @@ int prealloc_shrinker(struct shrinker *shrinker) if (!shrinker->nr_deferred) return -ENOMEM; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - if (prealloc_memcg_shrinker(shrinker)) - goto free_deferred; - } - return 0; - -free_deferred: - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; - return -ENOMEM; } void free_prealloced_shrinker(struct shrinker *shrinker) { - if (!shrinker->nr_deferred) - return; - - if (shrinker->flags & SHRINKER_MEMCG_AWARE) + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); + up_write(&shrinker_rwsem); + return; + } kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; @@ -373,10 +603,7 @@ void register_shrinker_prepared(struct shrinker *shrinker) { down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); -#ifdef CONFIG_MEMCG - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - idr_replace(&shrinker_idr, shrinker, shrinker->id); -#endif + shrinker->flags |= SHRINKER_REGISTERED; up_write(&shrinker_rwsem); } @@ -396,13 +623,16 @@ EXPORT_SYMBOL(register_shrinker); */ void unregister_shrinker(struct shrinker *shrinker) { - if (!shrinker->nr_deferred) + if (!(shrinker->flags & SHRINKER_REGISTERED)) return; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - unregister_memcg_shrinker(shrinker); + down_write(&shrinker_rwsem); list_del(&shrinker->list); + shrinker->flags &= ~SHRINKER_REGISTERED; + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); up_write(&shrinker_rwsem); + kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; } @@ -419,14 +649,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, long freeable; long nr; long new_nr; - int nid = shrinkctl->nid; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; long scanned = 0, next_deferred; - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) - nid = 0; - freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0 || freeable == SHRINK_EMPTY) return freeable; @@ -436,9 +662,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * and zero it so that other concurrent shrinker invocations * don't also do this scanning work. */ - nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); + nr = xchg_nr_deferred(shrinker, shrinkctl); - total_scan = nr; if (shrinker->seeks) { delta = freeable >> priority; delta *= 4; @@ -452,37 +677,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, delta = freeable / 2; } + total_scan = nr >> priority; total_scan += delta; - if (total_scan < 0) { - pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n", - shrinker->scan_objects, total_scan); - total_scan = freeable; - next_deferred = nr; - } else - next_deferred = total_scan; - - /* - * We need to avoid excessive windup on filesystem shrinkers - * due to large numbers of GFP_NOFS allocations causing the - * shrinkers to return -1 all the time. This results in a large - * nr being built up so when a shrink that can do some work - * comes along it empties the entire cache due to nr >>> - * freeable. This is bad for sustaining a working set in - * memory. - * - * Hence only allow the shrinker to scan the entire cache when - * a large delta change is calculated directly. - */ - if (delta < freeable / 4) - total_scan = min(total_scan, freeable / 2); - - /* - * Avoid risking looping forever due to too large nr value: - * never try to free more than twice the estimate number of - * freeable entries. - */ - if (total_scan > freeable * 2) - total_scan = freeable * 2; + total_scan = min(total_scan, (2 * freeable)); trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, freeable, delta, total_scan, priority); @@ -521,22 +718,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, cond_resched(); } - if (next_deferred >= scanned) - next_deferred -= scanned; - else - next_deferred = 0; + /* + * The deferred work is increased by any new work (delta) that wasn't + * done, decreased by old deferred work that was done now. + * + * And it is capped to two times of the freeable items. + */ + next_deferred = max_t(long, (nr + delta - scanned), 0); + next_deferred = min(next_deferred, (2 * freeable)); + /* * move the unused scan count back into the shrinker in a - * manner that handles concurrent updates. If we exhausted the - * scan, there is no need to do an update. + * manner that handles concurrent updates. */ - if (next_deferred > 0) - new_nr = atomic_long_add_return(next_deferred, - &shrinker->nr_deferred[nid]); - else - new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); + new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); - trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); + trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); return freed; } @@ -544,7 +741,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { - struct memcg_shrinker_map *map; + struct shrinker_info *info; unsigned long ret, freed = 0; int i; @@ -554,12 +751,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (!down_read_trylock(&shrinker_rwsem)) return 0; - map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map, - true); - if (unlikely(!map)) + info = shrinker_info_protected(memcg, nid); + if (unlikely(!info)) goto unlock; - for_each_set_bit(i, map->map, shrinker_nr_max) { + for_each_set_bit(i, info->map, shrinker_nr_max) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -568,9 +764,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct shrinker *shrinker; shrinker = idr_find(&shrinker_idr, i); - if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) { + if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { if (!shrinker) - clear_bit(i, map->map); + clear_bit(i, info->map); continue; } @@ -581,7 +777,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) { - clear_bit(i, map->map); + clear_bit(i, info->map); /* * After the shrinker reported that it had no objects to * free, but before we cleared the corresponding bit in @@ -590,7 +786,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, * case, we invoke the shrinker one more time and reset * the bit if it reports that it is not empty anymore. * The memory barrier here pairs with the barrier in - * memcg_set_shrinker_bit(): + * set_shrinker_bit(): * * list_lru_add() shrink_slab_memcg() * list_add_tail() clear_bit() @@ -602,7 +798,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (ret == SHRINK_EMPTY) ret = 0; else - memcg_set_shrinker_bit(memcg, nid, i); + set_shrinker_bit(memcg, nid, i); } freed += ret; @@ -1507,8 +1703,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, LIST_HEAD(clean_pages); list_for_each_entry_safe(page, next, page_list, lru) { - if (page_is_file_lru(page) && !PageDirty(page) && - !__PageMovable(page) && !PageUnevictable(page)) { + if (!PageHuge(page) && page_is_file_lru(page) && + !PageDirty(page) && !__PageMovable(page) && + !PageUnevictable(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); } @@ -3862,7 +4059,7 @@ static int kswapd(void *p) { unsigned int alloc_order, reclaim_order; unsigned int highest_zoneidx = MAX_NR_ZONES - 1; - pg_data_t *pgdat = (pg_data_t*)p; + pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -4086,14 +4283,6 @@ module_init(kswapd_init) int node_reclaim_mode __read_mostly; /* - * These bit locations are exposed in the vm.zone_reclaim_mode sysctl - * ABI. New bits are OK, but existing bits can never change. - */ -#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ -#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ -#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ - -/* * Priority for NODE_RECLAIM. This determines the fraction of pages * of a node considered for each zone_reclaim. 4 scans 1/16th of * a zone. diff --git a/mm/vmstat.c b/mm/vmstat.c index 74b2c374b86c..5ba118521ded 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1313,6 +1313,10 @@ const char * const vmstat_text[] = { "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", #endif +#ifdef CONFIG_CMA + "cma_alloc_success", + "cma_alloc_fail", +#endif "unevictable_pgs_culled", "unevictable_pgs_scanned", "unevictable_pgs_rescued", @@ -1365,6 +1369,10 @@ const char * const vmstat_text[] = { "swap_ra", "swap_ra_hit", #endif +#ifdef CONFIG_X86 + "direct_map_level2_splits", + "direct_map_level3_splits", +#endif #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ @@ -1854,25 +1862,34 @@ int vmstat_refresh(struct ctl_table *table, int write, if (err) return err; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + /* + * Skip checking stats known to go negative occasionally. + */ + switch (i) { + case NR_ZONE_WRITE_PENDING: + case NR_FREE_CMA_PAGES: + continue; + } val = atomic_long_read(&vm_zone_stat[i]); if (val < 0) { pr_warn("%s: %s %ld\n", __func__, zone_stat_name(i), val); - err = -EINVAL; } } -#ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { - val = atomic_long_read(&vm_numa_stat[i]); + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + /* + * Skip checking stats known to go negative occasionally. + */ + switch (i) { + case NR_WRITEBACK: + continue; + } + val = atomic_long_read(&vm_node_stat[i]); if (val < 0) { pr_warn("%s: %s %ld\n", - __func__, numa_stat_name(i), val); - err = -EINVAL; + __func__, node_stat_name(i), val); } } -#endif - if (err) - return err; if (write) *ppos += *lenp; else diff --git a/mm/workingset.c b/mm/workingset.c index cd39902c1062..b7cdeca5a76d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -554,7 +554,6 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out_invalid; if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; - mapping->nrexceptional -= node->nr_values; xa_delete_node(node, workingset_update_node); __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); diff --git a/mm/z3fold.c b/mm/z3fold.c index b5dafa7e44e4..9d889ad2bb86 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1346,8 +1346,22 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) page = list_entry(pos, struct page, lru); zhdr = page_address(page); - if (test_bit(PAGE_HEADLESS, &page->private)) + if (test_bit(PAGE_HEADLESS, &page->private)) { + /* + * For non-headless pages, we wait to do this + * until we have the page lock to avoid racing + * with __z3fold_alloc(). Headless pages don't + * have a lock (and __z3fold_alloc() will never + * see them), but we still need to test and set + * PAGE_CLAIMED to avoid racing with + * z3fold_free(), so just do it now before + * leaving the loop. + */ + if (test_and_set_bit(PAGE_CLAIMED, &page->private)) + continue; + break; + } if (kref_get_unless_zero(&zhdr->refcount) == 0) { zhdr = NULL; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 30c358b72025..58697f7a43f8 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1987,8 +1987,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, head = obj_to_head(page, addr); if (head & OBJ_ALLOCATED_TAG) { handle = head & ~OBJ_ALLOCATED_TAG; - if (!testpin_tag(handle)) - BUG(); + BUG_ON(!testpin_tag(handle)); old_obj = handle_to_obj(handle); obj_to_location(old_obj, &dummy, &obj_idx); @@ -2035,8 +2034,7 @@ unpin_objects: head = obj_to_head(page, addr); if (head & OBJ_ALLOCATED_TAG) { handle = head & ~OBJ_ALLOCATED_TAG; - if (!testpin_tag(handle)) - BUG(); + BUG_ON(!testpin_tag(handle)); unpin_tag(handle); } } diff --git a/mm/zswap.c b/mm/zswap.c index 578d9f256920..20763267a219 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -614,7 +614,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) } pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); - strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); + strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); if (!pool->acomp_ctx) { |