From 42c269c88dc146982a54a8267f71abc99f12852a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 3 Mar 2017 16:15:39 -0500 Subject: ftrace: Allow for function tracing to record init functions on boot up Adding a hook into free_reserve_area() that informs ftrace that boot up init text is being free, lets ftrace safely remove those init functions from its records, which keeps ftrace from trying to modify text that no longer exists. Note, this still does not allow for tracing .init text of modules, as modules require different work for freeing its init code. Link: http://lkml.kernel.org/r/1488502497.7212.24.camel@linux.intel.com Cc: linux-mm@kvack.org Cc: Vlastimil Babka Cc: Mel Gorman Cc: Peter Zijlstra Requested-by: Todd Brandt Signed-off-by: Steven Rostedt (VMware) --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6cbde310abed..eee82bfb7cd8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -6605,6 +6606,9 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) void *pos; unsigned long pages = 0; + /* This may be .init text, inform ftrace to remove it */ + ftrace_free_mem(start, end); + start = (void *)PAGE_ALIGN((unsigned long)start); end = (void *)((unsigned long)end & PAGE_MASK); for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { -- cgit v1.2.3 From 0e056eb5530da802c07f080d6bbd43c50e799efd Mon Sep 17 00:00:00 2001 From: "mchehab@s-opensource.com" Date: Thu, 30 Mar 2017 17:11:36 -0300 Subject: kernel-api.rst: fix a series of errors when parsing C files ./lib/string.c:134: WARNING: Inline emphasis start-string without end-string. ./mm/filemap.c:522: WARNING: Inline interpreted text or phrase reference start-string without end-string. ./mm/filemap.c:1283: ERROR: Unexpected indentation. ./mm/filemap.c:3003: WARNING: Inline interpreted text or phrase reference start-string without end-string. ./mm/vmalloc.c:1544: WARNING: Inline emphasis start-string without end-string. ./mm/page_alloc.c:4245: ERROR: Unexpected indentation. ./ipc/util.c:676: ERROR: Unexpected indentation. ./drivers/pci/irq.c:35: WARNING: Block quote ends without a blank line; unexpected unindent. ./security/security.c:109: ERROR: Unexpected indentation. ./security/security.c:110: WARNING: Definition list ends without a blank line; unexpected unindent. ./block/genhd.c:275: WARNING: Inline strong start-string without end-string. ./block/genhd.c:283: WARNING: Inline strong start-string without end-string. ./include/linux/clk.h:134: WARNING: Inline emphasis start-string without end-string. ./include/linux/clk.h:134: WARNING: Inline emphasis start-string without end-string. ./ipc/util.c:477: ERROR: Unknown target name: "s". Signed-off-by: Mauro Carvalho Chehab Acked-by: Bjorn Helgaas Signed-off-by: Jonathan Corbet --- block/genhd.c | 7 ++++--- drivers/pci/irq.c | 2 +- include/linux/clk.h | 4 ++-- ipc/util.c | 12 +++++++----- lib/string.c | 2 +- mm/filemap.c | 18 ++++++++++-------- mm/page_alloc.c | 3 ++- mm/vmalloc.c | 2 +- security/security.c | 12 ++++++++---- 9 files changed, 36 insertions(+), 26 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/block/genhd.c b/block/genhd.c index b26a5ea115d0..a53bfd19a0ec 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -271,16 +271,17 @@ void blkdev_show(struct seq_file *seqf, off_t offset) /** * register_blkdev - register a new block device * - * @major: the requested major device number [1..255]. If @major=0, try to + * @major: the requested major device number [1..255]. If @major = 0, try to * allocate any unused major number. * @name: the name of the new block device as a zero terminated string * * The @name must be unique within the system. * - * The return value depends on the @major input parameter. + * The return value depends on the @major input parameter: + * * - if a major device number was requested in range [1..255] then the * function returns zero on success, or a negative error code - * - if any unused major number was requested with @major=0 parameter + * - if any unused major number was requested with @major = 0 parameter * then the return value is the allocated major number in range * [1..255] or a negative error code otherwise */ diff --git a/drivers/pci/irq.c b/drivers/pci/irq.c index 6684f153ab57..f9f2a0324ecc 100644 --- a/drivers/pci/irq.c +++ b/drivers/pci/irq.c @@ -31,7 +31,7 @@ static void pci_note_irq_problem(struct pci_dev *pdev, const char *reason) * driver). * * Returns: - * a suggestion for fixing it (although the driver is not required to + * a suggestion for fixing it (although the driver is not required to * act on this). */ enum pci_lost_interrupt_reason pci_lost_interrupt(struct pci_dev *pdev) diff --git a/include/linux/clk.h b/include/linux/clk.h index e9d36b3e49de..024cd07870d0 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -132,8 +132,8 @@ int clk_get_phase(struct clk *clk); * @q: clk compared against p * * Returns true if the two struct clk pointers both point to the same hardware - * clock node. Put differently, returns true if struct clk *p and struct clk *q - * share the same struct clk_core object. + * clock node. Put differently, returns true if @p and @q + * share the same &struct clk_core object. * * Returns false otherwise. Note that two NULL clks are treated as matching. */ diff --git a/ipc/util.c b/ipc/util.c index 798cad18dd87..3459a16a9df9 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -474,7 +474,7 @@ void ipc_rcu_free(struct rcu_head *head) * Check user, group, other permissions for access * to ipc resources. return 0 if allowed * - * @flag will most probably be 0 or S_...UGO from + * @flag will most probably be 0 or ``S_...UGO`` from */ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flag) { @@ -672,10 +672,12 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) * * This function does some common audit and permissions check for some IPC_XXX * cmd and is called from semctl_down, shmctl_down and msgctl_down. - * It must be called without any lock held and - * - retrieves the ipc with the given id in the given table. - * - performs some audit and permission check, depending on the given cmd - * - returns a pointer to the ipc object or otherwise, the corresponding error. + * It must be called without any lock held and: + * + * - retrieves the ipc with the given id in the given table. + * - performs some audit and permission check, depending on the given cmd + * - returns a pointer to the ipc object or otherwise, the corresponding + * error. * * Call holding the both the rwsem and the rcu read lock. */ diff --git a/lib/string.c b/lib/string.c index ed83562a53ae..b5c9a1168d3a 100644 --- a/lib/string.c +++ b/lib/string.c @@ -131,7 +131,7 @@ EXPORT_SYMBOL(strncpy); * @src: Where to copy the string from * @size: size of destination buffer * - * Compatible with *BSD: the result is always a valid + * Compatible with ``*BSD``: the result is always a valid * NUL-terminated string that fits in the buffer (unless, * of course, the buffer size is zero). It does not pad * out the result like strncpy() does. diff --git a/mm/filemap.c b/mm/filemap.c index 1694623a6289..c5808b7a5fb1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -519,7 +519,7 @@ EXPORT_SYMBOL(filemap_write_and_wait); * * Write out and wait upon file offsets lstart->lend, inclusive. * - * Note that `lend' is inclusive (describes the last byte to be written) so + * Note that @lend is inclusive (describes the last byte to be written) so * that this function can be used to write to the very end-of-file (end = -1). */ int filemap_write_and_wait_range(struct address_space *mapping, @@ -1277,12 +1277,14 @@ EXPORT_SYMBOL(find_lock_entry); * * PCG flags modify how the page is returned. * - * FGP_ACCESSED: the page will be marked accessed - * FGP_LOCK: Page is return locked - * FGP_CREAT: If page is not present then a new page is allocated using - * @gfp_mask and added to the page cache and the VM's LRU - * list. The page is returned locked and with an increased - * refcount. Otherwise, %NULL is returned. + * @fgp_flags can be: + * + * - FGP_ACCESSED: the page will be marked accessed + * - FGP_LOCK: Page is return locked + * - FGP_CREAT: If page is not present then a new page is allocated using + * @gfp_mask and added to the page cache and the VM's LRU + * list. The page is returned locked and with an increased + * refcount. Otherwise, NULL is returned. * * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even * if the GFP flags specified for FGP_CREAT are atomic. @@ -3001,7 +3003,7 @@ EXPORT_SYMBOL(generic_file_write_iter); * @gfp_mask: memory allocation flags (and I/O mode) * * The address_space is to try to release any data against the page - * (presumably at page->private). If the release was successful, return `1'. + * (presumably at page->private). If the release was successful, return '1'. * Otherwise return zero. * * This may also be called if PG_fscache is set on a page, indicating that the diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eaa64d2ffdc5..c1b68edaf106 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4242,7 +4242,8 @@ EXPORT_SYMBOL(free_pages_exact); * nr_free_zone_pages() counts the number of counts pages which are beyond the * high watermark within all zones at or below a given zone index. For each * zone, the number of pages is calculated as: - * managed_pages - high_pages + * + * nr_free_zone_pages = managed_pages - high_pages */ static unsigned long nr_free_zone_pages(int offset) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b4024d688f38..c24db06f15c4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1540,7 +1540,7 @@ void vfree_atomic(const void *addr) * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling * conventions for vfree() arch-depenedent would be a really bad idea) * - * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node) + * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) */ void vfree(const void *addr) { diff --git a/security/security.c b/security/security.c index d0e07f269b2d..23555c5504f6 100644 --- a/security/security.c +++ b/security/security.c @@ -103,10 +103,14 @@ static int lsm_append(char *new, char **result) * to avoid security registration races. This method may also be used * to check if your LSM is currently loaded during kernel initialization. * - * Return true if: - * -The passed LSM is the one chosen by user at boot time, - * -or the passed LSM is configured as the default and the user did not - * choose an alternate LSM at boot time. + * Returns: + * + * true if: + * + * - The passed LSM is the one chosen by user at boot time, + * - or the passed LSM is configured as the default and the user did not + * choose an alternate LSM at boot time. + * * Otherwise, return false. */ int __init security_module_enable(const char *module) -- cgit v1.2.3 From b80f0f6c9ed3958ff4002b6135f43a1ef312a610 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 Apr 2017 12:57:35 -0400 Subject: ftrace: Have init/main.c call ftrace directly to free init memory Relying on free_reserved_area() to call ftrace to free init memory proved to not be sufficient. The issue is that on x86, when debug_pagealloc is enabled, the init memory is not freed, but simply set as not present. Since ftrace was uninformed of this, starting function tracing still tries to update pages that are not present according to the page tables, causing ftrace to bug, as well as killing the kernel itself. Instead of relying on free_reserved_area(), have init/main.c call ftrace directly just before it frees the init memory. Then it needs to use __init_begin and __init_end to know where the init memory location is. Looking at all archs (and testing what I can), it appears that this should work for each of them. Reported-by: kernel test robot Reported-by: Fengguang Wu Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 +++--- init/main.c | 1 + kernel/trace/ftrace.c | 7 ++++--- mm/page_alloc.c | 3 --- 4 files changed, 8 insertions(+), 9 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0276a2c487e6..ef7123219f14 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -147,9 +147,9 @@ struct ftrace_ops_hash { struct mutex regex_lock; }; -void ftrace_free_mem(void *start, void *end); +void ftrace_free_init_mem(void); #else -static inline void ftrace_free_mem(void *start, void *end) { } +static inline void ftrace_free_init_mem(void) { } #endif /* @@ -266,7 +266,7 @@ static inline int ftrace_nr_registered_ops(void) } static inline void clear_ftrace_function(void) { } static inline void ftrace_kill(void) { } -static inline void ftrace_free_mem(void *start, void *end) { } +static inline void ftrace_free_init_mem(void) { } #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_STACK_TRACER diff --git a/init/main.c b/init/main.c index c0137b916aa1..0e8849f74561 100644 --- a/init/main.c +++ b/init/main.c @@ -962,6 +962,7 @@ static int __ref kernel_init(void *unused) kernel_init_freeable(); /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); + ftrace_free_init_mem(); free_initmem(); mark_readonly(); system_state = SYSTEM_RUNNING; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index aff7a2c08387..8efd9fe7aec0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -36,6 +36,7 @@ #include +#include #include #include "trace_output.h" @@ -5279,10 +5280,10 @@ void ftrace_module_init(struct module *mod) } #endif /* CONFIG_MODULES */ -void ftrace_free_mem(void *start_ptr, void *end_ptr) +void __init ftrace_free_init_mem(void) { - unsigned long start = (unsigned long)start_ptr; - unsigned long end = (unsigned long)end_ptr; + unsigned long start = (unsigned long)(&__init_begin); + unsigned long end = (unsigned long)(&__init_end); struct ftrace_page **last_pg = &ftrace_pages_start; struct ftrace_page *pg; struct dyn_ftrace *rec; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eee82bfb7cd8..178bf9c2a2cb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6606,9 +6606,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) void *pos; unsigned long pages = 0; - /* This may be .init text, inform ftrace to remove it */ - ftrace_free_mem(start, end); - start = (void *)PAGE_ALIGN((unsigned long)start); end = (void *)((unsigned long)end & PAGE_MASK); for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { -- cgit v1.2.3 From d34b0733b452ca3ef1dee36436dab08b8aa6a85c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 20 Apr 2017 14:37:43 -0700 Subject: Revert "mm, page_alloc: only use per-cpu allocator for irq-safe requests" This reverts commit 374ad05ab64. While the patch worked great for userspace allocations, the fact that softirq loses the per-cpu allocator caused problems. It needs to be redone taking into account that a separate list is needed for hard/soft IRQs or alternatively find a cheap way of detecting reentry due to an interrupt. Both are possible but sufficiently tricky that it shouldn't be rushed. Jesper had one method for allowing softirqs but reported that the cost was high enough that it performed similarly to a plain revert. His figures for netperf TCP_STREAM were as follows Baseline v4.10.0 : 60316 Mbit/s Current 4.11.0-rc6: 47491 Mbit/s Jesper's patch : 60662 Mbit/s This patch : 60106 Mbit/s As this is a regression, I wish to revert to noirq allocator for now and go back to the drawing board. Link: http://lkml.kernel.org/r/20170415145350.ixy7vtrzdzve57mh@techsingularity.net Signed-off-by: Mel Gorman Reported-by: Tariq Toukan Acked-by: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f3d603cef2c0..07efbc3a8656 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1090,10 +1090,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - unsigned long nr_scanned, flags; + unsigned long nr_scanned; bool isolated_pageblocks; - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); isolated_pageblocks = has_isolate_pageblock(zone); nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) @@ -1142,7 +1142,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, trace_mm_page_pcpu_drain(page, 0, mt); } while (--count && --batch_free && !list_empty(list)); } - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); } static void free_one_page(struct zone *zone, @@ -1150,9 +1150,8 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { - unsigned long nr_scanned, flags; - spin_lock_irqsave(&zone->lock, flags); - __count_vm_events(PGFREE, 1 << order); + unsigned long nr_scanned; + spin_lock(&zone->lock); nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); @@ -1162,7 +1161,7 @@ static void free_one_page(struct zone *zone, migratetype = get_pfnblock_migratetype(page, pfn); } __free_one_page(page, pfn, zone, order, migratetype); - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); } static void __meminit __init_single_page(struct page *page, unsigned long pfn, @@ -1240,6 +1239,7 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) static void __free_pages_ok(struct page *page, unsigned int order) { + unsigned long flags; int migratetype; unsigned long pfn = page_to_pfn(page); @@ -1247,7 +1247,10 @@ static void __free_pages_ok(struct page *page, unsigned int order) return; migratetype = get_pfnblock_migratetype(page, pfn); + local_irq_save(flags); + __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, pfn, order, migratetype); + local_irq_restore(flags); } static void __init __free_pages_boot_core(struct page *page, unsigned int order) @@ -2219,9 +2222,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, int migratetype, bool cold) { int i, alloced = 0; - unsigned long flags; - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype); if (unlikely(page == NULL)) @@ -2257,7 +2259,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages added to the pcp list. */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); return alloced; } @@ -2485,20 +2487,17 @@ void free_hot_cold_page(struct page *page, bool cold) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; + unsigned long flags; unsigned long pfn = page_to_pfn(page); int migratetype; - if (in_interrupt()) { - __free_pages_ok(page, 0); - return; - } - if (!free_pcp_prepare(page)) return; migratetype = get_pfnblock_migratetype(page, pfn); set_pcppage_migratetype(page, migratetype); - preempt_disable(); + local_irq_save(flags); + __count_vm_event(PGFREE); /* * We only track unmovable, reclaimable and movable on pcp lists. @@ -2515,7 +2514,6 @@ void free_hot_cold_page(struct page *page, bool cold) migratetype = MIGRATE_MOVABLE; } - __count_vm_event(PGFREE); pcp = &this_cpu_ptr(zone->pageset)->pcp; if (!cold) list_add(&page->lru, &pcp->lists[migratetype]); @@ -2529,7 +2527,7 @@ void free_hot_cold_page(struct page *page, bool cold) } out: - preempt_enable(); + local_irq_restore(flags); } /* @@ -2654,8 +2652,6 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, { struct page *page; - VM_BUG_ON(in_interrupt()); - do { if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, @@ -2686,8 +2682,9 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct list_head *list; bool cold = ((gfp_flags & __GFP_COLD) != 0); struct page *page; + unsigned long flags; - preempt_disable(); + local_irq_save(flags); pcp = &this_cpu_ptr(zone->pageset)->pcp; list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); @@ -2695,7 +2692,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone); } - preempt_enable(); + local_irq_restore(flags); return page; } @@ -2711,7 +2708,7 @@ struct page *rmqueue(struct zone *preferred_zone, unsigned long flags; struct page *page; - if (likely(order == 0) && !in_interrupt()) { + if (likely(order == 0)) { page = rmqueue_pcplist(preferred_zone, zone, order, gfp_flags, migratetype); goto out; -- cgit v1.2.3 From c73322d098e4b6f5f0f0fa1330bf57e218775539 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 May 2017 14:51:51 -0700 Subject: mm: fix 100% CPU kswapd busyloop on unreclaimable nodes Patch series "mm: kswapd spinning on unreclaimable nodes - fixes and cleanups". Jia reported a scenario in which the kswapd of a node indefinitely spins at 100% CPU usage. We have seen similar cases at Facebook. The kernel's current method of judging its ability to reclaim a node (or whether to back off and sleep) is based on the amount of scanned pages in proportion to the amount of reclaimable pages. In Jia's and our scenarios, there are no reclaimable pages in the node, however, and the condition for backing off is never met. Kswapd busyloops in an attempt to restore the watermarks while having nothing to work with. This series reworks the definition of an unreclaimable node based not on scanning but on whether kswapd is able to actually reclaim pages in MAX_RECLAIM_RETRIES (16) consecutive runs. This is the same criteria the page allocator uses for giving up on direct reclaim and invoking the OOM killer. If it cannot free any pages, kswapd will go to sleep and leave further attempts to direct reclaim invocations, which will either make progress and re-enable kswapd, or invoke the OOM killer. Patch #1 fixes the immediate problem Jia reported, the remainder are smaller fixlets, cleanups, and overall phasing out of the old method. Patch #6 is the odd one out. It's a nice cleanup to get_scan_count(), and directly related to #5, but in itself not relevant to the series. If the whole series is too ambitious for 4.11, I would consider the first three patches fixes, the rest cleanups. This patch (of 9): Jia He reports a problem with kswapd spinning at 100% CPU when requesting more hugepages than memory available in the system: $ echo 4000 >/proc/sys/vm/nr_hugepages top - 13:42:59 up 3:37, 1 user, load average: 1.09, 1.03, 1.01 Tasks: 1 total, 1 running, 0 sleeping, 0 stopped, 0 zombie %Cpu(s): 0.0 us, 12.5 sy, 0.0 ni, 85.5 id, 2.0 wa, 0.0 hi, 0.0 si, 0.0 st KiB Mem: 31371520 total, 30915136 used, 456384 free, 320 buffers KiB Swap: 6284224 total, 115712 used, 6168512 free. 48192 cached Mem PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 76 root 20 0 0 0 0 R 100.0 0.000 217:17.29 kswapd3 At that time, there are no reclaimable pages left in the node, but as kswapd fails to restore the high watermarks it refuses to go to sleep. Kswapd needs to back away from nodes that fail to balance. Up until commit 1d82de618ddd ("mm, vmscan: make kswapd reclaim in terms of nodes") kswapd had such a mechanism. It considered zones whose theoretically reclaimable pages it had reclaimed six times over as unreclaimable and backed away from them. This guard was erroneously removed as the patch changed the definition of a balanced node. However, simply restoring this code wouldn't help in the case reported here: there *are* no reclaimable pages that could be scanned until the threshold is met. Kswapd would stay awake anyway. Introduce a new and much simpler way of backing off. If kswapd runs through MAX_RECLAIM_RETRIES (16) cycles without reclaiming a single page, make it back off from the node. This is the same number of shots direct reclaim takes before declaring OOM. Kswapd will go to sleep on that node until a direct reclaimer manages to reclaim some pages, thus proving the node reclaimable again. [hannes@cmpxchg.org: check kswapd failure against the cumulative nr_reclaimed count] Link: http://lkml.kernel.org/r/20170306162410.GB2090@cmpxchg.org [shakeelb@google.com: fix condition for throttle_direct_reclaim] Link: http://lkml.kernel.org/r/20170314183228.20152-1-shakeelb@google.com Link: http://lkml.kernel.org/r/20170228214007.5621-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Shakeel Butt Reported-by: Jia He Tested-by: Jia He Acked-by: Michal Hocko Acked-by: Hillf Danton Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 ++ mm/internal.h | 6 ++++++ mm/page_alloc.c | 9 ++------- mm/vmscan.c | 47 ++++++++++++++++++++++++++++++++--------------- mm/vmstat.c | 2 +- 5 files changed, 43 insertions(+), 23 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8e02b3750fe0..d2c50ab6ae40 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -630,6 +630,8 @@ typedef struct pglist_data { int kswapd_order; enum zone_type kswapd_classzone_idx; + int kswapd_failures; /* Number of 'reclaimed == 0' runs */ + #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_classzone_idx; diff --git a/mm/internal.h b/mm/internal.h index 266efaeaa370..e5a0e0ec2177 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -80,6 +80,12 @@ static inline void set_page_refcounted(struct page *page) extern unsigned long highest_memmap_pfn; +/* + * Maximum number of reclaim retries without progress before the OOM + * killer is consider the only way forward. + */ +#define MAX_RECLAIM_RETRIES 16 + /* * in mm/vmscan.c: */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bd01501efab9..42c0543e46c3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3521,12 +3521,6 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) return false; } -/* - * Maximum number of reclaim retries without any progress before OOM killer - * is consider as the only way to move forward. - */ -#define MAX_RECLAIM_RETRIES 16 - /* * Checks whether it makes sense to retry the reclaim to make a forward progress * for the given allocation request. @@ -4534,7 +4528,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), K(node_page_state(pgdat, NR_UNSTABLE_NFS)), node_page_state(pgdat, NR_PAGES_SCANNED), - !pgdat_reclaimable(pgdat) ? "yes" : "no"); + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? + "yes" : "no"); } for_each_populated_zone(zone) { diff --git a/mm/vmscan.c b/mm/vmscan.c index bc8031ef994d..667644e53b5c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2620,6 +2620,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + /* + * Kswapd gives up on balancing particular nodes after too + * many failures to reclaim anything from them and goes to + * sleep. On reclaim progress, reset the failure counter. A + * successful direct reclaim run will revive a dormant kswapd. + */ + if (reclaimable) + pgdat->kswapd_failures = 0; + return reclaimable; } @@ -2694,10 +2703,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) GFP_KERNEL | __GFP_HARDWALL)) continue; - if (sc->priority != DEF_PRIORITY && - !pgdat_reclaimable(zone->zone_pgdat)) - continue; /* Let kswapd poll it */ - /* * If we already have plenty of memory free for * compaction in this zone, don't free any more. @@ -2817,7 +2822,7 @@ retry: return 0; } -static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) +static bool allow_direct_reclaim(pg_data_t *pgdat) { struct zone *zone; unsigned long pfmemalloc_reserve = 0; @@ -2825,6 +2830,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) int i; bool wmark_ok; + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; + for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; if (!managed_zone(zone) || @@ -2905,7 +2913,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; - if (pfmemalloc_watermark_ok(pgdat)) + if (allow_direct_reclaim(pgdat)) goto out; break; } @@ -2927,14 +2935,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, */ if (!(gfp_mask & __GFP_FS)) { wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, - pfmemalloc_watermark_ok(pgdat), HZ); + allow_direct_reclaim(pgdat), HZ); goto check_pending; } /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - pfmemalloc_watermark_ok(pgdat)); + allow_direct_reclaim(pgdat)); check_pending: if (fatal_signal_pending(current)) @@ -3114,7 +3122,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) /* * The throttled processes are normally woken up in balance_pgdat() as - * soon as pfmemalloc_watermark_ok() is true. But there is a potential + * soon as allow_direct_reclaim() is true. But there is a potential * race between when kswapd checks the watermarks and a process gets * throttled. There is also a potential race if processes get * throttled, kswapd wakes, a large process exits thereby balancing the @@ -3128,6 +3136,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) if (waitqueue_active(&pgdat->pfmemalloc_wait)) wake_up_all(&pgdat->pfmemalloc_wait); + /* Hopeless node, leave it to direct reclaim */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; + for (i = 0; i <= classzone_idx; i++) { struct zone *zone = pgdat->node_zones + i; @@ -3214,9 +3226,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) count_vm_event(PAGEOUTRUN); do { + unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; - sc.nr_reclaimed = 0; sc.reclaim_idx = classzone_idx; /* @@ -3295,7 +3307,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && - pfmemalloc_watermark_ok(pgdat)) + allow_direct_reclaim(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ @@ -3306,10 +3318,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ - if (raise_priority || !sc.nr_reclaimed) + nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; + if (raise_priority || !nr_reclaimed) sc.priority--; } while (sc.priority >= 1); + if (!sc.nr_reclaimed) + pgdat->kswapd_failures++; + out: /* * Return the order kswapd stopped reclaiming at as @@ -3509,6 +3525,10 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (!waitqueue_active(&pgdat->kswapd_wait)) return; + /* Hopeless node, leave it to direct reclaim */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return; + /* Only wake kswapd if all zones are unbalanced */ for (z = 0; z <= classzone_idx; z++) { zone = pgdat->node_zones + z; @@ -3779,9 +3799,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) return NODE_RECLAIM_FULL; - if (!pgdat_reclaimable(pgdat)) - return NODE_RECLAIM_FULL; - /* * Do not scan if the allocation should not be delayed. */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 5a4f5c5a31e8..baee70dafba8 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1425,7 +1425,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n node_unreclaimable: %u" "\n start_pfn: %lu" "\n node_inactive_ratio: %u", - !pgdat_reclaimable(zone->zone_pgdat), + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, zone->zone_start_pfn, zone->zone_pgdat->inactive_ratio); seq_putc(m, '\n'); -- cgit v1.2.3 From c822f6223d03c2c5b026a21da09c6b6d523258cd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 May 2017 14:52:10 -0700 Subject: mm: delete NR_PAGES_SCANNED and pgdat_reclaimable() NR_PAGES_SCANNED counts number of pages scanned since the last page free event in the allocator. This was used primarily to measure the reclaimability of zones and nodes, and determine when reclaim should give up on them. In that role, it has been replaced in the preceding patches by a different mechanism. Being implemented as an efficient vmstat counter, it was automatically exported to userspace as well. It's however unlikely that anyone outside the kernel is using this counter in any meaningful way. Remove the counter and the unused pgdat_reclaimable(). Link: http://lkml.kernel.org/r/20170228214007.5621-8-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Hillf Danton Acked-by: Michal Hocko Cc: Jia He Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 - mm/internal.h | 1 - mm/page_alloc.c | 11 ----------- mm/vmscan.c | 9 --------- mm/vmstat.c | 22 +++------------------- 5 files changed, 3 insertions(+), 41 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d2c50ab6ae40..04e0969966f6 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -149,7 +149,6 @@ enum node_stat_item { NR_UNEVICTABLE, /* " " " " " */ NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ - NR_PAGES_SCANNED, /* pages scanned since last reclaim */ WORKINGSET_REFAULT, WORKINGSET_ACTIVATE, WORKINGSET_NODERECLAIM, diff --git a/mm/internal.h b/mm/internal.h index e5a0e0ec2177..a36719572eb9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -91,7 +91,6 @@ extern unsigned long highest_memmap_pfn; */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); -extern bool pgdat_reclaimable(struct pglist_data *pgdat); /* * in mm/rmap.c: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 42c0543e46c3..6994f28f769c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1090,14 +1090,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - unsigned long nr_scanned; bool isolated_pageblocks; spin_lock(&zone->lock); isolated_pageblocks = has_isolate_pageblock(zone); - nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); - if (nr_scanned) - __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); while (count) { struct page *page; @@ -1150,12 +1146,7 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { - unsigned long nr_scanned; spin_lock(&zone->lock); - nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); - if (nr_scanned) - __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); - if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); @@ -4504,7 +4495,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #endif " writeback_tmp:%lukB" " unstable:%lukB" - " pages_scanned:%lu" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -4527,7 +4517,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), K(node_page_state(pgdat, NR_UNSTABLE_NFS)), - node_page_state(pgdat, NR_PAGES_SCANNED), pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 9117ae8d49ee..02f2eb51b33e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -230,12 +230,6 @@ unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) return nr; } -bool pgdat_reclaimable(struct pglist_data *pgdat) -{ - return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) < - pgdat_reclaimable_pages(pgdat) * 6; -} - /** * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector @@ -1750,7 +1744,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) { - __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) __count_vm_events(PGSCAN_KSWAPD, nr_scanned); else @@ -1953,8 +1946,6 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; - if (global_reclaim(sc)) - __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); __count_vm_events(PGREFILL, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); diff --git a/mm/vmstat.c b/mm/vmstat.c index baee70dafba8..c8d15051616b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -954,7 +954,6 @@ const char * const vmstat_text[] = { "nr_unevictable", "nr_isolated_anon", "nr_isolated_file", - "nr_pages_scanned", "workingset_refault", "workingset_activate", "workingset_nodereclaim", @@ -1378,7 +1377,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n min %lu" "\n low %lu" "\n high %lu" - "\n node_scanned %lu" "\n spanned %lu" "\n present %lu" "\n managed %lu", @@ -1386,7 +1384,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED), zone->spanned_pages, zone->present_pages, zone->managed_pages); @@ -1586,22 +1583,9 @@ int vmstat_refresh(struct ctl_table *table, int write, for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { val = atomic_long_read(&vm_zone_stat[i]); if (val < 0) { - switch (i) { - case NR_PAGES_SCANNED: - /* - * This is often seen to go negative in - * recent kernels, but not to go permanently - * negative. Whilst it would be nicer not to - * have exceptions, rooting them out would be - * another task, of rather low priority. - */ - break; - default: - pr_warn("%s: %s %ld\n", - __func__, vmstat_text[i], val); - err = -EINVAL; - break; - } + pr_warn("%s: %s %ld\n", + __func__, vmstat_text[i], val); + err = -EINVAL; } } if (err) -- cgit v1.2.3 From 491d79ae778f24dbf65f1f2178d4744d940d4093 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 May 2017 14:52:16 -0700 Subject: mm: remove unnecessary back-off function when retrying page reclaim The backoff mechanism is not needed. If we have MAX_RECLAIM_RETRIES loops without progress, we'll OOM anyway; backing off might cut one or two iterations off that in the rare OOM case. If we have intermittent success reclaiming a few pages, the backoff function gets reset also, and so is of little help in these scenarios. We might want a backoff function for when there IS progress, but not enough to be satisfactory. But this isn't that. Remove it. Link: http://lkml.kernel.org/r/20170228214007.5621-10-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Hillf Danton Acked-by: Michal Hocko Cc: Jia He Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6994f28f769c..f82beddbd96f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3515,11 +3515,10 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) /* * Checks whether it makes sense to retry the reclaim to make a forward progress * for the given allocation request. - * The reclaim feedback represented by did_some_progress (any progress during - * the last reclaim round) and no_progress_loops (number of reclaim rounds without - * any progress in a row) is considered as well as the reclaimable pages on the - * applicable zone list (with a backoff mechanism which is a function of - * no_progress_loops). + * + * We give up when we either have tried MAX_RECLAIM_RETRIES in a row + * without success, or when we couldn't even meet the watermark if we + * reclaimed all remaining pages on the LRU lists. * * Returns true if a retry is viable or false to enter the oom path. */ @@ -3564,13 +3563,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, bool wmark; available = reclaimable = zone_reclaimable_pages(zone); - available -= DIV_ROUND_UP((*no_progress_loops) * available, - MAX_RECLAIM_RETRIES); available += zone_page_state_snapshot(zone, NR_FREE_PAGES); /* - * Would the allocation succeed if we reclaimed the whole - * available? + * Would the allocation succeed if we reclaimed all + * reclaimable pages? */ wmark = __zone_watermark_ok(zone, order, min_wmark, ac_classzone_idx(ac), alloc_flags, available); -- cgit v1.2.3 From a6ffdc07847e74cc244c02ab6d0351a4a5d77281 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Wed, 3 May 2017 14:52:52 -0700 Subject: mm: use is_migrate_highatomic() to simplify the code Introduce two helpers, is_migrate_highatomic() and is_migrate_highatomic_page(). Simplify the code, no functional changes. [akpm@linux-foundation.org: use static inlines rather than macros, per mhocko] Link: http://lkml.kernel.org/r/58B94F15.6060606@huawei.com Signed-off-by: Xishi Qiu Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Mel Gorman Cc: Minchan Kim Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- mm/internal.h | 10 ++++++++++ mm/page_alloc.c | 14 ++++++-------- 3 files changed, 17 insertions(+), 9 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 04e0969966f6..446cf68c1c09 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -35,7 +35,7 @@ */ #define PAGE_ALLOC_COSTLY_ORDER 3 -enum { +enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, diff --git a/mm/internal.h b/mm/internal.h index a36719572eb9..04d08ef91224 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -510,4 +510,14 @@ extern const struct trace_print_flags pageflag_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; +static inline bool is_migrate_highatomic(enum migratetype migratetype) +{ + return migratetype == MIGRATE_HIGHATOMIC; +} + +static inline bool is_migrate_highatomic_page(struct page *page) +{ + return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; +} + #endif /* __MM_INTERNAL_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f82beddbd96f..34ac32428de8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2036,8 +2036,8 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, /* Yoink! */ mt = get_pageblock_migratetype(page); - if (mt != MIGRATE_HIGHATOMIC && - !is_migrate_isolate(mt) && !is_migrate_cma(mt)) { + if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) + && !is_migrate_cma(mt)) { zone->nr_reserved_highatomic += pageblock_nr_pages; set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); move_freepages_block(zone, page, MIGRATE_HIGHATOMIC); @@ -2094,8 +2094,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * from highatomic to ac->migratetype. So we should * adjust the count once. */ - if (get_pageblock_migratetype(page) == - MIGRATE_HIGHATOMIC) { + if (is_migrate_highatomic_page(page)) { /* * It should never happen but changes to * locking could inadvertently allow a per-cpu @@ -2152,8 +2151,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); - if (can_steal && - get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC) + if (can_steal && !is_migrate_highatomic_page(page)) steal_suitable_fallback(zone, page, start_migratetype); /* Remove the page from the freelists */ @@ -2493,7 +2491,7 @@ void free_hot_cold_page(struct page *page, bool cold) /* * We only track unmovable, reclaimable and movable on pcp lists. * Free ISOLATE pages back to the allocator because they are being - * offlined but treat RESERVE as movable pages so we can get those + * offlined but treat HIGHATOMIC as movable pages so we can get those * areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ @@ -2603,7 +2601,7 @@ int __isolate_free_page(struct page *page, unsigned int order) for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) - && mt != MIGRATE_HIGHATOMIC) + && !is_migrate_highatomic(mt)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } -- cgit v1.2.3 From 7dea19f9ee636cb244109a4dba426bbb3e5304b7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 3 May 2017 14:53:15 -0700 Subject: mm: introduce memalloc_nofs_{save,restore} API GFP_NOFS context is used for the following 5 reasons currently: - to prevent from deadlocks when the lock held by the allocation context would be needed during the memory reclaim - to prevent from stack overflows during the reclaim because the allocation is performed from a deep context already - to prevent lockups when the allocation context depends on other reclaimers to make a forward progress indirectly - just in case because this would be safe from the fs POV - silence lockdep false positives Unfortunately overuse of this allocation context brings some problems to the MM. Memory reclaim is much weaker (especially during heavy FS metadata workloads), OOM killer cannot be invoked because the MM layer doesn't have enough information about how much memory is freeable by the FS layer. In many cases it is far from clear why the weaker context is even used and so it might be used unnecessarily. We would like to get rid of those as much as possible. One way to do that is to use the flag in scopes rather than isolated cases. Such a scope is declared when really necessary, tracked per task and all the allocation requests from within the context will simply inherit the GFP_NOFS semantic. Not only this is easier to understand and maintain because there are much less problematic contexts than specific allocation requests, this also helps code paths where FS layer interacts with other layers (e.g. crypto, security modules, MM etc...) and there is no easy way to convey the allocation context between the layers. Introduce memalloc_nofs_{save,restore} API to control the scope of GFP_NOFS allocation context. This is basically copying memalloc_noio_{save,restore} API we have for other restricted allocation context GFP_NOIO. The PF_MEMALLOC_NOFS flag already exists and it is just an alias for PF_FSTRANS which has been xfs specific until recently. There are no more PF_FSTRANS users anymore so let's just drop it. PF_MEMALLOC_NOFS is now checked in the MM layer and drops __GFP_FS implicitly same as PF_MEMALLOC_NOIO drops __GFP_IO. memalloc_noio_flags is renamed to current_gfp_context because it now cares about both PF_MEMALLOC_NOFS and PF_MEMALLOC_NOIO contexts. Xfs code paths preserve their semantic. kmem_flags_convert() doesn't need to evaluate the flag anymore. This patch shouldn't introduce any functional changes. Let's hope that filesystems will drop direct GFP_NOFS (resp. ~__GFP_FS) usage as much as possible and only use a properly documented memalloc_nofs_{save,restore} checkpoints where they are appropriate. [akpm@linux-foundation.org: fix comment typo, reflow comment] Link: http://lkml.kernel.org/r/20170306131408.9828-5-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Dave Chinner Cc: Theodore Ts'o Cc: Chris Mason Cc: David Sterba Cc: Jan Kara Cc: Brian Foster Cc: Darrick J. Wong Cc: Nikolay Borisov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/xfs/kmem.h | 2 +- include/linux/gfp.h | 8 ++++++++ include/linux/sched.h | 8 +++----- include/linux/sched/mm.h | 26 +++++++++++++++++++++++--- kernel/locking/lockdep.c | 6 +++--- mm/page_alloc.c | 10 ++++++---- mm/vmscan.c | 6 +++--- 7 files changed, 47 insertions(+), 19 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index a6c8da40c70d..d6ea520162b2 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags) lflags = GFP_ATOMIC | __GFP_NOWARN; } else { lflags = GFP_KERNEL | __GFP_NOWARN; - if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS)) + if (flags & KM_NOFS) lflags &= ~__GFP_FS; } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 978232a3b4ae..2bfcfd33e476 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -210,8 +210,16 @@ struct vm_area_struct; * * GFP_NOIO will use direct reclaim to discard clean pages or slab pages * that do not require the starting of any physical IO. + * Please try to avoid using this flag directly and instead use + * memalloc_noio_{save,restore} to mark the whole scope which cannot + * perform any IO with a short explanation why. All allocation requests + * will inherit GFP_NOIO implicitly. * * GFP_NOFS will use direct reclaim but will not use any filesystem interfaces. + * Please try to avoid using this flag directly and instead use + * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't + * recurse into the FS layer with a short explanation why. All allocation + * requests will inherit GFP_NOFS implicitly. * * GFP_USER is for userspace allocations that also need to be directly * accessibly by the kernel or hardware. It is typically used by hardware diff --git a/include/linux/sched.h b/include/linux/sched.h index 8ac11465ac5b..993e7e25a3a5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1224,9 +1224,9 @@ extern struct pid *cad_pid; #define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_FROZEN 0x00010000 /* Frozen for system suspend */ -#define PF_FSTRANS 0x00020000 /* Inside a filesystem transaction */ -#define PF_KSWAPD 0x00040000 /* I am kswapd */ -#define PF_MEMALLOC_NOIO 0x00080000 /* Allocating memory without IO involved */ +#define PF_KSWAPD 0x00020000 /* I am kswapd */ +#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ +#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ @@ -1237,8 +1237,6 @@ extern struct pid *cad_pid; #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ -#define PF_MEMALLOC_NOFS PF_FSTRANS /* Transition to a more generic GFP_NOFS scope semantic */ - /* * Only the _current_ task can read/write to tsk->flags, but other * tasks can access tsk->flags in readonly mode for example diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 830953ebb391..9daabe138c99 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -149,13 +149,21 @@ static inline bool in_vfork(struct task_struct *tsk) return ret; } -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags - * __GFP_FS is also cleared as it implies __GFP_IO. +/* + * Applies per-task gfp context to the given allocation flags. + * PF_MEMALLOC_NOIO implies GFP_NOIO + * PF_MEMALLOC_NOFS implies GFP_NOFS */ -static inline gfp_t memalloc_noio_flags(gfp_t flags) +static inline gfp_t current_gfp_context(gfp_t flags) { + /* + * NOIO implies both NOIO and NOFS and it is a weaker context + * so always make sure it makes precendence + */ if (unlikely(current->flags & PF_MEMALLOC_NOIO)) flags &= ~(__GFP_IO | __GFP_FS); + else if (unlikely(current->flags & PF_MEMALLOC_NOFS)) + flags &= ~__GFP_FS; return flags; } @@ -171,4 +179,16 @@ static inline void memalloc_noio_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; } +static inline unsigned int memalloc_nofs_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC_NOFS; + current->flags |= PF_MEMALLOC_NOFS; + return flags; +} + +static inline void memalloc_nofs_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; +} + #endif /* _LINUX_SCHED_MM_H */ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index f84294c9a018..fd440b5a3c75 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2877,7 +2877,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) if (unlikely(!debug_locks)) return; - gfp_mask = memalloc_noio_flags(gfp_mask); + gfp_mask = current_gfp_context(gfp_mask); /* no reclaim without waiting on it */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) @@ -2888,7 +2888,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) return; /* We're only interested __GFP_FS allocations for now */ - if (!(gfp_mask & __GFP_FS)) + if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS)) return; /* @@ -3954,7 +3954,7 @@ EXPORT_SYMBOL_GPL(lock_unpin_lock); void lockdep_set_current_reclaim_state(gfp_t gfp_mask) { - current->lockdep_reclaim_gfp = memalloc_noio_flags(gfp_mask); + current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask); } void lockdep_clear_current_reclaim_state(void) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 34ac32428de8..7a3751e53f91 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3951,10 +3951,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, goto out; /* - * Runtime PM, block IO and its error handling path can deadlock - * because I/O on the device might not complete. + * Apply scoped allocation constraints. This is mainly about GFP_NOFS + * resp. GFP_NOIO which has to be inherited for all allocation requests + * from a particular context which has been marked by + * memalloc_no{fs,io}_{save,restore}. */ - alloc_mask = memalloc_noio_flags(gfp_mask); + alloc_mask = current_gfp_context(gfp_mask); ac.spread_dirty_pages = false; /* @@ -7408,7 +7410,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, .ignore_skip_hint = true, - .gfp_mask = memalloc_noio_flags(gfp_mask), + .gfp_mask = current_gfp_context(gfp_mask), }; INIT_LIST_HEAD(&cc.migratepages); diff --git a/mm/vmscan.c b/mm/vmscan.c index ec4555369e17..3ad66580b8b4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2915,7 +2915,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, unsigned long nr_reclaimed; struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, - .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), .reclaim_idx = gfp_zone(gfp_mask), .order = order, .nodemask = nodemask, @@ -2995,7 +2995,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, int nid; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), .reclaim_idx = MAX_NR_ZONES - 1, .target_mem_cgroup = memcg, @@ -3702,7 +3702,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in int classzone_idx = gfp_zone(gfp_mask); struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), .order = order, .priority = NODE_RECLAIM_PRIORITY, .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), -- cgit v1.2.3 From 822519634142cb66614514ac8ee9221868f764bb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 May 2017 14:53:48 -0700 Subject: mm: page_alloc: __GFP_NOWARN shouldn't suppress stall warnings __GFP_NOWARN, which is usually added to avoid warnings from callsites that expect to fail and have fallbacks, currently also suppresses allocation stall warnings. These trigger when an allocation is stuck inside the allocator for 10 seconds or longer. But there is no class of allocations that can get legitimately stuck in the allocator for this long. This always indicates a problem. Always emit stall warnings. Restrict __GFP_NOWARN to alloc failures. Link: http://lkml.kernel.org/r/20170125181150.GA16398@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Minchan Kim Cc: Michal Hocko Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7a3751e53f91..465391811c2e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3751,7 +3751,7 @@ retry: /* Make sure we know about allocations which stall for too long */ if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask, ac->nodemask, + warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask, "page allocation stalls for %ums, order:%u", jiffies_to_msecs(jiffies-alloc_start), order); stall_timeout += 10 * HZ; -- cgit v1.2.3 From bd33ef3681359343863f2290aded182b0441edee Mon Sep 17 00:00:00 2001 From: Vinayak Menon Date: Wed, 3 May 2017 14:54:42 -0700 Subject: mm: enable page poisoning early at boot On SPARSEMEM systems page poisoning is enabled after buddy is up, because of the dependency on page extension init. This causes the pages released by free_all_bootmem not to be poisoned. This either delays or misses the identification of some issues because the pages have to undergo another cycle of alloc-free-alloc for any corruption to be detected. Enable page poisoning early by getting rid of the PAGE_EXT_DEBUG_POISON flag. Since all the free pages will now be poisoned, the flag need not be verified before checking the poison during an alloc. [vinmenon@codeaurora.org: fix Kconfig] Link: http://lkml.kernel.org/r/1490878002-14423-1-git-send-email-vinmenon@codeaurora.org Link: http://lkml.kernel.org/r/1490358246-11001-1-git-send-email-vinmenon@codeaurora.org Signed-off-by: Vinayak Menon Acked-by: Laura Abbott Tested-by: Laura Abbott Cc: Joonsoo Kim Cc: Michal Hocko Cc: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/Kconfig.debug | 1 - mm/page_alloc.c | 13 +++------ mm/page_ext.c | 13 ++------- mm/page_poison.c | 77 +++++++++--------------------------------------------- 5 files changed, 17 insertions(+), 88 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 695da2a19b4c..5d22e69f51ea 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2487,7 +2487,6 @@ extern long copy_huge_page_from_user(struct page *dst_page, #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ extern struct page_ext_operations debug_guardpage_ops; -extern struct page_ext_operations page_poisoning_ops; #ifdef CONFIG_DEBUG_PAGEALLOC extern unsigned int _debug_guardpage_minorder; diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 79d0fd13b5b3..5b0adf1435de 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -42,7 +42,6 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT config PAGE_POISONING bool "Poison pages after freeing" - select PAGE_EXTENSION select PAGE_POISONING_NO_SANITY if HIBERNATION ---help--- Fill the pages with poison patterns after free_pages() and verify diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 465391811c2e..f1f225608413 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1689,10 +1689,10 @@ static inline int check_new_page(struct page *page) return 1; } -static inline bool free_pages_prezeroed(bool poisoned) +static inline bool free_pages_prezeroed(void) { return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && - page_poisoning_enabled() && poisoned; + page_poisoning_enabled(); } #ifdef CONFIG_DEBUG_VM @@ -1746,17 +1746,10 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags unsigned int alloc_flags) { int i; - bool poisoned = true; - - for (i = 0; i < (1 << order); i++) { - struct page *p = page + i; - if (poisoned) - poisoned &= page_is_poisoned(p); - } post_alloc_hook(page, order, gfp_flags); - if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) + if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) for (i = 0; i < (1 << order); i++) clear_highpage(page + i); diff --git a/mm/page_ext.c b/mm/page_ext.c index 121dcffc4ec1..88ccc044b09a 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -59,9 +59,6 @@ static struct page_ext_operations *page_ext_ops[] = { &debug_guardpage_ops, -#ifdef CONFIG_PAGE_POISONING - &page_poisoning_ops, -#endif #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif @@ -127,15 +124,12 @@ struct page_ext *lookup_page_ext(struct page *page) struct page_ext *base; base = NODE_DATA(page_to_nid(page))->node_page_ext; -#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) +#if defined(CONFIG_DEBUG_VM) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. - * - * This check is also necessary for ensuring page poisoning - * works as expected when enabled */ if (unlikely(!base)) return NULL; @@ -204,15 +198,12 @@ struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) +#if defined(CONFIG_DEBUG_VM) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. - * - * This check is also necessary for ensuring page poisoning - * works as expected when enabled */ if (!section->page_ext) return NULL; diff --git a/mm/page_poison.c b/mm/page_poison.c index 2e647c65916b..be19e989ccff 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -6,7 +6,6 @@ #include #include -static bool __page_poisoning_enabled __read_mostly; static bool want_page_poisoning __read_mostly; static int early_page_poison_param(char *buf) @@ -18,75 +17,22 @@ static int early_page_poison_param(char *buf) early_param("page_poison", early_page_poison_param); bool page_poisoning_enabled(void) -{ - return __page_poisoning_enabled; -} - -static bool need_page_poisoning(void) -{ - return want_page_poisoning; -} - -static void init_page_poisoning(void) { /* - * page poisoning is debug page alloc for some arches. If either - * of those options are enabled, enable poisoning + * Assumes that debug_pagealloc_enabled is set before + * free_all_bootmem. + * Page poisoning is debug page alloc for some arches. If + * either of those options are enabled, enable poisoning. */ - if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) { - if (!want_page_poisoning && !debug_pagealloc_enabled()) - return; - } else { - if (!want_page_poisoning) - return; - } - - __page_poisoning_enabled = true; -} - -struct page_ext_operations page_poisoning_ops = { - .need = need_page_poisoning, - .init = init_page_poisoning, -}; - -static inline void set_page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return; - - __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -static inline void clear_page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return; - - __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -bool page_is_poisoned(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return false; - - return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); + return (want_page_poisoning || + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && + debug_pagealloc_enabled())); } static void poison_page(struct page *page) { void *addr = kmap_atomic(page); - set_page_poison(page); memset(addr, PAGE_POISON, PAGE_SIZE); kunmap_atomic(addr); } @@ -140,12 +86,13 @@ static void unpoison_page(struct page *page) { void *addr; - if (!page_is_poisoned(page)) - return; - addr = kmap_atomic(page); + /* + * Page poisoning when enabled poisons each and every page + * that is freed to buddy. Thus no extra check is done to + * see if a page was posioned. + */ check_poison_mem(addr, PAGE_SIZE); - clear_page_poison(page); kunmap_atomic(addr); } -- cgit v1.2.3 From 0f7896f12b6a3dae3ffccbebe8c954d954350f3d Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 3 May 2017 14:55:34 -0700 Subject: mm, page_alloc: remove debug_guardpage_minorder() test in warn_alloc() Commit c0a32fc5a2e4 ("mm: more intensive memory corruption debugging") changed to check debug_guardpage_minorder() > 0 when reporting allocation failures. The reasoning was When we use guard page to debug memory corruption, it shrinks available pages to 1/2, 1/4, 1/8 and so on, depending on parameter value. In such case memory allocation failures can be common and printing errors can flood dmesg. If somebody debug corruption, allocation failures are not the things he/she is interested about. but this is misguided. Allocation requests with __GFP_NOWARN flag by definition do not cause flooding of allocation failure messages. Allocation requests with __GFP_NORETRY flag likely also have __GFP_NOWARN flag. Costly allocation requests likely also have __GFP_NOWARN flag. Allocation requests without __GFP_DIRECT_RECLAIM flag likely also have __GFP_NOWARN flag or __GFP_HIGH flag. Non-costly allocation requests with __GFP_DIRECT_RECLAIM flag basically retry forever due to the "too small to fail" memory-allocation rule. Therefore, as a whole, shrinking available pages by debug_guardpage_minorder= kernel boot parameter might cause flooding of OOM killer messages but unlikely causes flooding of allocation failure messages. Let's remove debug_guardpage_minorder() > 0 check which would likely be pointless. Link: http://lkml.kernel.org/r/1491910035-4231-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Cc: Stanislaw Gruszka Cc: Mel Gorman Cc: Andrea Arcangeli Cc: "Rafael J . Wysocki" Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f1f225608413..1e2af704938d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3092,8 +3092,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || - debug_guardpage_minorder() > 0) + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) return; pr_warn("%s: ", current->comm); -- cgit v1.2.3 From 3bc48f96cf11ce8699e419d5e47ae0d456403274 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 8 May 2017 15:54:37 -0700 Subject: mm, page_alloc: split smallest stolen page in fallback The __rmqueue_fallback() function is called when there's no free page of requested migratetype, and we need to steal from a different one. There are various heuristics to make this event infrequent and reduce permanent fragmentation. The main one is to try stealing from a pageblock that has the most free pages, and possibly steal them all at once and convert the whole pageblock. Precise searching for such pageblock would be expensive, so instead the heuristics walks the free lists from MAX_ORDER down to requested order and assumes that the block with highest-order free page is likely to also have the most free pages in total. Chances are that together with the highest-order page, we steal also pages of lower orders from the same block. But then we still split the highest order page. This is wasteful and can contribute to fragmentation instead of avoiding it. This patch thus changes __rmqueue_fallback() to just steal the page(s) and put them on the freelist of the requested migratetype, and only report whether it was successful. Then we pick (and eventually split) the smallest page with __rmqueue_smallest(). This all happens under zone lock, so nobody can steal it from us in the process. This should reduce fragmentation due to fallbacks. At worst we are only stealing a single highest-order page and waste some cycles by moving it between lists and then removing it, but fallback is not exactly hot path so that should not be a concern. As a side benefit the patch removes some duplicate code by reusing __rmqueue_smallest(). [vbabka@suse.cz: fix endless loop in the modified __rmqueue()] Link: http://lkml.kernel.org/r/59d71b35-d556-4fc9-ee2e-1574259282fd@suse.cz Link: http://lkml.kernel.org/r/20170307131545.28577-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Acked-by: Johannes Weiner Cc: Joonsoo Kim Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 62 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 25 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2c25de46c58f..2f1118b4dda4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1948,23 +1948,44 @@ static bool can_steal_fallback(unsigned int order, int start_mt) * use it's pages as requested migratetype in the future. */ static void steal_suitable_fallback(struct zone *zone, struct page *page, - int start_type) + int start_type, bool whole_block) { unsigned int current_order = page_order(page); + struct free_area *area; int pages; + /* + * This can happen due to races and we want to prevent broken + * highatomic accounting. + */ + if (is_migrate_highatomic_page(page)) + goto single_page; + /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { change_pageblock_range(page, current_order, start_type); - return; + goto single_page; } + /* We are not allowed to try stealing from the whole block */ + if (!whole_block) + goto single_page; + pages = move_freepages_block(zone, page, start_type); + /* moving whole block can fail due to zone boundary conditions */ + if (!pages) + goto single_page; /* Claim the whole block if over half of it is free */ if (pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_type); + + return; + +single_page: + area = &zone->free_area[current_order]; + list_move(&page->lru, &area->free_list[start_type]); } /* @@ -2123,8 +2144,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, return false; } -/* Remove an element from the buddy allocator from the fallback list */ -static inline struct page * +/* + * Try finding a free buddy page on the fallback list and put it on the free + * list of requested migratetype, possibly along with other pages from the same + * block, depending on fragmentation avoidance heuristics. Returns true if + * fallback was found so that __rmqueue_smallest() can grab it. + */ +static inline bool __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) { struct free_area *area; @@ -2145,32 +2171,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); - if (can_steal && !is_migrate_highatomic_page(page)) - steal_suitable_fallback(zone, page, start_migratetype); - /* Remove the page from the freelists */ - area->nr_free--; - list_del(&page->lru); - rmv_page_order(page); - - expand(zone, page, order, current_order, area, - start_migratetype); - /* - * The pcppage_migratetype may differ from pageblock's - * migratetype depending on the decisions in - * find_suitable_fallback(). This is OK as long as it does not - * differ for MIGRATE_CMA pageblocks. Those can be used as - * fallback only via special __rmqueue_cma_fallback() function - */ - set_pcppage_migratetype(page, start_migratetype); + steal_suitable_fallback(zone, page, start_migratetype, + can_steal); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); - return page; + return true; } - return NULL; + return false; } /* @@ -2182,13 +2193,14 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, { struct page *page; +retry: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { if (migratetype == MIGRATE_MOVABLE) page = __rmqueue_cma_fallback(zone, order); - if (!page) - page = __rmqueue_fallback(zone, order, migratetype); + if (!page && __rmqueue_fallback(zone, order, migratetype)) + goto retry; } trace_mm_page_alloc_zone_locked(page, order, migratetype); -- cgit v1.2.3 From 02aa0cdd72483c6dd436ed24d1000f86e0038d28 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 8 May 2017 15:54:40 -0700 Subject: mm, page_alloc: count movable pages when stealing from pageblock When stealing pages from pageblock of a different migratetype, we count how many free pages were stolen, and change the pageblock's migratetype if more than half of the pageblock was free. This might be too conservative, as there might be other pages that are not free, but were allocated with the same migratetype as our allocation requested. While we cannot determine the migratetype of allocated pages precisely (at least without the page_owner functionality enabled), we can count pages that compaction would try to isolate for migration - those are either on LRU or __PageMovable(). The rest can be assumed to be MIGRATE_RECLAIMABLE or MIGRATE_UNMOVABLE, which we cannot easily distinguish. This counting can be done as part of free page stealing with little additional overhead. The page stealing code is changed so that it considers free pages plus pages of the "good" migratetype for the decision whether to change pageblock's migratetype. The result should be more accurate migratetype of pageblocks wrt the actual pages in the pageblocks, when stealing from semi-occupied pageblocks. This should help the efficiency of page grouping by mobility. In testing based on 4.9 kernel with stress-highalloc from mmtests configured for order-4 GFP_KERNEL allocations, this patch has reduced the number of unmovable allocations falling back to movable pageblocks by 47%. The number of movable allocations falling back to other pageblocks are increased by 55%, but these events don't cause permanent fragmentation, so the tradeoff should be positive. Later patches also offset the movable fallback increase to some extent. [akpm@linux-foundation.org: merge fix] Link: http://lkml.kernel.org/r/20170307131545.28577-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Johannes Weiner Cc: Joonsoo Kim Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-isolation.h | 5 +-- mm/page_alloc.c | 74 +++++++++++++++++++++++++++++++++--------- mm/page_isolation.c | 5 +-- 3 files changed, 63 insertions(+), 21 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 047d64706f2a..d4cd2014fa6f 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -33,10 +33,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, bool skip_hwpoisoned_pages); void set_pageblock_migratetype(struct page *page, int migratetype); int move_freepages_block(struct zone *zone, struct page *page, - int migratetype); -int move_freepages(struct zone *zone, - struct page *start_page, struct page *end_page, - int migratetype); + int migratetype, int *num_movable); /* * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2f1118b4dda4..d90792addeb9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1832,9 +1832,9 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ -int move_freepages(struct zone *zone, +static int move_freepages(struct zone *zone, struct page *start_page, struct page *end_page, - int migratetype) + int migratetype, int *num_movable) { struct page *page; unsigned int order; @@ -1851,6 +1851,9 @@ int move_freepages(struct zone *zone, VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); #endif + if (num_movable) + *num_movable = 0; + for (page = start_page; page <= end_page;) { if (!pfn_valid_within(page_to_pfn(page))) { page++; @@ -1861,6 +1864,15 @@ int move_freepages(struct zone *zone, VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); if (!PageBuddy(page)) { + /* + * We assume that pages that could be isolated for + * migration are movable. But we don't actually try + * isolating, as that would be expensive. + */ + if (num_movable && + (PageLRU(page) || __PageMovable(page))) + (*num_movable)++; + page++; continue; } @@ -1876,7 +1888,7 @@ int move_freepages(struct zone *zone, } int move_freepages_block(struct zone *zone, struct page *page, - int migratetype) + int migratetype, int *num_movable) { unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; @@ -1893,7 +1905,8 @@ int move_freepages_block(struct zone *zone, struct page *page, if (!zone_spans_pfn(zone, end_pfn)) return 0; - return move_freepages(zone, start_page, end_page, migratetype); + return move_freepages(zone, start_page, end_page, migratetype, + num_movable); } static void change_pageblock_range(struct page *pageblock_page, @@ -1943,22 +1956,26 @@ static bool can_steal_fallback(unsigned int order, int start_mt) /* * This function implements actual steal behaviour. If order is large enough, * we can steal whole pageblock. If not, we first move freepages in this - * pageblock and check whether half of pages are moved or not. If half of - * pages are moved, we can change migratetype of pageblock and permanently - * use it's pages as requested migratetype in the future. + * pageblock to our migratetype and determine how many already-allocated pages + * are there in the pageblock with a compatible migratetype. If at least half + * of pages are free or compatible, we can change migratetype of the pageblock + * itself, so pages freed in the future will be put on the correct free list. */ static void steal_suitable_fallback(struct zone *zone, struct page *page, int start_type, bool whole_block) { unsigned int current_order = page_order(page); struct free_area *area; - int pages; + int free_pages, movable_pages, alike_pages; + int old_block_type; + + old_block_type = get_pageblock_migratetype(page); /* * This can happen due to races and we want to prevent broken * highatomic accounting. */ - if (is_migrate_highatomic_page(page)) + if (is_migrate_highatomic(old_block_type)) goto single_page; /* Take ownership for orders >= pageblock_order */ @@ -1971,13 +1988,39 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, if (!whole_block) goto single_page; - pages = move_freepages_block(zone, page, start_type); + free_pages = move_freepages_block(zone, page, start_type, + &movable_pages); + /* + * Determine how many pages are compatible with our allocation. + * For movable allocation, it's the number of movable pages which + * we just obtained. For other types it's a bit more tricky. + */ + if (start_type == MIGRATE_MOVABLE) { + alike_pages = movable_pages; + } else { + /* + * If we are falling back a RECLAIMABLE or UNMOVABLE allocation + * to MOVABLE pageblock, consider all non-movable pages as + * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or + * vice versa, be conservative since we can't distinguish the + * exact migratetype of non-movable pages. + */ + if (old_block_type == MIGRATE_MOVABLE) + alike_pages = pageblock_nr_pages + - (free_pages + movable_pages); + else + alike_pages = 0; + } + /* moving whole block can fail due to zone boundary conditions */ - if (!pages) + if (!free_pages) goto single_page; - /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1)) || + /* + * If a sufficient number of pages in the block are either free or of + * comparable migratability as our allocation, claim the whole block. + */ + if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_type); @@ -2055,7 +2098,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, && !is_migrate_cma(mt)) { zone->nr_reserved_highatomic += pageblock_nr_pages; set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); - move_freepages_block(zone, page, MIGRATE_HIGHATOMIC); + move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); } out_unlock: @@ -2132,7 +2175,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * may increase. */ set_pageblock_migratetype(page, ac->migratetype); - ret = move_freepages_block(zone, page, ac->migratetype); + ret = move_freepages_block(zone, page, ac->migratetype, + NULL); if (ret) { spin_unlock_irqrestore(&zone->lock, flags); return ret; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 7927bbb54a4e..5092e4ef00c8 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -66,7 +66,8 @@ out: set_pageblock_migratetype(page, MIGRATE_ISOLATE); zone->nr_isolate_pageblock++; - nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); + nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, + NULL); __mod_zone_freepage_state(zone, -nr_pages, migratetype); } @@ -120,7 +121,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) * pageblock scanning for freepage moving. */ if (!isolated_page) { - nr_pages = move_freepages_block(zone, page, migratetype); + nr_pages = move_freepages_block(zone, page, migratetype, NULL); __mod_zone_freepage_state(zone, nr_pages, migratetype); } set_pageblock_migratetype(page, migratetype); -- cgit v1.2.3 From 282722b0d258ec23fc79d80165418fee83f01736 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 8 May 2017 15:54:49 -0700 Subject: mm, compaction: restrict async compaction to pageblocks of same migratetype The migrate scanner in async compaction is currently limited to MIGRATE_MOVABLE pageblocks. This is a heuristic intended to reduce latency, based on the assumption that non-MOVABLE pageblocks are unlikely to contain movable pages. However, with the exception of THP's, most high-order allocations are not movable. Should the async compaction succeed, this increases the chance that the non-MOVABLE allocations will fallback to a MOVABLE pageblock, making the long-term fragmentation worse. This patch attempts to help the situation by changing async direct compaction so that the migrate scanner only scans the pageblocks of the requested migratetype. If it's a non-MOVABLE type and there are such pageblocks that do contain movable pages, chances are that the allocation can succeed within one of such pageblocks, removing the need for a fallback. If that fails, the subsequent sync attempt will ignore this restriction. In testing based on 4.9 kernel with stress-highalloc from mmtests configured for order-4 GFP_KERNEL allocations, this patch has reduced the number of unmovable allocations falling back to movable pageblocks by 30%. The number of movable allocations falling back is reduced by 12%. Link: http://lkml.kernel.org/r/20170307131545.28577-8-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Mel Gorman Cc: Johannes Weiner Cc: Joonsoo Kim Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 11 +++++++++-- mm/page_alloc.c | 20 +++++++++++++------- 2 files changed, 22 insertions(+), 9 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/compaction.c b/mm/compaction.c index 365b3c8ae943..206847d35978 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -986,10 +986,17 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, static bool suitable_migration_source(struct compact_control *cc, struct page *page) { - if (cc->mode != MIGRATE_ASYNC) + int block_mt; + + if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) return true; - return is_migrate_movable(get_pageblock_migratetype(page)); + block_mt = get_pageblock_migratetype(page); + + if (cc->migratetype == MIGRATE_MOVABLE) + return is_migrate_movable(block_mt); + else + return block_mt == cc->migratetype; } /* Returns true if the page is within a block suitable for migration to */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d90792addeb9..e7486afa7fa7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3665,6 +3665,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; + const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; unsigned int alloc_flags; unsigned long did_some_progress; @@ -3732,12 +3733,17 @@ retry_cpuset: /* * For costly allocations, try direct compaction first, as it's likely - * that we have enough base pages and don't need to reclaim. Don't try - * that for allocations that are allowed to ignore watermarks, as the - * ALLOC_NO_WATERMARKS attempt didn't yet happen. + * that we have enough base pages and don't need to reclaim. For non- + * movable high-order allocations, do that as well, as compaction will + * try prevent permanent fragmentation by migrating from blocks of the + * same migratetype. + * Don't try this for allocations that are allowed to ignore + * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. */ - if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER && - !gfp_pfmemalloc_allowed(gfp_mask)) { + if (can_direct_reclaim && + (costly_order || + (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) + && !gfp_pfmemalloc_allowed(gfp_mask)) { page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, INIT_COMPACT_PRIORITY, @@ -3749,7 +3755,7 @@ retry_cpuset: * Checks for costly allocations with __GFP_NORETRY, which * includes THP page fault allocations */ - if (gfp_mask & __GFP_NORETRY) { + if (costly_order && (gfp_mask & __GFP_NORETRY)) { /* * If compaction is deferred for high-order allocations, * it is because sync compaction recently failed. If @@ -3830,7 +3836,7 @@ retry: * Do not retry costly high order allocations unless they are * __GFP_REPEAT */ - if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) + if (costly_order && !(gfp_mask & __GFP_REPEAT)) goto nopage; if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, -- cgit v1.2.3 From 62be1511b1db8066220b18b7d4da2e6b9fdc69fb Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 8 May 2017 15:59:46 -0700 Subject: mm: prevent potential recursive reclaim due to clearing PF_MEMALLOC Patch series "more robust PF_MEMALLOC handling" This series aims to unify the setting and clearing of PF_MEMALLOC, which prevents recursive reclaim. There are some places that clear the flag unconditionally from current->flags, which may result in clearing a pre-existing flag. This already resulted in a bug report that Patch 1 fixes (without the new helpers, to make backporting easier). Patch 2 introduces the new helpers, modelled after existing memalloc_noio_* and memalloc_nofs_* helpers, and converts mm core to use them. Patches 3 and 4 convert non-mm code. This patch (of 4): __alloc_pages_direct_compact() sets PF_MEMALLOC to prevent deadlock during page migration by lock_page() (see the comment in __unmap_and_move()). Then it unconditionally clears the flag, which can clear a pre-existing PF_MEMALLOC flag and result in recursive reclaim. This was not a problem until commit a8161d1ed609 ("mm, page_alloc: restructure direct compaction handling in slowpath"), because direct compation was called only after direct reclaim, which was skipped when PF_MEMALLOC flag was set. Even now it's only a theoretical issue, as the new callsite of __alloc_pages_direct_compact() is reached only for costly orders and when gfp_pfmemalloc_allowed() is true, which means either __GFP_NOMEMALLOC is in gfp_flags or in_interrupt() is true. There is no such known context, but let's play it safe and make __alloc_pages_direct_compact() robust for cases where PF_MEMALLOC is already set. Fixes: a8161d1ed609 ("mm, page_alloc: restructure direct compaction handling in slowpath") Link: http://lkml.kernel.org/r/20170405074700.29871-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reported-by: Andrey Ryabinin Acked-by: Michal Hocko Acked-by: Hillf Danton Cc: Mel Gorman Cc: Johannes Weiner Cc: Boris Brezillon Cc: Chris Leech Cc: "David S. Miller" Cc: Eric Dumazet Cc: Josef Bacik Cc: Lee Duncan Cc: Michal Hocko Cc: Richard Weinberger Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e7486afa7fa7..1daf509722c7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3283,6 +3283,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, enum compact_priority prio, enum compact_result *compact_result) { struct page *page; + unsigned int noreclaim_flag = current->flags & PF_MEMALLOC; if (!order) return NULL; @@ -3290,7 +3291,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, prio); - current->flags &= ~PF_MEMALLOC; + current->flags = (current->flags & ~PF_MEMALLOC) | noreclaim_flag; if (*compact_result <= COMPACT_INACTIVE) return NULL; -- cgit v1.2.3 From 499118e966f1d2150bd66647c8932343c4e9a0b8 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 8 May 2017 15:59:50 -0700 Subject: mm: introduce memalloc_noreclaim_{save,restore} The previous patch ("mm: prevent potential recursive reclaim due to clearing PF_MEMALLOC") has shown that simply setting and clearing PF_MEMALLOC in current->flags can result in wrongly clearing a pre-existing PF_MEMALLOC flag and potentially lead to recursive reclaim. Let's introduce helpers that support proper nesting by saving the previous stat of the flag, similar to the existing memalloc_noio_* and memalloc_nofs_* helpers. Convert existing setting/clearing of PF_MEMALLOC within mm to the new helpers. There are no known issues with the converted code, but the change makes it more robust. Link: http://lkml.kernel.org/r/20170405074700.29871-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Suggested-by: Michal Hocko Acked-by: Michal Hocko Acked-by: Hillf Danton Cc: Mel Gorman Cc: Johannes Weiner Cc: Andrey Ryabinin Cc: Boris Brezillon Cc: Chris Leech Cc: "David S. Miller" Cc: Eric Dumazet Cc: Josef Bacik Cc: Lee Duncan Cc: Michal Hocko Cc: Richard Weinberger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 12 ++++++++++++ mm/page_alloc.c | 11 ++++++----- mm/vmscan.c | 17 +++++++++++------ 3 files changed, 29 insertions(+), 11 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 9daabe138c99..2b24a6974847 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -191,4 +191,16 @@ static inline void memalloc_nofs_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; } +static inline unsigned int memalloc_noreclaim_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC; + current->flags |= PF_MEMALLOC; + return flags; +} + +static inline void memalloc_noreclaim_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC) | flags; +} + #endif /* _LINUX_SCHED_MM_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1daf509722c7..f9e450c6b6e4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3283,15 +3283,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, enum compact_priority prio, enum compact_result *compact_result) { struct page *page; - unsigned int noreclaim_flag = current->flags & PF_MEMALLOC; + unsigned int noreclaim_flag; if (!order) return NULL; - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, prio); - current->flags = (current->flags & ~PF_MEMALLOC) | noreclaim_flag; + memalloc_noreclaim_restore(noreclaim_flag); if (*compact_result <= COMPACT_INACTIVE) return NULL; @@ -3438,12 +3438,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, { struct reclaim_state reclaim_state; int progress; + unsigned int noreclaim_flag; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; @@ -3453,7 +3454,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); - current->flags &= ~PF_MEMALLOC; + memalloc_noreclaim_restore(noreclaim_flag); cond_resched(); diff --git a/mm/vmscan.c b/mm/vmscan.c index 4e7ed65842af..2f45c0520f43 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3036,6 +3036,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, struct zonelist *zonelist; unsigned long nr_reclaimed; int nid; + unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | @@ -3062,9 +3063,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.gfp_mask, sc.reclaim_idx); - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); - current->flags &= ~PF_MEMALLOC; + memalloc_noreclaim_restore(noreclaim_flag); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -3589,8 +3590,9 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; unsigned long nr_reclaimed; + unsigned int noreclaim_flag; - p->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); lockdep_set_current_reclaim_state(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3599,7 +3601,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) p->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); - p->flags &= ~PF_MEMALLOC; + memalloc_noreclaim_restore(noreclaim_flag); return nr_reclaimed; } @@ -3764,6 +3766,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in struct task_struct *p = current; struct reclaim_state reclaim_state; int classzone_idx = gfp_zone(gfp_mask); + unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), @@ -3781,7 +3784,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in * and we also need to be able to write out pages for RECLAIM_WRITE * and RECLAIM_UNMAP. */ - p->flags |= PF_MEMALLOC | PF_SWAPWRITE; + noreclaim_flag = memalloc_noreclaim_save(); + p->flags |= PF_SWAPWRITE; lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3797,7 +3801,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in } p->reclaim_state = NULL; - current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); + current->flags &= ~PF_SWAPWRITE; + memalloc_noreclaim_restore(noreclaim_flag); lockdep_clear_current_reclaim_state(); return sc.nr_reclaimed >= nr_pages; } -- cgit v1.2.3