From 0eef5ca171aac20bf116132b4326745ee39ed1ef Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 5 Apr 2017 09:19:23 +1000 Subject: mm: fix page_vma_mapped_walk() for ksm pages Doug Smythies reports oops with KSM in this backtrace, I've been seeing the same: page_vma_mapped_walk+0xe6/0x5b0 page_referenced_one+0x91/0x1a0 rmap_walk_ksm+0x100/0x190 rmap_walk+0x4f/0x60 page_referenced+0x149/0x170 shrink_active_list+0x1c2/0x430 shrink_node_memcg+0x67a/0x7a0 shrink_node+0xe1/0x320 kswapd+0x34b/0x720 Just as 4b0ece6fa016 ("mm: migrate: fix remove_migration_pte() for ksm pages") observed, you cannot use page->index calculations on ksm pages. page_vma_mapped_walk() is relying on __vma_address(), where a ksm page can lead it off the end of the page table, and into whatever nonsense is in the next page, ending as an oops inside check_pte()'s pte_page(). KSM tells page_vma_mapped_walk() exactly where to look for the page, it does not need any page->index calculation: and that's so also for all the normal and file and anon pages - just not for THPs and their subpages. Get out early in most cases: instead of a PageKsm test, move down the earlier not-THP-page test, as suggested by Kirill. I'm also slightly worried that this loop can stray into other vmas, so added a vm_end test to prevent surprises; though I have not imagined anything worse than a very contrived case, in which a page mlocked in the next vma might be reclaimed because it is not mlocked in this vma. Fixes: ace71a19cec5 ("mm: introduce page_vma_mapped_walk()") Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1704031104400.1118@eggly.anvils Signed-off-by: Hugh Dickins Reported-by: Doug Smythies Reviewed-by: Kirill A. Shutemov Signed-off-by: Andrew Morton --- mm/page_vma_mapped.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index c4c9def8ffea..de9c40d7304a 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -111,12 +111,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (pvmw->pmd && !pvmw->pte) return not_found(pvmw); - /* Only for THP, seek to next pte entry makes sense */ - if (pvmw->pte) { - if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page)) - return not_found(pvmw); + if (pvmw->pte) goto next_pte; - } if (unlikely(PageHuge(pvmw->page))) { /* when pud is not present, pte will be NULL */ @@ -165,9 +161,14 @@ restart: while (1) { if (check_pte(pvmw)) return true; -next_pte: do { +next_pte: + /* Seek to next pte only makes sense for THP */ + if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page)) + return not_found(pvmw); + do { pvmw->address += PAGE_SIZE; - if (pvmw->address >= + if (pvmw->address >= pvmw->vma->vm_end || + pvmw->address >= __vma_address(pvmw->page, pvmw->vma) + hpage_nr_pages(pvmw->page) * PAGE_SIZE) return not_found(pvmw); -- cgit v1.2.3 From f3de4d62683128ee0b46e8536173b8c3e7ff0b62 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 5 Apr 2017 09:19:25 +1000 Subject: userfaultfd: report actual registered features in fdinfo fdinfo for userfault file descriptor reports UFFD_API_FEATURES. Up until recently, the UFFD_API_FEATURES was defined as 0, therefore corresponding field in fdinfo always contained zero. Now, with introduction of several additional features, UFFD_API_FEATURES is not longer 0 and it seems better to report actual features requested for the userfaultfd object described by the fdinfo. First, the applications that were using userfault will still see zero at the features field in fdinfo. Next, reporting actual features rather than available features, gives clear indication of what userfault features are used by an application. Link: http://lkml.kernel.org/r/1491140181-22121-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrea Arcangeli Cc: Pavel Emelyanov Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 1d227b0fcf49..f7555fc25877 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1756,7 +1756,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) * protocols: aa:... bb:... */ seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n", - pending, total, UFFD_API, UFFD_API_FEATURES, + pending, total, UFFD_API, ctx->features, UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS); } #endif -- cgit v1.2.3 From dfc6d75e9e61869b2a1de4acbb601c584c8dd1d3 Mon Sep 17 00:00:00 2001 From: Alexander Polakov Date: Wed, 5 Apr 2017 09:19:26 +1000 Subject: mmpage_alloc.c: fix print order in show_free_areas() Fixes: 11fb998986a72a ("mm: move most file-based accounting to the node") Link: http://lkml.kernel.org/r/1490377730.30219.2.camel@beget.ru Signed-off-by: Alexander Polyakov Cc: Mel Gorman Cc: Vlastimil Babka Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6cbde310abed..d6a665057d61 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4519,13 +4519,13 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_FILE_MAPPED)), K(node_page_state(pgdat, NR_FILE_DIRTY)), K(node_page_state(pgdat, NR_WRITEBACK)), + K(node_page_state(pgdat, NR_SHMEM)), #ifdef CONFIG_TRANSPARENT_HUGEPAGE K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR), K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), #endif - K(node_page_state(pgdat, NR_SHMEM)), K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), K(node_page_state(pgdat, NR_UNSTABLE_NFS)), node_page_state(pgdat, NR_PAGES_SCANNED), -- cgit v1.2.3 From 939897e2d736e25ee8917096de1586226fed9a7b Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Wed, 5 Apr 2017 09:19:27 +1000 Subject: vmlinux.lds: add missing VMLINUX_SYMBOL macros When __{start,end}_ro_after_init is referenced from C code, we run into the following build errors on blackfin: kernel/extable.c:169: undefined reference to `__start_ro_after_init' kernel/extable.c:169: undefined reference to `__end_ro_after_init' The build error is due to the fact that blackfin is one of the few arches that prepends an underscore '_' to all symbols defined in C. Fix this by wrapping __{start,end}_ro_after_init in vmlinux.lds.h with VMLINUX_SYMBOL(), which adds the necessary prefix for arches that have HAVE_UNDERSCORE_SYMBOL_PREFIX. Link: http://lkml.kernel.org/r/1491259387-15869-1-git-send-email-jeyu@redhat.com Signed-off-by: Jessica Yu Acked-by: Kees Cook Cc: Arnd Bergmann Cc: Eddie Kovsky Signed-off-by: Andrew Morton --- include/asm-generic/vmlinux.lds.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 7cdfe167074f..143db9c523e2 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -261,9 +261,9 @@ */ #ifndef RO_AFTER_INIT_DATA #define RO_AFTER_INIT_DATA \ - __start_ro_after_init = .; \ + VMLINUX_SYMBOL(__start_ro_after_init) = .; \ *(.data..ro_after_init) \ - __end_ro_after_init = .; + VMLINUX_SYMBOL(__end_ro_after_init) = .; #endif /* -- cgit v1.2.3 From ae03d3524968e4c0538d1652f3da906c07cfafe5 Mon Sep 17 00:00:00 2001 From: "bsegall@google.com" Date: Wed, 5 Apr 2017 09:19:29 +1000 Subject: ptrace: fix PTRACE_LISTEN race corrupting task->state In PT_SEIZED + LISTEN mode STOP/CONT signals cause a wakeup against __TASK_TRACED. If this races with the ptrace_unfreeze_traced at the end of a PTRACE_LISTEN, this can wake the task /after/ the check against __TASK_TRACED, but before the reset of state to TASK_TRACED. This causes it to instead clobber TASK_WAKING, allowing a subsequent wakeup against TRACED while the task is still on the rq wake_list, corrupting it. Link: http://lkml.kernel.org/r/xm26y3vfhmkp.fsf_-_@bsegall-linux.mtv.corp.google.com Signed-off-by: Ben Segall --- kernel/ptrace.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0af928712174..ede305b040ce 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -184,11 +184,17 @@ static void ptrace_unfreeze_traced(struct task_struct *task) WARN_ON(!task->ptrace || task->parent != current); + /* + * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up + * remotely. Recheck state under the lock to close this race. + */ spin_lock_irq(&task->sighand->siglock); - if (__fatal_signal_pending(task)) - wake_up_state(task, __TASK_TRACED); - else - task->state = TASK_TRACED; + if (task->state == __TASK_TRACED) { + if (__fatal_signal_pending(task)) + wake_up_state(task, __TASK_TRACED); + else + task->state = TASK_TRACED; + } spin_unlock_irq(&task->sighand->siglock); } -- cgit v1.2.3 From 15743a3c805e67a54aaf72faacb7db5c6613a06b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 5 Apr 2017 09:19:30 +1000 Subject: ptrace-fix-ptrace_listen-race-corrupting-task-state-checkpatch-fixes ERROR: code indent should use tabs where possible #39: FILE: kernel/ptrace.c:188: + * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up$ ERROR: code indent should use tabs where possible #40: FILE: kernel/ptrace.c:189: + * remotely. Recheck state under the lock to close this race.$ ERROR: code indent should use tabs where possible #41: FILE: kernel/ptrace.c:190: + */$ WARNING: please, no spaces at the start of a line #47: FILE: kernel/ptrace.c:192: + if (task->state == __TASK_TRACED) {$ WARNING: suspect code indent for conditional statements (7, 15) #47: FILE: kernel/ptrace.c:192: + if (task->state == __TASK_TRACED) { + if (__fatal_signal_pending(task)) ERROR: code indent should use tabs where possible #48: FILE: kernel/ptrace.c:193: + if (__fatal_signal_pending(task))$ WARNING: please, no spaces at the start of a line #48: FILE: kernel/ptrace.c:193: + if (__fatal_signal_pending(task))$ WARNING: suspect code indent for conditional statements (15, 23) #48: FILE: kernel/ptrace.c:193: + if (__fatal_signal_pending(task)) + wake_up_state(task, __TASK_TRACED); ERROR: code indent should use tabs where possible #49: FILE: kernel/ptrace.c:194: + wake_up_state(task, __TASK_TRACED);$ WARNING: please, no spaces at the start of a line #49: FILE: kernel/ptrace.c:194: + wake_up_state(task, __TASK_TRACED);$ ERROR: code indent should use tabs where possible #50: FILE: kernel/ptrace.c:195: + else$ WARNING: please, no spaces at the start of a line #50: FILE: kernel/ptrace.c:195: + else$ ERROR: code indent should use tabs where possible #51: FILE: kernel/ptrace.c:196: + task->state = TASK_TRACED;$ WARNING: please, no spaces at the start of a line #51: FILE: kernel/ptrace.c:196: + task->state = TASK_TRACED;$ WARNING: please, no spaces at the start of a line #52: FILE: kernel/ptrace.c:197: + }$ total: 7 errors, 8 warnings, 21 lines checked NOTE: For some of the reported defects, checkpatch may be able to mechanically convert to the typical style using --fix or --fix-inplace. NOTE: Whitespace errors detected. You may wish to use scripts/cleanpatch or scripts/cleanfile ./patches/ptrace-fix-ptrace_listen-race-corrupting-task-state.patch has style problems, please review. NOTE: If any of the errors are false positives, please report them to the maintainer, see CHECKPATCH in MAINTAINERS. Please run checkpatch prior to sending patches Cc: Ben Segall Signed-off-by: Andrew Morton --- kernel/ptrace.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ede305b040ce..266ddcc1d8bb 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -184,17 +184,17 @@ static void ptrace_unfreeze_traced(struct task_struct *task) WARN_ON(!task->ptrace || task->parent != current); - /* - * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up - * remotely. Recheck state under the lock to close this race. - */ + /* + * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely. + * Recheck state under the lock to close this race. + */ spin_lock_irq(&task->sighand->siglock); - if (task->state == __TASK_TRACED) { - if (__fatal_signal_pending(task)) - wake_up_state(task, __TASK_TRACED); - else - task->state = TASK_TRACED; - } + if (task->state == __TASK_TRACED) { + if (__fatal_signal_pending(task)) + wake_up_state(task, __TASK_TRACED); + else + task->state = TASK_TRACED; + } spin_unlock_irq(&task->sighand->siglock); } -- cgit v1.2.3 From 6e19569d23154ab79a56db662752cb597481599d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 5 Apr 2017 09:19:32 +1000 Subject: arm: arch/arm/include/asm/page.h needs personality.h VM_DATA_DEFAULT_FLAGS uses READ_IMPLIES_EXEC, so page.h should include personality.h to provide this. This fixes no known bugs and can be safely ignored ;) Cc: Russell King Cc: Will Deacon Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton --- arch/arm/include/asm/page.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index 4355f0ec44d6..f98baaec0a15 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -17,6 +17,8 @@ #ifndef __ASSEMBLY__ +#include /* For READ_IMPLIES_EXEC */ + #ifndef CONFIG_MMU #include -- cgit v1.2.3 From 834fac54d12562b594d676623c0e11627ef558cc Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 5 Apr 2017 09:19:35 +1000 Subject: dax: add tracepoints to dax_iomap_pte_fault() Patch series "second round of tracepoints for DAX". This second round of DAX tracepoint patches adds tracing to the PTE fault path (dax_iomap_pte_fault(), dax_pfn_mkwrite(), dax_load_hole(), dax_insert_mapping()) and to the writeback path (dax_writeback_mapping_range(), dax_writeback_one()). The purpose of this tracing is to give us a high level view of what DAX is doing, whether faults are being serviced by PMDs or PTEs, and by real storage or by zero pages covering holes. I do have some patches nearly ready which also add tracing to grab_mapping_entry() and dax_insert_mapping_entry(). These are more targeted at logging how we are interacting with the radix tree, how we use empty entries for locking, whether we "downgrade" huge zero pages to 4k PTE sized allocations, etc. In the end it seemed to me that this might be too detailed to have as constantly present tracepoints, but if anyone sees value in having tracepoints like this in the DAX code permanently (Jan?), please let me know and I'll add those last two patches. All these tracepoints were done to be consistent with the style of the XFS tracepoints and with the existing DAX PMD tracepoints. This patch (of 6): Add tracepoints to dax_iomap_pte_fault(), following the same logging conventions as the rest of DAX. Here is an example fault that initially tries to be serviced by the PMD fault handler but which falls back to PTEs because the VMA isn't large enough to hold a PMD: small-1086 [005] .... 71.140014: xfs_filemap_huge_fault: dev 259:0 ino 0x1003 small-1086 [005] .... 71.140027: dax_pmd_fault: dev 259:0 ino 0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10420000 vm_start 0x10200000 vm_end 0x10500000 pgoff 0x220 max_pgoff 0x1400 small-1086 [005] .... 71.140028: dax_pmd_fault_done: dev 259:0 ino 0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10420000 vm_start 0x10200000 vm_end 0x10500000 pgoff 0x220 max_pgoff 0x1400 FALLBACK small-1086 [005] .... 71.140035: dax_pte_fault: dev 259:0 ino 0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10420000 pgoff 0x220 small-1086 [005] .... 71.140396: dax_pte_fault_done: dev 259:0 ino 0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10420000 pgoff 0x220 MAJOR|NOPAGE Link: http://lkml.kernel.org/r/20170221195116.13278-2-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Dan Williams Cc: Ingo Molnar Cc: Matthew Wilcox Cc: Steven Rostedt Signed-off-by: Andrew Morton --- fs/dax.c | 15 +++++++++++---- include/trace/events/fs_dax.h | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index de622d4282a6..13acc2655428 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1133,13 +1133,16 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, int vmf_ret = 0; void *entry; + trace_dax_pte_fault(inode, vmf, vmf_ret); /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ - if (pos >= i_size_read(inode)) - return VM_FAULT_SIGBUS; + if (pos >= i_size_read(inode)) { + vmf_ret = VM_FAULT_SIGBUS; + goto out; + } if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) flags |= IOMAP_WRITE; @@ -1150,8 +1153,10 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, * that we never have to deal with more than a single extent here. */ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); - if (error) - return dax_fault_return(error); + if (error) { + vmf_ret = dax_fault_return(error); + goto out; + } if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { vmf_ret = dax_fault_return(-EIO); /* fs corruption? */ goto finish_iomap; @@ -1235,6 +1240,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, */ ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } +out: + trace_dax_pte_fault_done(inode, vmf, vmf_ret); return vmf_ret; } diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index c566ddc87f73..cbcd7d64a18d 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -150,6 +150,47 @@ DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \ DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping); DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback); +DECLARE_EVENT_CLASS(dax_pte_fault_class, + TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), + TP_ARGS(inode, vmf, result), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long, vm_flags) + __field(unsigned long, address) + __field(pgoff_t, pgoff) + __field(dev_t, dev) + __field(unsigned int, flags) + __field(int, result) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->vm_flags = vmf->vma->vm_flags; + __entry->address = vmf->address; + __entry->flags = vmf->flags; + __entry->pgoff = vmf->pgoff; + __entry->result = result; + ), + TP_printk("dev %d:%d ino %#lx %s %s address %#lx pgoff %#lx %s", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->vm_flags & VM_SHARED ? "shared" : "private", + __print_flags(__entry->flags, "|", FAULT_FLAG_TRACE), + __entry->address, + __entry->pgoff, + __print_flags(__entry->result, "|", VM_FAULT_RESULT_TRACE) + ) +) + +#define DEFINE_PTE_FAULT_EVENT(name) \ +DEFINE_EVENT(dax_pte_fault_class, name, \ + TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), \ + TP_ARGS(inode, vmf, result)) + +DEFINE_PTE_FAULT_EVENT(dax_pte_fault); +DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done); + #endif /* _TRACE_FS_DAX_H */ /* This part must be outside protection */ -- cgit v1.2.3 From d2cdb9662231f4edffdab1f4740cf5ef4f0f7a1a Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 5 Apr 2017 09:19:37 +1000 Subject: dax: add tracepoints to dax_pfn_mkwrite() Add tracepoints to dax_pfn_mkwrite(), following the same logging conventions as the rest of DAX. Here is an example PTE fault followed by a pfn_mkwrite: small_aligned-1094 [002] .... 374.084998: dax_pte_fault: dev 259:0 ino 0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10400000 pgoff 0x200 small_aligned-1094 [002] .... 374.085145: dax_pte_fault_done: dev 259:0 ino 0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10400000 pgoff 0x200 MAJOR|NOPAGE small_aligned-1094 [002] .... 374.085165: dax_pfn_mkwrite: dev 259:0 ino 0x1003 shared WRITE|MKWRITE|ALLOW_RETRY|KILLABLE|USER address 0x10400000 pgoff 0x200 NOPAGE Link: http://lkml.kernel.org/r/20170221195116.13278-3-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Dan Williams Cc: Ingo Molnar Cc: Matthew Wilcox Cc: Steven Rostedt Signed-off-by: Andrew Morton --- fs/dax.c | 3 +++ include/trace/events/fs_dax.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/fs/dax.c b/fs/dax.c index 13acc2655428..ed9af976d8a6 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -932,6 +932,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; void *entry, **slot; pgoff_t index = vmf->pgoff; @@ -941,6 +942,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) if (entry) put_unlocked_mapping_entry(mapping, index, entry); spin_unlock_irq(&mapping->tree_lock); + trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); @@ -953,6 +955,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) */ finish_mkwrite_fault(vmf); put_locked_mapping_entry(mapping, index, entry); + trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index cbcd7d64a18d..b5a520961f8d 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -190,6 +190,8 @@ DEFINE_EVENT(dax_pte_fault_class, name, \ DEFINE_PTE_FAULT_EVENT(dax_pte_fault); DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done); +DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry); +DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite); #endif /* _TRACE_FS_DAX_H */ -- cgit v1.2.3 From 64a157cb721ab13e64e85460fb56b3351347a515 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 5 Apr 2017 09:19:38 +1000 Subject: dax: add tracepoints to dax_load_hole() Add tracepoints to dax_load_hole(), following the same logging conventions as the rest of DAX. Here is the logging generated by a PTE read from a hole: read-1075 [002] .... 62.362108: dax_pte_fault: dev 259:0 ino 0x1003 shared ALLOW_RETRY|KILLABLE|USER address 0x10480000 pgoff 0x280 read-1075 [002] .... 62.362140: dax_load_hole: dev 259:0 ino 0x1003 shared ALLOW_RETRY|KILLABLE|USER address 0x10480000 pgoff 0x280 NOPAGE read-1075 [002] .... 62.362141: dax_pte_fault_done: dev 259:0 ino 0x1003 shared ALLOW_RETRY|KILLABLE|USER address 0x10480000 pgoff 0x280 NOPAGE Link: http://lkml.kernel.org/r/20170221195116.13278-4-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Dan Williams Cc: Ingo Molnar Cc: Matthew Wilcox Cc: Steven Rostedt Signed-off-by: Andrew Morton --- fs/dax.c | 16 +++++++++++----- include/trace/events/fs_dax.h | 1 + 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index ed9af976d8a6..c46a5a666a3e 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -546,21 +546,25 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, static int dax_load_hole(struct address_space *mapping, void **entry, struct vm_fault *vmf) { + struct inode *inode = mapping->host; struct page *page; int ret; /* Hole page already exists? Return it... */ if (!radix_tree_exceptional_entry(*entry)) { page = *entry; - goto out; + goto finish_fault; } /* This will replace locked radix tree entry with a hole page */ page = find_or_create_page(mapping, vmf->pgoff, vmf->gfp_mask | __GFP_ZERO); - if (!page) - return VM_FAULT_OOM; - out: + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + +finish_fault: vmf->page = page; ret = finish_fault(vmf); vmf->page = NULL; @@ -568,8 +572,10 @@ static int dax_load_hole(struct address_space *mapping, void **entry, if (!ret) { /* Grab reference for PTE that is now referencing the page */ get_page(page); - return VM_FAULT_NOPAGE; + ret = VM_FAULT_NOPAGE; } +out: + trace_dax_load_hole(inode, vmf, ret); return ret; } diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index b5a520961f8d..2f15dfea7fb1 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -192,6 +192,7 @@ DEFINE_PTE_FAULT_EVENT(dax_pte_fault); DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done); DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry); DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite); +DEFINE_PTE_FAULT_EVENT(dax_load_hole); #endif /* _TRACE_FS_DAX_H */ -- cgit v1.2.3 From 107f537c68f9cd9d7ab585963c4a5df71196e960 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 5 Apr 2017 09:19:40 +1000 Subject: dax: add tracepoints to dax_writeback_mapping_range() Add tracepoints to dax_writeback_mapping_range(), following the same logging conventions as the rest of DAX. Here is an example writeback call: msync-1085 [006] .... 200.902565: dax_writeback_range: dev 259:0 ino 0x1003 pgoff 0x200-0x2ff msync-1085 [006] .... 200.902579: dax_writeback_range_done: dev 259:0 ino 0x1003 pgoff 0x200-0x2ff Link: http://lkml.kernel.org/r/20170221195116.13278-5-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Dan Williams Cc: Ingo Molnar Cc: Matthew Wilcox Cc: Steven Rostedt Signed-off-by: Andrew Morton --- fs/dax.c | 8 ++++++-- include/trace/events/fs_dax.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index c46a5a666a3e..9b1c7ed4dddc 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -879,6 +879,8 @@ int dax_writeback_mapping_range(struct address_space *mapping, start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; + trace_dax_writeback_range(inode, start_index, end_index); + tag_pages_for_writeback(mapping, start_index, end_index); pagevec_init(&pvec, 0); @@ -899,10 +901,12 @@ int dax_writeback_mapping_range(struct address_space *mapping, ret = dax_writeback_one(bdev, mapping, indices[i], pvec.pages[i]); if (ret < 0) - return ret; + goto out; } } - return 0; +out: + trace_dax_writeback_range_done(inode, start_index, end_index); + return ret; } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index 2f15dfea7fb1..9afe8c8f0bef 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -194,6 +194,38 @@ DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry); DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite); DEFINE_PTE_FAULT_EVENT(dax_load_hole); +DECLARE_EVENT_CLASS(dax_writeback_range_class, + TP_PROTO(struct inode *inode, pgoff_t start_index, pgoff_t end_index), + TP_ARGS(inode, start_index, end_index), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(pgoff_t, start_index) + __field(pgoff_t, end_index) + __field(dev_t, dev) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start_index = start_index; + __entry->end_index = end_index; + ), + TP_printk("dev %d:%d ino %#lx pgoff %#lx-%#lx", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->start_index, + __entry->end_index + ) +) + +#define DEFINE_WRITEBACK_RANGE_EVENT(name) \ +DEFINE_EVENT(dax_writeback_range_class, name, \ + TP_PROTO(struct inode *inode, pgoff_t start_index, pgoff_t end_index),\ + TP_ARGS(inode, start_index, end_index)) + +DEFINE_WRITEBACK_RANGE_EVENT(dax_writeback_range); +DEFINE_WRITEBACK_RANGE_EVENT(dax_writeback_range_done); + #endif /* _TRACE_FS_DAX_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 05afbdd973135e9d18bfc70110e583ddc90d0a45 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 5 Apr 2017 09:19:41 +1000 Subject: dax: fix regression in dax_writeback_mapping_range() commit 354ae7432ee8 ("dax: add tracepoints to dax_writeback_mapping_range()") in the -next tree, which appears in next-20170310, inadvertently changed dax_writeback_mapping_range() so that it could end up returning a positive value: the number of bytes flushed, as returned by dax_writeback_one(). This was incorrect. This function either needs to return a negative error value, or zero on success. This change was causing xfstest failures, as reported by Xiong: https://lkml.org/lkml/2017/3/13/1220 With this fix applied to next-20170310, all the test failures reported by Xiong (generic/075 generic/112 generic/127 generic/231 generic/263) are resolved. Link: http://lkml.kernel.org/r/20170314215358.31451-1-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reported-by: Xiong Zhou Cc: Alexander Viro Cc: Dan Williams Cc: Ingo Molnar Cc: Matthew Wilcox Cc: Steven Rostedt Signed-off-by: Andrew Morton --- fs/dax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dax.c b/fs/dax.c index 9b1c7ed4dddc..23a55b485be2 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -906,7 +906,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, } out: trace_dax_writeback_range_done(inode, start_index, end_index); - return ret; + return (ret < 0 ? ret : 0); } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -- cgit v1.2.3 From ff91661855f2a95a77f3cea5e5e56daabd3759ea Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 5 Apr 2017 09:19:42 +1000 Subject: dax: add tracepoint to dax_writeback_one() Add a tracepoint to dax_writeback_one(), following the same logging conventions as the rest of DAX. Here is an example range writeback which ends up flushing one PMD and one PTE: test-1265 [003] .... 496.615250: dax_writeback_range: dev 259:0 ino 0x1003 pgoff 0x0-0x7ffffffffffff test-1265 [003] .... 496.616263: dax_writeback_one: dev 259:0 ino 0x1003 pgoff 0x0 pglen 0x200 test-1265 [003] .... 496.616270: dax_writeback_one: dev 259:0 ino 0x1003 pgoff 0x305 pglen 0x1 test-1265 [003] .... 496.616272: dax_writeback_range_done: dev 259:0 ino 0x1003 pgoff 0x0-0x7ffffffffffff Link: http://lkml.kernel.org/r/20170221195116.13278-6-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Dan Williams Cc: Ingo Molnar Cc: Matthew Wilcox Cc: Steven Rostedt Signed-off-by: Andrew Morton --- fs/dax.c | 1 + include/trace/events/fs_dax.h | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/fs/dax.c b/fs/dax.c index 23a55b485be2..5a38d40457e8 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -844,6 +844,7 @@ static int dax_writeback_one(struct block_device *bdev, spin_lock_irq(&mapping->tree_lock); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); + trace_dax_writeback_one(mapping->host, index, dax.size >> PAGE_SHIFT); unmap: dax_unmap_atomic(bdev, &dax); put_locked_mapping_entry(mapping, index, entry); diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index 9afe8c8f0bef..292a4719edd0 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -226,6 +226,30 @@ DEFINE_EVENT(dax_writeback_range_class, name, \ DEFINE_WRITEBACK_RANGE_EVENT(dax_writeback_range); DEFINE_WRITEBACK_RANGE_EVENT(dax_writeback_range_done); +TRACE_EVENT(dax_writeback_one, + TP_PROTO(struct inode *inode, pgoff_t pgoff, pgoff_t pglen), + TP_ARGS(inode, pgoff, pglen), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(pgoff_t, pgoff) + __field(pgoff_t, pglen) + __field(dev_t, dev) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pgoff = pgoff; + __entry->pglen = pglen; + ), + TP_printk("dev %d:%d ino %#lx pgoff %#lx pglen %#lx", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->pgoff, + __entry->pglen + ) +) + #endif /* _TRACE_FS_DAX_H */ /* This part must be outside protection */ -- cgit v1.2.3 From c345b97bdbb460e10e259f0be94cea951d84b268 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 5 Apr 2017 09:19:44 +1000 Subject: dax: add tracepoint to dax_insert_mapping() Add a tracepoint to dax_insert_mapping(), following the same logging conventions as the rest of DAX. This tracepoint, along with the one in dax_load_hole(), lets us know how a DAX PTE fault was serviced. Here is an example DAX fault that inserts a PTE mapping: small-1126 [007] .... 145.451604: dax_pte_fault: dev 259:0 ino 0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10420000 pgoff 0x220 small-1126 [007] .... 145.452317: dax_insert_mapping: dev 259:0 ino 0x1003 shared write address 0x10420000 radix_entry 0x100006 small-1126 [007] .... 145.452399: dax_pte_fault_done: dev 259:0 ino 0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10420000 pgoff 0x220 MAJOR|NOPAGE Link: http://lkml.kernel.org/r/20170221195116.13278-7-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Dan Williams Cc: Ingo Molnar Cc: Matthew Wilcox Cc: Steven Rostedt Signed-off-by: Andrew Morton --- fs/dax.c | 1 + include/trace/events/fs_dax.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/fs/dax.c b/fs/dax.c index 5a38d40457e8..cd29ad7b6ae9 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -932,6 +932,7 @@ static int dax_insert_mapping(struct address_space *mapping, return PTR_ERR(ret); *entryp = ret; + trace_dax_insert_mapping(mapping->host, vmf, ret); return vm_insert_mixed(vma, vaddr, dax.pfn); } diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index 292a4719edd0..08bb3ed18dcc 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -194,6 +194,36 @@ DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry); DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite); DEFINE_PTE_FAULT_EVENT(dax_load_hole); +TRACE_EVENT(dax_insert_mapping, + TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry), + TP_ARGS(inode, vmf, radix_entry), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long, vm_flags) + __field(unsigned long, address) + __field(void *, radix_entry) + __field(dev_t, dev) + __field(int, write) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->vm_flags = vmf->vma->vm_flags; + __entry->address = vmf->address; + __entry->write = vmf->flags & FAULT_FLAG_WRITE; + __entry->radix_entry = radix_entry; + ), + TP_printk("dev %d:%d ino %#lx %s %s address %#lx radix_entry %#lx", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->vm_flags & VM_SHARED ? "shared" : "private", + __entry->write ? "write" : "read", + __entry->address, + (unsigned long)__entry->radix_entry + ) +) + DECLARE_EVENT_CLASS(dax_writeback_range_class, TP_PROTO(struct inode *inode, pgoff_t start_index, pgoff_t end_index), TP_ARGS(inode, start_index, end_index), -- cgit v1.2.3 From bcd025be9f1051a729c1a457399e7bd81064e3bb Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 5 Apr 2017 09:19:48 +1000 Subject: kbuild: consolidate header generation from ASM offset information Largely redundant code is used in different places to generate C headers from offset information extracted from assembly language output. Consolidate the code in a Makefile include and use this instead. Link: http://lkml.kernel.org/r/20170403193739.84905-1-mka@chromium.org Signed-off-by: Matthias Kaehlcke Cc: Tony Luck Cc: Fenghua Yu Cc: Michal Marek Cc: Jan Kiszka Cc: Kieran Bingham Cc: Grant Grundler Cc: Michael Davidson Cc: Greg Hackmann Cc: Masahiro Yamada Signed-off-by: Andrew Morton --- Kbuild | 21 ++------------------- arch/ia64/kernel/Makefile | 19 +++---------------- scripts/Makefile.asm-offsets | 22 ++++++++++++++++++++++ scripts/mod/Makefile | 21 ++------------------- 4 files changed, 29 insertions(+), 54 deletions(-) create mode 100644 scripts/Makefile.asm-offsets diff --git a/Kbuild b/Kbuild index 3d0ae152af7c..e3789c9611fd 100644 --- a/Kbuild +++ b/Kbuild @@ -7,29 +7,12 @@ # 4) Check for missing system calls # 5) Generate constants.py (may need bounds.h) -# Default sed regexp - multiline due to syntax constraints -define sed-y - "/^->/{s:->#\(.*\):/* \1 */:; \ - s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \ - s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \ - s:->::; p;}" -endef +include scripts/Makefile.asm-offsets # Use filechk to avoid rebuilds when a header changes, but the resulting file # does not define filechk_offsets - (set -e; \ - echo "#ifndef $2"; \ - echo "#define $2"; \ - echo "/*"; \ - echo " * DO NOT MODIFY."; \ - echo " *"; \ - echo " * This file was generated by Kbuild"; \ - echo " */"; \ - echo ""; \ - sed -ne $(sed-y); \ - echo ""; \ - echo "#endif" ) + $(call gen_header_from_asm_offsets,$2) endef ##### diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 3686d6abafde..186ba553ff26 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -50,25 +50,12 @@ CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 # The gate DSO image is built using a special linker script. include $(src)/Makefile.gate +include $(srctree)/scripts/Makefile.asm-offsets + # Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config -define sed-y - "/^->/{s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; s:->::; p;}" -endef quiet_cmd_nr_irqs = GEN $@ define cmd_nr_irqs - (set -e; \ - echo "#ifndef __ASM_NR_IRQS_H__"; \ - echo "#define __ASM_NR_IRQS_H__"; \ - echo "/*"; \ - echo " * DO NOT MODIFY."; \ - echo " *"; \ - echo " * This file was generated by Kbuild"; \ - echo " *"; \ - echo " */"; \ - echo ""; \ - sed -ne $(sed-y) $<; \ - echo ""; \ - echo "#endif" ) > $@ + $(call gen_header_from_asm_offsets,__ASM_NR_IRQS_H__) < $< > $@ endef # We use internal kbuild rules to avoid the "is up to date" message from make diff --git a/scripts/Makefile.asm-offsets b/scripts/Makefile.asm-offsets new file mode 100644 index 000000000000..4ba80ba29b82 --- /dev/null +++ b/scripts/Makefile.asm-offsets @@ -0,0 +1,22 @@ +# Default sed regexp - multiline due to syntax constraints +define sed-asm-offsets-to-c + "/^->/{s:->#\(.*\):/* \1 */:; \ + s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \ + s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \ + s:->::; p;}" +endef + +define gen_header_from_asm_offsets + (set -e; \ + echo "#ifndef $1"; \ + echo "#define $1"; \ + echo "/*"; \ + echo " * DO NOT MODIFY."; \ + echo " *"; \ + echo " * This file was generated by Kbuild"; \ + echo " */"; \ + echo ""; \ + sed -ne $(sed-asm-offsets-to-c); \ + echo ""; \ + echo "#endif" ) +endef diff --git a/scripts/mod/Makefile b/scripts/mod/Makefile index 19d9bcadc0cc..5858bebfaf32 100644 --- a/scripts/mod/Makefile +++ b/scripts/mod/Makefile @@ -7,28 +7,11 @@ modpost-objs := modpost.o file2alias.o sumversion.o devicetable-offsets-file := devicetable-offsets.h -define sed-y - "/^->/{s:->#\(.*\):/* \1 */:; \ - s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \ - s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \ - s:->::; p;}" -endef +include $(srctree)/scripts/Makefile.asm-offsets quiet_cmd_offsets = GEN $@ define cmd_offsets - (set -e; \ - echo "#ifndef __DEVICETABLE_OFFSETS_H__"; \ - echo "#define __DEVICETABLE_OFFSETS_H__"; \ - echo "/*"; \ - echo " * DO NOT MODIFY."; \ - echo " *"; \ - echo " * This file was generated by Kbuild"; \ - echo " *"; \ - echo " */"; \ - echo ""; \ - sed -ne $(sed-y) $<; \ - echo ""; \ - echo "#endif" ) > $@ + $(call gen_header_from_asm_offsets,__DEVICETABLE_OFFSETS_H__) < $< > $@ endef $(obj)/$(devicetable-offsets-file): $(obj)/devicetable-offsets.s -- cgit v1.2.3 From 197ec848ee088be8030c0c96a961114b7f676127 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 5 Apr 2017 09:19:52 +1000 Subject: fs/ocfs2/cluster: use setup_timer Use setup_timer() instead of init_timer() to simplify the code. Link: http://lkml.kernel.org/r/5e75bf07beb91e092d5aa36c36769949a480456a.1489060564.git.geliangtang@gmail.com Signed-off-by: Geliang Tang Signed-off-by: Andrew Morton --- fs/ocfs2/cluster/tcp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index d0ab7e56d0b4..13014f83042c 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -450,9 +450,8 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc); INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req); - init_timer(&sc->sc_idle_timeout); - sc->sc_idle_timeout.function = o2net_idle_timer; - sc->sc_idle_timeout.data = (unsigned long)sc; + setup_timer(&sc->sc_idle_timeout, o2net_idle_timer, + (unsigned long)sc); sclog(sc, "alloced\n"); -- cgit v1.2.3 From 033881af1a0b7d79f7082a28558d9c1d0c20d765 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Wed, 5 Apr 2017 09:19:54 +1000 Subject: ocfs2: o2hb: revert hb threshold to keep compatible Configfs is the interface for ocfs2-tools to set configure to kernel and $configfs_dir/cluster/$clustername/heartbeat/dead_threshold is the one used to configure heartbeat dead threshold. Kernel has a default value of it but user can set O2CB_HEARTBEAT_THRESHOLD in /etc/sysconfig/o2cb to override it. Commit 45b997737a80 ("ocfs2/cluster: use per-attribute show and store methods") changed heartbeat dead threshold name while ocfs2-tools did not, so ocfs2-tools won't set this configurable and the default value is always used. So revert it. Fixes: 45b997737a80 ("ocfs2/cluster: use per-attribute show and store methods") Link: http://lkml.kernel.org/r/1490665245-15374-1-git-send-email-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton --- fs/ocfs2/cluster/heartbeat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f6e871760f8d..0da0332725aa 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -2242,13 +2242,13 @@ unlock: spin_unlock(&o2hb_live_lock); } -static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item, +static ssize_t o2hb_heartbeat_group_dead_threshold_show(struct config_item *item, char *page) { return sprintf(page, "%u\n", o2hb_dead_threshold); } -static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item, +static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *item, const char *page, size_t count) { unsigned long tmp; @@ -2297,11 +2297,11 @@ static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item, } -CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold); +CONFIGFS_ATTR(o2hb_heartbeat_group_, dead_threshold); CONFIGFS_ATTR(o2hb_heartbeat_group_, mode); static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { - &o2hb_heartbeat_group_attr_threshold, + &o2hb_heartbeat_group_attr_dead_threshold, &o2hb_heartbeat_group_attr_mode, NULL, }; -- cgit v1.2.3 From 2702777758dd0840f2246acedc8ecaa271710e36 Mon Sep 17 00:00:00 2001 From: Guozhonghua Date: Wed, 5 Apr 2017 09:19:55 +1000 Subject: ocfs2: old mle put and release after the function dlm_add_migration_mle called If the old mle is found after the dlm_add_migration_mle called, it should be put once. If the return value is not - EEXIST and its type is BLOCK, it should be put again to release it to avoid memory leak, for it had been unhashed from the map. Link: http://lkml.kernel.org/r/71604351584F6A4EBAE558C676F37CA4A3D4B7FE@H3CMLB12-EX.srv.huawei-3com.com Signed-off-by: Guozhonghua Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton --- fs/ocfs2/dlm/dlmmaster.c | 62 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 3e04279446e8..4438671c4ac3 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2612,20 +2612,45 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, spin_lock(&dlm->master_lock); ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, namelen, target, dlm->node_num); + if (ret == -EEXIST) { + if(oldmle) + __dlm_put_mle(oldmle); + + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + mlog(0, "another process is already migrating it\n"); + goto fail; + } + + /* If an old one mle found, it should be put. if its type is BLOCK, + * it should be put again. Because it had been unhasded from the map + * in the function dlm_add_migration_mle. + * otherwise the memory will be leaked. It will not found it again from + * the hash map. + */ + if (oldmle) { + /* master is known, detach if not already detached */ + __dlm_mle_detach_hb_events(dlm, oldmle); + __dlm_put_mle(oldmle); + + /* if the type of the mle is BLOCK, should put it once for release. + * otherwise memory leak may be caused because oldmle had been unhashed + * from the hash map, it will not be found anymore. + */ + if (oldmle->type == DLM_MLE_BLOCK) + __dlm_put_mle(oldmle); + } + /* get an extra reference on the mle. * otherwise the assert_master from the new * master will destroy this. */ dlm_get_mle_inuse(mle); + mle_added = 1; + spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); - if (ret == -EEXIST) { - mlog(0, "another process is already migrating it\n"); - goto fail; - } - mle_added = 1; - /* * set the MIGRATING flag and flush asts * if we fail after this we need to re-dirty the lockres @@ -2642,12 +2667,6 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, } fail: - if (ret != -EEXIST && oldmle) { - /* master is known, detach if not already detached */ - dlm_mle_detach_hb_events(dlm, oldmle); - dlm_put_mle(oldmle); - } - if (ret < 0) { if (mle_added) { dlm_mle_detach_hb_events(dlm, mle); @@ -3182,16 +3201,23 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, if (ret < 0) kmem_cache_free(dlm_mle_cache, mle); + /* If an old one mle found, it should be put. if its type is BLOCK, + * it should be put again. Because it had been unhasded from the map + * in the function dlm_add_migration_mle. + * otherwise the memory will be leaked. It will not found it again from + * the hash map. + */ + if (oldmle) { + __dlm_mle_detach_hb_events(dlm, oldmle); + __dlm_put_mle(oldmle); + if (ret >= 0 && oldmle->type == DLM_MLE_BLOCK) + __dlm_put_mle(oldmle); + } + spin_unlock(&dlm->master_lock); unlock: spin_unlock(&dlm->spinlock); - if (oldmle) { - /* master is known, detach if not already detached */ - dlm_mle_detach_hb_events(dlm, oldmle); - dlm_put_mle(oldmle); - } - if (res) dlm_lockres_put(res); leave: -- cgit v1.2.3 From 40f5224a0525caa6c4e4962eea92e24bbe9e5349 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 5 Apr 2017 09:19:57 +1000 Subject: ocfs2-old-mle-put-and-release-after-the-function-dlm_add_migration_mle-called-fix fix coding style, comments Cc: Guozhonghua Cc: Joel Becker Cc: Joseph Qi Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/dlm/dlmmaster.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 4438671c4ac3..f0072145eead 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2613,7 +2613,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, namelen, target, dlm->node_num); if (ret == -EEXIST) { - if(oldmle) + if (oldmle) __dlm_put_mle(oldmle); spin_unlock(&dlm->master_lock); @@ -2622,10 +2622,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, goto fail; } - /* If an old one mle found, it should be put. if its type is BLOCK, - * it should be put again. Because it had been unhasded from the map + /* + * If an old mle is found, it should be put. If its type is BLOCK, + * it should be put again. Because it has been unhasded from the map * in the function dlm_add_migration_mle. - * otherwise the memory will be leaked. It will not found it again from + * Otherwise the memory will be leaked. It will not be found again from * the hash map. */ if (oldmle) { @@ -2633,9 +2634,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, __dlm_mle_detach_hb_events(dlm, oldmle); __dlm_put_mle(oldmle); - /* if the type of the mle is BLOCK, should put it once for release. - * otherwise memory leak may be caused because oldmle had been unhashed - * from the hash map, it will not be found anymore. + /* + * If the type of the mle is BLOCK, it should be put once for + * release. Otherwise a memory leak may be caused because + * oldmle has been unhashed from the hash map and it will not + * be found any more. */ if (oldmle->type == DLM_MLE_BLOCK) __dlm_put_mle(oldmle); @@ -3201,10 +3204,11 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, if (ret < 0) kmem_cache_free(dlm_mle_cache, mle); - /* If an old one mle found, it should be put. if its type is BLOCK, - * it should be put again. Because it had been unhasded from the map - * in the function dlm_add_migration_mle. - * otherwise the memory will be leaked. It will not found it again from + /* + * If an old mle is found, it should be put. If its type is BLOCK, + * it should be put again because it has been unhashed from the map + * in the dlm_add_migration_mle(). + * Otherwise the memory will be leaked. It will not be found again from * the hash map. */ if (oldmle) { -- cgit v1.2.3 From 4819d61ef0476abdfcfd770a4d846cc4d0624e52 Mon Sep 17 00:00:00 2001 From: Guozhonghua Date: Wed, 5 Apr 2017 09:19:58 +1000 Subject: ocfs2/dlm: optimize freeing of dead node locks Three loops can be optimized into one and its sub loops, so less code can do the same work. Link: http://lkml.kernel.org/r/71604351584F6A4EBAE558C676F37CA4C4AF898E@H3CMLB12-EX.srv.huawei-3com.com Signed-off-by: Guozhonghua Reviewed-by: Eric Ren Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton --- fs/ocfs2/dlm/dlmrecovery.c | 39 ++++++++++++++------------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 74407c6dd592..4c4b18e612c5 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2268,6 +2268,8 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, { struct dlm_lock *lock, *next; unsigned int freed = 0; + struct list_head *queue = NULL; + int i; /* this node is the lockres master: * 1) remove any stale locks for the dead node @@ -2280,31 +2282,18 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, * to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */ /* TODO: check pending_asts, pending_basts here */ - list_for_each_entry_safe(lock, next, &res->granted, list) { - if (lock->ml.node == dead_node) { - list_del_init(&lock->list); - dlm_lock_put(lock); - /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ - dlm_lock_put(lock); - freed++; - } - } - list_for_each_entry_safe(lock, next, &res->converting, list) { - if (lock->ml.node == dead_node) { - list_del_init(&lock->list); - dlm_lock_put(lock); - /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ - dlm_lock_put(lock); - freed++; - } - } - list_for_each_entry_safe(lock, next, &res->blocked, list) { - if (lock->ml.node == dead_node) { - list_del_init(&lock->list); - dlm_lock_put(lock); - /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */ - dlm_lock_put(lock); - freed++; + for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) { + queue = dlm_list_idx_to_ptr(res, i); + list_for_each_entry_safe(lock, next, queue, list) { + if (lock->ml.node == dead_node) { + list_del_init(&lock->list); + dlm_lock_put(lock); + /* Can't schedule DLM_UNLOCK_FREE_LOCK + * do manually + */ + dlm_lock_put(lock); + freed++; + } } } -- cgit v1.2.3 From 4816877e9bc5e27f13dfa19829f9e36a9da6298a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 5 Apr 2017 09:20:00 +1000 Subject: ocfs2-dlm-optimization-of-code-while-free-dead-node-locks-checkpatch-fixes WARNING: please, no spaces at the start of a line #26: FILE: fs/ocfs2/dlm/dlmrecovery.c:2271: + struct list_head *queue = NULL;$ WARNING: please, no spaces at the start of a line #27: FILE: fs/ocfs2/dlm/dlmrecovery.c:2272: + int i;$ WARNING: please, no spaces at the start of a line #60: FILE: fs/ocfs2/dlm/dlmrecovery.c:2285: + for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) {$ WARNING: suspect code indent for conditional statements (7, 15) #60: FILE: fs/ocfs2/dlm/dlmrecovery.c:2285: + for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) { + queue = dlm_list_idx_to_ptr(res, i); ERROR: code indent should use tabs where possible #61: FILE: fs/ocfs2/dlm/dlmrecovery.c:2286: + queue = dlm_list_idx_to_ptr(res, i);$ WARNING: please, no spaces at the start of a line #61: FILE: fs/ocfs2/dlm/dlmrecovery.c:2286: + queue = dlm_list_idx_to_ptr(res, i);$ ERROR: code indent should use tabs where possible #62: FILE: fs/ocfs2/dlm/dlmrecovery.c:2287: + list_for_each_entry_safe(lock, next, queue, list) {$ WARNING: please, no spaces at the start of a line #62: FILE: fs/ocfs2/dlm/dlmrecovery.c:2287: + list_for_each_entry_safe(lock, next, queue, list) {$ WARNING: suspect code indent for conditional statements (15, 23) #62: FILE: fs/ocfs2/dlm/dlmrecovery.c:2287: + list_for_each_entry_safe(lock, next, queue, list) { + if (lock->ml.node == dead_node) { ERROR: code indent should use tabs where possible #63: FILE: fs/ocfs2/dlm/dlmrecovery.c:2288: + if (lock->ml.node == dead_node) {$ WARNING: please, no spaces at the start of a line #63: FILE: fs/ocfs2/dlm/dlmrecovery.c:2288: + if (lock->ml.node == dead_node) {$ WARNING: suspect code indent for conditional statements (23, 31) #63: FILE: fs/ocfs2/dlm/dlmrecovery.c:2288: + if (lock->ml.node == dead_node) { + list_del_init(&lock->list); ERROR: code indent should use tabs where possible #64: FILE: fs/ocfs2/dlm/dlmrecovery.c:2289: + list_del_init(&lock->list);$ WARNING: please, no spaces at the start of a line #64: FILE: fs/ocfs2/dlm/dlmrecovery.c:2289: + list_del_init(&lock->list);$ ERROR: code indent should use tabs where possible #65: FILE: fs/ocfs2/dlm/dlmrecovery.c:2290: + dlm_lock_put(lock);$ WARNING: please, no spaces at the start of a line #65: FILE: fs/ocfs2/dlm/dlmrecovery.c:2290: + dlm_lock_put(lock);$ ERROR: code indent should use tabs where possible #66: FILE: fs/ocfs2/dlm/dlmrecovery.c:2291: + /* Can't schedule DLM_UNLOCK_FREE_LOCK$ ERROR: code indent should use tabs where possible #67: FILE: fs/ocfs2/dlm/dlmrecovery.c:2292: + * do manually$ ERROR: code indent should use tabs where possible #68: FILE: fs/ocfs2/dlm/dlmrecovery.c:2293: + */$ ERROR: code indent should use tabs where possible #69: FILE: fs/ocfs2/dlm/dlmrecovery.c:2294: + dlm_lock_put(lock);$ WARNING: please, no spaces at the start of a line #69: FILE: fs/ocfs2/dlm/dlmrecovery.c:2294: + dlm_lock_put(lock);$ ERROR: code indent should use tabs where possible #70: FILE: fs/ocfs2/dlm/dlmrecovery.c:2295: + freed++;$ WARNING: please, no spaces at the start of a line #70: FILE: fs/ocfs2/dlm/dlmrecovery.c:2295: + freed++;$ ERROR: code indent should use tabs where possible #71: FILE: fs/ocfs2/dlm/dlmrecovery.c:2296: + }$ WARNING: please, no spaces at the start of a line #71: FILE: fs/ocfs2/dlm/dlmrecovery.c:2296: + }$ total: 11 errors, 14 warnings, 51 lines checked NOTE: For some of the reported defects, checkpatch may be able to mechanically convert to the typical style using --fix or --fix-inplace. NOTE: Whitespace errors detected. You may wish to use scripts/cleanpatch or scripts/cleanfile ./patches/ocfs2-dlm-optimization-of-code-while-free-dead-node-locks.patch has style problems, please review. NOTE: If any of the errors are false positives, please report them to the maintainer, see CHECKPATCH in MAINTAINERS. Please run checkpatch prior to sending patches Cc: Guozhonghua Signed-off-by: Andrew Morton --- fs/ocfs2/dlm/dlmrecovery.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 4c4b18e612c5..908b05942282 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2268,8 +2268,8 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, { struct dlm_lock *lock, *next; unsigned int freed = 0; - struct list_head *queue = NULL; - int i; + struct list_head *queue = NULL; + int i; /* this node is the lockres master: * 1) remove any stale locks for the dead node @@ -2282,18 +2282,19 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, * to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */ /* TODO: check pending_asts, pending_basts here */ - for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) { - queue = dlm_list_idx_to_ptr(res, i); - list_for_each_entry_safe(lock, next, queue, list) { - if (lock->ml.node == dead_node) { - list_del_init(&lock->list); - dlm_lock_put(lock); - /* Can't schedule DLM_UNLOCK_FREE_LOCK - * do manually - */ - dlm_lock_put(lock); - freed++; - } + for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) { + queue = dlm_list_idx_to_ptr(res, i); + list_for_each_entry_safe(lock, next, queue, list) { + if (lock->ml.node == dead_node) { + list_del_init(&lock->list); + dlm_lock_put(lock); + /* + * Can't schedule DLM_UNLOCK_FREE_LOCK: do + * manually + */ + dlm_lock_put(lock); + freed++; + } } } -- cgit v1.2.3 From 44ecd79dcf536a3335dea8f4e20a0460aabb3651 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Wed, 5 Apr 2017 09:20:02 +1000 Subject: block: restore /proc/partitions to not display non-partitionable removable devices We found with newer kernels we started seeing the cdrom device showing up in /proc/partitions, but it was not there before. Looking into this I found that commit d27769ec ("block: add GENHD_FL_NO_PART_SCAN") introduces this change in behavior. It's not clear to me from the commit's changelog if this change was intentional or not. This comment still remains: /* Don't show non-partitionable removeable devices or empty devices */ so I've decided to send a patch to restore the behavior of not printing unpartitionable removable devices. Signed-off-by: Josh Hunt Cc: Tejun Heo Cc: Kay Sievers Cc: Jens Axboe Cc: Al Viro Signed-off-by: Andrew Morton --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index a9c516a8b37d..08c1cc707702 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -889,7 +889,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) -- cgit v1.2.3 From 21447a80830ea99460fbde74cc026ea80fe13fb0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 5 Apr 2017 09:20:04 +1000 Subject: mm: fix 100% CPU kswapd busyloop on unreclaimable nodes Patch series "mm: kswapd spinning on unreclaimable nodes - fixes and cleanups". Jia reported a scenario in which the kswapd of a node indefinitely spins at 100% CPU usage. We have seen similar cases at Facebook. The kernel's current method of judging its ability to reclaim a node (or whether to back off and sleep) is based on the amount of scanned pages in proportion to the amount of reclaimable pages. In Jia's and our scenarios, there are no reclaimable pages in the node, however, and the condition for backing off is never met. Kswapd busyloops in an attempt to restore the watermarks while having nothing to work with. This series reworks the definition of an unreclaimable node based not on scanning but on whether kswapd is able to actually reclaim pages in MAX_RECLAIM_RETRIES (16) consecutive runs. This is the same criteria the page allocator uses for giving up on direct reclaim and invoking the OOM killer. If it cannot free any pages, kswapd will go to sleep and leave further attempts to direct reclaim invocations, which will either make progress and re-enable kswapd, or invoke the OOM killer. Patch #1 fixes the immediate problem Jia reported, the remainder are smaller fixlets, cleanups, and overall phasing out of the old method. Patch #6 is the odd one out. It's a nice cleanup to get_scan_count(), and directly related to #5, but in itself not relevant to the series. If the whole series is too ambitious for 4.11, I would consider the first three patches fixes, the rest cleanups. This patch (of 9): Jia He reports a problem with kswapd spinning at 100% CPU when requesting more hugepages than memory available in the system: $ echo 4000 >/proc/sys/vm/nr_hugepages top - 13:42:59 up 3:37, 1 user, load average: 1.09, 1.03, 1.01 Tasks: 1 total, 1 running, 0 sleeping, 0 stopped, 0 zombie %Cpu(s): 0.0 us, 12.5 sy, 0.0 ni, 85.5 id, 2.0 wa, 0.0 hi, 0.0 si, 0.0 st KiB Mem: 31371520 total, 30915136 used, 456384 free, 320 buffers KiB Swap: 6284224 total, 115712 used, 6168512 free. 48192 cached Mem PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 76 root 20 0 0 0 0 R 100.0 0.000 217:17.29 kswapd3 At that time, there are no reclaimable pages left in the node, but as kswapd fails to restore the high watermarks it refuses to go to sleep. Kswapd needs to back away from nodes that fail to balance. Up until 1d82de618ddd ("mm, vmscan: make kswapd reclaim in terms of nodes") kswapd had such a mechanism. It considered zones whose theoretically reclaimable pages it had reclaimed six times over as unreclaimable and backed away from them. This guard was erroneously removed as the patch changed the definition of a balanced node. However, simply restoring this code wouldn't help in the case reported here: there *are* no reclaimable pages that could be scanned until the threshold is met. Kswapd would stay awake anyway. Introduce a new and much simpler way of backing off. If kswapd runs through MAX_RECLAIM_RETRIES (16) cycles without reclaiming a single page, make it back off from the node. This is the same number of shots direct reclaim takes before declaring OOM. Kswapd will go to sleep on that node until a direct reclaimer manages to reclaim some pages, thus proving the node reclaimable again. v2: move MAX_RECLAIM_RETRIES to mm/internal.h (Michal) Link: http://lkml.kernel.org/r/20170228214007.5621-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Jia He Tested-by: Jia He Acked-by: Michal Hocko Acked-by: Hillf Danton Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 ++ mm/internal.h | 6 ++++++ mm/page_alloc.c | 9 ++------- mm/vmscan.c | 27 ++++++++++++++++++++------- mm/vmstat.c | 2 +- 5 files changed, 31 insertions(+), 15 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8e02b3750fe0..d2c50ab6ae40 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -630,6 +630,8 @@ typedef struct pglist_data { int kswapd_order; enum zone_type kswapd_classzone_idx; + int kswapd_failures; /* Number of 'reclaimed == 0' runs */ + #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_classzone_idx; diff --git a/mm/internal.h b/mm/internal.h index ccfc2a2969f4..aae93e3fd984 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -80,6 +80,12 @@ static inline void set_page_refcounted(struct page *page) extern unsigned long highest_memmap_pfn; +/* + * Maximum number of reclaim retries without progress before the OOM + * killer is consider the only way forward. + */ +#define MAX_RECLAIM_RETRIES 16 + /* * in mm/vmscan.c: */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d6a665057d61..8a40aedddfe5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3517,12 +3517,6 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) return false; } -/* - * Maximum number of reclaim retries without any progress before OOM killer - * is consider as the only way to move forward. - */ -#define MAX_RECLAIM_RETRIES 16 - /* * Checks whether it makes sense to retry the reclaim to make a forward progress * for the given allocation request. @@ -4529,7 +4523,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), K(node_page_state(pgdat, NR_UNSTABLE_NFS)), node_page_state(pgdat, NR_PAGES_SCANNED), - !pgdat_reclaimable(pgdat) ? "yes" : "no"); + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? + "yes" : "no"); } for_each_populated_zone(zone) { diff --git a/mm/vmscan.c b/mm/vmscan.c index bc8031ef994d..71292ef80ed4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2620,6 +2620,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + /* + * Kswapd gives up on balancing particular nodes after too + * many failures to reclaim anything from them and goes to + * sleep. On reclaim progress, reset the failure counter. A + * successful direct reclaim run will revive a dormant kswapd. + */ + if (reclaimable) + pgdat->kswapd_failures = 0; + return reclaimable; } @@ -2694,10 +2703,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) GFP_KERNEL | __GFP_HARDWALL)) continue; - if (sc->priority != DEF_PRIORITY && - !pgdat_reclaimable(zone->zone_pgdat)) - continue; /* Let kswapd poll it */ - /* * If we already have plenty of memory free for * compaction in this zone, don't free any more. @@ -3128,6 +3133,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) if (waitqueue_active(&pgdat->pfmemalloc_wait)) wake_up_all(&pgdat->pfmemalloc_wait); + /* Hopeless node, leave it to direct reclaim */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; + for (i = 0; i <= classzone_idx; i++) { struct zone *zone = pgdat->node_zones + i; @@ -3310,6 +3319,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) sc.priority--; } while (sc.priority >= 1); + if (!sc.nr_reclaimed) + pgdat->kswapd_failures++; + out: /* * Return the order kswapd stopped reclaiming at as @@ -3509,6 +3521,10 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (!waitqueue_active(&pgdat->kswapd_wait)) return; + /* Hopeless node, leave it to direct reclaim */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return; + /* Only wake kswapd if all zones are unbalanced */ for (z = 0; z <= classzone_idx; z++) { zone = pgdat->node_zones + z; @@ -3779,9 +3795,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) return NODE_RECLAIM_FULL; - if (!pgdat_reclaimable(pgdat)) - return NODE_RECLAIM_FULL; - /* * Do not scan if the allocation should not be delayed. */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 89f95396ec46..16d0411fc2cc 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1425,7 +1425,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n node_unreclaimable: %u" "\n start_pfn: %lu" "\n node_inactive_ratio: %u", - !pgdat_reclaimable(zone->zone_pgdat), + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, zone->zone_start_pfn, zone->zone_pgdat->inactive_ratio); seq_putc(m, '\n'); -- cgit v1.2.3 From b956edd67315a4381af1fba5af7f5f33f5bbd1fb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 5 Apr 2017 09:20:05 +1000 Subject: mm: fix 100% CPU kswapd busyloop on unreclaimable nodes fix Check kswapd failure against the cumulative nr_reclaimed count, not against the count from the lowest priority iteration. Link: http://lkml.kernel.org/r/20170306162410.GB2090@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Minchan Kim Acked-by: Minchan Kim Cc: Michal Hocko Cc: Jia He Cc: Hillf Danton Signed-off-by: Andrew Morton --- mm/vmscan.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 71292ef80ed4..ea90340ef933 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3223,9 +3223,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) count_vm_event(PAGEOUTRUN); do { + unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; - sc.nr_reclaimed = 0; sc.reclaim_idx = classzone_idx; /* @@ -3315,7 +3315,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ - if (raise_priority || !sc.nr_reclaimed) + nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; + if (raise_priority || !nr_reclaimed) sc.priority--; } while (sc.priority >= 1); -- cgit v1.2.3 From fa6f62b7f13e9671f3a1299db003001bb0cb2e06 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 5 Apr 2017 09:20:06 +1000 Subject: mm: fix condition for throttle_direct_reclaim Since "mm: fix 100% CPU kswapd busyloop on unreclaimable nodes" kswapd has been modified to give up after MAX_RECLAIM_RETRIES number of unsucessful iterations. Before going to sleep, kswapd thread will unconditionally wakeup all threads sleeping on pfmemalloc_wait. However the awoken threads will recheck the watermarks and wake the kswapd thread and sleep again on pfmemalloc_wait. There is a chance that the system might end up in livelock between unsuccessful kswapd and direct reclaimers because all direct reclaimer might end up in throttle_direct_reclaim and there is nobody to make a forward progress. So, add kswapd_failures check on the throttle_direct_reclaim condition. Link: http://lkml.kernel.org/r/20170314183228.20152-1-shakeelb@google.com Signed-off-by: Shakeel Butt Suggested-by: Michal Hocko Suggested-by: Johannes Weiner Acked-by: Hillf Danton Acked-by: Michal Hocko Signed-off-by: Andrew Morton --- mm/vmscan.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index ea90340ef933..667644e53b5c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2822,7 +2822,7 @@ retry: return 0; } -static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) +static bool allow_direct_reclaim(pg_data_t *pgdat) { struct zone *zone; unsigned long pfmemalloc_reserve = 0; @@ -2830,6 +2830,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) int i; bool wmark_ok; + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; + for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; if (!managed_zone(zone) || @@ -2910,7 +2913,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; - if (pfmemalloc_watermark_ok(pgdat)) + if (allow_direct_reclaim(pgdat)) goto out; break; } @@ -2932,14 +2935,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, */ if (!(gfp_mask & __GFP_FS)) { wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, - pfmemalloc_watermark_ok(pgdat), HZ); + allow_direct_reclaim(pgdat), HZ); goto check_pending; } /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - pfmemalloc_watermark_ok(pgdat)); + allow_direct_reclaim(pgdat)); check_pending: if (fatal_signal_pending(current)) @@ -3119,7 +3122,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) /* * The throttled processes are normally woken up in balance_pgdat() as - * soon as pfmemalloc_watermark_ok() is true. But there is a potential + * soon as allow_direct_reclaim() is true. But there is a potential * race between when kswapd checks the watermarks and a process gets * throttled. There is also a potential race if processes get * throttled, kswapd wakes, a large process exits thereby balancing the @@ -3304,7 +3307,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && - pfmemalloc_watermark_ok(pgdat)) + allow_direct_reclaim(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ -- cgit v1.2.3 From c779bcc2064754fbb791181c5b3c5c723e2287ce Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 5 Apr 2017 09:20:07 +1000 Subject: mm: fix check for reclaimable pages in PF_MEMALLOC reclaim throttling PF_MEMALLOC direct reclaimers get throttled on a node when the sum of all free pages in each zone fall below half the min watermark. During the summation, we want to exclude zones that don't have reclaimables. Checking the same pgdat over and over again doesn't make sense. Fixes: 599d0c954f91 ("mm, vmscan: move LRU lists to node") Link: http://lkml.kernel.org/r/20170228214007.5621-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Hillf Danton Acked-by: Michal Hocko Cc: Jia He Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/vmscan.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 667644e53b5c..52832bedb2ed 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2835,8 +2835,10 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; - if (!managed_zone(zone) || - pgdat_reclaimable_pages(pgdat) == 0) + if (!managed_zone(zone)) + continue; + + if (!zone_reclaimable_pages(zone)) continue; pfmemalloc_reserve += min_wmark_pages(zone); -- cgit v1.2.3 From 281e5c092e1e7336f0d44ef9baa25f9fb2593ee6 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 5 Apr 2017 09:20:08 +1000 Subject: mm: remove seemingly spurious reclaimability check from laptop_mode gating 1d82de618ddd ("mm, vmscan: make kswapd reclaim in terms of nodes") allowed laptop_mode=1 to start writing not just when the priority drops to DEF_PRIORITY - 2 but also when the node is unreclaimable. That appears to be a spurious change in this patch as I doubt the series was tested with laptop_mode, and neither is that particular change mentioned in the changelog. Remove it, it's still recent. Link: http://lkml.kernel.org/r/20170228214007.5621-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Hillf Danton Acked-by: Mel Gorman Acked-by: Michal Hocko Cc: Jia He Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 52832bedb2ed..014d0d181be0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3285,7 +3285,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * If we're getting trouble reclaiming, start doing writepage * even in laptop mode. */ - if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat)) + if (sc.priority < DEF_PRIORITY - 2) sc.may_writepage = 1; /* Call soft limit reclaim before calling shrink_node. */ -- cgit v1.2.3 From f9a37e599cf573f5da678439ecadbe2de3455a37 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 5 Apr 2017 09:20:08 +1000 Subject: mm: remove unnecessary reclaimability check from NUMA balancing target NUMA balancing already checks the watermarks of the target node to decide whether it's a suitable balancing target. Whether the node is reclaimable or not is irrelevant when we don't intend to reclaim. Link: http://lkml.kernel.org/r/20170228214007.5621-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Hillf Danton Acked-by: Michal Hocko Cc: Jia He Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/migrate.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index ed97c2c14fa8..011df454812b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1722,9 +1722,6 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, { int z; - if (!pgdat_reclaimable(pgdat)) - return false; - for (z = pgdat->nr_zones - 1; z >= 0; z--) { struct zone *zone = pgdat->node_zones + z; -- cgit v1.2.3 From d7c5d077f9c4ae68137b15f8c4dfcb8946e93928 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 5 Apr 2017 09:20:09 +1000 Subject: mm: don't avoid high-priority reclaim on unreclaimable nodes 246e87a93934 ("memcg: fix get_scan_count() for small targets") sought to avoid high reclaim priorities for kswapd by forcing it to scan a minimum amount of pages when lru_pages >> priority yielded nothing. b95a2f2d486d ("mm: vmscan: convert global reclaim to per-memcg LRU lists"), due to switching global reclaim to a round-robin scheme over all cgroups, had to restrict this forceful behavior to unreclaimable zones in order to prevent massive overreclaim with many cgroups. The latter patch effectively neutered the behavior completely for all but extreme memory pressure. But in those situations we might as well drop the reclaimers to lower priority levels. Remove the check. Link: http://lkml.kernel.org/r/20170228214007.5621-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Hillf Danton Acked-by: Michal Hocko Cc: Jia He Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/vmscan.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 014d0d181be0..2fd50ca88016 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2130,22 +2130,13 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, int pass; /* - * If the zone or memcg is small, nr[l] can be 0. This - * results in no scanning on this priority and a potential - * priority drop. Global direct reclaim can go to the next - * zone and tends to have no problems. Global kswapd is for - * zone balancing and it needs to scan a minimum amount. When + * If the zone or memcg is small, nr[l] can be 0. When * reclaiming for a memcg, a priority drop can cause high - * latencies, so it's better to scan a minimum amount there as - * well. + * latencies, so it's better to scan a minimum amount. When a + * cgroup has already been deleted, scrape out the remaining + * cache forcefully to get rid of the lingering state. */ - if (current_is_kswapd()) { - if (!pgdat_reclaimable(pgdat)) - force_scan = true; - if (!mem_cgroup_online(memcg)) - force_scan = true; - } - if (!global_reclaim(sc)) + if (!global_reclaim(sc) || !mem_cgroup_online(memcg)) force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ -- cgit v1.2.3 From eba74071a6806d36f26a6c4d1771448f97fc9604 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 5 Apr 2017 09:20:10 +1000 Subject: mm: don't avoid high-priority reclaim on memcg limit reclaim 246e87a93934 ("memcg: fix get_scan_count() for small targets") sought to avoid high reclaim priorities for memcg by forcing it to scan a minimum amount of pages when lru_pages >> priority yielded nothing. This was done at a time when reclaim decisions like dirty throttling were tied to the priority level. Nowadays, the only meaningful thing still tied to priority dropping below DEF_PRIORITY - 2 is gating whether laptop_mode=1 is generally allowed to write. But that is from an era where direct reclaim was still allowed to call ->writepage, and kswapd nowadays avoids writes until it's scanned every clean page in the system. Potential changes to how quick sc->may_writepage could trigger are of little concern. Remove the force_scan stuff, as well as the ugly multi-pass target calculation that it necessitated. Link: http://lkml.kernel.org/r/20170228214007.5621-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Hillf Danton Acked-by: Michal Hocko Cc: Jia He Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/vmscan.c | 94 ++++++++++++++++++++++++------------------------------------- 1 file changed, 37 insertions(+), 57 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 2fd50ca88016..9117ae8d49ee 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2123,21 +2123,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, unsigned long anon_prio, file_prio; enum scan_balance scan_balance; unsigned long anon, file; - bool force_scan = false; unsigned long ap, fp; enum lru_list lru; - bool some_scanned; - int pass; - - /* - * If the zone or memcg is small, nr[l] can be 0. When - * reclaiming for a memcg, a priority drop can cause high - * latencies, so it's better to scan a minimum amount. When a - * cgroup has already been deleted, scrape out the remaining - * cache forcefully to get rid of the lingering state. - */ - if (!global_reclaim(sc) || !mem_cgroup_online(memcg)) - force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { @@ -2268,55 +2255,48 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, fraction[1] = fp; denominator = ap + fp + 1; out: - some_scanned = false; - /* Only use force_scan on second pass. */ - for (pass = 0; !some_scanned && pass < 2; pass++) { - *lru_pages = 0; - for_each_evictable_lru(lru) { - int file = is_file_lru(lru); - unsigned long size; - unsigned long scan; - - size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - scan = size >> sc->priority; - - if (!scan && pass && force_scan) - scan = min(size, SWAP_CLUSTER_MAX); - - switch (scan_balance) { - case SCAN_EQUAL: - /* Scan lists relative to size */ - break; - case SCAN_FRACT: - /* - * Scan types proportional to swappiness and - * their relative recent reclaim efficiency. - */ - scan = div64_u64(scan * fraction[file], - denominator); - break; - case SCAN_FILE: - case SCAN_ANON: - /* Scan one type exclusively */ - if ((scan_balance == SCAN_FILE) != file) { - size = 0; - scan = 0; - } - break; - default: - /* Look ma, no brain */ - BUG(); - } + *lru_pages = 0; + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long size; + unsigned long scan; - *lru_pages += size; - nr[lru] = scan; + size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + scan = size >> sc->priority; + /* + * If the cgroup's already been deleted, make sure to + * scrape out the remaining cache. + */ + if (!scan && !mem_cgroup_online(memcg)) + scan = min(size, SWAP_CLUSTER_MAX); + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: /* - * Skip the second pass and don't force_scan, - * if we found something to scan. + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. */ - some_scanned |= !!scan; + scan = div64_u64(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) { + size = 0; + scan = 0; + } + break; + default: + /* Look ma, no brain */ + BUG(); } + + *lru_pages += size; + nr[lru] = scan; } } -- cgit v1.2.3 From f66b0818d4b2b9ccd141582acb63002578a90427 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 5 Apr 2017 09:20:10 +1000 Subject: mm: delete NR_PAGES_SCANNED and pgdat_reclaimable() NR_PAGES_SCANNED counts number of pages scanned since the last page free event in the allocator. This was used primarily to measure the reclaimability of zones and nodes, and determine when reclaim should give up on them. In that role, it has been replaced in the preceding patches by a different mechanism. Being implemented as an efficient vmstat counter, it was automatically exported to userspace as well. It's however unlikely that anyone outside the kernel is using this counter in any meaningful way. Remove the counter and the unused pgdat_reclaimable(). Link: http://lkml.kernel.org/r/20170228214007.5621-8-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Hillf Danton Acked-by: Michal Hocko Cc: Jia He Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 - mm/internal.h | 1 - mm/page_alloc.c | 15 +++------------ mm/vmscan.c | 9 --------- mm/vmstat.c | 22 +++------------------- 5 files changed, 6 insertions(+), 42 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d2c50ab6ae40..04e0969966f6 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -149,7 +149,6 @@ enum node_stat_item { NR_UNEVICTABLE, /* " " " " " */ NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ - NR_PAGES_SCANNED, /* pages scanned since last reclaim */ WORKINGSET_REFAULT, WORKINGSET_ACTIVATE, WORKINGSET_NODERECLAIM, diff --git a/mm/internal.h b/mm/internal.h index aae93e3fd984..c583ce1b32b9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -91,7 +91,6 @@ extern unsigned long highest_memmap_pfn; */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); -extern bool pgdat_reclaimable(struct pglist_data *pgdat); /* * in mm/rmap.c: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8a40aedddfe5..5f2fd0bd5ead 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1090,15 +1090,11 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - unsigned long nr_scanned, flags; + unsigned long flags; bool isolated_pageblocks; spin_lock_irqsave(&zone->lock, flags); isolated_pageblocks = has_isolate_pageblock(zone); - nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); - if (nr_scanned) - __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); - while (count) { struct page *page; struct list_head *list; @@ -1150,13 +1146,10 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { - unsigned long nr_scanned, flags; + unsigned long flags; + spin_lock_irqsave(&zone->lock, flags); __count_vm_events(PGFREE, 1 << order); - nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); - if (nr_scanned) - __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); - if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); @@ -4499,7 +4492,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #endif " writeback_tmp:%lukB" " unstable:%lukB" - " pages_scanned:%lu" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -4522,7 +4514,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), K(node_page_state(pgdat, NR_UNSTABLE_NFS)), - node_page_state(pgdat, NR_PAGES_SCANNED), pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 9117ae8d49ee..02f2eb51b33e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -230,12 +230,6 @@ unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) return nr; } -bool pgdat_reclaimable(struct pglist_data *pgdat) -{ - return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) < - pgdat_reclaimable_pages(pgdat) * 6; -} - /** * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector @@ -1750,7 +1744,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) { - __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) __count_vm_events(PGSCAN_KSWAPD, nr_scanned); else @@ -1953,8 +1946,6 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; - if (global_reclaim(sc)) - __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); __count_vm_events(PGREFILL, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); diff --git a/mm/vmstat.c b/mm/vmstat.c index 16d0411fc2cc..9911ec58fce2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -954,7 +954,6 @@ const char * const vmstat_text[] = { "nr_unevictable", "nr_isolated_anon", "nr_isolated_file", - "nr_pages_scanned", "workingset_refault", "workingset_activate", "workingset_nodereclaim", @@ -1378,7 +1377,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n min %lu" "\n low %lu" "\n high %lu" - "\n node_scanned %lu" "\n spanned %lu" "\n present %lu" "\n managed %lu", @@ -1386,7 +1384,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED), zone->spanned_pages, zone->present_pages, zone->managed_pages); @@ -1587,22 +1584,9 @@ int vmstat_refresh(struct ctl_table *table, int write, for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { val = atomic_long_read(&vm_zone_stat[i]); if (val < 0) { - switch (i) { - case NR_PAGES_SCANNED: - /* - * This is often seen to go negative in - * recent kernels, but not to go permanently - * negative. Whilst it would be nicer not to - * have exceptions, rooting them out would be - * another task, of rather low priority. - */ - break; - default: - pr_warn("%s: %s %ld\n", - __func__, vmstat_text[i], val); - err = -EINVAL; - break; - } + pr_warn("%s: %s %ld\n", + __func__, vmstat_text[i], val); + err = -EINVAL; } } if (err) -- cgit v1.2.3 From c72c741ba1c05e83497dcb417c48c0ac0ffdaf9b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 5 Apr 2017 09:20:11 +1000 Subject: Revert "mm, vmscan: account for skipped pages as a partial scan" This reverts commit d7f05528eedb047efe2288cff777676b028747b6. Now that reclaimability of a node is no longer based on the ratio between pages scanned and theoretically reclaimable pages, we can remove accounting tricks for pages skipped due to zone constraints. Link: http://lkml.kernel.org/r/20170228214007.5621-9-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Hillf Danton Acked-by: Michal Hocko Cc: Jia He Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/vmscan.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 02f2eb51b33e..77832f0dbe0d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1472,12 +1472,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long nr_taken = 0; unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; - unsigned long skipped = 0, total_skipped = 0; + unsigned long skipped = 0; unsigned long scan, nr_pages; LIST_HEAD(pages_skipped); for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && - !list_empty(src);) { + !list_empty(src); scan++) { struct page *page; page = lru_to_page(src); @@ -1491,12 +1491,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, continue; } - /* - * Account for scanned and skipped separetly to avoid the pgdat - * being prematurely marked unreclaimable by pgdat_reclaimable. - */ - scan++; - switch (__isolate_lru_page(page, mode)) { case 0: nr_pages = hpage_nr_pages(page); @@ -1525,6 +1519,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, if (!list_empty(&pages_skipped)) { int zid; + list_splice(&pages_skipped, src); for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_skipped[zid]) continue; @@ -1532,17 +1527,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); skipped += nr_skipped[zid]; } - - /* - * Account skipped pages as a partial scan as the pgdat may be - * close to unreclaimable. If the LRU list is empty, account - * skipped pages as a full scan. - */ - total_skipped = list_empty(src) ? skipped : skipped >> 2; - - list_splice(&pages_skipped, src); } - *nr_scanned = scan + total_skipped; + *nr_scanned = scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, skipped, nr_taken, mode, lru); update_lru_sizes(lruvec, lru, nr_zone_taken); -- cgit v1.2.3 From 08b755e44f1f17bc3aba76c8b54e020d4291787e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 5 Apr 2017 09:20:11 +1000 Subject: mm: remove unnecessary back-off function when retrying page reclaim The backoff mechanism is not needed. If we have MAX_RECLAIM_RETRIES loops without progress, we'll OOM anyway; backing off might cut one or two iterations off that in the rare OOM case. If we have intermittent success reclaiming a few pages, the backoff function gets reset also, and so is of little help in these scenarios. We might want a backoff function for when there IS progress, but not enough to be satisfactory. But this isn't that. Remove it. Link: http://lkml.kernel.org/r/20170228214007.5621-10-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Hillf Danton Acked-by: Michal Hocko Cc: Jia He Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5f2fd0bd5ead..1f222dd91a5b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3513,11 +3513,10 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) /* * Checks whether it makes sense to retry the reclaim to make a forward progress * for the given allocation request. - * The reclaim feedback represented by did_some_progress (any progress during - * the last reclaim round) and no_progress_loops (number of reclaim rounds without - * any progress in a row) is considered as well as the reclaimable pages on the - * applicable zone list (with a backoff mechanism which is a function of - * no_progress_loops). + * + * We give up when we either have tried MAX_RECLAIM_RETRIES in a row + * without success, or when we couldn't even meet the watermark if we + * reclaimed all remaining pages on the LRU lists. * * Returns true if a retry is viable or false to enter the oom path. */ @@ -3562,13 +3561,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, bool wmark; available = reclaimable = zone_reclaimable_pages(zone); - available -= DIV_ROUND_UP((*no_progress_loops) * available, - MAX_RECLAIM_RETRIES); available += zone_page_state_snapshot(zone, NR_FREE_PAGES); /* - * Would the allocation succeed if we reclaimed the whole - * available? + * Would the allocation succeed if we reclaimed all + * reclaimable pages? */ wmark = __zone_watermark_ok(zone, order, min_wmark, ac_classzone_idx(ac), alloc_flags, available); -- cgit v1.2.3 From 932489293c74c6ee03c0ee82fdf81a949c957e33 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 5 Apr 2017 09:20:12 +1000 Subject: mm/page-writeback.c: use setup_deferrable_timer Use setup_deferrable_timer() instead of init_timer_deferrable() to simplify the code. Link: http://lkml.kernel.org/r/e8e3d4280a34facbc007346f31df833cec28801e.1488070291.git.geliangtang@gmail.com Signed-off-by: Geliang Tang Signed-off-by: Andrew Morton --- mm/page-writeback.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d8ac2a7fb9e7..33df0583edb9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -650,9 +650,8 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp) spin_lock_init(&dom->lock); - init_timer_deferrable(&dom->period_timer); - dom->period_timer.function = writeout_period; - dom->period_timer.data = (unsigned long)dom; + setup_deferrable_timer(&dom->period_timer, writeout_period, + (unsigned long)dom); dom->dirty_limit_tstamp = jiffies; -- cgit v1.2.3 From e7b090350311749b3e5ad07954f8649be9ebedff Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 5 Apr 2017 09:20:13 +1000 Subject: mm: delete unnecessary TTU_* flags Patch series "mm: fix some MADV_FREE issues", v5. We are trying to use MADV_FREE in jemalloc. Several issues are found. Without solving the issues, jemalloc can't use the MADV_FREE feature. - Doesn't support system without swap enabled. Because if swap is off, we can't or can't efficiently age anonymous pages. And since MADV_FREE pages are mixed with other anonymous pages, we can't reclaim MADV_FREE pages. In current implementation, MADV_FREE will fallback to MADV_DONTNEED without swap enabled. But in our environment, a lot of machines don't enable swap. This will prevent our setup using MADV_FREE. - Increases memory pressure. page reclaim bias file pages reclaim against anonymous pages. This doesn't make sense for MADV_FREE pages, because those pages could be freed easily and refilled with very slight penality. Even page reclaim doesn't bias file pages, there is still an issue, because MADV_FREE pages and other anonymous pages are mixed together. To reclaim a MADV_FREE page, we probably must scan a lot of other anonymous pages, which is inefficient. In our test, we usually see oom with MADV_FREE enabled and nothing without it. - Accounting. There are two accounting problems. We don't have a global accounting. If the system is abnormal, we don't know if it's a problem from MADV_FREE side. The other problem is RSS accounting. MADV_FREE pages are accounted as normal anon pages and reclaimed lazily, so application's RSS becomes bigger. This confuses our workloads. We have monitoring daemon running and if it finds applications' RSS becomes abnormal, the daemon will kill the applications even kernel can reclaim the memory easily. To address the first the two issues, we can either put MADV_FREE pages into a separate LRU list (Minchan's previous patches and V1 patches), or put them into LRU_INACTIVE_FILE list (suggested by Johannes). The patchset use the second idea. The reason is LRU_INACTIVE_FILE list is tiny nowadays and should be full of used once file pages. So we can still efficiently reclaim MADV_FREE pages there without interference with other anon and active file pages. Putting the pages into inactive file list also has an advantage which allows page reclaim to prioritize MADV_FREE pages and used once file pages. MADV_FREE pages are put into the lru list and clear SwapBacked flag, so PageAnon(page) && !PageSwapBacked(page) will indicate a MADV_FREE pages. These pages will directly freed without pageout if they are clean, otherwise normal swap will reclaim them. For the third issue, the previous post adds global accounting and a separate RSS count for MADV_FREE pages. The problem is we never get accurate accounting for MADV_FREE pages. The pages are mapped to userspace, can be dirtied without notice from kernel side. To get accurate accounting, we could write protect the page, but then there is extra page fault overhead, which people don't want to pay. Jemalloc guys have concerns about the inaccurate accounting, so this post drops the accounting patches temporarily. The info exported to /proc/pid/smaps for MADV_FREE pages are kept, which is the only place we can get accurate accounting right now. This patch (of 6): Johannes pointed out TTU_LZFREE is unnecessary. It's true because we always have the flag set if we want to do an unmap. For cases we don't do an unmap, the TTU_LZFREE part of code should never run. Also the TTU_UNMAP is unnecessary. If no other flags set (for example, TTU_MIGRATION), an unmap is implied. The patch includes Johannes's cleanup and dead TTU_ACTION macro removal code Link: http://lkml.kernel.org/r/4be3ea1bc56b26fd98a54d0a6f70bec63f6d8980.1487965799.git.shli@fb.com Signed-off-by: Shaohua Li Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Acked-by: Minchan Kim Acked-by: Hillf Danton Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/linux/rmap.h | 22 +++++++++------------- mm/memory-failure.c | 2 +- mm/rmap.c | 2 +- mm/vmscan.c | 11 ++++------- 4 files changed, 15 insertions(+), 22 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 8c89e902df3e..7a3941492856 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -83,19 +83,17 @@ struct anon_vma_chain { }; enum ttu_flags { - TTU_UNMAP = 1, /* unmap mode */ - TTU_MIGRATION = 2, /* migration mode */ - TTU_MUNLOCK = 4, /* munlock mode */ - TTU_LZFREE = 8, /* lazy free mode */ - TTU_SPLIT_HUGE_PMD = 16, /* split huge PMD if any */ - - TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ - TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ - TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */ - TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible + TTU_MIGRATION = 0x1, /* migration mode */ + TTU_MUNLOCK = 0x2, /* munlock mode */ + + TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ + TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ + TTU_IGNORE_ACCESS = 0x10, /* don't age */ + TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */ + TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible * and caller guarantees they will * do a final flush if necessary */ - TTU_RMAP_LOCKED = (1 << 12) /* do not grab rmap lock: + TTU_RMAP_LOCKED = 0x80 /* do not grab rmap lock: * caller holds it */ }; @@ -193,8 +191,6 @@ static inline void page_dup_rmap(struct page *page, bool compound) int page_referenced(struct page *, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags); -#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) - int try_to_unmap(struct page *, enum ttu_flags flags); /* Avoid racy checks */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 27f7210e7fab..f85adfe57484 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -907,7 +907,7 @@ EXPORT_SYMBOL_GPL(get_hwpoison_page); static int hwpoison_user_mappings(struct page *p, unsigned long pfn, int trapno, int flags, struct page **hpagep) { - enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; struct address_space *mapping; LIST_HEAD(tokill); int ret; diff --git a/mm/rmap.c b/mm/rmap.c index f6838015810f..d7b6d780764b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1426,7 +1426,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ VM_BUG_ON_PAGE(!PageSwapCache(page), page); - if (!PageDirty(page) && (flags & TTU_LZFREE)) { + if (!PageDirty(page)) { /* It's a freeable page by MADV_FREE */ dec_mm_counter(mm, MM_ANONPAGES); rp->lazyfreed++; diff --git a/mm/vmscan.c b/mm/vmscan.c index 77832f0dbe0d..e5c00f2b98ab 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -966,7 +966,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; bool dirty, writeback; - bool lazyfree = false; int ret = SWAP_SUCCESS; cond_resched(); @@ -1120,7 +1119,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; if (!add_to_swap(page, page_list)) goto activate_locked; - lazyfree = true; may_enter_fs = 1; /* Adding to swap updated mapping */ @@ -1138,9 +1136,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (ret = try_to_unmap(page, lazyfree ? - (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : - (ttu_flags | TTU_BATCH_FLUSH))) { + switch (ret = try_to_unmap(page, + ttu_flags | TTU_BATCH_FLUSH)) { case SWAP_FAIL: nr_unmap_fail++; goto activate_locked; @@ -1348,7 +1345,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true); + TTU_IGNORE_ACCESS, NULL, true); list_splice(&clean_pages, page_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); return ret; @@ -1740,7 +1737,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP, + nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0, &stat, false); spin_lock_irq(&pgdat->lru_lock); -- cgit v1.2.3 From 216032ff36d2c5ebff90bd2419abc7779beb2540 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 5 Apr 2017 09:20:14 +1000 Subject: mm: don't assume anonymous pages have SwapBacked flag There are a few places the code assumes anonymous pages should have SwapBacked flag set. MADV_FREE pages are anonymous pages but we are going to add them to LRU_INACTIVE_FILE list and clear SwapBacked flag for them. The assumption doesn't hold any more, so fix them. Link: http://lkml.kernel.org/r/3945232c0df3dd6c4ef001976f35a95f18dcb407.1487965799.git.shli@fb.com Signed-off-by: Shaohua Li Acked-by: Johannes Weiner Acked-by: Hillf Danton Cc: Michal Hocko Cc: Minchan Kim Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/huge_memory.c | 1 - mm/khugepaged.c | 8 +++----- mm/migrate.c | 3 ++- mm/rmap.c | 3 ++- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1ebc93e179f3..f7dd8b886d41 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2368,7 +2368,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) VM_BUG_ON_PAGE(is_huge_zero_page(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); VM_BUG_ON_PAGE(!PageCompound(page), page); if (PageAnon(head)) { diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ba40b7f673f4..88e4b1737c90 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -483,8 +483,7 @@ void __khugepaged_exit(struct mm_struct *mm) static void release_pte_page(struct page *page) { - /* 0 stands for page_is_file_cache(page) == false */ - dec_node_page_state(page, NR_ISOLATED_ANON + 0); + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); unlock_page(page); putback_lru_page(page); } @@ -532,7 +531,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!PageAnon(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); /* * We can do it before isolate_lru_page because the @@ -579,8 +577,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_DEL_PAGE_LRU; goto out; } - /* 0 stands for page_is_file_cache(page) == false */ - inc_node_page_state(page, NR_ISOLATED_ANON + 0); + inc_node_page_state(page, + NR_ISOLATED_ANON + page_is_file_cache(page)); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); diff --git a/mm/migrate.c b/mm/migrate.c index 011df454812b..937378e8b883 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1944,7 +1944,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, /* Prepare a page as a migration target */ __SetPageLocked(new_page); - __SetPageSwapBacked(new_page); + if (PageSwapBacked(page)) + __SetPageSwapBacked(new_page); /* anon mapping, we can simply copy page->mapping to the new page: */ new_page->mapping = page->mapping; diff --git a/mm/rmap.c b/mm/rmap.c index d7b6d780764b..b4084d09dbe8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1424,7 +1424,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Store the swap location in the pte. * See handle_pte_fault() ... */ - VM_BUG_ON_PAGE(!PageSwapCache(page), page); + VM_BUG_ON_PAGE(!PageSwapCache(page) && PageSwapBacked(page), + page); if (!PageDirty(page)) { /* It's a freeable page by MADV_FREE */ -- cgit v1.2.3 From b8f0976d39d334991ac75e0f6067884f29055821 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 5 Apr 2017 09:20:15 +1000 Subject: mm: move MADV_FREE pages into LRU_INACTIVE_FILE list madv()'s MADV_FREE indicate pages are 'lazyfree'. They are still anonymous pages, but they can be freed without pageout. To distinguish these from normal anonymous pages, we clear their SwapBacked flag. MADV_FREE pages could be freed without pageout, so they pretty much like used once file pages. For such pages, we'd like to reclaim them once there is memory pressure. Also it might be unfair reclaiming MADV_FREE pages always before used once file pages and we definitively want to reclaim the pages before other anonymous and file pages. To speed up MADV_FREE pages reclaim, we put the pages into LRU_INACTIVE_FILE list. The rationale is LRU_INACTIVE_FILE list is tiny nowadays and should be full of used once file pages. Reclaiming MADV_FREE pages will not have much interfere of anonymous and active file pages. And the inactive file pages and MADV_FREE pages will be reclaimed according to their age, so we don't reclaim too many MADV_FREE pages too. Putting the MADV_FREE pages into LRU_INACTIVE_FILE_LIST also means we can reclaim the pages without swap support. This idea is suggested by Johannes. This patch doesn't move MADV_FREE pages to LRU_INACTIVE_FILE list yet to avoid bisect failure, next patch will do it. The patch is based on Minchan's original patch. Link: http://lkml.kernel.org/r/2f87063c1e9354677b7618c647abde77b07561e5.1487965799.git.shli@fb.com Signed-off-by: Shaohua Li Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Acked-by: Minchan Kim Acked-by: Michal Hocko Acked-by: Hillf Danton Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 +- include/linux/vm_event_item.h | 2 +- mm/huge_memory.c | 3 --- mm/madvise.c | 2 -- mm/swap.c | 50 ++++++++++++++++++++++++------------------- mm/vmstat.c | 1 + 6 files changed, 31 insertions(+), 29 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 45e91dd6716d..486494e6b2fc 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -279,7 +279,7 @@ extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); -extern void deactivate_page(struct page *page); +extern void mark_page_lazyfree(struct page *page); extern void swap_setup(void); extern void add_page_to_unevictable_list(struct page *page); diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index a80b7b59cf33..d84ae90ccd5c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -25,7 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, FOR_ALL_ZONES(PGALLOC), FOR_ALL_ZONES(ALLOCSTALL), FOR_ALL_ZONES(PGSCAN_SKIP), - PGFREE, PGACTIVATE, PGDEACTIVATE, + PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE, PGFAULT, PGMAJFAULT, PGLAZYFREED, PGREFILL, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f7dd8b886d41..ccd301360ad1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1564,9 +1564,6 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, ClearPageDirty(page); unlock_page(page); - if (PageActive(page)) - deactivate_page(page); - if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, tlb->fullmm); diff --git a/mm/madvise.c b/mm/madvise.c index 7a2abf0127ae..cf3021b05b32 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -411,8 +411,6 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, ptent = pte_mkold(ptent); ptent = pte_mkclean(ptent); set_pte_at(mm, addr, pte, ptent); - if (PageActive(page)) - deactivate_page(page); tlb_remove_tlb_entry(tlb, pte, addr); } } diff --git a/mm/swap.c b/mm/swap.c index c4910f14f957..c4fb4b9f7524 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -46,7 +46,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); #endif @@ -561,20 +561,26 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, } -static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, +static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, void *arg) { - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_cache(page); - int lru = page_lru_base_type(page); + if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && + !PageUnevictable(page)) { + bool active = PageActive(page); - del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); + del_page_from_lru_list(page, lruvec, LRU_INACTIVE_ANON + active); ClearPageActive(page); ClearPageReferenced(page); - add_page_to_lru_list(page, lruvec, lru); + /* + * lazyfree pages are clean anonymous pages. They have + * SwapBacked flag cleared to distinguish normal anonymous + * pages + */ + ClearPageSwapBacked(page); + add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); - __count_vm_event(PGDEACTIVATE); - update_page_reclaim_stat(lruvec, file, 0); + __count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); + update_page_reclaim_stat(lruvec, 1, 0); } } @@ -604,9 +610,9 @@ void lru_add_drain_cpu(int cpu) if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); - pvec = &per_cpu(lru_deactivate_pvecs, cpu); + pvec = &per_cpu(lru_lazyfree_pvecs, cpu); if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); activate_page_drain(cpu); } @@ -638,22 +644,22 @@ void deactivate_file_page(struct page *page) } /** - * deactivate_page - deactivate a page + * mark_page_lazyfree - make an anon page lazyfree * @page: page to deactivate * - * deactivate_page() moves @page to the inactive list if @page was on the active - * list and was not an unevictable page. This is done to accelerate the reclaim - * of @page. + * mark_page_lazyfree() moves @page to the inactive file list. + * This is done to accelerate the reclaim of @page. */ -void deactivate_page(struct page *page) -{ - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); +void mark_page_lazyfree(struct page *page) + { + if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && + !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); - put_cpu_var(lru_deactivate_pvecs); + pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); + put_cpu_var(lru_lazyfree_pvecs); } } @@ -704,7 +710,7 @@ void lru_add_drain_all(void) if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, lru_add_drain_wq, work); diff --git a/mm/vmstat.c b/mm/vmstat.c index 9911ec58fce2..35de61b46408 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -991,6 +991,7 @@ const char * const vmstat_text[] = { "pgfree", "pgactivate", "pgdeactivate", + "pglazyfree", "pgfault", "pgmajfault", -- cgit v1.2.3 From 312f6b0a70120436adbb122fa494e546a574ed90 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 5 Apr 2017 09:20:16 +1000 Subject: mm-move-madv_free-pages-into-lru_inactive_file-list-checkpatch-fixes WARNING: line over 80 characters #127: FILE: mm/swap.c:571: + del_page_from_lru_list(page, lruvec, LRU_INACTIVE_ANON + active); WARNING: please, no spaces at the start of a line #177: FILE: mm/swap.c:654: + {$ total: 0 errors, 2 warnings, 133 lines checked NOTE: For some of the reported defects, checkpatch may be able to mechanically convert to the typical style using --fix or --fix-inplace. ./patches/mm-move-madv_free-pages-into-lru_inactive_file-list.patch has style problems, please review. NOTE: If any of the errors are false positives, please report them to the maintainer, see CHECKPATCH in MAINTAINERS. Please run checkpatch prior to sending patches Cc: Shaohua Li Signed-off-by: Andrew Morton --- mm/swap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index c4fb4b9f7524..ac98eb443a03 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -568,7 +568,8 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, !PageUnevictable(page)) { bool active = PageActive(page); - del_page_from_lru_list(page, lruvec, LRU_INACTIVE_ANON + active); + del_page_from_lru_list(page, lruvec, + LRU_INACTIVE_ANON + active); ClearPageActive(page); ClearPageReferenced(page); /* @@ -651,7 +652,7 @@ void deactivate_file_page(struct page *page) * This is done to accelerate the reclaim of @page. */ void mark_page_lazyfree(struct page *page) - { +{ if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && !PageUnevictable(page)) { struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs); -- cgit v1.2.3 From c3084daebe1c7dd6be2f4f437732a98993aea947 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 5 Apr 2017 09:20:17 +1000 Subject: mm: reclaim MADV_FREE pages When memory pressure is high, we free MADV_FREE pages. If the pages are not dirty in pte, the pages could be freed immediately. Otherwise we can't reclaim them. We put the pages back to anonumous LRU list (by setting SwapBacked flag) and the pages will be reclaimed in normal swapout way. We use normal page reclaim policy. Since MADV_FREE pages are put into inactive file list, such pages and inactive file pages are reclaimed according to their age. This is expected, because we don't want to reclaim too many MADV_FREE pages before used once pages. Based on Minchan's original patch Link: http://lkml.kernel.org/r/14b8eb1d3f6bf6cc492833f183ac8c304e560484.1487965799.git.shli@fb.com Signed-off-by: Shaohua Li Acked-by: Minchan Kim Acked-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: Hillf Danton Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 +- mm/huge_memory.c | 2 ++ mm/madvise.c | 1 + mm/rmap.c | 40 +++++++++++++++++----------------------- mm/vmscan.c | 34 ++++++++++++++++++++++------------ 5 files changed, 43 insertions(+), 36 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 7a3941492856..fee10d744ebd 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -298,6 +298,6 @@ static inline int page_mkclean(struct page *page) #define SWAP_AGAIN 1 #define SWAP_FAIL 2 #define SWAP_MLOCK 3 -#define SWAP_LZFREE 4 +#define SWAP_DIRTY 4 #endif /* _LINUX_RMAP_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ccd301360ad1..c474e6f0461a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1573,6 +1573,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, set_pmd_at(mm, addr, pmd, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } + + mark_page_lazyfree(page); ret = true; out: spin_unlock(ptl); diff --git a/mm/madvise.c b/mm/madvise.c index cf3021b05b32..d3a6712c3e14 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -413,6 +413,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(mm, addr, pte, ptent); tlb_remove_tlb_entry(tlb, pte, addr); } + mark_page_lazyfree(page); } out: if (nr_swap) { diff --git a/mm/rmap.c b/mm/rmap.c index b4084d09dbe8..b7c47fd59108 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1288,11 +1288,6 @@ void page_remove_rmap(struct page *page, bool compound) */ } -struct rmap_private { - enum ttu_flags flags; - int lazyfreed; -}; - /* * @arg: enum ttu_flags will be passed to this argument */ @@ -1308,8 +1303,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, pte_t pteval; struct page *subpage; int ret = SWAP_AGAIN; - struct rmap_private *rp = arg; - enum ttu_flags flags = rp->flags; + enum ttu_flags flags = (enum ttu_flags)arg; /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) @@ -1427,11 +1421,21 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageSwapCache(page) && PageSwapBacked(page), page); - if (!PageDirty(page)) { + /* + * swapin page could be clean, it has data stored in + * swap. We can't silently discard it without setting + * swap entry in the page table. + */ + if (!PageDirty(page) && !PageSwapCache(page)) { /* It's a freeable page by MADV_FREE */ dec_mm_counter(mm, MM_ANONPAGES); - rp->lazyfreed++; goto discard; + } else if (!PageSwapBacked(page)) { + /* dirty MADV_FREE page */ + set_pte_at(mm, address, pvmw.pte, pteval); + ret = SWAP_DIRTY; + page_vma_mapped_walk_done(&pvmw); + break; } if (swap_duplicate(entry) < 0) { @@ -1499,18 +1503,15 @@ static int page_mapcount_is_zero(struct page *page) * SWAP_AGAIN - we missed a mapping, try again later * SWAP_FAIL - the page is unswappable * SWAP_MLOCK - page is mlocked. + * SWAP_DIRTY - page is dirty MADV_FREE page */ int try_to_unmap(struct page *page, enum ttu_flags flags) { int ret; - struct rmap_private rp = { - .flags = flags, - .lazyfreed = 0, - }; struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, - .arg = &rp, + .arg = (void *)flags, .done = page_mapcount_is_zero, .anon_lock = page_lock_anon_vma_read, }; @@ -1531,11 +1532,8 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) else ret = rmap_walk(page, &rwc); - if (ret != SWAP_MLOCK && !page_mapcount(page)) { + if (ret != SWAP_MLOCK && !page_mapcount(page)) ret = SWAP_SUCCESS; - if (rp.lazyfreed && !PageDirty(page)) - ret = SWAP_LZFREE; - } return ret; } @@ -1562,14 +1560,10 @@ static int page_not_mapped(struct page *page) int try_to_munlock(struct page *page) { int ret; - struct rmap_private rp = { - .flags = TTU_MUNLOCK, - .lazyfreed = 0, - }; struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, - .arg = &rp, + .arg = (void *)TTU_MUNLOCK, .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, diff --git a/mm/vmscan.c b/mm/vmscan.c index e5c00f2b98ab..ec4555369e17 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -906,7 +906,8 @@ static void page_check_dirty_writeback(struct page *page, * Anonymous pages are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them */ - if (!page_is_file_cache(page)) { + if (!page_is_file_cache(page) || + (PageAnon(page) && !PageSwapBacked(page))) { *dirty = false; *writeback = false; return; @@ -987,7 +988,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; /* Double the slab pressure for mapped and swapcache pages */ - if (page_mapped(page) || PageSwapCache(page)) + if ((page_mapped(page) || PageSwapCache(page)) && + !(PageAnon(page) && !PageSwapBacked(page))) sc->nr_scanned++; may_enter_fs = (sc->gfp_mask & __GFP_FS) || @@ -1113,8 +1115,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. + * Lazyfree page could be freed directly */ - if (PageAnon(page) && !PageSwapCache(page)) { + if (PageAnon(page) && PageSwapBacked(page) && + !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (!add_to_swap(page, page_list)) @@ -1135,9 +1139,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, * The page is mapped into the page tables of one or more * processes. Try to unmap it here. */ - if (page_mapped(page) && mapping) { + if (page_mapped(page)) { switch (ret = try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) { + case SWAP_DIRTY: + SetPageSwapBacked(page); + /* fall through */ case SWAP_FAIL: nr_unmap_fail++; goto activate_locked; @@ -1145,8 +1152,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; case SWAP_MLOCK: goto cull_mlocked; - case SWAP_LZFREE: - goto lazyfree; case SWAP_SUCCESS: ; /* try to free the page below */ } @@ -1258,10 +1263,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } -lazyfree: - if (!mapping || !__remove_mapping(mapping, page, true)) - goto keep_locked; + if (PageAnon(page) && !PageSwapBacked(page)) { + /* follow __remove_mapping for reference */ + if (!page_ref_freeze(page, 1)) + goto keep_locked; + if (PageDirty(page)) { + page_ref_unfreeze(page, 1); + goto keep_locked; + } + count_vm_event(PGLAZYFREED); + } else if (!mapping || !__remove_mapping(mapping, page, true)) + goto keep_locked; /* * At this point, we have no other references and there is * no way to pick any more up (removed from LRU, removed @@ -1271,9 +1284,6 @@ lazyfree: */ __ClearPageLocked(page); free_it: - if (ret == SWAP_LZFREE) - count_vm_event(PGLAZYFREED); - nr_reclaimed++; /* -- cgit v1.2.3 From 751632341c07df3cb18fc388e4d87f261a768713 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:20:18 +1000 Subject: mm: clean up lazyfree page handling We can make it simple to understand without need to be aware of clean-swapin page. This patch just clean up lazyfree page handling in try_to_unmap_one. Link: http://lkml.kernel.org/r/20170303025237.GB3503@bbox Signed-off-by: Minchan Kim Reviewed-by: Shaohua Li Cc: Michal Hocko Cc: Johannes Weiner Cc: Hillf Danton Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/rmap.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index b7c47fd59108..519b7eb723d1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1421,17 +1421,17 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageSwapCache(page) && PageSwapBacked(page), page); - /* - * swapin page could be clean, it has data stored in - * swap. We can't silently discard it without setting - * swap entry in the page table. - */ - if (!PageDirty(page) && !PageSwapCache(page)) { - /* It's a freeable page by MADV_FREE */ - dec_mm_counter(mm, MM_ANONPAGES); - goto discard; - } else if (!PageSwapBacked(page)) { - /* dirty MADV_FREE page */ + /* MADV_FREE page check */ + if (!PageSwapBacked(page)) { + if (!PageDirty(page)) { + dec_mm_counter(mm, MM_ANONPAGES); + goto discard; + } + + /* + * If the page was redirtied, it cannot be + * discarded. Remap the page to page table. + */ set_pte_at(mm, address, pvmw.pte, pteval); ret = SWAP_DIRTY; page_vma_mapped_walk_done(&pvmw); -- cgit v1.2.3 From 6f42a3110d537567eb0ea3ff1668c59fe7013598 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:20:19 +1000 Subject: mm: fix lazyfree BUG_ON check in try_to_unmap_one() If a page is swapbacked, it means it should be in swapcache in try_to_unmap_one's path. If a page is !swapbacked, it mean it shouldn't be in swapcache in try_to_unmap_one's path. Check both two cases all at once and if it fails, warn and return SWAP_FAIL. Such bug never mean we should shut down the kernel. Link: http://lkml.kernel.org/r/20170307055551.GC29458@bbox Signed-off-by: Minchan Kim Suggested-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Shaohua Li Cc: Hillf Danton Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/rmap.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 519b7eb723d1..1b6105be87f3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1418,8 +1418,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Store the swap location in the pte. * See handle_pte_fault() ... */ - VM_BUG_ON_PAGE(!PageSwapCache(page) && PageSwapBacked(page), - page); + if (VM_WARN_ON_ONCE(PageSwapBacked(page) != + PageSwapCache(page))) { + ret = SWAP_FAIL; + page_vma_mapped_walk_done(&pvmw); + break; + + } /* MADV_FREE page check */ if (!PageSwapBacked(page)) { -- cgit v1.2.3 From 50b06ec48564f28f673f48d852fa188fcf2c92b7 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:20:20 +1000 Subject: mm: do not use VM_WARN_ON_ONCE as if condition Sergey reported VM_WARN_ON_ONCE returns void with !CONFIG_DEBUG_VM so we cannot use it as if's condition unlike WARN_ON. This patch fixes it. Link: http://lkml.kernel.org/r/20170309060226.GB854@bbox Signed-off-by: Minchan Kim Reported-by: Sergey Senozhatsky Acked-by: Michal Hocko Signed-off-by: Andrew Morton --- mm/rmap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 1b6105be87f3..a19bd8b8ab0d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1418,12 +1418,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Store the swap location in the pte. * See handle_pte_fault() ... */ - if (VM_WARN_ON_ONCE(PageSwapBacked(page) != - PageSwapCache(page))) { + if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { + WARN_ON_ONCE(1); ret = SWAP_FAIL; page_vma_mapped_walk_done(&pvmw); break; - } /* MADV_FREE page check */ -- cgit v1.2.3 From e92e4622d3c09f7f7fb5778c90e7cb92b4b2412c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 5 Apr 2017 09:20:21 +1000 Subject: mm: enable MADV_FREE for swapless system Now MADV_FREE pages can be easily reclaimed even for swapless system. We can safely enable MADV_FREE for all systems. Link: http://lkml.kernel.org/r/155648585589300bfae1d45078e7aebb3d988b87.1487965799.git.shli@fb.com Signed-off-by: Shaohua Li Acked-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Hillf Danton Acked-by: Minchan Kim Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/madvise.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index d3a6712c3e14..a09d2d3dfae9 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -650,13 +650,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); case MADV_FREE: - /* - * XXX: In this implementation, MADV_FREE works like - * MADV_DONTNEED on swapless system or full swap. - */ - if (get_nr_swap_pages() > 0) - return madvise_free(vma, prev, start, end); - /* passthrough */ + return madvise_free(vma, prev, start, end); case MADV_DONTNEED: return madvise_dontneed(vma, prev, start, end); default: -- cgit v1.2.3 From 69c0412e8ff4ee456f36e2bb2a82358c81fe9dce Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 5 Apr 2017 09:20:22 +1000 Subject: proc: show MADV_FREE pages info in smaps Show MADV_FREE pages info of each vma in smaps. The interface is for diganose or monitoring purpose, userspace could use it to understand what happens in the application. Since userspace could dirty MADV_FREE pages without notice from kernel, this interface is the only place we can get accurate accounting info about MADV_FREE pages. Link: http://lkml.kernel.org/r/89efde633559de1ec07444f2ef0f4963a97a2ce8.1487965799.git.shli@fb.com Signed-off-by: Shaohua Li Acked-by: Johannes Weiner Acked-by: Minchan Kim Acked-by: Michal Hocko Acked-by: Hillf Danton Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.txt | 4 ++++ fs/proc/task_mmu.c | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index c94b4675d021..45853e116eef 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -412,6 +412,7 @@ Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 892 kB Anonymous: 0 kB +LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB Shared_Hugetlb: 0 kB @@ -441,6 +442,9 @@ accessed. "Anonymous" shows the amount of memory that does not belong to any file. Even a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE and a page is modified, the file page is replaced by a private anonymous copy. +"LazyFree" shows the amount of memory which is marked by madvise(MADV_FREE). +The memory isn't freed immediately with madvise(). It's freed in memory +pressure if the memory is clean. "AnonHugePages" shows the ammount of memory backed by transparent hugepage. "ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by huge pages. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f08bd31c1081..b0e3800b4cf5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -441,6 +441,7 @@ struct mem_size_stats { unsigned long private_dirty; unsigned long referenced; unsigned long anonymous; + unsigned long lazyfree; unsigned long anonymous_thp; unsigned long shmem_thp; unsigned long swap; @@ -457,8 +458,11 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, int i, nr = compound ? 1 << compound_order(page) : 1; unsigned long size = nr * PAGE_SIZE; - if (PageAnon(page)) + if (PageAnon(page)) { mss->anonymous += size; + if (!PageSwapBacked(page) && !dirty && !PageDirty(page)) + mss->lazyfree += size; + } mss->resident += size; /* Accumulate the size in pages that have been accessed. */ @@ -771,6 +775,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) "Private_Dirty: %8lu kB\n" "Referenced: %8lu kB\n" "Anonymous: %8lu kB\n" + "LazyFree: %8lu kB\n" "AnonHugePages: %8lu kB\n" "ShmemPmdMapped: %8lu kB\n" "Shared_Hugetlb: %8lu kB\n" @@ -789,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.private_dirty >> 10, mss.referenced >> 10, mss.anonymous >> 10, + mss.lazyfree >> 10, mss.anonymous_thp >> 10, mss.shmem_thp >> 10, mss.shared_hugetlb >> 10, -- cgit v1.2.3 From afd080b551855b2440925117f9019becadbe8b06 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 5 Apr 2017 09:20:22 +1000 Subject: proc-show-madv_free-pages-info-in-smaps-fix update Documentation/filesystems/proc.txt Cc: Shaohua Li Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 45853e116eef..0b58b317dc76 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -444,7 +444,9 @@ a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE and a page is modified, the file page is replaced by a private anonymous copy. "LazyFree" shows the amount of memory which is marked by madvise(MADV_FREE). The memory isn't freed immediately with madvise(). It's freed in memory -pressure if the memory is clean. +pressure if the memory is clean. Please note that the printed value might +be lower than the real value due to optimizations used in the current +implementation. If this is not desirable please file a bug report. "AnonHugePages" shows the ammount of memory backed by transparent hugepage. "ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by huge pages. -- cgit v1.2.3 From 47eecd1eaf803dec48f8d96a83b99bc0dc72662d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 5 Apr 2017 09:20:24 +1000 Subject: mm: memcontrol: provide shmem statistics Cgroups currently don't report how much shmem they use, which can be useful data to have, in particular since shmem is included in the cache/file item while being reclaimed like anonymous memory. Add a counter to track shmem pages during charging and uncharging. Link: http://lkml.kernel.org/r/20170221164343.32252-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Chris Down Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton --- Documentation/cgroup-v2.txt | 5 +++++ include/linux/memcontrol.h | 1 + mm/memcontrol.c | 28 ++++++++++++++++++++-------- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 49d7c997fa1e..e50b95c25868 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -871,6 +871,11 @@ PAGE_SIZE multiple when read back. Amount of memory used in network transmission buffers + shmem + + Amount of cached filesystem data that is swap-backed, + such as tmpfs, shm segments, shared anonymous mmap()s + file_mapped Amount of cached filesystem data mapped with mmap() diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bb7250c45cb8..c5ebb32fef49 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -46,6 +46,7 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ + MEM_CGROUP_STAT_SHMEM, /* # of pages charged as shmem */ MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ MEM_CGROUP_STAT_DIRTY, /* # of dirty pages in page cache */ MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2bd7541d7c11..490d5b4676c1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -104,6 +104,7 @@ static const char * const mem_cgroup_stat_names[] = { "cache", "rss", "rss_huge", + "shmem", "mapped_file", "dirty", "writeback", @@ -608,9 +609,13 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, if (PageAnon(page)) __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); - else + else { __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); + if (PageSwapBacked(page)) + __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SHMEM], + nr_pages); + } if (compound) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); @@ -5208,6 +5213,8 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "sock %llu\n", (u64)stat[MEMCG_SOCK] * PAGE_SIZE); + seq_printf(m, "shmem %llu\n", + (u64)stat[MEM_CGROUP_STAT_SHMEM] * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", @@ -5476,8 +5483,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long nr_anon, unsigned long nr_file, - unsigned long nr_huge, unsigned long nr_kmem, - struct page *dummy_page) + unsigned long nr_kmem, unsigned long nr_huge, + unsigned long nr_shmem, struct page *dummy_page) { unsigned long nr_pages = nr_anon + nr_file + nr_kmem; unsigned long flags; @@ -5495,6 +5502,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_SHMEM], nr_shmem); __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); __this_cpu_add(memcg->stat->nr_page_events, nr_pages); memcg_check_events(memcg, dummy_page); @@ -5507,6 +5515,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, static void uncharge_list(struct list_head *page_list) { struct mem_cgroup *memcg = NULL; + unsigned long nr_shmem = 0; unsigned long nr_anon = 0; unsigned long nr_file = 0; unsigned long nr_huge = 0; @@ -5539,9 +5548,9 @@ static void uncharge_list(struct list_head *page_list) if (memcg != page->mem_cgroup) { if (memcg) { uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_huge, nr_kmem, page); - pgpgout = nr_anon = nr_file = - nr_huge = nr_kmem = 0; + nr_kmem, nr_huge, nr_shmem, page); + pgpgout = nr_anon = nr_file = nr_kmem = 0; + nr_huge = nr_shmem = 0; } memcg = page->mem_cgroup; } @@ -5555,8 +5564,11 @@ static void uncharge_list(struct list_head *page_list) } if (PageAnon(page)) nr_anon += nr_pages; - else + else { nr_file += nr_pages; + if (PageSwapBacked(page)) + nr_shmem += nr_pages; + } pgpgout++; } else { nr_kmem += 1 << compound_order(page); @@ -5568,7 +5580,7 @@ static void uncharge_list(struct list_head *page_list) if (memcg) uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_huge, nr_kmem, page); + nr_kmem, nr_huge, nr_shmem, page); } /** -- cgit v1.2.3 From f58a7e847cde1b28baa9fc2ca611a993bc28ae5e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 5 Apr 2017 09:20:25 +1000 Subject: thp: reduce indentation level in change_huge_pmd() Patch series "thp: fix few MADV_DONTNEED races" For MADV_DONTNEED to work properly with huge pages, it's critical to not clear pmd intermittently unless you hold down_write(mmap_sem). Otherwise MADV_DONTNEED can miss the THP which can lead to userspace breakage. See example of such race in commit message of patch 2/4. All these races are found by code inspection. I haven't seen them triggered. I don't think it's worth to apply them to stable@. This patch (of 4): Restructure code in preparation for a fix. Link: http://lkml.kernel.org/r/20170302151034.27829-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hillf Danton Signed-off-by: Andrew Morton --- mm/huge_memory.c | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c474e6f0461a..55badf9baf0d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1723,37 +1723,37 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; - int ret = 0; + pmd_t entry; + bool preserve_write; + int ret; ptl = __pmd_trans_huge_lock(pmd, vma); - if (ptl) { - pmd_t entry; - bool preserve_write = prot_numa && pmd_write(*pmd); - ret = 1; + if (!ptl) + return 0; - /* - * Avoid trapping faults against the zero page. The read-only - * data is likely to be read-cached on the local CPU and - * local/remote hits to the zero page are not interesting. - */ - if (prot_numa && is_huge_zero_pmd(*pmd)) { - spin_unlock(ptl); - return ret; - } + preserve_write = prot_numa && pmd_write(*pmd); + ret = 1; - if (!prot_numa || !pmd_protnone(*pmd)) { - entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); - entry = pmd_modify(entry, newprot); - if (preserve_write) - entry = pmd_mk_savedwrite(entry); - ret = HPAGE_PMD_NR; - set_pmd_at(mm, addr, pmd, entry); - BUG_ON(vma_is_anonymous(vma) && !preserve_write && - pmd_write(entry)); - } - spin_unlock(ptl); - } + /* + * Avoid trapping faults against the zero page. The read-only + * data is likely to be read-cached on the local CPU and + * local/remote hits to the zero page are not interesting. + */ + if (prot_numa && is_huge_zero_pmd(*pmd)) + goto unlock; + if (prot_numa && pmd_protnone(*pmd)) + goto unlock; + + entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); + entry = pmd_modify(entry, newprot); + if (preserve_write) + entry = pmd_mk_savedwrite(entry); + ret = HPAGE_PMD_NR; + set_pmd_at(mm, addr, pmd, entry); + BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); +unlock: + spin_unlock(ptl); return ret; } -- cgit v1.2.3 From 4eee541efec1e9965c4b302d7ad70b7cacfcf6a8 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 5 Apr 2017 09:20:26 +1000 Subject: thp: fix MADV_DONTNEED vs. numa balancing race In case prot_numa, we are under down_read(mmap_sem). It's critical to not clear pmd intermittently to avoid race with MADV_DONTNEED which is also under down_read(mmap_sem): CPU0: CPU1: change_huge_pmd(prot_numa=1) pmdp_huge_get_and_clear_notify() madvise_dontneed() zap_pmd_range() pmd_trans_huge(*pmd) == 0 (without ptl) // skip the pmd set_pmd_at(); // pmd is re-established The race makes MADV_DONTNEED miss the huge pmd and don't clear it which may break userspace. Found by code analysis, never saw triggered. Link: http://lkml.kernel.org/r/20170302151034.27829-3-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hillf Danton Signed-off-by: Andrew Morton --- mm/huge_memory.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 55badf9baf0d..23768acddf2f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1745,7 +1745,39 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (prot_numa && pmd_protnone(*pmd)) goto unlock; - entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); + /* + * In case prot_numa, we are under down_read(mmap_sem). It's critical + * to not clear pmd intermittently to avoid race with MADV_DONTNEED + * which is also under down_read(mmap_sem): + * + * CPU0: CPU1: + * change_huge_pmd(prot_numa=1) + * pmdp_huge_get_and_clear_notify() + * madvise_dontneed() + * zap_pmd_range() + * pmd_trans_huge(*pmd) == 0 (without ptl) + * // skip the pmd + * set_pmd_at(); + * // pmd is re-established + * + * The race makes MADV_DONTNEED miss the huge pmd and don't clear it + * which may break userspace. + * + * pmdp_invalidate() is required to make sure we don't miss + * dirty/young flags set by hardware. + */ + entry = *pmd; + pmdp_invalidate(vma, addr, pmd); + + /* + * Recover dirty/young flags. It relies on pmdp_invalidate to not + * corrupt them. + */ + if (pmd_dirty(*pmd)) + entry = pmd_mkdirty(entry); + if (pmd_young(*pmd)) + entry = pmd_mkyoung(entry); + entry = pmd_modify(entry, newprot); if (preserve_write) entry = pmd_mk_savedwrite(entry); -- cgit v1.2.3 From 2318e4706aeffde78a521196312880f331081b01 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 5 Apr 2017 09:20:26 +1000 Subject: mm: drop unused pmdp_huge_get_and_clear_notify() Dave noticed that after fixing MADV_DONTNEED vs. numa balancing race the last pmdp_huge_get_and_clear_notify() user is gone. Let's drop the helper. Link: http://lkml.kernel.org/r/20170306112047.24809-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Dave Hansen Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 51891fb0d3ce..c91b3bcd158f 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -394,18 +394,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) ___pud; \ }) -#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd) \ -({ \ - unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ - pmd_t ___pmd; \ - \ - ___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd); \ - mmu_notifier_invalidate_range(__mm, ___haddr, \ - ___haddr + HPAGE_PMD_SIZE); \ - \ - ___pmd; \ -}) - /* * set_pte_at_notify() sets the pte _after_ running the notifier. * This is safe to start by updating the secondary MMUs, because the primary MMU @@ -489,7 +477,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) #define ptep_clear_flush_notify ptep_clear_flush #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush #define pudp_huge_clear_flush_notify pudp_huge_clear_flush -#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear #define set_pte_at_notify set_pte_at #endif /* CONFIG_MMU_NOTIFIER */ -- cgit v1.2.3 From a99ee3fe57fd1c5f23c20e291d1e2d56c5d766d7 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 5 Apr 2017 09:20:27 +1000 Subject: thp: fix MADV_DONTNEED vs. MADV_FREE race Both MADV_DONTNEED and MADV_FREE handled with down_read(mmap_sem). It's critical to not clear pmd intermittently while handling MADV_FREE to avoid race with MADV_DONTNEED: CPU0: CPU1: madvise_free_huge_pmd() pmdp_huge_get_and_clear_full() madvise_dontneed() zap_pmd_range() pmd_trans_huge(*pmd) == 0 (without ptl) // skip the pmd set_pmd_at(); // pmd is re-established It results in MADV_DONTNEED skipping the pmd, leaving it not cleared. It violates MADV_DONTNEED interface and can result is userspace misbehaviour. Basically it's the same race as with numa balancing in change_huge_pmd(), but a bit simpler to mitigate: we don't need to preserve dirty/young flags here due to MADV_FREE functionality. Link: http://lkml.kernel.org/r/20170302151034.27829-4-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Minchan Kim Cc: Minchan Kim Cc: Andrea Arcangeli Cc: Hillf Danton Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 23768acddf2f..8a239d965304 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1565,8 +1565,6 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, unlock_page(page); if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { - orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, - tlb->fullmm); orig_pmd = pmd_mkold(orig_pmd); orig_pmd = pmd_mkclean(orig_pmd); -- cgit v1.2.3 From 8fa0fc1aaa3004fbee769176f5de5b442313c618 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 5 Apr 2017 09:20:28 +1000 Subject: thp-fix-madv_dontneed-vs-madv_free-race-fix Urgh... Power is special again. Link: http://lkml.kernel.org/r/20170303102636.bhd2zhtpds4mt62a@black.fi.intel.com Signed-off-by: Kirill A. Shutemov Cc: Minchan Kim Cc: Andrea Arcangeli Cc: Hillf Danton Signed-off-by: Andrew Morton --- mm/huge_memory.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8a239d965304..c36094c4fa4b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1565,6 +1565,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, unlock_page(page); if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { + pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); orig_pmd = pmd_mkclean(orig_pmd); -- cgit v1.2.3 From 15b240ffe2b8d2f51dbf65c841ecb398b629c9bb Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 5 Apr 2017 09:20:28 +1000 Subject: thp: fix MADV_DONTNEED vs clear soft dirty race Yet another instance of the same race. Fix is identical to change_huge_pmd(). See "thp: fix MADV_DONTNEED vs. numa balancing race" for more details. Link: http://lkml.kernel.org/r/20170302151034.27829-5-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hillf Danton Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b0e3800b4cf5..f0c8b33d99b1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -906,7 +906,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); + pmd_t pmd = *pmdp; + + /* See comment in change_huge_pmd() */ + pmdp_invalidate(vma, addr, pmdp); + if (pmd_dirty(*pmdp)) + pmd = pmd_mkdirty(pmd); + if (pmd_young(*pmdp)) + pmd = pmd_mkyoung(pmd); pmd = pmd_wrprotect(pmd); pmd = pmd_clear_soft_dirty(pmd); -- cgit v1.2.3 From 475c08708b9287294943379efd6e56ab67037bff Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 5 Apr 2017 09:20:29 +1000 Subject: mm, swap: Fix a race in free_swap_and_cache() Before using cluster lock in free_swap_and_cache(), the swap_info_struct->lock will be held during freeing the swap entry and acquiring page lock, so the page swap count will not change when testing page information later. But after using cluster lock, the cluster lock (or swap_info_struct->lock) will be held only during freeing the swap entry. So before acquiring the page lock, the page swap count may be changed in another thread. If the page swap count is not 0, we should not delete the page from the swap cache. This is fixed via checking page swap count again after acquiring the page lock. I found the race when I review the code, so I didn't trigger the race via a test program. If the race occurs for an anonymous page shared by multiple processes via fork, multiple pages will be allocated and swapped in from the swap device for the previously shared one page. That is, the user-visible runtime effect is more memory will be used and the access latency for the page will be higher, that is, the performance regression. Link: http://lkml.kernel.org/r/20170301143905.12846-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Hugh Dickins Cc: Shaohua Li Cc: Minchan Kim Cc: Rik van Riel Cc: Tim Chen Signed-off-by: Andrew Morton --- mm/swapfile.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 178130880b90..6b6bb1bb6209 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1111,6 +1111,18 @@ int page_swapcount(struct page *page) return count; } +static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) +{ + int count = 0; + pgoff_t offset = swp_offset(entry); + struct swap_cluster_info *ci; + + ci = lock_cluster_or_swap_info(si, offset); + count = swap_count(si->swap_map[offset]); + unlock_cluster_or_swap_info(si, ci); + return count; +} + /* * How many references to @entry are currently swapped out? * This does not give an exact answer when swap count is continued, @@ -1119,17 +1131,11 @@ int page_swapcount(struct page *page) int __swp_swapcount(swp_entry_t entry) { int count = 0; - pgoff_t offset; struct swap_info_struct *si; - struct swap_cluster_info *ci; si = __swap_info_get(entry); - if (si) { - offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(si, offset); - count = swap_count(si->swap_map[offset]); - unlock_cluster_or_swap_info(si, ci); - } + if (si) + count = swap_swapcount(si, entry); return count; } @@ -1291,7 +1297,8 @@ int free_swap_and_cache(swp_entry_t entry) * Also recheck PageSwapCache now page is locked (above). */ if (PageSwapCache(page) && !PageWriteback(page) && - (!page_mapped(page) || mem_cgroup_swap_full(page))) { + (!page_mapped(page) || mem_cgroup_swap_full(page)) && + !swap_swapcount(p, entry)) { delete_from_swap_cache(page); SetPageDirty(page); } -- cgit v1.2.3 From 8fff6f299969c2e298b12ef62a77d2f24c788c92 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Wed, 5 Apr 2017 09:20:29 +1000 Subject: mm: use is_migrate_highatomic() to simplify the code Introduce two helpers, is_migrate_highatomic() and is_migrate_highatomic_page(). Simplify the code, no functional changes. Link: http://lkml.kernel.org/r/58B94F15.6060606@huawei.com Signed-off-by: Xishi Qiu Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Mel Gorman Cc: Minchan Kim Cc: Joonsoo Kim Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 5 +++++ mm/page_alloc.c | 14 ++++++-------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 04e0969966f6..4b25dd64a0be 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -66,6 +66,11 @@ enum { /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */ extern char * const migratetype_names[MIGRATE_TYPES]; +#define is_migrate_highatomic(migratetype) \ + (migratetype == MIGRATE_HIGHATOMIC) +#define is_migrate_highatomic_page(_page) \ + (get_pageblock_migratetype(_page) == MIGRATE_HIGHATOMIC) + #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1f222dd91a5b..3b4b7d5017a4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2035,8 +2035,8 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, /* Yoink! */ mt = get_pageblock_migratetype(page); - if (mt != MIGRATE_HIGHATOMIC && - !is_migrate_isolate(mt) && !is_migrate_cma(mt)) { + if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) + && !is_migrate_cma(mt)) { zone->nr_reserved_highatomic += pageblock_nr_pages; set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); move_freepages_block(zone, page, MIGRATE_HIGHATOMIC); @@ -2093,8 +2093,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * from highatomic to ac->migratetype. So we should * adjust the count once. */ - if (get_pageblock_migratetype(page) == - MIGRATE_HIGHATOMIC) { + if (is_migrate_highatomic_page(page)) { /* * It should never happen but changes to * locking could inadvertently allow a per-cpu @@ -2151,8 +2150,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); - if (can_steal && - get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC) + if (can_steal && !is_migrate_highatomic_page(page)) steal_suitable_fallback(zone, page, start_migratetype); /* Remove the page from the freelists */ @@ -2489,7 +2487,7 @@ void free_hot_cold_page(struct page *page, bool cold) /* * We only track unmovable, reclaimable and movable on pcp lists. * Free ISOLATE pages back to the allocator because they are being - * offlined but treat RESERVE as movable pages so we can get those + * offlined but treat HIGHATOMIC as movable pages so we can get those * areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ @@ -2600,7 +2598,7 @@ int __isolate_free_page(struct page *page, unsigned int order) for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) - && mt != MIGRATE_HIGHATOMIC) + && !is_migrate_highatomic(mt)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } -- cgit v1.2.3 From 55da3de08375fb99fc115dfb9d604364ea6044aa Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 5 Apr 2017 09:20:30 +1000 Subject: mm-use-is_migrate_highatomic-to-simplify-the-code-fix use static inlines rather than macros, per mhocko Cc: Joonsoo Kim Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Vlastimil Babka Cc: Xishi Qiu Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 7 +------ mm/internal.h | 10 ++++++++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4b25dd64a0be..446cf68c1c09 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -35,7 +35,7 @@ */ #define PAGE_ALLOC_COSTLY_ORDER 3 -enum { +enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, @@ -66,11 +66,6 @@ enum { /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */ extern char * const migratetype_names[MIGRATE_TYPES]; -#define is_migrate_highatomic(migratetype) \ - (migratetype == MIGRATE_HIGHATOMIC) -#define is_migrate_highatomic_page(_page) \ - (get_pageblock_migratetype(_page) == MIGRATE_HIGHATOMIC) - #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) diff --git a/mm/internal.h b/mm/internal.h index c583ce1b32b9..823a7a89099b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -503,4 +503,14 @@ extern const struct trace_print_flags pageflag_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; +static inline bool is_migrate_highatomic(enum migratetype migratetype) +{ + return migratetype == MIGRATE_HIGHATOMIC; +} + +static inline bool is_migrate_highatomic_page(struct page *page) +{ + return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; +} + #endif /* __MM_INTERNAL_H */ -- cgit v1.2.3 From dcf15639c61d23445556546fb6074cd16d4b84c6 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Wed, 5 Apr 2017 09:20:31 +1000 Subject: mm: use is_migrate_isolate_page() to simplify the code Use is_migrate_isolate_page() to simplify the code, no functional changes. Link: http://lkml.kernel.org/r/58B94FB1.8020802@huawei.com Signed-off-by: Xishi Qiu Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Mel Gorman Cc: Minchan Kim Cc: Joonsoo Kim Signed-off-by: Andrew Morton --- mm/page_isolation.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f4e17a57926a..7927bbb54a4e 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -88,7 +88,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); - if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + if (!is_migrate_isolate_page(page)) goto out; /* @@ -205,7 +205,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + if (!page || !is_migrate_isolate_page(page)) continue; unset_migratetype_isolate(page, migratetype); } @@ -262,7 +262,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, */ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + if (page && !is_migrate_isolate_page(page)) break; } page = __first_valid_page(start_pfn, end_pfn - start_pfn); -- cgit v1.2.3 From 8686ce143327cd173c852eb1fd7f24736e128e8b Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 5 Apr 2017 09:20:31 +1000 Subject: mm, vmstat: print non-populated zones in zoneinfo Initscripts can use the information (protection levels) from /proc/zoneinfo to configure vm.lowmem_reserve_ratio at boot. vm.lowmem_reserve_ratio is an array of ratios for each configured zone on the system. If a zone is not populated on an arch, /proc/zoneinfo suppresses its output. This results in there not being a 1:1 mapping between the set of zones emitted by /proc/zoneinfo and the zones configured by vm.lowmem_reserve_ratio. This patch shows statistics for non-populated zones in /proc/zoneinfo. The zones exist and hold a spot in the vm.lowmem_reserve_ratio array. Without this patch, it is not possible to determine which index in the array controls which zone if one or more zones on the system are not populated. Remaining users of walk_zones_in_node() are unchanged. Files such as /proc/pagetypeinfo require certain zone data to be initialized properly for display, which is not done for unpopulated zones. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1703031451310.98023@chino.kir.corp.google.com Signed-off-by: David Rientjes Reviewed-by: Anshuman Khandual Cc: Vlastimil Babka Cc: Mel Gorman Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/vmstat.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 35de61b46408..f8844236926f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1124,8 +1124,12 @@ static void frag_stop(struct seq_file *m, void *arg) { } -/* Walk all the zones in a node and print using a callback */ +/* + * Walk zones in a node and print using a callback. + * If @assert_populated is true, only use callback for zones that are populated. + */ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, + bool assert_populated, void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) { struct zone *zone; @@ -1133,7 +1137,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, unsigned long flags; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!populated_zone(zone)) + if (assert_populated && !populated_zone(zone)) continue; spin_lock_irqsave(&zone->lock, flags); @@ -1161,7 +1165,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, static int frag_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, frag_show_print); + walk_zones_in_node(m, pgdat, true, frag_show_print); return 0; } @@ -1202,7 +1206,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg) seq_printf(m, "%6d ", order); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print); + walk_zones_in_node(m, pgdat, true, pagetypeinfo_showfree_print); return 0; } @@ -1254,7 +1258,7 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) seq_printf(m, "%12s ", migratetype_names[mtype]); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print); + walk_zones_in_node(m, pgdat, true, pagetypeinfo_showblockcount_print); return 0; } @@ -1280,7 +1284,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) seq_printf(m, "%12s ", migratetype_names[mtype]); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print); + walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print); #endif /* CONFIG_PAGE_OWNER */ } @@ -1430,12 +1434,15 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, } /* - * Output information about zones in @pgdat. + * Output information about zones in @pgdat. All zones are printed regardless + * of whether they are populated or not: lowmem_reserve_ratio operates on the + * set of all zones and userspace would not be aware of such zones if they are + * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio). */ static int zoneinfo_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, zoneinfo_show_print); + walk_zones_in_node(m, pgdat, false, zoneinfo_show_print); return 0; } @@ -1839,7 +1846,7 @@ static int unusable_show(struct seq_file *m, void *arg) if (!node_state(pgdat->node_id, N_MEMORY)) return 0; - walk_zones_in_node(m, pgdat, unusable_show_print); + walk_zones_in_node(m, pgdat, true, unusable_show_print); return 0; } @@ -1891,7 +1898,7 @@ static int extfrag_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, extfrag_show_print); + walk_zones_in_node(m, pgdat, true, extfrag_show_print); return 0; } -- cgit v1.2.3 From 64f33106acd67891159b0ced339f2fc345ae0871 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 5 Apr 2017 09:20:32 +1000 Subject: mm, vmstat: suppress pcp stats for unpopulated zones in zoneinfo After "mm, vmstat: print non-populated zones in zoneinfo", /proc/zoneinfo will show unpopulated zones. The per-cpu pageset statistics are not relevant for unpopulated zones and can be potentially lengthy, so supress them when they are not interesting. Also moves lowmem reserve protection information above pcp stats since it is relevant for all zones per vm.lowmem_reserve_ratio. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1703061400500.46428@chino.kir.corp.google.com Signed-off-by: David Rientjes Cc: Anshuman Khandual Cc: Vlastimil Babka Cc: Mel Gorman Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/vmstat.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index f8844236926f..fe937e32a7a6 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1393,18 +1393,24 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone->present_pages, zone->managed_pages); - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - seq_printf(m, "\n %-12s %lu", vmstat_text[i], - zone_page_state(zone, i)); - seq_printf(m, "\n protection: (%ld", zone->lowmem_reserve[0]); for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) seq_printf(m, ", %ld", zone->lowmem_reserve[i]); - seq_printf(m, - ")" - "\n pagesets"); + seq_putc(m, ')'); + + /* If unpopulated, no other information is useful */ + if (!populated_zone(zone)) { + seq_putc(m, '\n'); + return; + } + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + seq_printf(m, "\n %-12s %lu", vmstat_text[i], + zone_page_state(zone, i)); + + seq_printf(m, "\n pagesets"); for_each_online_cpu(i) { struct per_cpu_pageset *pageset; -- cgit v1.2.3 From ea04bcf38515398d3fbd93f0a2b74e4b1b24d593 Mon Sep 17 00:00:00 2001 From: Sangwoo Park Date: Wed, 5 Apr 2017 09:20:32 +1000 Subject: zram: reduce load operation in page_same_filled In page_same_filled function, all elements in the page is compared with next index value. The current comparison routine compares the (i)th and (i+1)th values of the page. In this case, two load operaions occur for each comparison. But if we store first value of the page stores at 'val' variable and using it to compare with others, the load opearation is reduced. It reduce load operation per page by up to 64times. Link: http://lkml.kernel.org/r/1488428104-7257-1-git-send-email-sangwoo2.park@lge.com Signed-off-by: Sangwoo Park Reviewed-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index dceb5edd1e54..01944419b1f3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -177,15 +177,17 @@ static bool page_same_filled(void *ptr, unsigned long *element) { unsigned int pos; unsigned long *page; + unsigned long val; page = (unsigned long *)ptr; + val = page[0]; - for (pos = 0; pos < PAGE_SIZE / sizeof(*page) - 1; pos++) { - if (page[pos] != page[pos + 1]) + for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) { + if (val != page[pos]) return false; } - *element = page[pos]; + *element = val; return true; } -- cgit v1.2.3 From 6ddca4ad37510572bb45a2b3011818bcff31fc8c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 5 Apr 2017 09:20:33 +1000 Subject: lockdep: teach lockdep about memalloc_noio_save Patch series "scope GFP_NOFS api", v5. This patch (of 7): Commit 21caf2fc1931 ("mm: teach mm by current context info to not do I/O during memory allocation") added the memalloc_noio_(save|restore) functions to enable people to modify the MM behavior by disabling I/O during memory allocation. This was further extended in Fixes: 934f3072c17c ("mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set"). memalloc_noio_* functions prevent allocation paths recursing back into the filesystem without explicitly changing the flags for every allocation site. However, lockdep hasn't been keeping up with the changes and it entirely misses handling the memalloc_noio adjustments. Instead, it is left to the callers of __lockdep_trace_alloc to call the function after they have shaven the respective GFP flags which can lead to false positives: [ 644.173373] ================================= [ 644.174012] [ INFO: inconsistent lock state ] [ 644.174012] 4.10.0-nbor #134 Not tainted [ 644.174012] --------------------------------- [ 644.174012] inconsistent {IN-RECLAIM_FS-W} -> {RECLAIM_FS-ON-W} usage. [ 644.174012] fsstress/3365 [HC0[0]:SC0[0]:HE1:SE1] takes: [ 644.174012] (&xfs_nondir_ilock_class){++++?.}, at: [] xfs_ilock+0x141/0x230 [ 644.174012] {IN-RECLAIM_FS-W} state was registered at: [ 644.174012] __lock_acquire+0x62a/0x17c0 [ 644.174012] lock_acquire+0xc5/0x220 [ 644.174012] down_write_nested+0x4f/0x90 [ 644.174012] xfs_ilock+0x141/0x230 [ 644.174012] xfs_reclaim_inode+0x12a/0x320 [ 644.174012] xfs_reclaim_inodes_ag+0x2c8/0x4e0 [ 644.174012] xfs_reclaim_inodes_nr+0x33/0x40 [ 644.174012] xfs_fs_free_cached_objects+0x19/0x20 [ 644.174012] super_cache_scan+0x191/0x1a0 [ 644.174012] shrink_slab+0x26f/0x5f0 [ 644.174012] shrink_node+0xf9/0x2f0 [ 644.174012] kswapd+0x356/0x920 [ 644.174012] kthread+0x10c/0x140 [ 644.174012] ret_from_fork+0x31/0x40 [ 644.174012] irq event stamp: 173777 [ 644.174012] hardirqs last enabled at (173777): [] __local_bh_enable_ip+0x70/0xc0 [ 644.174012] hardirqs last disabled at (173775): [] __local_bh_enable_ip+0x37/0xc0 [ 644.174012] softirqs last enabled at (173776): [] _xfs_buf_find+0x67a/0xb70 [ 644.174012] softirqs last disabled at (173774): [] _xfs_buf_find+0x5db/0xb70 [ 644.174012] [ 644.174012] other info that might help us debug this: [ 644.174012] Possible unsafe locking scenario: [ 644.174012] [ 644.174012] CPU0 [ 644.174012] ---- [ 644.174012] lock(&xfs_nondir_ilock_class); [ 644.174012] [ 644.174012] lock(&xfs_nondir_ilock_class); [ 644.174012] [ 644.174012] *** DEADLOCK *** [ 644.174012] [ 644.174012] 4 locks held by fsstress/3365: [ 644.174012] #0: (sb_writers#10){++++++}, at: [] mnt_want_write+0x24/0x50 [ 644.174012] #1: (&sb->s_type->i_mutex_key#12){++++++}, at: [] vfs_setxattr+0x6f/0xb0 [ 644.174012] #2: (sb_internal#2){++++++}, at: [] xfs_trans_alloc+0xfc/0x140 [ 644.174012] #3: (&xfs_nondir_ilock_class){++++?.}, at: [] xfs_ilock+0x141/0x230 [ 644.174012] [ 644.174012] stack backtrace: [ 644.174012] CPU: 0 PID: 3365 Comm: fsstress Not tainted 4.10.0-nbor #134 [ 644.174012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 [ 644.174012] Call Trace: [ 644.174012] dump_stack+0x85/0xc9 [ 644.174012] print_usage_bug.part.37+0x284/0x293 [ 644.174012] ? print_shortest_lock_dependencies+0x1b0/0x1b0 [ 644.174012] mark_lock+0x27e/0x660 [ 644.174012] mark_held_locks+0x66/0x90 [ 644.174012] lockdep_trace_alloc+0x6f/0xd0 [ 644.174012] kmem_cache_alloc_node_trace+0x3a/0x2c0 [ 644.174012] ? vm_map_ram+0x2a1/0x510 [ 644.174012] vm_map_ram+0x2a1/0x510 [ 644.174012] ? vm_map_ram+0x46/0x510 [ 644.174012] _xfs_buf_map_pages+0x77/0x140 [ 644.174012] xfs_buf_get_map+0x185/0x2a0 [ 644.174012] xfs_attr_rmtval_set+0x233/0x430 [ 644.174012] xfs_attr_leaf_addname+0x2d2/0x500 [ 644.174012] xfs_attr_set+0x214/0x420 [ 644.174012] xfs_xattr_set+0x59/0xb0 [ 644.174012] __vfs_setxattr+0x76/0xa0 [ 644.174012] __vfs_setxattr_noperm+0x5e/0xf0 [ 644.174012] vfs_setxattr+0xae/0xb0 [ 644.174012] ? __might_fault+0x43/0xa0 [ 644.174012] setxattr+0x15e/0x1a0 [ 644.174012] ? __lock_is_held+0x53/0x90 [ 644.174012] ? rcu_read_lock_sched_held+0x93/0xa0 [ 644.174012] ? rcu_sync_lockdep_assert+0x2f/0x60 [ 644.174012] ? __sb_start_write+0x130/0x1d0 [ 644.174012] ? mnt_want_write+0x24/0x50 [ 644.174012] path_setxattr+0x8f/0xc0 [ 644.174012] SyS_lsetxattr+0x11/0x20 [ 644.174012] entry_SYSCALL_64_fastpath+0x23/0xc6 Let's fix this by making lockdep explicitly do the shaving of respective GFP flags. Fixes: 934f3072c17c ("mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set") Link: http://lkml.kernel.org/r/20170306131408.9828-2-mhocko@kernel.org Signed-off-by: Nikolay Borisov Signed-off-by: Michal Hocko Acked-by: Peter Zijlstra (Intel) Cc: Dave Chinner Cc: Theodore Ts'o Cc: Chris Mason Cc: David Sterba Cc: Jan Kara Cc: Brian Foster Cc: Darrick J. Wong Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- kernel/locking/lockdep.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index a95e5d1f4a9c..73a2025b8bde 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -2863,6 +2864,8 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) if (unlikely(!debug_locks)) return; + gfp_mask = memalloc_noio_flags(gfp_mask); + /* no reclaim without waiting on it */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) return; @@ -3861,7 +3864,7 @@ EXPORT_SYMBOL_GPL(lock_unpin_lock); void lockdep_set_current_reclaim_state(gfp_t gfp_mask) { - current->lockdep_reclaim_gfp = gfp_mask; + current->lockdep_reclaim_gfp = memalloc_noio_flags(gfp_mask); } void lockdep_clear_current_reclaim_state(void) -- cgit v1.2.3 From 6c0556eb86b90e799ca6dc15b9b599a0dfd594df Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 5 Apr 2017 09:20:34 +1000 Subject: lockdep: allow to disable reclaim lockup detection The current implementation of the reclaim lockup detection can lead to false positives and those even happen and usually lead to tweak the code to silence the lockdep by using GFP_NOFS even though the context can use __GFP_FS just fine. See http://lkml.kernel.org/r/20160512080321.GA18496@dastard as an example. ================================= [ INFO: inconsistent lock state ] 4.5.0-rc2+ #4 Tainted: G O --------------------------------- inconsistent {RECLAIM_FS-ON-R} -> {IN-RECLAIM_FS-W} usage. kswapd0/543 [HC0[0]:SC0[0]:HE1:SE1] takes: (&xfs_nondir_ilock_class){++++-+}, at: [] xfs_ilock+0x177/0x200 [xfs] {RECLAIM_FS-ON-R} state was registered at: [] mark_held_locks+0x79/0xa0 [] lockdep_trace_alloc+0xb3/0x100 [] kmem_cache_alloc+0x33/0x230 [] kmem_zone_alloc+0x81/0x120 [xfs] [] xfs_refcountbt_init_cursor+0x3e/0xa0 [xfs] [] __xfs_refcount_find_shared+0x75/0x580 [xfs] [] xfs_refcount_find_shared+0x84/0xb0 [xfs] [] xfs_getbmap+0x608/0x8c0 [xfs] [] xfs_vn_fiemap+0xab/0xc0 [xfs] [] do_vfs_ioctl+0x498/0x670 [] SyS_ioctl+0x79/0x90 [] entry_SYSCALL_64_fastpath+0x12/0x6f CPU0 ---- lock(&xfs_nondir_ilock_class); lock(&xfs_nondir_ilock_class); *** DEADLOCK *** 3 locks held by kswapd0/543: stack backtrace: CPU: 0 PID: 543 Comm: kswapd0 Tainted: G O 4.5.0-rc2+ #4 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 ffffffff82a34f10 ffff88003aa078d0 ffffffff813a14f9 ffff88003d8551c0 ffff88003aa07920 ffffffff8110ec65 0000000000000000 0000000000000001 ffff880000000001 000000000000000b 0000000000000008 ffff88003d855aa0 Call Trace: [] dump_stack+0x4b/0x72 [] print_usage_bug+0x215/0x240 [] mark_lock+0x1f5/0x660 [] ? print_shortest_lock_dependencies+0x1a0/0x1a0 [] __lock_acquire+0xa80/0x1e50 [] ? kmem_cache_alloc+0x15e/0x230 [] ? kmem_zone_alloc+0x81/0x120 [xfs] [] lock_acquire+0xd8/0x1e0 [] ? xfs_ilock+0x177/0x200 [xfs] [] ? xfs_reflink_cancel_cow_range+0x150/0x300 [xfs] [] down_write_nested+0x5e/0xc0 [] ? xfs_ilock+0x177/0x200 [xfs] [] xfs_ilock+0x177/0x200 [xfs] [] xfs_reflink_cancel_cow_range+0x150/0x300 [xfs] [] xfs_fs_evict_inode+0xdc/0x1e0 [xfs] [] evict+0xc5/0x190 [] dispose_list+0x39/0x60 [] prune_icache_sb+0x4b/0x60 [] super_cache_scan+0x14f/0x1a0 [] shrink_slab.part.63.constprop.79+0x1e9/0x4e0 [] shrink_zone+0x15e/0x170 [] kswapd+0x4f1/0xa80 [] ? zone_reclaim+0x230/0x230 [] kthread+0xf2/0x110 [] ? kthread_create_on_node+0x220/0x220 [] ret_from_fork+0x3f/0x70 [] ? kthread_create_on_node+0x220/0x220 To quote Dave: " Ignoring whether reflink should be doing anything or not, that's a "xfs_refcountbt_init_cursor() gets called both outside and inside transactions" lockdep false positive case. The problem here is lockdep has seen this allocation from within a transaction, hence a GFP_NOFS allocation, and now it's seeing it in a GFP_KERNEL context. Also note that we have an active reference to this inode. So, because the reclaim annotations overload the interrupt level detections and it's seen the inode ilock been taken in reclaim ("interrupt") context, this triggers a reclaim context warning where it thinks it is unsafe to do this allocation in GFP_KERNEL context holding the inode ilock... " This sounds like a fundamental problem of the reclaim lock detection. It is really impossible to annotate such a special usecase IMHO unless the reclaim lockup detection is reworked completely. Until then it is much better to provide a way to add "I know what I am doing flag" and mark problematic places. This would prevent from abusing GFP_NOFS flag which has a runtime effect even on configurations which have lockdep disabled. Introduce __GFP_NOLOCKDEP flag which tells the lockdep gfp tracking to skip the current allocation request. While we are at it also make sure that the radix tree doesn't accidentaly override tags stored in the upper part of the gfp_mask. Link: http://lkml.kernel.org/r/20170306131408.9828-3-mhocko@kernel.org Signed-off-by: Michal Hocko Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra (Intel) Acked-by: Vlastimil Babka Cc: Dave Chinner Cc: Theodore Ts'o Cc: Chris Mason Cc: David Sterba Cc: Jan Kara Cc: Brian Foster Cc: Darrick J. Wong Cc: Nikolay Borisov Signed-off-by: Andrew Morton --- include/linux/gfp.h | 10 +++++++++- kernel/locking/lockdep.c | 4 ++++ lib/radix-tree.c | 2 ++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index db373b9d3223..978232a3b4ae 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -40,6 +40,11 @@ struct vm_area_struct; #define ___GFP_DIRECT_RECLAIM 0x400000u #define ___GFP_WRITE 0x800000u #define ___GFP_KSWAPD_RECLAIM 0x1000000u +#ifdef CONFIG_LOCKDEP +#define ___GFP_NOLOCKDEP 0x4000000u +#else +#define ___GFP_NOLOCKDEP 0 +#endif /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* @@ -179,8 +184,11 @@ struct vm_area_struct; #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) +/* Disable lockdep for GFP context tracking */ +#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) + /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT 25 +#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 73a2025b8bde..a06159c984e4 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2884,6 +2884,10 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) return; + /* Disable lockdep if explicitly requested */ + if (gfp_mask & __GFP_NOLOCKDEP) + return; + mark_held_locks(curr, RECLAIM_FS); } diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 691a9ad48497..898e87998417 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -2284,6 +2284,8 @@ static int radix_tree_cpu_dead(unsigned int cpu) void __init radix_tree_init(void) { int ret; + + BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32); radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, -- cgit v1.2.3 From dcccc9d42e58df92f7114acfc721331e65ef1fc3 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 5 Apr 2017 09:20:34 +1000 Subject: xfs: abstract PF_FSTRANS to PF_MEMALLOC_NOFS xfs has defined PF_FSTRANS to declare a scope GFP_NOFS semantic quite some time ago. We would like to make this concept more generic and use it for other filesystems as well. Let's start by giving the flag a more generic name PF_MEMALLOC_NOFS which is in line with an exiting PF_MEMALLOC_NOIO already used for the same purpose for GFP_NOIO contexts. Replace all PF_FSTRANS usage from the xfs code in the first step before we introduce a full API for it as xfs uses the flag directly anyway. This patch doesn't introduce any functional change. Link: http://lkml.kernel.org/r/20170306131408.9828-4-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Acked-by: Vlastimil Babka Cc: Dave Chinner Cc: Theodore Ts'o Cc: Chris Mason Cc: David Sterba Cc: Jan Kara Cc: Nikolay Borisov Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- fs/xfs/kmem.c | 4 ++-- fs/xfs/kmem.h | 2 +- fs/xfs/libxfs/xfs_btree.c | 2 +- fs/xfs/xfs_aops.c | 6 +++--- fs/xfs/xfs_trans.c | 12 ++++++------ include/linux/sched.h | 2 ++ 6 files changed, 15 insertions(+), 13 deletions(-) diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 70a5b55e0870..d0ac1a065539 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -63,13 +63,13 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering * the filesystem here and potentially deadlocking. */ - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS)) noio_flag = memalloc_noio_save(); lflags = kmem_flags_convert(flags); ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS)) memalloc_noio_restore(noio_flag); return ptr; diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index f0fc84fcaac2..a6c8da40c70d 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags) lflags = GFP_ATOMIC | __GFP_NOWARN; } else { lflags = GFP_KERNEL | __GFP_NOWARN; - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS)) lflags &= ~__GFP_FS; } diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c3decedc9455..3059a3ec7ecb 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2886,7 +2886,7 @@ xfs_btree_split_worker( struct xfs_btree_split_args *args = container_of(work, struct xfs_btree_split_args, work); unsigned long pflags; - unsigned long new_pflags = PF_FSTRANS; + unsigned long new_pflags = PF_MEMALLOC_NOFS; /* * we are in a transaction context here, but may also be doing work diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 61494295d92f..05eca126c688 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -189,7 +189,7 @@ xfs_setfilesize_trans_alloc( * We hand off the transaction to the completion thread now, so * clear the flag here. */ - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return 0; } @@ -252,7 +252,7 @@ xfs_setfilesize_ioend( * thus we need to mark ourselves as being in a transaction manually. * Similarly for freeze protection. */ - current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); /* we abort the update if there was an IO error */ @@ -1016,7 +1016,7 @@ xfs_do_writepage( * Given that we do not allow direct reclaim to call us, we should * never be called while in a filesystem transaction. */ - if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS)) goto redirty; /* diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 70f42ea86dfb..f5969c8274fc 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -134,7 +134,7 @@ xfs_trans_reserve( bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; /* Mark this thread as being in a transaction */ - current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); /* * Attempt to reserve the needed disk blocks by decrementing @@ -144,7 +144,7 @@ xfs_trans_reserve( if (blocks > 0) { error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); if (error != 0) { - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return -ENOSPC; } tp->t_blk_res += blocks; @@ -221,7 +221,7 @@ undo_blocks: tp->t_blk_res = 0; } - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return error; } @@ -914,7 +914,7 @@ __xfs_trans_commit( xfs_log_commit_cil(mp, tp, &commit_lsn, regrant); - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free(tp); /* @@ -944,7 +944,7 @@ out_unreserve: if (commit_lsn == -1 && !error) error = -EIO; } - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free_items(tp, NULLCOMMITLSN, !!error); xfs_trans_free(tp); @@ -998,7 +998,7 @@ xfs_trans_cancel( xfs_log_done(mp, tp->t_ticket, NULL, false); /* mark this thread as no longer being in a transaction */ - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free_items(tp, NULLCOMMITLSN, dirty); xfs_trans_free(tp); diff --git a/include/linux/sched.h b/include/linux/sched.h index d67eee84fd43..4528f7c9789f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1224,6 +1224,8 @@ extern struct pid *cad_pid; #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ +#define PF_MEMALLOC_NOFS PF_FSTRANS /* Transition to a more generic GFP_NOFS scope semantic */ + /* * Only the _current_ task can read/write to tsk->flags, but other * tasks can access tsk->flags in readonly mode for example -- cgit v1.2.3 From b3225bf62ceec294665c03296ffe7f122e3b8188 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 5 Apr 2017 09:20:35 +1000 Subject: mm: introduce memalloc_nofs_{save,restore} API GFP_NOFS context is used for the following 5 reasons currently - to prevent from deadlocks when the lock held by the allocation context would be needed during the memory reclaim - to prevent from stack overflows during the reclaim because the allocation is performed from a deep context already - to prevent lockups when the allocation context depends on other reclaimers to make a forward progress indirectly - just in case because this would be safe from the fs POV - silence lockdep false positives Unfortunately overuse of this allocation context brings some problems to the MM. Memory reclaim is much weaker (especially during heavy FS metadata workloads), OOM killer cannot be invoked because the MM layer doesn't have enough information about how much memory is freeable by the FS layer. In many cases it is far from clear why the weaker context is even used and so it might be used unnecessarily. We would like to get rid of those as much as possible. One way to do that is to use the flag in scopes rather than isolated cases. Such a scope is declared when really necessary, tracked per task and all the allocation requests from within the context will simply inherit the GFP_NOFS semantic. Not only this is easier to understand and maintain because there are much less problematic contexts than specific allocation requests, this also helps code paths where FS layer interacts with other layers (e.g. crypto, security modules, MM etc...) and there is no easy way to convey the allocation context between the layers. Introduce memalloc_nofs_{save,restore} API to control the scope of GFP_NOFS allocation context. This is basically copying memalloc_noio_{save,restore} API we have for other restricted allocation context GFP_NOIO. The PF_MEMALLOC_NOFS flag already exists and it is just an alias for PF_FSTRANS which has been xfs specific until recently. There are no more PF_FSTRANS users anymore so let's just drop it. PF_MEMALLOC_NOFS is now checked in the MM layer and drops __GFP_FS implicitly same as PF_MEMALLOC_NOIO drops __GFP_IO. memalloc_noio_flags is renamed to current_gfp_context because it now cares about both PF_MEMALLOC_NOFS and PF_MEMALLOC_NOIO contexts. Xfs code paths preserve their semantic. kmem_flags_convert() doesn't need to evaluate the flag anymore. This patch shouldn't introduce any functional changes. Let's hope that filesystems will drop direct GFP_NOFS (resp. ~__GFP_FS) usage as much as possible and only use a properly documented memalloc_nofs_{save,restore} checkpoints where they are appropriate. Link: http://lkml.kernel.org/r/20170306131408.9828-5-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Dave Chinner Cc: Theodore Ts'o Cc: Chris Mason Cc: David Sterba Cc: Jan Kara Cc: Brian Foster Cc: Darrick J. Wong Cc: Nikolay Borisov Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- fs/xfs/kmem.h | 2 +- include/linux/gfp.h | 8 ++++++++ include/linux/sched.h | 8 +++----- include/linux/sched/mm.h | 26 +++++++++++++++++++++++--- kernel/locking/lockdep.c | 6 +++--- mm/page_alloc.c | 10 ++++++---- mm/vmscan.c | 6 +++--- 7 files changed, 47 insertions(+), 19 deletions(-) diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index a6c8da40c70d..d6ea520162b2 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags) lflags = GFP_ATOMIC | __GFP_NOWARN; } else { lflags = GFP_KERNEL | __GFP_NOWARN; - if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS)) + if (flags & KM_NOFS) lflags &= ~__GFP_FS; } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 978232a3b4ae..2bfcfd33e476 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -210,8 +210,16 @@ struct vm_area_struct; * * GFP_NOIO will use direct reclaim to discard clean pages or slab pages * that do not require the starting of any physical IO. + * Please try to avoid using this flag directly and instead use + * memalloc_noio_{save,restore} to mark the whole scope which cannot + * perform any IO with a short explanation why. All allocation requests + * will inherit GFP_NOIO implicitly. * * GFP_NOFS will use direct reclaim but will not use any filesystem interfaces. + * Please try to avoid using this flag directly and instead use + * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't + * recurse into the FS layer with a short explanation why. All allocation + * requests will inherit GFP_NOFS implicitly. * * GFP_USER is for userspace allocations that also need to be directly * accessibly by the kernel or hardware. It is typically used by hardware diff --git a/include/linux/sched.h b/include/linux/sched.h index 4528f7c9789f..9c3ee2281a56 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1211,9 +1211,9 @@ extern struct pid *cad_pid; #define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_FROZEN 0x00010000 /* Frozen for system suspend */ -#define PF_FSTRANS 0x00020000 /* Inside a filesystem transaction */ -#define PF_KSWAPD 0x00040000 /* I am kswapd */ -#define PF_MEMALLOC_NOIO 0x00080000 /* Allocating memory without IO involved */ +#define PF_KSWAPD 0x00020000 /* I am kswapd */ +#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ +#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ @@ -1224,8 +1224,6 @@ extern struct pid *cad_pid; #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ -#define PF_MEMALLOC_NOFS PF_FSTRANS /* Transition to a more generic GFP_NOFS scope semantic */ - /* * Only the _current_ task can read/write to tsk->flags, but other * tasks can access tsk->flags in readonly mode for example diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 830953ebb391..9daabe138c99 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -149,13 +149,21 @@ static inline bool in_vfork(struct task_struct *tsk) return ret; } -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags - * __GFP_FS is also cleared as it implies __GFP_IO. +/* + * Applies per-task gfp context to the given allocation flags. + * PF_MEMALLOC_NOIO implies GFP_NOIO + * PF_MEMALLOC_NOFS implies GFP_NOFS */ -static inline gfp_t memalloc_noio_flags(gfp_t flags) +static inline gfp_t current_gfp_context(gfp_t flags) { + /* + * NOIO implies both NOIO and NOFS and it is a weaker context + * so always make sure it makes precendence + */ if (unlikely(current->flags & PF_MEMALLOC_NOIO)) flags &= ~(__GFP_IO | __GFP_FS); + else if (unlikely(current->flags & PF_MEMALLOC_NOFS)) + flags &= ~__GFP_FS; return flags; } @@ -171,4 +179,16 @@ static inline void memalloc_noio_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; } +static inline unsigned int memalloc_nofs_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC_NOFS; + current->flags |= PF_MEMALLOC_NOFS; + return flags; +} + +static inline void memalloc_nofs_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; +} + #endif /* _LINUX_SCHED_MM_H */ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index a06159c984e4..ab94a8606d3c 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2864,7 +2864,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) if (unlikely(!debug_locks)) return; - gfp_mask = memalloc_noio_flags(gfp_mask); + gfp_mask = current_gfp_context(gfp_mask); /* no reclaim without waiting on it */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) @@ -2875,7 +2875,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) return; /* We're only interested __GFP_FS allocations for now */ - if (!(gfp_mask & __GFP_FS)) + if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS)) return; /* @@ -3868,7 +3868,7 @@ EXPORT_SYMBOL_GPL(lock_unpin_lock); void lockdep_set_current_reclaim_state(gfp_t gfp_mask) { - current->lockdep_reclaim_gfp = memalloc_noio_flags(gfp_mask); + current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask); } void lockdep_clear_current_reclaim_state(void) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3b4b7d5017a4..b4a9f9382e0c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3949,10 +3949,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, goto out; /* - * Runtime PM, block IO and its error handling path can deadlock - * because I/O on the device might not complete. + * Apply scoped allocation constrains. This is mainly about + * GFP_NOFS resp. GFP_NOIO which has to be inherited for all + * allocation requests from a particular context which has + * been marked by memalloc_no{fs,io}_{save,restore} */ - alloc_mask = memalloc_noio_flags(gfp_mask); + alloc_mask = current_gfp_context(gfp_mask); ac.spread_dirty_pages = false; /* @@ -7405,7 +7407,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, .ignore_skip_hint = true, - .gfp_mask = memalloc_noio_flags(gfp_mask), + .gfp_mask = current_gfp_context(gfp_mask), }; INIT_LIST_HEAD(&cc.migratepages); diff --git a/mm/vmscan.c b/mm/vmscan.c index ec4555369e17..3ad66580b8b4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2915,7 +2915,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, unsigned long nr_reclaimed; struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, - .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), .reclaim_idx = gfp_zone(gfp_mask), .order = order, .nodemask = nodemask, @@ -2995,7 +2995,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, int nid; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), .reclaim_idx = MAX_NR_ZONES - 1, .target_mem_cgroup = memcg, @@ -3702,7 +3702,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in int classzone_idx = gfp_zone(gfp_mask); struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), .order = order, .priority = NODE_RECLAIM_PRIORITY, .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), -- cgit v1.2.3 From c7c9c685b40e38d82ed89cb099d3118646520457 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 5 Apr 2017 09:20:35 +1000 Subject: mm-introduce-memalloc_nofs_saverestore-api-fix fix comment typo, reflow comment Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b4a9f9382e0c..0b1de367e79a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3949,10 +3949,10 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, goto out; /* - * Apply scoped allocation constrains. This is mainly about - * GFP_NOFS resp. GFP_NOIO which has to be inherited for all - * allocation requests from a particular context which has - * been marked by memalloc_no{fs,io}_{save,restore} + * Apply scoped allocation constraints. This is mainly about GFP_NOFS + * resp. GFP_NOIO which has to be inherited for all allocation requests + * from a particular context which has been marked by + * memalloc_no{fs,io}_{save,restore}. */ alloc_mask = current_gfp_context(gfp_mask); ac.spread_dirty_pages = false; -- cgit v1.2.3 From 6f840f9cc900929a7dd86906a31cde397a16aad8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 5 Apr 2017 09:20:36 +1000 Subject: xfs: use memalloc_nofs_{save,restore} instead of memalloc_noio* kmem_zalloc_large and _xfs_buf_map_pages use memalloc_noio_{save,restore} API to prevent from reclaim recursion into the fs because vmalloc can invoke unconditional GFP_KERNEL allocations and these functions might be called from the NOFS contexts. The memalloc_noio_save will enforce GFP_NOIO context which is even weaker than GFP_NOFS and that seems to be unnecessary. Let's use memalloc_nofs_{save,restore} instead as it should provide exactly what we need here - implicit GFP_NOFS context. Link: http://lkml.kernel.org/r/20170306131408.9828-6-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Acked-by: Vlastimil Babka Cc: Dave Chinner Cc: Theodore Ts'o Cc: Chris Mason Cc: David Sterba Cc: Jan Kara Cc: Nikolay Borisov Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- fs/xfs/kmem.c | 12 ++++++------ fs/xfs/xfs_buf.c | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index d0ac1a065539..780fc8986dab 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -48,7 +48,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) void * kmem_zalloc_large(size_t size, xfs_km_flags_t flags) { - unsigned noio_flag = 0; + unsigned nofs_flag = 0; void *ptr; gfp_t lflags; @@ -60,17 +60,17 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) * __vmalloc() will allocate data pages and auxillary structures (e.g. * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context * here. Hence we need to tell memory reclaim that we are in such a - * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering + * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering * the filesystem here and potentially deadlocking. */ - if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS)) - noio_flag = memalloc_noio_save(); + if (flags & KM_NOFS) + nofs_flag = memalloc_nofs_save(); lflags = kmem_flags_convert(flags); ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); - if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS)) - memalloc_noio_restore(noio_flag); + if (flags & KM_NOFS) + memalloc_nofs_restore(nofs_flag); return ptr; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b6208728ba39..ca09061369cb 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -443,17 +443,17 @@ _xfs_buf_map_pages( bp->b_addr = NULL; } else { int retried = 0; - unsigned noio_flag; + unsigned nofs_flag; /* * vm_map_ram() will allocate auxillary structures (e.g. * pagetables) with GFP_KERNEL, yet we are likely to be under * GFP_NOFS context here. Hence we need to tell memory reclaim - * that we are in such a context via PF_MEMALLOC_NOIO to prevent + * that we are in such a context via PF_MEMALLOC_NOFS to prevent * memory reclaim re-entering the filesystem here and * potentially deadlocking. */ - noio_flag = memalloc_noio_save(); + nofs_flag = memalloc_nofs_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, -1, PAGE_KERNEL); @@ -461,7 +461,7 @@ _xfs_buf_map_pages( break; vm_unmap_aliases(); } while (retried++ <= 1); - memalloc_noio_restore(noio_flag); + memalloc_nofs_restore(nofs_flag); if (!bp->b_addr) return -ENOMEM; -- cgit v1.2.3 From fbb8c9f146ea722f52203b2a87c24a9514bd92dd Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 5 Apr 2017 09:20:36 +1000 Subject: jbd2: mark the transaction context with the scope GFP_NOFS context now that we have memalloc_nofs_{save,restore} api we can mark the whole transaction context as implicitly GFP_NOFS. All allocations will automatically inherit GFP_NOFS this way. This means that we do not have to mark any of those requests with GFP_NOFS and moreover all the ext4_kv[mz]alloc(GFP_NOFS) are also safe now because even the hardcoded GFP_KERNEL allocations deep inside the vmalloc will be NOFS now. Link: http://lkml.kernel.org/r/20170306131408.9828-7-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Jan Kara Cc: Dave Chinner Cc: Theodore Ts'o Cc: Chris Mason Cc: David Sterba Cc: Brian Foster Cc: Darrick J. Wong Cc: Nikolay Borisov Cc: Peter Zijlstra Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- fs/jbd2/transaction.c | 12 ++++++++++++ include/linux/jbd2.h | 2 ++ 2 files changed, 14 insertions(+) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 5e659ee08d6a..d8f09f34285f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -388,6 +389,11 @@ repeat: rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_); jbd2_journal_free_transaction(new_transaction); + /* + * Make sure that no allocations done while the transaction is + * open is going to recurse back to the fs layer. + */ + handle->saved_alloc_context = memalloc_nofs_save(); return 0; } @@ -466,6 +472,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, handle->h_transaction->t_tid, type, line_no, nblocks); + return handle; } EXPORT_SYMBOL(jbd2__journal_start); @@ -1760,6 +1767,11 @@ int jbd2_journal_stop(handle_t *handle) if (handle->h_rsv_handle) jbd2_journal_free_reserved(handle->h_rsv_handle); free_and_exit: + /* + * scope of th GFP_NOFS context is over here and so we can + * restore the original alloc context. + */ + memalloc_nofs_restore(handle->saved_alloc_context); jbd2_free_handle(handle); return err; } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index dfaa1f4dcb0c..606b6bce3a5b 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -491,6 +491,8 @@ struct jbd2_journal_handle unsigned long h_start_jiffies; unsigned int h_requested_credits; + + unsigned int saved_alloc_context; }; -- cgit v1.2.3 From 12016b84e88322bbecdc0a0b4d23272f5c91081d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 5 Apr 2017 09:20:37 +1000 Subject: jbd2-mark-the-transaction-context-with-the-scope-gfp_nofs-context-fix tweak comments Cc: Jan Kara Cc: Michal Hocko Signed-off-by: Andrew Morton --- fs/jbd2/transaction.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index d8f09f34285f..9ee4832b6f8b 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -390,8 +390,8 @@ repeat: rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_); jbd2_journal_free_transaction(new_transaction); /* - * Make sure that no allocations done while the transaction is - * open is going to recurse back to the fs layer. + * Ensure that no allocations done while the transaction is open are + * going to recurse back to the fs layer. */ handle->saved_alloc_context = memalloc_nofs_save(); return 0; @@ -1768,8 +1768,8 @@ int jbd2_journal_stop(handle_t *handle) jbd2_journal_free_reserved(handle->h_rsv_handle); free_and_exit: /* - * scope of th GFP_NOFS context is over here and so we can - * restore the original alloc context. + * Scope of the GFP_NOFS context is over here and so we can restore the + * original alloc context. */ memalloc_nofs_restore(handle->saved_alloc_context); jbd2_free_handle(handle); -- cgit v1.2.3 From 670cd509d441dddccc89ecb33163bb8f3d374f3f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 5 Apr 2017 09:20:37 +1000 Subject: jbd2: make the whole kjournald2 kthread NOFS safe kjournald2 is central to the transaction commit processing. As such any potential allocation from this kernel thread has to be GFP_NOFS. Make sure to mark the whole kernel thread GFP_NOFS by the memalloc_nofs_save. Link: http://lkml.kernel.org/r/20170306131408.9828-8-mhocko@kernel.org Signed-off-by: Michal Hocko Suggested-by: Jan Kara Reviewed-by: Jan Kara Cc: Dave Chinner Cc: Theodore Ts'o Cc: Chris Mason Cc: David Sterba Cc: Brian Foster Cc: Darrick J. Wong Cc: Nikolay Borisov Cc: Peter Zijlstra Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- fs/jbd2/journal.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5adc2fb62b0f..5e86e8ca1324 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -43,6 +43,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -205,6 +206,13 @@ static int kjournald2(void *arg) journal->j_task = current; wake_up(&journal->j_wait_done_commit); + /* + * Make sure that no allocations from this kernel thread will ever recurse + * to the fs layer because we are responsible for the transaction commit + * and any fs involvement might get stuck waiting for the trasn. commit. + */ + memalloc_nofs_save(); + /* * And now, wait forever for commit wakeup events. */ -- cgit v1.2.3 From b213931158477c10f5e584ba3f0fdac5a628a7b4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 5 Apr 2017 09:20:38 +1000 Subject: jbd2-make-the-whole-kjournald2-kthread-nofs-safe-checkpatch-fixes WARNING: line over 80 characters #42: FILE: fs/jbd2/journal.c:210: + * Make sure that no allocations from this kernel thread will ever recurse total: 0 errors, 1 warnings, 20 lines checked NOTE: For some of the reported defects, checkpatch may be able to mechanically convert to the typical style using --fix or --fix-inplace. ./patches/jbd2-make-the-whole-kjournald2-kthread-nofs-safe.patch has style problems, please review. NOTE: If any of the errors are false positives, please report them to the maintainer, see CHECKPATCH in MAINTAINERS. Please run checkpatch prior to sending patches Cc: Jan Kara Cc: Michal Hocko Signed-off-by: Andrew Morton --- fs/jbd2/journal.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5e86e8ca1324..c43fe83ee708 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -207,9 +207,10 @@ static int kjournald2(void *arg) wake_up(&journal->j_wait_done_commit); /* - * Make sure that no allocations from this kernel thread will ever recurse - * to the fs layer because we are responsible for the transaction commit - * and any fs involvement might get stuck waiting for the trasn. commit. + * Make sure that no allocations from this kernel thread will ever + * recurse to the fs layer because we are responsible for the + * transaction commit and any fs involvement might get stuck waiting for + * the trasn. commit. */ memalloc_nofs_save(); -- cgit v1.2.3 From bee959dd9d80673220934e52213dfd98d3c28a02 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 5 Apr 2017 09:20:39 +1000 Subject: mm: tighten up the fault path a little The round_up() macro generates a couple of unnecessary instructions in this usage: 48cd: 49 8b 47 50 mov 0x50(%r15),%rax 48d1: 48 83 e8 01 sub $0x1,%rax 48d5: 48 0d ff 0f 00 00 or $0xfff,%rax 48db: 48 83 c0 01 add $0x1,%rax 48df: 48 c1 f8 0c sar $0xc,%rax 48e3: 48 39 c3 cmp %rax,%rbx 48e6: 72 2e jb 4916 If we change round_up() to ((x) + __round_mask(x, y)) & ~__round_mask(x, y) then GCC can see through it and remove the mask (because that would be dead code given the subsequent shift): 48cd: 49 8b 47 50 mov 0x50(%r15),%rax 48d1: 48 05 ff 0f 00 00 add $0xfff,%rax 48d7: 48 c1 e8 0c shr $0xc,%rax 48db: 48 39 c3 cmp %rax,%rbx 48de: 72 2e jb 490e But that's problematic because we'd evaluate 'y' twice. Converting round_up into an inline function prevents it from being used in other definitions. The easiest thing to do is just change these three usages of round_up to use DIV_ROUND_UP. Also add an unlikely() because GCC's heuristic is wrong in this case. Link: http://lkml.kernel.org/r/20170207192812.5281-1-willy@infradead.org Signed-off-by: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/filemap.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 1694623a6289..68b166a9eda0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2202,12 +2202,12 @@ int filemap_fault(struct vm_fault *vmf) struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; + pgoff_t max_off; struct page *page; - loff_t size; int ret = 0; - size = round_up(i_size_read(inode), PAGE_SIZE); - if (offset >= size >> PAGE_SHIFT) + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= max_off)) return VM_FAULT_SIGBUS; /* @@ -2256,8 +2256,8 @@ retry_find: * Found the page and have a reference on it. * We must recheck i_size under page lock. */ - size = round_up(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= size >> PAGE_SHIFT)) { + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= max_off)) { unlock_page(page); put_page(page); return VM_FAULT_SIGBUS; @@ -2323,7 +2323,7 @@ void filemap_map_pages(struct vm_fault *vmf, struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; - loff_t size; + unsigned long max_idx; struct page *head, *page; rcu_read_lock(); @@ -2369,8 +2369,8 @@ repeat: if (page->mapping != mapping || !PageUptodate(page)) goto unlock; - size = round_up(i_size_read(mapping->host), PAGE_SIZE); - if (page->index >= size >> PAGE_SHIFT) + max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + if (page->index >= max_idx) goto unlock; if (file->f_ra.mmap_miss > 0) -- cgit v1.2.3 From eaf02dbe42bb68dc2b7114a879e495b569562e86 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 5 Apr 2017 09:20:39 +1000 Subject: mm: remove rodata_test_data export, add pr_fmt Since commit 3ad38ceb2769 ("x86/mm: Remove CONFIG_DEBUG_NX_TEST"), nothing is using the exported rodata_test_data variable, so drop the export. Additionally updates the pr_fmt to avoid redundant strings and adjusts some whitespace. Link: http://lkml.kernel.org/r/20170307005313.GA85809@beast Signed-off-by: Kees Cook Cc: Jinbum Park Cc: Arjan van de Ven Signed-off-by: Andrew Morton --- include/linux/rodata_test.h | 1 - mm/rodata_test.c | 17 +++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/rodata_test.h b/include/linux/rodata_test.h index ea05f6c51413..84766bcdd01f 100644 --- a/include/linux/rodata_test.h +++ b/include/linux/rodata_test.h @@ -14,7 +14,6 @@ #define _RODATA_TEST_H #ifdef CONFIG_DEBUG_RODATA_TEST -extern const int rodata_test_data; void rodata_test(void); #else static inline void rodata_test(void) {} diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 0fd21670b513..6bb4deb12e78 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c @@ -9,11 +9,12 @@ * as published by the Free Software Foundation; version 2 * of the License. */ +#define pr_fmt(fmt) "rodata_test: " fmt + #include #include const int rodata_test_data = 0xC3; -EXPORT_SYMBOL_GPL(rodata_test_data); void rodata_test(void) { @@ -23,20 +24,20 @@ void rodata_test(void) /* test 1: read the value */ /* If this test fails, some previous testrun has clobbered the state */ if (!rodata_test_data) { - pr_err("rodata_test: test 1 fails (start data)\n"); + pr_err("test 1 fails (start data)\n"); return; } /* test 2: write to the variable; this should fault */ if (!probe_kernel_write((void *)&rodata_test_data, - (void *)&zero, sizeof(zero))) { - pr_err("rodata_test: test data was not read only\n"); + (void *)&zero, sizeof(zero))) { + pr_err("test data was not read only\n"); return; } /* test 3: check the value hasn't changed */ if (rodata_test_data == zero) { - pr_err("rodata_test: test data was changed\n"); + pr_err("test data was changed\n"); return; } @@ -44,13 +45,13 @@ void rodata_test(void) start = (unsigned long)__start_rodata; end = (unsigned long)__end_rodata; if (start & (PAGE_SIZE - 1)) { - pr_err("rodata_test: start of .rodata is not page size aligned\n"); + pr_err("start of .rodata is not page size aligned\n"); return; } if (end & (PAGE_SIZE - 1)) { - pr_err("rodata_test: end of .rodata is not page size aligned\n"); + pr_err("end of .rodata is not page size aligned\n"); return; } - pr_info("rodata_test: all tests were successful\n"); + pr_info("all tests were successful\n"); } -- cgit v1.2.3 From 4867f817534bee769144e9576eef2ca6cc0eeac9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 5 Apr 2017 09:20:40 +1000 Subject: mm: move pcp and lru-pcp draining into single wq We currently have 2 specific WQ_RECLAIM workqueues in the mm code. vmstat_wq for updating pcp stats and lru_add_drain_wq dedicated to drain per cpu lru caches. This seems more than necessary because both can run on a single WQ. Both do not block on locks requiring a memory allocation nor perform any allocations themselves. We will save one rescuer thread this way. On the other hand drain_all_pages() queues work on the system wq which doesn't have rescuer and so this depend on memory allocation (when all workers are stuck allocating and new ones cannot be created). This is not critical as there should be somebody invoking the OOM killer (e.g. the forking worker) and get the situation unstuck and eventually performs the draining. Quite annoying though. This worker should be using WQ_RECLAIM as well. We can reuse the same one as for lru draining and vmstat. Link: http://lkml.kernel.org/r/20170307131751.24936-1-mhocko@kernel.org Signed-off-by: Michal Hocko Suggested-by: Tetsuo Handa Acked-by: Vlastimil Babka Acked-by: Mel Gorman Tested-by: Yang Li Signed-off-by: Andrew Morton --- mm/internal.h | 7 +++++++ mm/page_alloc.c | 9 ++++++++- mm/swap.c | 27 ++++++++------------------- mm/vmstat.c | 14 +++++++++----- 4 files changed, 32 insertions(+), 25 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 823a7a89099b..04d08ef91224 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -486,6 +486,13 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, enum ttu_flags; struct tlbflush_unmap_batch; + +/* + * only for MM internal work items which do not depend on + * any allocations or locks which might depend on allocations + */ +extern struct workqueue_struct *mm_percpu_wq; + #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH void try_to_unmap_flush(void); void try_to_unmap_flush_dirty(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0b1de367e79a..c0977c4d0501 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2364,6 +2364,13 @@ void drain_all_pages(struct zone *zone) */ static cpumask_t cpus_with_pcps; + /* + * Make sure nobody triggers this path before mm_percpu_wq is fully + * initialized. + */ + if (WARN_ON_ONCE(!mm_percpu_wq)) + return; + /* Workqueues cannot recurse */ if (current->flags & PF_WQ_WORKER) return; @@ -2413,7 +2420,7 @@ void drain_all_pages(struct zone *zone) for_each_cpu(cpu, &cpus_with_pcps) { struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); INIT_WORK(work, drain_local_pages_wq); - schedule_work_on(cpu, work); + queue_work_on(cpu, mm_percpu_wq, work); } for_each_cpu(cpu, &cpus_with_pcps) flush_work(per_cpu_ptr(&pcpu_drain, cpu)); diff --git a/mm/swap.c b/mm/swap.c index ac98eb443a03..361bdb1575ab 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -677,30 +677,19 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); -/* - * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM - * workqueue, aiding in getting memory freed. - */ -static struct workqueue_struct *lru_add_drain_wq; - -static int __init lru_init(void) -{ - lru_add_drain_wq = alloc_workqueue("lru-add-drain", WQ_MEM_RECLAIM, 0); - - if (WARN(!lru_add_drain_wq, - "Failed to create workqueue lru_add_drain_wq")) - return -ENOMEM; - - return 0; -} -early_initcall(lru_init); - void lru_add_drain_all(void) { static DEFINE_MUTEX(lock); static struct cpumask has_work; int cpu; + /* + * Make sure nobody triggers this path before mm_percpu_wq is fully + * initialized. + */ + if (WARN_ON(!mm_percpu_wq)) + return; + mutex_lock(&lock); get_online_cpus(); cpumask_clear(&has_work); @@ -714,7 +703,7 @@ void lru_add_drain_all(void) pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); - queue_work_on(cpu, lru_add_drain_wq, work); + queue_work_on(cpu, mm_percpu_wq, work); cpumask_set_cpu(cpu, &has_work); } } diff --git a/mm/vmstat.c b/mm/vmstat.c index fe937e32a7a6..6561402443cb 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1563,7 +1563,6 @@ static const struct file_operations proc_vmstat_file_operations = { #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SMP -static struct workqueue_struct *vmstat_wq; static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; @@ -1621,7 +1620,7 @@ static void vmstat_update(struct work_struct *w) * to occur in the future. Keep on running the * update worker thread. */ - queue_delayed_work_on(smp_processor_id(), vmstat_wq, + queue_delayed_work_on(smp_processor_id(), mm_percpu_wq, this_cpu_ptr(&vmstat_work), round_jiffies_relative(sysctl_stat_interval)); } @@ -1700,7 +1699,7 @@ static void vmstat_shepherd(struct work_struct *w) struct delayed_work *dw = &per_cpu(vmstat_work, cpu); if (!delayed_work_pending(dw) && need_update(cpu)) - queue_delayed_work_on(cpu, vmstat_wq, dw, 0); + queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0); } put_online_cpus(); @@ -1716,7 +1715,6 @@ static void __init start_shepherd_timer(void) INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), vmstat_update); - vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); } @@ -1762,10 +1760,16 @@ static int vmstat_cpu_dead(unsigned int cpu) #endif +struct workqueue_struct *mm_percpu_wq; + void __init init_mm_internals(void) { + int ret __maybe_unused; + + mm_percpu_wq = alloc_workqueue("vmstat", + WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + #ifdef CONFIG_SMP - int ret; ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead", NULL, vmstat_cpu_dead); -- cgit v1.2.3 From 0ec46c948f7fc081671291037eb92c6e6a5fc46e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 5 Apr 2017 09:20:40 +1000 Subject: mm-move-pcp-and-lru-pcp-drainging-into-single-wq-fix > Should the workqueue also have been renamed to mm_percpu_wq? Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/vmstat.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 6561402443cb..757be8303aa0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1766,11 +1766,10 @@ void __init init_mm_internals(void) { int ret __maybe_unused; - mm_percpu_wq = alloc_workqueue("vmstat", + mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); #ifdef CONFIG_SMP - ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead", NULL, vmstat_cpu_dead); if (ret < 0) -- cgit v1.2.3 From 34883c08506e0b33b283fe252c59edbc17efd32a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 5 Apr 2017 09:20:41 +1000 Subject: mm, compaction: reorder fields in struct compact_control Patch series "try to reduce fragmenting fallbacks", v3. Last year, Johannes Weiner has reported a regression in page mobility grouping [1] and while the exact cause was not found, I've come up with some ways to improve it by reducing the number of allocations falling back to different migratetype and causing permanent fragmentation. The series was tested with mmtests stress-highalloc modified to do GFP_KERNEL order-4 allocations, on 4.9 with "mm, vmscan: fix zone balance check in prepare_kswapd_sleep" (without that, kcompactd indeed wasn't woken up) on UMA machine with 4GB memory. There were 5 repeats of each run, as the extfrag stats are quite volatile (note the stats below are sums, not averages, as it was less perl hacking for me). Success rate are the same, already high due to the low allocation order used, so I'm not including them. Compaction stats: (the patches are stacked, and I haven't measured the non-functional-changes patches separately) patch 1 patch 2 patch 3 patch 4 patch 7 patch 8 Compaction stalls 22449 24680 24846 19765 22059 17480 Compaction success 12971 14836 14608 10475 11632 8757 Compaction failures 9477 9843 10238 9290 10426 8722 Page migrate success 3109022 3370438 3312164 1695105 1608435 2111379 Page migrate failure 911588 1149065 1028264 1112675 1077251 1026367 Compaction pages isolated 7242983 8015530 7782467 4629063 4402787 5377665 Compaction migrate scanned 980838938 987367943 957690188 917647238 947155598 1018922197 Compaction free scanned 557926893 598946443 602236894 594024490 541169699 763651731 Compaction cost 10243 10578 10304 8286 8398 9440 Compaction stats are mostly within noise until patch 4, which decreases the number of compactions, and migrations. Part of that could be due to more pageblocks marked as unmovable, and async compaction skipping those. This changes a bit with patch 7, but not so much. Patch 8 increases free scanner stats and migrations, which comes from the changed termination criteria. Interestingly number of compactions decreases - probably the fully compacted pageblock satisfies multiple subsequent allocations, so it amortizes. Next comes the extfrag tracepoint, where "fragmenting" means that an allocation had to fallback to a pageblock of another migratetype which wasn't fully free (which is almost all of the fallbacks). I have locally added another tracepoint for "Page steal" into steal_suitable_fallback() which triggers in situations where we are allowed to do move_freepages_block(). If we decide to also do set_pageblock_migratetype(), it's "Pages steal with pageblock" with break down for which allocation migratetype we are stealing and from which fallback migratetype. The last part "due to counting" comes from patch 4 and counts the events where the counting of movable pages allowed us to change pageblock's migratetype, while the number of free pages alone wouldn't be enough to cross the threshold. patch 1 patch 2 patch 3 patch 4 patch 7 patch 8 Page alloc extfrag event 10155066 8522968 10164959 15622080 13727068 13140319 Extfrag fragmenting 10149231 8517025 10159040 15616925 13721391 13134792 Extfrag fragmenting for unmovable 159504 168500 184177 97835 70625 56948 Extfrag fragmenting unmovable placed with movable 153613 163549 172693 91740 64099 50917 Extfrag fragmenting unmovable placed with reclaim. 5891 4951 11484 6095 6526 6031 Extfrag fragmenting for reclaimable 4738 4829 6345 4822 5640 5378 Extfrag fragmenting reclaimable placed with movable 1836 1902 1851 1579 1739 1760 Extfrag fragmenting reclaimable placed with unmov. 2902 2927 4494 3243 3901 3618 Extfrag fragmenting for movable 9984989 8343696 9968518 15514268 13645126 13072466 Pages steal 179954 192291 210880 123254 94545 81486 Pages steal with pageblock 22153 18943 20154 33562 29969 33444 Pages steal with pageblock for unmovable 14350 12858 13256 20660 19003 20852 Pages steal with pageblock for unmovable from mov. 12812 11402 11683 19072 17467 19298 Pages steal with pageblock for unmovable from recl. 1538 1456 1573 1588 1536 1554 Pages steal with pageblock for movable 7114 5489 5965 11787 10012 11493 Pages steal with pageblock for movable from unmov. 6885 5291 5541 11179 9525 10885 Pages steal with pageblock for movable from recl. 229 198 424 608 487 608 Pages steal with pageblock for reclaimable 689 596 933 1115 954 1099 Pages steal with pageblock for reclaimable from unmov. 273 219 537 658 547 667 Pages steal with pageblock for reclaimable from mov. 416 377 396 457 407 432 Pages steal with pageblock due to counting 11834 10075 7530 ... for unmovable 8993 7381 4616 ... for movable 2792 2653 2851 ... for reclaimable 49 41 63 What we can see is that "Extfrag fragmenting for unmovable" and "... placed with movable" drops with almost each patch, which is good as we are polluting less movable pageblocks with unmovable pages. The most significant change is patch 4 with movable page counting. On the other hand it increases "Extfrag fragmenting for movable" by 50%. "Pages steal" drops though, so these movable allocation fallbacks find only small free pages and are not allowed to steal whole pageblocks back. "Pages steal with pageblock" raises, because the patch increases the chances of pageblock migratetype changes to happen. This affects all migratetypes. The summary is that patch 4 is not a clear win wrt these stats, but I believe that the tradeoff it makes is a good one. There's less pollution of movable pageblocks by unmovable allocations. There's less stealing between pageblock, and those that remain have higher chance of changing migratetype also the pageblock itself, so it should more faithfully reflect the migratetype of the pages within the pageblock. The increase of movable allocations falling back to unmovable pageblock might look dramatic, but those allocations can be migrated by compaction when needed, and other patches in the series (7-9) improve that aspect. Patches 7 and 8 continue the trend of reduced unmovable fallbacks and also reduce the impact on movable fallbacks from patch 4. [1] https://www.spinics.net/lists/linux-mm/msg114237.html This patch (of 8): While currently there are (mostly by accident) no holes in struct compact_control (on x86_64), but we are going to add more bool flags, so place them all together to the end of the structure. While at it, just order all fields from largest to smallest. Link: http://lkml.kernel.org/r/20170307131545.28577-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Acked-by: Johannes Weiner Cc: Joonsoo Kim Cc: David Rientjes Signed-off-by: Andrew Morton --- mm/internal.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 04d08ef91224..004471b72977 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -183,6 +183,7 @@ extern int user_min_free_kbytes; struct compact_control { struct list_head freepages; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ + struct zone *zone; unsigned long nr_freepages; /* Number of isolated free pages */ unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long total_migrate_scanned; @@ -190,16 +191,15 @@ struct compact_control { unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ + const gfp_t gfp_mask; /* gfp mask of a direct compactor */ + int order; /* order a direct compactor needs */ + const unsigned int alloc_flags; /* alloc flags of a direct compactor */ + const int classzone_idx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ bool whole_zone; /* Whole zone should/has been scanned */ - int order; /* order a direct compactor needs */ - const gfp_t gfp_mask; /* gfp mask of a direct compactor */ - const unsigned int alloc_flags; /* alloc flags of a direct compactor */ - const int classzone_idx; /* zone index of a direct compactor */ - struct zone *zone; bool contended; /* Signal lock or sched contention */ }; -- cgit v1.2.3 From 66403bc7d65352979de165613fa1dc655dca861e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 5 Apr 2017 09:20:41 +1000 Subject: mm, compaction: remove redundant watermark check in compact_finished() When detecting whether compaction has succeeded in forming a high-order page, __compact_finished() employs a watermark check, followed by an own search for a suitable page in the freelists. This is not ideal for two reasons: - The watermark check also searches high-order freelists, but has a less strict criteria wrt fallback. It's therefore redundant and waste of cycles. This was different in the past when high-order watermark check attempted to apply reserves to high-order pages. - The watermark check might actually fail due to lack of order-0 pages. Compaction can't help with that, so there's no point in continuing because of that. It's possible that high-order page still exists and it terminates. This patch therefore removes the watermark check. This should save some cycles and terminate compaction sooner in some cases. Link: http://lkml.kernel.org/r/20170307131545.28577-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Acked-by: Johannes Weiner Cc: Joonsoo Kim Cc: David Rientjes Signed-off-by: Andrew Morton --- mm/compaction.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 81e1eaa2a2cf..9222ff362f33 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1280,7 +1280,6 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ const int migratetype) { unsigned int order; - unsigned long watermark; if (cc->contended || fatal_signal_pending(current)) return COMPACT_CONTENDED; @@ -1308,13 +1307,6 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ if (is_via_compact_memory(cc->order)) return COMPACT_CONTINUE; - /* Compaction run is not finished if the watermark is not met */ - watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK]; - - if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, - cc->alloc_flags)) - return COMPACT_CONTINUE; - /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { struct free_area *area = &zone->free_area[order]; -- cgit v1.2.3 From 25b17331ed1a2dcab6b12688bc6d63bcf00c5728 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 5 Apr 2017 09:20:42 +1000 Subject: mm, page_alloc: split smallest stolen page in fallback The __rmqueue_fallback() function is called when there's no free page of requested migratetype, and we need to steal from a different one. There are various heuristics to make this event infrequent and reduce permanent fragmentation. The main one is to try stealing from a pageblock that has the most free pages, and possibly steal them all at once and convert the whole pageblock. Precise searching for such pageblock would be expensive, so instead the heuristics walks the free lists from MAX_ORDER down to requested order and assumes that the block with highest-order free page is likely to also have the most free pages in total. Chances are that together with the highest-order page, we steal also pages of lower orders from the same block. But then we still split the highest order page. This is wasteful and can contribute to fragmentation instead of avoiding it. This patch thus changes __rmqueue_fallback() to just steal the page(s) and put them on the freelist of the requested migratetype, and only report whether it was successful. Then we pick (and eventually split) the smallest page with __rmqueue_smallest(). This all happens under zone lock, so nobody can steal it from us in the process. This should reduce fragmentation due to fallbacks. At worst we are only stealing a single highest-order page and waste some cycles by moving it between lists and then removing it, but fallback is not exactly hot path so that should not be a concern. As a side benefit the patch removes some duplicate code by reusing __rmqueue_smallest(). Link: http://lkml.kernel.org/r/20170307131545.28577-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Acked-by: Johannes Weiner Cc: Joonsoo Kim Cc: David Rientjes Signed-off-by: Andrew Morton --- mm/page_alloc.c | 59 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c0977c4d0501..76236b572fc6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1953,23 +1953,41 @@ static bool can_steal_fallback(unsigned int order, int start_mt) * use it's pages as requested migratetype in the future. */ static void steal_suitable_fallback(struct zone *zone, struct page *page, - int start_type) + int start_type, bool whole_block) { unsigned int current_order = page_order(page); + struct free_area *area; int pages; + /* + * This can happen due to races and we want to prevent broken + * highatomic accounting. + */ + if (is_migrate_highatomic_page(page)) + goto single_page; + /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { change_pageblock_range(page, current_order, start_type); - return; + goto single_page; } + /* We are not allowed to try stealing from the whole block */ + if (!whole_block) + goto single_page; + pages = move_freepages_block(zone, page, start_type); /* Claim the whole block if over half of it is free */ if (pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_type); + + return; + +single_page: + area = &zone->free_area[current_order]; + list_move(&page->lru, &area->free_list[start_type]); } /* @@ -2128,8 +2146,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, return false; } -/* Remove an element from the buddy allocator from the fallback list */ -static inline struct page * +/* + * Try finding a free buddy page on the fallback list and put it on the free + * list of requested migratetype, possibly along with other pages from the same + * block, depending on fragmentation avoidance heuristics. Returns true if + * fallback was found so that __rmqueue_smallest() can grab it. + */ +static inline bool __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) { struct free_area *area; @@ -2150,32 +2173,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); - if (can_steal && !is_migrate_highatomic_page(page)) - steal_suitable_fallback(zone, page, start_migratetype); - /* Remove the page from the freelists */ - area->nr_free--; - list_del(&page->lru); - rmv_page_order(page); - - expand(zone, page, order, current_order, area, - start_migratetype); - /* - * The pcppage_migratetype may differ from pageblock's - * migratetype depending on the decisions in - * find_suitable_fallback(). This is OK as long as it does not - * differ for MIGRATE_CMA pageblocks. Those can be used as - * fallback only via special __rmqueue_cma_fallback() function - */ - set_pcppage_migratetype(page, start_migratetype); + steal_suitable_fallback(zone, page, start_migratetype, + can_steal); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); - return page; + return true; } - return NULL; + return false; } /* @@ -2187,13 +2195,14 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, { struct page *page; +retry: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { if (migratetype == MIGRATE_MOVABLE) page = __rmqueue_cma_fallback(zone, order); - if (!page) - page = __rmqueue_fallback(zone, order, migratetype); + if (!page && __rmqueue_fallback(zone, order, migratetype)) + goto retry; } trace_mm_page_alloc_zone_locked(page, order, migratetype); -- cgit v1.2.3 From 6c0d9a04e3a0f6dcfee1ecefae8530e0adffa445 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 5 Apr 2017 09:20:43 +1000 Subject: mm-page_alloc-split-smallest-stolen-page-in-fallback-fix The lkp-robot reported a test case stuck on boot due to the patch [1] which was due to endless loop in the modified __rmqueue(). It blindly expected that move_freepages_block() will succeed, but that can fail due to last pageblock pfn not belonging to the same zone as the fallback candidate page. This fix checks the result of move_freepages_block() and steals the single candidate page if it fails, which was also effectively done before [1]. [1] mmotm: mm-page_alloc-split-smallest-stolen-page-in-fallback.patch Link: http://lkml.kernel.org/r/59d71b35-d556-4fc9-ee2e-1574259282fd@suse.cz Signed-off-by: Vlastimil Babka Cc: Stephen Rothwell Cc: Mel Gorman Cc: Johannes Weiner Cc: Joonsoo Kim Cc: David Rientjes Signed-off-by: Andrew Morton --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 76236b572fc6..299ecbae04d6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1977,6 +1977,9 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, goto single_page; pages = move_freepages_block(zone, page, start_type); + /* moving whole block can fail due to zone boundary conditions */ + if (!pages) + goto single_page; /* Claim the whole block if over half of it is free */ if (pages >= (1 << (pageblock_order-1)) || -- cgit v1.2.3 From 5ec0505c1417ae25edc0ad7106fa5122ceff2d53 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 5 Apr 2017 09:20:43 +1000 Subject: mm, page_alloc: count movable pages when stealing from pageblock When stealing pages from pageblock of a different migratetype, we count how many free pages were stolen, and change the pageblock's migratetype if more than half of the pageblock was free. This might be too conservative, as there might be other pages that are not free, but were allocated with the same migratetype as our allocation requested. While we cannot determine the migratetype of allocated pages precisely (at least without the page_owner functionality enabled), we can count pages that compaction would try to isolate for migration - those are either on LRU or __PageMovable(). The rest can be assumed to be MIGRATE_RECLAIMABLE or MIGRATE_UNMOVABLE, which we cannot easily distinguish. This counting can be done as part of free page stealing with little additional overhead. The page stealing code is changed so that it considers free pages plus pages of the "good" migratetype for the decision whether to change pageblock's migratetype. The result should be more accurate migratetype of pageblocks wrt the actual pages in the pageblocks, when stealing from semi-occupied pageblocks. This should help the efficiency of page grouping by mobility. In testing based on 4.9 kernel with stress-highalloc from mmtests configured for order-4 GFP_KERNEL allocations, this patch has reduced the number of unmovable allocations falling back to movable pageblocks by 47%. The number of movable allocations falling back to other pageblocks are increased by 55%, but these events don't cause permanent fragmentation, so the tradeoff should be positive. Later patches also offset the movable fallback increase to some extent. Link: http://lkml.kernel.org/r/20170307131545.28577-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Johannes Weiner Cc: Joonsoo Kim Cc: David Rientjes Signed-off-by: Andrew Morton --- include/linux/page-isolation.h | 5 +-- mm/page_alloc.c | 72 ++++++++++++++++++++++++++++++++++-------- mm/page_isolation.c | 5 +-- 3 files changed, 62 insertions(+), 20 deletions(-) diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 047d64706f2a..d4cd2014fa6f 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -33,10 +33,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, bool skip_hwpoisoned_pages); void set_pageblock_migratetype(struct page *page, int migratetype); int move_freepages_block(struct zone *zone, struct page *page, - int migratetype); -int move_freepages(struct zone *zone, - struct page *start_page, struct page *end_page, - int migratetype); + int migratetype, int *num_movable); /* * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 299ecbae04d6..a8f518bd5651 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1837,9 +1837,9 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ -int move_freepages(struct zone *zone, +static int move_freepages(struct zone *zone, struct page *start_page, struct page *end_page, - int migratetype) + int migratetype, int *num_movable) { struct page *page; unsigned int order; @@ -1856,6 +1856,9 @@ int move_freepages(struct zone *zone, VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); #endif + if (num_movable) + *num_movable = 0; + for (page = start_page; page <= end_page;) { if (!pfn_valid_within(page_to_pfn(page))) { page++; @@ -1866,6 +1869,15 @@ int move_freepages(struct zone *zone, VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); if (!PageBuddy(page)) { + /* + * We assume that pages that could be isolated for + * migration are movable. But we don't actually try + * isolating, as that would be expensive. + */ + if (num_movable && + (PageLRU(page) || __PageMovable(page))) + (*num_movable)++; + page++; continue; } @@ -1881,7 +1893,7 @@ int move_freepages(struct zone *zone, } int move_freepages_block(struct zone *zone, struct page *page, - int migratetype) + int migratetype, int *num_movable) { unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; @@ -1898,7 +1910,8 @@ int move_freepages_block(struct zone *zone, struct page *page, if (!zone_spans_pfn(zone, end_pfn)) return 0; - return move_freepages(zone, start_page, end_page, migratetype); + return move_freepages(zone, start_page, end_page, migratetype, + num_movable); } static void change_pageblock_range(struct page *pageblock_page, @@ -1948,22 +1961,26 @@ static bool can_steal_fallback(unsigned int order, int start_mt) /* * This function implements actual steal behaviour. If order is large enough, * we can steal whole pageblock. If not, we first move freepages in this - * pageblock and check whether half of pages are moved or not. If half of - * pages are moved, we can change migratetype of pageblock and permanently - * use it's pages as requested migratetype in the future. + * pageblock to our migratetype and determine how many already-allocated pages + * are there in the pageblock with a compatible migratetype. If at least half + * of pages are free or compatible, we can change migratetype of the pageblock + * itself, so pages freed in the future will be put on the correct free list. */ static void steal_suitable_fallback(struct zone *zone, struct page *page, int start_type, bool whole_block) { unsigned int current_order = page_order(page); struct free_area *area; - int pages; + int free_pages, movable_pages, alike_pages; + int old_block_type; + + old_block_type = get_pageblock_migratetype(page); /* * This can happen due to races and we want to prevent broken * highatomic accounting. */ - if (is_migrate_highatomic_page(page)) + if (is_migrate_highatomic(old_block_type)) goto single_page; /* Take ownership for orders >= pageblock_order */ @@ -1976,13 +1993,39 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, if (!whole_block) goto single_page; - pages = move_freepages_block(zone, page, start_type); + free_pages = move_freepages_block(zone, page, start_type, + &movable_pages); + /* + * Determine how many pages are compatible with our allocation. + * For movable allocation, it's the number of movable pages which + * we just obtained. For other types it's a bit more tricky. + */ + if (start_type == MIGRATE_MOVABLE) { + alike_pages = movable_pages; + } else { + /* + * If we are falling back a RECLAIMABLE or UNMOVABLE allocation + * to MOVABLE pageblock, consider all non-movable pages as + * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or + * vice versa, be conservative since we can't distinguish the + * exact migratetype of non-movable pages. + */ + if (old_block_type == MIGRATE_MOVABLE) + alike_pages = pageblock_nr_pages + - (free_pages + movable_pages); + else + alike_pages = 0; + } + /* moving whole block can fail due to zone boundary conditions */ if (!pages) goto single_page; - /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1)) || + /* + * If a sufficient number of pages in the block are either free or of + * comparable migratability as our allocation, claim the whole block. + */ + if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_type); @@ -2060,7 +2103,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, && !is_migrate_cma(mt)) { zone->nr_reserved_highatomic += pageblock_nr_pages; set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); - move_freepages_block(zone, page, MIGRATE_HIGHATOMIC); + move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); } out_unlock: @@ -2137,7 +2180,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * may increase. */ set_pageblock_migratetype(page, ac->migratetype); - ret = move_freepages_block(zone, page, ac->migratetype); + ret = move_freepages_block(zone, page, ac->migratetype, + NULL); if (ret) { spin_unlock_irqrestore(&zone->lock, flags); return ret; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 7927bbb54a4e..5092e4ef00c8 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -66,7 +66,8 @@ out: set_pageblock_migratetype(page, MIGRATE_ISOLATE); zone->nr_isolate_pageblock++; - nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); + nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, + NULL); __mod_zone_freepage_state(zone, -nr_pages, migratetype); } @@ -120,7 +121,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) * pageblock scanning for freepage moving. */ if (!isolated_page) { - nr_pages = move_freepages_block(zone, page, migratetype); + nr_pages = move_freepages_block(zone, page, migratetype, NULL); __mod_zone_freepage_state(zone, nr_pages, migratetype); } set_pageblock_migratetype(page, migratetype); -- cgit v1.2.3 From eadd0fe934746c6343cb605ee885141219ab7a65 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 5 Apr 2017 09:20:44 +1000 Subject: mm-page_alloc-count-movable-pages-when-stealing-from-pageblock-fix merge fix Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a8f518bd5651..a834a578c256 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2018,7 +2018,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, } /* moving whole block can fail due to zone boundary conditions */ - if (!pages) + if (!free_pages) goto single_page; /* -- cgit v1.2.3 From c70542057ec37fc03c057cb73a906491a2595329 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 5 Apr 2017 09:20:44 +1000 Subject: mm, compaction: change migrate_async_suitable() to suitable_migration_source() Preparation for making the decisions more complex and depending on compact_control flags. No functional change. Link: http://lkml.kernel.org/r/20170307131545.28577-6-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Acked-by: Johannes Weiner Cc: Joonsoo Kim Cc: David Rientjes Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 5 +++++ mm/compaction.c | 19 +++++++++++-------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 446cf68c1c09..618499159a7c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -74,6 +74,11 @@ extern char * const migratetype_names[MIGRATE_TYPES]; # define is_migrate_cma_page(_page) false #endif +static inline bool is_migrate_movable(int mt) +{ + return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE; +} + #define for_each_migratetype_order(order, type) \ for (order = 0; order < MAX_ORDER; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) diff --git a/mm/compaction.c b/mm/compaction.c index 9222ff362f33..a7c2f0da7228 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -89,11 +89,6 @@ static void map_pages(struct list_head *list) list_splice(&tmp_list, list); } -static inline bool migrate_async_suitable(int migratetype) -{ - return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; -} - #ifdef CONFIG_COMPACTION int PageMovable(struct page *page) @@ -988,6 +983,15 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION +static bool suitable_migration_source(struct compact_control *cc, + struct page *page) +{ + if (cc->mode != MIGRATE_ASYNC) + return true; + + return is_migrate_movable(get_pageblock_migratetype(page)); +} + /* Returns true if the page is within a block suitable for migration to */ static bool suitable_migration_target(struct compact_control *cc, struct page *page) @@ -1007,7 +1011,7 @@ static bool suitable_migration_target(struct compact_control *cc, } /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (migrate_async_suitable(get_pageblock_migratetype(page))) + if (is_migrate_movable(get_pageblock_migratetype(page))) return true; /* Otherwise skip the block */ @@ -1242,8 +1246,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * Async compaction is optimistic to see if the minimum amount * of work satisfies the allocation. */ - if (cc->mode == MIGRATE_ASYNC && - !migrate_async_suitable(get_pageblock_migratetype(page))) + if (!suitable_migration_source(cc, page)) continue; /* Perform the isolation */ -- cgit v1.2.3 From 5a4d3740b1161d84a5c4a15b208b75ad188ae59e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 5 Apr 2017 09:20:45 +1000 Subject: mm, compaction: add migratetype to compact_control Preparation patch. We are going to need migratetype at lower layers than compact_zone() and compact_finished(). Link: http://lkml.kernel.org/r/20170307131545.28577-7-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Acked-by: Johannes Weiner Cc: Joonsoo Kim Cc: David Rientjes Signed-off-by: Andrew Morton --- mm/compaction.c | 15 +++++++-------- mm/internal.h | 1 + 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index a7c2f0da7228..c48da73e30a5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1279,10 +1279,11 @@ static inline bool is_via_compact_memory(int order) return order == -1; } -static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc, - const int migratetype) +static enum compact_result __compact_finished(struct zone *zone, + struct compact_control *cc) { unsigned int order; + const int migratetype = cc->migratetype; if (cc->contended || fatal_signal_pending(current)) return COMPACT_CONTENDED; @@ -1338,12 +1339,11 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ } static enum compact_result compact_finished(struct zone *zone, - struct compact_control *cc, - const int migratetype) + struct compact_control *cc) { int ret; - ret = __compact_finished(zone, cc, migratetype); + ret = __compact_finished(zone, cc); trace_mm_compaction_finished(zone, cc->order, ret); if (ret == COMPACT_NO_SUITABLE_PAGE) ret = COMPACT_CONTINUE; @@ -1476,9 +1476,9 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro enum compact_result ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); - const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); const bool sync = cc->mode != MIGRATE_ASYNC; + cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); ret = compaction_suitable(zone, cc->order, cc->alloc_flags, cc->classzone_idx); /* Compaction is likely to fail */ @@ -1528,8 +1528,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro migrate_prep_local(); - while ((ret = compact_finished(zone, cc, migratetype)) == - COMPACT_CONTINUE) { + while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { int err; switch (isolate_migratepages(zone, cc)) { diff --git a/mm/internal.h b/mm/internal.h index 004471b72977..e7e709fd3043 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -193,6 +193,7 @@ struct compact_control { unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ int order; /* order a direct compactor needs */ + int migratetype; /* migratetype of direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ const int classzone_idx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ -- cgit v1.2.3 From bc84ff5428991f9c1b4d941927fa436503719a9e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 5 Apr 2017 09:20:45 +1000 Subject: mm, compaction: restrict async compaction to pageblocks of same migratetype The migrate scanner in async compaction is currently limited to MIGRATE_MOVABLE pageblocks. This is a heuristic intended to reduce latency, based on the assumption that non-MOVABLE pageblocks are unlikely to contain movable pages. However, with the exception of THP's, most high-order allocations are not movable. Should the async compaction succeed, this increases the chance that the non-MOVABLE allocations will fallback to a MOVABLE pageblock, making the long-term fragmentation worse. This patch attempts to help the situation by changing async direct compaction so that the migrate scanner only scans the pageblocks of the requested migratetype. If it's a non-MOVABLE type and there are such pageblocks that do contain movable pages, chances are that the allocation can succeed within one of such pageblocks, removing the need for a fallback. If that fails, the subsequent sync attempt will ignore this restriction. In testing based on 4.9 kernel with stress-highalloc from mmtests configured for order-4 GFP_KERNEL allocations, this patch has reduced the number of unmovable allocations falling back to movable pageblocks by 30%. The number of movable allocations falling back is reduced by 12%. Link: http://lkml.kernel.org/r/20170307131545.28577-8-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Mel Gorman Cc: Johannes Weiner Cc: Joonsoo Kim Cc: David Rientjes Signed-off-by: Andrew Morton --- mm/compaction.c | 11 +++++++++-- mm/page_alloc.c | 20 +++++++++++++------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index c48da73e30a5..2c288e75840d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -986,10 +986,17 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, static bool suitable_migration_source(struct compact_control *cc, struct page *page) { - if (cc->mode != MIGRATE_ASYNC) + int block_mt; + + if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) return true; - return is_migrate_movable(get_pageblock_migratetype(page)); + block_mt = get_pageblock_migratetype(page); + + if (cc->migratetype == MIGRATE_MOVABLE) + return is_migrate_movable(block_mt); + else + return block_mt == cc->migratetype; } /* Returns true if the page is within a block suitable for migration to */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a834a578c256..7d0332b5f3d8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3677,6 +3677,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; + const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; unsigned int alloc_flags; unsigned long did_some_progress; @@ -3744,12 +3745,17 @@ retry_cpuset: /* * For costly allocations, try direct compaction first, as it's likely - * that we have enough base pages and don't need to reclaim. Don't try - * that for allocations that are allowed to ignore watermarks, as the - * ALLOC_NO_WATERMARKS attempt didn't yet happen. + * that we have enough base pages and don't need to reclaim. For non- + * movable high-order allocations, do that as well, as compaction will + * try prevent permanent fragmentation by migrating from blocks of the + * same migratetype. + * Don't try this for allocations that are allowed to ignore + * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. */ - if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER && - !gfp_pfmemalloc_allowed(gfp_mask)) { + if (can_direct_reclaim && + (costly_order || + (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) + && !gfp_pfmemalloc_allowed(gfp_mask)) { page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, INIT_COMPACT_PRIORITY, @@ -3761,7 +3767,7 @@ retry_cpuset: * Checks for costly allocations with __GFP_NORETRY, which * includes THP page fault allocations */ - if (gfp_mask & __GFP_NORETRY) { + if (costly_order && (gfp_mask & __GFP_NORETRY)) { /* * If compaction is deferred for high-order allocations, * it is because sync compaction recently failed. If @@ -3842,7 +3848,7 @@ retry: * Do not retry costly high order allocations unless they are * __GFP_REPEAT */ - if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) + if (costly_order && !(gfp_mask & __GFP_REPEAT)) goto nopage; if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, -- cgit v1.2.3 From 32795e89bce9e578747e284a9883b382542703f3 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 5 Apr 2017 09:20:46 +1000 Subject: mm, compaction: finish whole pageblock to reduce fragmentation The main goal of direct compaction is to form a high-order page for allocation, but it should also help against long-term fragmentation when possible. Most lower-than-pageblock-order compactions are for non-movable allocations, which means that if we compact in a movable pageblock and terminate as soon as we create the high-order page, it's unlikely that the fallback heuristics will claim the whole block. Instead there might be a single unmovable page in a pageblock full of movable pages, and the next unmovable allocation might pick another pageblock and increase long-term fragmentation. To help against such scenarios, this patch changes the termination criteria for compaction so that the current pageblock is finished even though the high-order page already exists. Note that it might be possible that the high-order page formed elsewhere in the zone due to parallel activity, but this patch doesn't try to detect that. This is only done with sync compaction, because async compaction is limited to pageblock of the same migratetype, where it cannot result in a migratetype fallback. (Async compaction also eagerly skips order-aligned blocks where isolation fails, which is against the goal of migrating away as much of the pageblock as possible.) As a result of this patch, long-term memory fragmentation should be reduced. In testing based on 4.9 kernel with stress-highalloc from mmtests configured for order-4 GFP_KERNEL allocations, this patch has reduced the number of unmovable allocations falling back to movable pageblocks by 20%. The number Link: http://lkml.kernel.org/r/20170307131545.28577-9-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Acked-by: Johannes Weiner Cc: Joonsoo Kim Cc: David Rientjes Signed-off-by: Andrew Morton --- mm/compaction.c | 36 ++++++++++++++++++++++++++++++++++-- mm/internal.h | 1 + 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 2c288e75840d..bc7903130501 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1318,6 +1318,17 @@ static enum compact_result __compact_finished(struct zone *zone, if (is_via_compact_memory(cc->order)) return COMPACT_CONTINUE; + if (cc->finishing_block) { + /* + * We have finished the pageblock, but better check again that + * we really succeeded. + */ + if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) + cc->finishing_block = false; + else + return COMPACT_CONTINUE; + } + /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { struct free_area *area = &zone->free_area[order]; @@ -1338,8 +1349,29 @@ static enum compact_result __compact_finished(struct zone *zone, * other migratetype buddy lists. */ if (find_suitable_fallback(area, order, migratetype, - true, &can_steal) != -1) - return COMPACT_SUCCESS; + true, &can_steal) != -1) { + + /* movable pages are OK in any pageblock */ + if (migratetype == MIGRATE_MOVABLE) + return COMPACT_SUCCESS; + + /* + * We are stealing for a non-movable allocation. Make + * sure we finish compacting the current pageblock + * first so it is as free as possible and we won't + * have to steal another one soon. This only applies + * to sync compaction, as async compaction operates + * on pageblocks of the same migratetype. + */ + if (cc->mode == MIGRATE_ASYNC || + IS_ALIGNED(cc->migrate_pfn, + pageblock_nr_pages)) { + return COMPACT_SUCCESS; + } + + cc->finishing_block = true; + return COMPACT_CONTINUE; + } } return COMPACT_NO_SUITABLE_PAGE; diff --git a/mm/internal.h b/mm/internal.h index e7e709fd3043..0e4f558412fb 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -202,6 +202,7 @@ struct compact_control { bool direct_compaction; /* False from kcompactd or /proc/... */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock or sched contention */ + bool finishing_block; /* Finishing current pageblock */ }; unsigned long -- cgit v1.2.3 From 7fc5212771801fb2f185ca61d0e3d6083e8b19c1 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:20:46 +1000 Subject: mm: do not use double negation for testing page flags With the discussion[1], I found it seems there are every PageFlags functions return bool at this moment so we don't need double negation any more. Although it's not a problem to keep it, it makes future users confused to use double negation for them, too. Remove such possibility. [1] https://marc.info/?l=linux-kernel&m=148881578820434 Frankly sepaking, I like every PageFlags to return bool instead of int. It will make it clear. AFAIR, Chen Gang had tried it but don't know why it was not merged at that time. http://lkml.kernel.org/r/1469336184-1904-1-git-send-email-chengang@emindsoft.com.cn Link: http://lkml.kernel.org/r/1488868597-32222-1-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Kirill A. Shutemov Cc: Chen Gang Cc: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/khugepaged.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 88e4b1737c90..7cb9c88bb4a3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -548,7 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * The page must only be referenced by the scanned process * and page swap cache. */ - if (page_count(page) != 1 + !!PageSwapCache(page)) { + if (page_count(page) != 1 + PageSwapCache(page)) { unlock_page(page); result = SCAN_PAGE_COUNT; goto out; @@ -1181,7 +1181,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, * The page must only be referenced by the scanned process * and page swap cache. */ - if (page_count(page) != 1 + !!PageSwapCache(page)) { + if (page_count(page) != 1 + PageSwapCache(page)) { result = SCAN_PAGE_COUNT; goto out_unmap; } -- cgit v1.2.3 From ca737d720d0b5abeb7ccb4e42e4ea12a8b7574ed Mon Sep 17 00:00:00 2001 From: Shantanu Goel Date: Wed, 5 Apr 2017 09:20:47 +1000 Subject: mm, vmscan: fix zone balance check in prepare_kswapd_sleep Patch series "Reduce amount of time kswapd sleeps prematurely", v2. The series is unusual in that the first patch fixes one problem and introduces other issues that are noted in the changelog. Patch 2 makes a minor modification that is worth considering on its own but leaves the kernel in a state where it behaves badly. It's not until patch 3 that there is an improvement against baseline. This was mostly motivated by examining Chris Mason's "simoop" benchmark which puts the VM under similar pressure to HADOOP. It has been reported that the benchmark has regressed severely during the last number of releases. While I cannot reproduce all the same problems Chris experienced due to hardware limitations, there was a number of problems on a 2-socket machine with a single disk. simoop latencies 4.11.0-rc1 4.11.0-rc1 vanilla keepawake-v2 Amean p50-Read 21670074.18 ( 0.00%) 22668332.52 ( -4.61%) Amean p95-Read 25456267.64 ( 0.00%) 26738688.00 ( -5.04%) Amean p99-Read 29369064.73 ( 0.00%) 30991404.52 ( -5.52%) Amean p50-Write 1390.30 ( 0.00%) 924.91 ( 33.47%) Amean p95-Write 412901.57 ( 0.00%) 1362.62 ( 99.67%) Amean p99-Write 6668722.09 ( 0.00%) 16854.04 ( 99.75%) Amean p50-Allocation 78714.31 ( 0.00%) 74729.74 ( 5.06%) Amean p95-Allocation 175533.51 ( 0.00%) 101609.74 ( 42.11%) Amean p99-Allocation 247003.02 ( 0.00%) 125765.57 ( 49.08%) These are latencies. Read/write are threads reading fixed-size random blocks from a simulated database. The allocation latency is mmaping and faulting regions of memory. The p50, 95 and p99 reports the worst latencies for 50% of the samples, 95% and 99% respectively. For example, the report indicates that while the test was running 99% of writes completed 99.75% faster. It's worth noting that on a UMA machine that no difference in performance with simoop was observed so milage will vary. It's noted that there is a slight impact to read latencies but it's mostly due to IO scheduler decisions and offset by the large reduction in other latencies. This patch (of 3): The check in prepare_kswapd_sleep needs to match the one in balance_pgdat since the latter will return as soon as any one of the zones in the classzone is above the watermark. This is specially important for higher order allocations since balance_pgdat will typically reset the order to zero relying on compaction to create the higher order pages. Without this patch, prepare_kswapd_sleep fails to wake up kcompactd since the zone balance check fails. It was first reported against 4.9.7 that kswapd is failing to wake up kcompactd due to a mismatch in the zone balance check between balance_pgdat() and prepare_kswapd_sleep(). balance_pgdat() returns as soon as a single zone satisfies the allocation but prepare_kswapd_sleep() requires all zones to do +the same. This causes prepare_kswapd_sleep() to never succeed except in the order == 0 case and consequently, wakeup_kcompactd() is never called. For the machine that originally motivated this patch, the state of compaction from /proc/vmstat looked this way after a day and a half +of uptime: compact_migrate_scanned 240496 compact_free_scanned 76238632 compact_isolated 123472 compact_stall 1791 compact_fail 29 compact_success 1762 compact_daemon_wake 0 After applying the patch and about 10 hours of uptime the state looks like this: compact_migrate_scanned 59927299 compact_free_scanned 2021075136 compact_isolated 640926 compact_stall 4 compact_fail 2 compact_success 2 compact_daemon_wake 5160 Further notes from Mel that motivated him to pick this patch up and resend it; It was observed for the simoop workload (pressures the VM similar to HADOOP) that kswapd was failing to keep ahead of direct reclaim. The investigation noted that there was a need to rationalise kswapd decisions to reclaim with kswapd decisions to sleep. With this patch on a 2-socket box, there was a 49% reduction in direct reclaim scanning. However, the impact otherwise is extremely negative. Kswapd reclaim efficiency dropped from 98% to 76%. simoop has three latency-related metrics for read, write and allocation (an anonymous mmap and fault). 4.11.0-rc1 4.11.0-rc1 vanilla fixcheck-v2 Amean p50-Read 21670074.18 ( 0.00%) 20464344.18 ( 5.56%) Amean p95-Read 25456267.64 ( 0.00%) 25721423.64 ( -1.04%) Amean p99-Read 29369064.73 ( 0.00%) 30174230.76 ( -2.74%) Amean p50-Write 1390.30 ( 0.00%) 1395.28 ( -0.36%) Amean p95-Write 412901.57 ( 0.00%) 37737.74 ( 90.86%) Amean p99-Write 6668722.09 ( 0.00%) 666489.04 ( 90.01%) Amean p50-Allocation 78714.31 ( 0.00%) 86286.22 ( -9.62%) Amean p95-Allocation 175533.51 ( 0.00%) 351812.27 (-100.42%) Amean p99-Allocation 247003.02 ( 0.00%) 6291171.56 (-2447.00%) Of greater concern is that the patch causes swapping and page writes from kswapd context rose from 0 pages to 4189753 pages during the hour the workload ran for. By and large, the patch has very bad behaviour but easily missed as the impact on a UMA machine is negligible. This patch is included with the data in case a bisection leads to this area. This patch is also a pre-requisite for the rest of the series. Link: http://lkml.kernel.org/r/20170309075657.25121-2-mgorman@techsingularity.net Signed-off-by: Shantanu Goel Signed-off-by: Mel Gorman Acked-by: Hillf Danton Acked-by: Vlastimil Babka Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/vmscan.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 3ad66580b8b4..1860bfab02c5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3103,11 +3103,11 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) if (!managed_zone(zone)) continue; - if (!zone_balanced(zone, order, classzone_idx)) - return false; + if (zone_balanced(zone, order, classzone_idx)) + return true; } - return true; + return false; } /* @@ -3304,7 +3304,13 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); - /* Try to sleep for a short interval */ + /* + * Try to sleep for a short interval. Note that kcompactd will only be + * woken if it is possible to sleep for a short interval. This is + * deliberate on the assumption that if reclaim cannot keep an + * eligible zone balanced that it's also unlikely that compaction will + * succeed. + */ if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { /* * Compaction records what page blocks it recently failed to -- cgit v1.2.3 From 68c78b1fdcb2ee72fd80741e920a42136431d18c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 5 Apr 2017 09:20:48 +1000 Subject: mm, vmscan: only clear pgdat congested/dirty/writeback state when balanced A pgdat tracks if recent reclaim encountered too many dirty, writeback or congested pages. The flags control whether kswapd writes pages back from reclaim context, tags pages for immediate reclaim when IO completes, whether processes block on wait_iff_congested and whether kswapd blocks when too many pages marked for immediate reclaim are encountered. The state is cleared in a check function with side-effects. With the patch "mm, vmscan: fix zone balance check in prepare_kswapd_sleep", the timing of when the bits get cleared changed. Due to the way the check works, it'll clear the bits if ZONE_DMA is balanced for a GFP_DMA allocation because it does not account for lowmem reserves properly. For the simoop workload, kswapd is not stalling when it should due to the premature clearing, writing pages from reclaim context like crazy and generally being unhelpful. This patch resets the pgdat bits related to page reclaim only when kswapd is going to sleep. The comparison with simoop is then 4.11.0-rc1 4.11.0-rc1 4.11.0-rc1 vanilla fixcheck-v2 clear-v2 Amean p50-Read 21670074.18 ( 0.00%) 20464344.18 ( 5.56%) 19786774.76 ( 8.69%) Amean p95-Read 25456267.64 ( 0.00%) 25721423.64 ( -1.04%) 24101956.27 ( 5.32%) Amean p99-Read 29369064.73 ( 0.00%) 30174230.76 ( -2.74%) 27691872.71 ( 5.71%) Amean p50-Write 1390.30 ( 0.00%) 1395.28 ( -0.36%) 1011.91 ( 27.22%) Amean p95-Write 412901.57 ( 0.00%) 37737.74 ( 90.86%) 34874.98 ( 91.55%) Amean p99-Write 6668722.09 ( 0.00%) 666489.04 ( 90.01%) 575449.60 ( 91.37%) Amean p50-Allocation 78714.31 ( 0.00%) 86286.22 ( -9.62%) 84246.26 ( -7.03%) Amean p95-Allocation 175533.51 ( 0.00%) 351812.27 (-100.42%) 400058.43 (-127.91%) Amean p99-Allocation 247003.02 ( 0.00%) 6291171.56 (-2447.00%) 10905600.00 (-4315.17%) Read latency is improved, write latency is mostly improved but allocation latency is regressed. kswapd is still reclaiming inefficiently, pages are being written back from writeback context and a host of other issues. However, given the change, it needed to be spelled out why the side-effect was moved. Link: http://lkml.kernel.org/r/20170309075657.25121-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Hillf Danton Cc: Johannes Weiner Cc: Shantanu Goel Signed-off-by: Andrew Morton --- mm/vmscan.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1860bfab02c5..8c553fa0d800 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3056,17 +3056,17 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx) if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx)) return false; - /* - * If any eligible zone is balanced then the node is not considered - * to be congested or dirty - */ - clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags); - clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags); - clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags); - return true; } +/* Clear pgdat state for congested, dirty or under writeback. */ +static void clear_pgdat_congested(pg_data_t *pgdat) +{ + clear_bit(PGDAT_CONGESTED, &pgdat->flags); + clear_bit(PGDAT_DIRTY, &pgdat->flags); + clear_bit(PGDAT_WRITEBACK, &pgdat->flags); +} + /* * Prepare kswapd for sleeping. This verifies that there are no processes * waiting in throttle_direct_reclaim() and that watermarks have been met. @@ -3103,8 +3103,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) if (!managed_zone(zone)) continue; - if (zone_balanced(zone, order, classzone_idx)) + if (zone_balanced(zone, order, classzone_idx)) { + clear_pgdat_congested(pgdat); return true; + } } return false; -- cgit v1.2.3 From 68487be26caaf5aba92272e37b538346d3a2cc18 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 5 Apr 2017 09:20:48 +1000 Subject: mm, vmscan: prevent kswapd sleeping prematurely due to mismatched classzone_idx kswapd is woken to reclaim a node based on a failed allocation request from any eligible zone. Once reclaiming in balance_pgdat(), it will continue reclaiming until there is an eligible zone available for the zone it was woken for. kswapd tracks what zone it was recently woken for in pgdat->kswapd_classzone_idx. If it has not been woken recently, this zone will be 0. However, the decision on whether to sleep is made on kswapd_classzone_idx which is 0 without a recent wakeup request and that classzone does not account for lowmem reserves. This allows kswapd to sleep when a low small zone such as ZONE_DMA is balanced for a GFP_DMA request even if a stream of allocations cannot use that zone. While kswapd may be woken again shortly in the near future there are two consequences -- the pgdat bits that control congestion are cleared prematurely and direct reclaim is more likely as kswapd slept prematurely. This patch flips kswapd_classzone_idx to default to MAX_NR_ZONES (an invalid index) when there has been no recent wakeups. If there are no wakeups, it'll decide whether to sleep based on the highest possible zone available (MAX_NR_ZONES - 1). It then becomes critical that the "pgdat balanced" decisions during reclaim and when deciding to sleep are the same. If there is a mismatch, kswapd can stay awake continually trying to balance tiny zones. simoop was used to evaluate it again. Two of the preparation patches regressed the workload so they are included as the second set of results. Otherwise this patch looks artifically excellent 4.11.0-rc1 4.11.0-rc1 4.11.0-rc1 vanilla clear-v2 keepawake-v2 Amean p50-Read 21670074.18 ( 0.00%) 19786774.76 ( 8.69%) 22668332.52 ( -4.61%) Amean p95-Read 25456267.64 ( 0.00%) 24101956.27 ( 5.32%) 26738688.00 ( -5.04%) Amean p99-Read 29369064.73 ( 0.00%) 27691872.71 ( 5.71%) 30991404.52 ( -5.52%) Amean p50-Write 1390.30 ( 0.00%) 1011.91 ( 27.22%) 924.91 ( 33.47%) Amean p95-Write 412901.57 ( 0.00%) 34874.98 ( 91.55%) 1362.62 ( 99.67%) Amean p99-Write 6668722.09 ( 0.00%) 575449.60 ( 91.37%) 16854.04 ( 99.75%) Amean p50-Allocation 78714.31 ( 0.00%) 84246.26 ( -7.03%) 74729.74 ( 5.06%) Amean p95-Allocation 175533.51 ( 0.00%) 400058.43 (-127.91%) 101609.74 ( 42.11%) Amean p99-Allocation 247003.02 ( 0.00%) 10905600.00 (-4315.17%) 125765.57 ( 49.08%) With this patch on top, write and allocation latencies are massively improved. The read latencies are slightly impaired but it's worth noting that this is mostly due to the IO scheduler and not directly related to reclaim. The vmstats are a bit of a mix but the relevant ones are as follows; 4.10.0-rc7 4.10.0-rc7 4.10.0-rc7 mmots-20170209 clear-v1r25keepawake-v1r25 Swap Ins 0 0 0 Swap Outs 0 608 0 Direct pages scanned 6910672 3132699 6357298 Kswapd pages scanned 57036946 82488665 56986286 Kswapd pages reclaimed 55993488 63474329 55939113 Direct pages reclaimed 6905990 2964843 6352115 Kswapd efficiency 98% 76% 98% Kswapd velocity 12494.375 17597.507 12488.065 Direct efficiency 99% 94% 99% Direct velocity 1513.835 668.306 1393.148 Page writes by reclaim 0.000 4410243.000 0.000 Page writes file 0 4409635 0 Page writes anon 0 608 0 Page reclaim immediate 1036792 14175203 1042571 4.11.0-rc1 4.11.0-rc1 4.11.0-rc1 vanilla clear-v2 keepawake-v2 Swap Ins 0 12 0 Swap Outs 0 838 0 Direct pages scanned 6579706 3237270 6256811 Kswapd pages scanned 61853702 79961486 54837791 Kswapd pages reclaimed 60768764 60755788 53849586 Direct pages reclaimed 6579055 2987453 6256151 Kswapd efficiency 98% 75% 98% Page writes by reclaim 0.000 4389496.000 0.000 Page writes file 0 4388658 0 Page writes anon 0 838 0 Page reclaim immediate 1073573 14473009 982507 Swap-outs are equivalent to baseline. Direct reclaim is reduced but not eliminated. It's worth noting that there are two periods of direct reclaim for this workload. The first is when it switches from preparing the files for the actual test itself. It's a lot of file IO followed by a lot of allocs that reclaims heavily for a brief window. While direct reclaim is lower with clear-v2, it is due to kswapd scanning aggressively and trying to reclaim the world which is not the right thing to do. With the patches applied, there is still direct reclaim but the phase change from "creating work files" to starting multiple threads that allocate a lot of anonymous memory faster than kswapd can reclaim. Scanning/reclaim efficiency is restored by this patch. Page writes from reclaim context are back at 0 which is ideal. Pages immediately reclaimed after IO completes is slightly improved but it is expected this will vary slightly. On UMA, there is almost no change so this is not expected to be a universal win. Link: http://lkml.kernel.org/r/20170309075657.25121-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Hillf Danton Cc: Johannes Weiner Cc: Shantanu Goel Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 2 +- mm/vmscan.c | 120 +++++++++++++++++++++++++++++----------------------- 2 files changed, 67 insertions(+), 55 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6fa7208bcd56..587eccfd4588 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1211,7 +1211,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) /* Reset the nr_zones, order and classzone_idx before reuse */ pgdat->nr_zones = 0; pgdat->kswapd_order = 0; - pgdat->kswapd_classzone_idx = 0; + pgdat->kswapd_classzone_idx = MAX_NR_ZONES; } /* we can use NODE_DATA(nid) from here */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 8c553fa0d800..8ce39867140b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3049,14 +3049,36 @@ static void age_active_anon(struct pglist_data *pgdat, } while (memcg); } -static bool zone_balanced(struct zone *zone, int order, int classzone_idx) +/* + * Returns true if there is an eligible zone balanced for the request order + * and classzone_idx + */ +static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) { - unsigned long mark = high_wmark_pages(zone); + int i; + unsigned long mark = -1; + struct zone *zone; - if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx)) - return false; + for (i = 0; i <= classzone_idx; i++) { + zone = pgdat->node_zones + i; - return true; + if (!managed_zone(zone)) + continue; + + mark = high_wmark_pages(zone); + if (zone_watermark_ok_safe(zone, order, mark, classzone_idx)) + return true; + } + + /* + * If a node has no populated zone within classzone_idx, it does not + * need balancing by definition. This can happen if a zone-restricted + * allocation tries to wake a remote kswapd. + */ + if (mark == -1) + return true; + + return false; } /* Clear pgdat state for congested, dirty or under writeback. */ @@ -3075,8 +3097,6 @@ static void clear_pgdat_congested(pg_data_t *pgdat) */ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) { - int i; - /* * The throttled processes are normally woken up in balance_pgdat() as * soon as allow_direct_reclaim() is true. But there is a potential @@ -3097,16 +3117,9 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) return true; - for (i = 0; i <= classzone_idx; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!managed_zone(zone)) - continue; - - if (zone_balanced(zone, order, classzone_idx)) { - clear_pgdat_congested(pgdat); - return true; - } + if (pgdat_balanced(pgdat, order, classzone_idx)) { + clear_pgdat_congested(pgdat); + return true; } return false; @@ -3212,23 +3225,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) } /* - * Only reclaim if there are no eligible zones. Check from - * high to low zone as allocations prefer higher zones. - * Scanning from low to high zone would allow congestion to be - * cleared during a very small window when a small low - * zone was balanced even under extreme pressure when the - * overall node may be congested. Note that sc.reclaim_idx - * is not used as buffer_heads_over_limit may have adjusted - * it. + * Only reclaim if there are no eligible zones. Note that + * sc.reclaim_idx is not used as buffer_heads_over_limit may + * have adjusted it. */ - for (i = classzone_idx; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (!managed_zone(zone)) - continue; - - if (zone_balanced(zone, sc.order, classzone_idx)) - goto out; - } + if (pgdat_balanced(pgdat, sc.order, classzone_idx)) + goto out; /* * Do some background aging of the anon list, to give @@ -3295,6 +3297,22 @@ out: return sc.order; } +/* + * pgdat->kswapd_classzone_idx is the highest zone index that a recent + * allocation request woke kswapd for. When kswapd has not woken recently, + * the value is MAX_NR_ZONES which is not a valid index. This compares a + * given classzone and returns it or the highest classzone index kswapd + * was recently woke for. + */ +static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat, + enum zone_type classzone_idx) +{ + if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES) + return classzone_idx; + + return max(pgdat->kswapd_classzone_idx, classzone_idx); +} + static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, unsigned int classzone_idx) { @@ -3336,7 +3354,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * the previous request that slept prematurely. */ if (remaining) { - pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); + pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order); } @@ -3390,7 +3408,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o */ static int kswapd(void *p) { - unsigned int alloc_order, reclaim_order, classzone_idx; + unsigned int alloc_order, reclaim_order; + unsigned int classzone_idx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -3420,20 +3439,23 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); - pgdat->kswapd_order = alloc_order = reclaim_order = 0; - pgdat->kswapd_classzone_idx = classzone_idx = 0; + pgdat->kswapd_order = 0; + pgdat->kswapd_classzone_idx = MAX_NR_ZONES; for ( ; ; ) { bool ret; + alloc_order = reclaim_order = pgdat->kswapd_order; + classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); + kswapd_try_sleep: kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, classzone_idx); /* Read the new order and classzone_idx */ alloc_order = reclaim_order = pgdat->kswapd_order; - classzone_idx = pgdat->kswapd_classzone_idx; + classzone_idx = kswapd_classzone_idx(pgdat, 0); pgdat->kswapd_order = 0; - pgdat->kswapd_classzone_idx = 0; + pgdat->kswapd_classzone_idx = MAX_NR_ZONES; ret = try_to_freeze(); if (kthread_should_stop()) @@ -3459,9 +3481,6 @@ kswapd_try_sleep: reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); if (reclaim_order < alloc_order) goto kswapd_try_sleep; - - alloc_order = reclaim_order = pgdat->kswapd_order; - classzone_idx = pgdat->kswapd_classzone_idx; } tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); @@ -3477,7 +3496,6 @@ kswapd_try_sleep: void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; - int z; if (!managed_zone(zone)) return; @@ -3485,7 +3503,8 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) return; pgdat = zone->zone_pgdat; - pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); + pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, + classzone_idx); pgdat->kswapd_order = max(pgdat->kswapd_order, order); if (!waitqueue_active(&pgdat->kswapd_wait)) return; @@ -3494,17 +3513,10 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) return; - /* Only wake kswapd if all zones are unbalanced */ - for (z = 0; z <= classzone_idx; z++) { - zone = pgdat->node_zones + z; - if (!managed_zone(zone)) - continue; - - if (zone_balanced(zone, order, classzone_idx)) - return; - } + if (pgdat_balanced(pgdat, order, classzone_idx)) + return; - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order); wake_up_interruptible(&pgdat->kswapd_wait); } -- cgit v1.2.3 From bfd894cbfd10d0b00e1464c8b459da9de2315238 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 5 Apr 2017 09:20:49 +1000 Subject: mm: page_alloc: __GFP_NOWARN shouldn't suppress stall warnings __GFP_NOWARN, which is usually added to avoid warnings from callsites that expect to fail and have fallbacks, currently also suppresses allocation stall warnings. These trigger when an allocation is stuck inside the allocator for 10 seconds or longer. But there is no class of allocations that can get legitimately stuck in the allocator for this long. This always indicates a problem. Always emit stall warnings. Restrict __GFP_NOWARN to alloc failures. Link: http://lkml.kernel.org/r/20170125181150.GA16398@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Minchan Kim Cc: Michal Hocko Cc: Tetsuo Handa Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7d0332b5f3d8..975b081b5c6c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3818,7 +3818,7 @@ retry: /* Make sure we know about allocations which stall for too long */ if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask, ac->nodemask, + warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask, "page allocation stalls for %ums, order:%u", jiffies_to_msecs(jiffies-alloc_start), order); stall_timeout += 10 * HZ; -- cgit v1.2.3 From 4a97827a90ad87b8acc7a0e23c8bb30215d4a217 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 5 Apr 2017 09:20:49 +1000 Subject: mm/sparse: refine usemap_size() a little Current implementation calculates usemap_size in two steps: * calculate number of bytes to cover these bits * calculate number of "unsigned long" to cover these bytes It would be more clear by: * calculate number of "unsigned long" to cover these bits * multiple it with sizeof(unsigned long) This patch refine usemap_size() a little to make it more easy to understand. Link: http://lkml.kernel.org/r/20170310043713.96871-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Tejun Heo Signed-off-by: Andrew Morton --- mm/sparse.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index db6bf3c97ea2..6903c8fc3085 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -248,10 +248,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms, unsigned long usemap_size(void) { - unsigned long size_bytes; - size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8; - size_bytes = roundup(size_bytes, sizeof(unsigned long)); - return size_bytes; + return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long); } #ifdef CONFIG_MEMORY_HOTPLUG -- cgit v1.2.3 From 24ac7307a6c911995f8311b2eb967e85a2442233 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Wed, 5 Apr 2017 09:20:50 +1000 Subject: mm/compaction: ignore block suitable after check large free page By reviewing code, I find that if the migrate target is a large free page and we ignore suitable, it may splite large target free page into smaller block which is not good for defrag. So move the ignore block suitable after check large free page. As Vlastimil pointed out in RFC version that this patch is just based on logical analyses which might be better for future-proofing the function and it is most likely won't have any visible effect right now, for direct compaction shouldn't have to be called if there's a >=pageblock_order page already available. Link: http://lkml.kernel.org/r/1489490743-5364-1-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie Cc: Vlastimil Babka Cc: Michal Hocko Cc: Mel Gorman Cc: Joonsoo Kim Cc: David Rientjes Cc: Minchan Kim Cc: Hanjun Guo Cc: Xishi Qiu Signed-off-by: Andrew Morton --- mm/compaction.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index bc7903130501..613c59e928cb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1003,9 +1003,6 @@ static bool suitable_migration_source(struct compact_control *cc, static bool suitable_migration_target(struct compact_control *cc, struct page *page) { - if (cc->ignore_block_suitable) - return true; - /* If the page is a large free page, then disallow migration */ if (PageBuddy(page)) { /* @@ -1017,6 +1014,9 @@ static bool suitable_migration_target(struct compact_control *cc, return false; } + if (cc->ignore_block_suitable) + return true; + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ if (is_migrate_movable(get_pageblock_migratetype(page))) return true; -- cgit v1.2.3 From c1b1c16b17cf9b2ad64215709330d53c8c4f5ef1 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Wed, 5 Apr 2017 09:20:50 +1000 Subject: mm/vmscan: more restrictive condition for retry in do_try_to_free_pages By reviewing code, I find that when enter do_try_to_free_pages, the may_thrash is always clear, and it will retry shrink zones to tap cgroup's reserves memory by setting may_thrash when the former shrink_zones reclaim nothing. However, when memcg is disabled or on legacy hierarchy, or there do not have any memcg protected by low limit, it should not do this useless retry at all, for we do not have any cgroup's reserves memory to tap, and we have already done hard work but made no progress, which as Michal pointed out in former version, we are trying hard to control the retry logical of page alloctor, and the current additional round of reclaim is just lame. Therefore, to avoid this unneeded retrying and make code more readable, we remove the may_thrash field in scan_control, instead, introduce memcg_low_reclaim and memcg_low_skipped, and only retry when memcg_low_skipped, by setting memcg_low_reclaim. Link: http://lkml.kernel.org/r/1490191893-5923-1-git-send-email-ysxie@foxmail.com Signed-off-by: Yisheng Xie Acked-by: Michal Hocko Suggested-by: Johannes Weiner Suggested-by: Michal Hocko Suggested-by: Shakeel Butt Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Mel Gorman Cc: Vlastimil Babka Cc: Rik van Riel Signed-off-by: Andrew Morton --- mm/vmscan.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8ce39867140b..068f21a1fa31 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -100,6 +100,9 @@ struct scan_control { /* Can cgroups be reclaimed below their normal consumption range? */ unsigned int may_thrash:1; + /* Did we have any memcg protected by the low limit */ + unsigned int memcg_low_protection:1; + unsigned int hibernation_mode:1; /* One of the zones is ready for compaction */ @@ -2512,6 +2515,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long scanned; if (mem_cgroup_low(root, memcg)) { + sc->memcg_low_protection = 1; + if (!sc->may_thrash) continue; mem_cgroup_events(memcg, MEMCG_LOW, 1); @@ -2768,7 +2773,7 @@ retry: return 1; /* Untapped cgroup reserves? Don't OOM, retry. */ - if (!sc->may_thrash) { + if (sc->memcg_low_protection && !sc->may_thrash) { sc->priority = initial_priority; sc->may_thrash = 1; goto retry; -- cgit v1.2.3 From 3597871d972d4d11d74cdf9d00c6b7fecd652cc3 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Wed, 5 Apr 2017 09:20:51 +1000 Subject: mm-vmscan-more-restrictive-condition-for-retry-in-do_try_to_free_pages-v5 - remove may_thrash field in scan_control, and introduce mem_cgroup_reclaim and memcg_low_skipped to make code more readable. - Johannes Link: http://lkml.kernel.org/r/1490191893-5923-1-git-send-email-ysxie@foxmail.com Signed-off-by: Yisheng Xie Acked-by: Michal Hocko Suggested-by: Johannes Weiner Suggested-by: Michal Hocko Suggested-by: Shakeel Butt Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Mel Gorman Cc: Vlastimil Babka Cc: Rik van Riel Signed-off-by: Andrew Morton --- mm/vmscan.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 068f21a1fa31..e54c882d6789 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -97,11 +97,13 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ unsigned int may_swap:1; - /* Can cgroups be reclaimed below their normal consumption range? */ - unsigned int may_thrash:1; - - /* Did we have any memcg protected by the low limit */ - unsigned int memcg_low_protection:1; + /* + * Cgroups are not reclaimed below their configured memory.low, + * unless we threaten to OOM. If any cgroups are skipped due to + * memory.low and nothing was reclaimed, go back for memory.low. + */ + unsigned int memcg_low_reclaim:1; + unsigned int memcg_low_skipped:1; unsigned int hibernation_mode:1; @@ -2515,10 +2517,10 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long scanned; if (mem_cgroup_low(root, memcg)) { - sc->memcg_low_protection = 1; - - if (!sc->may_thrash) + if (!sc->memcg_low_reclaim) { + sc->memcg_low_skipped = 1; continue; + } mem_cgroup_events(memcg, MEMCG_LOW, 1); } @@ -2773,9 +2775,10 @@ retry: return 1; /* Untapped cgroup reserves? Don't OOM, retry. */ - if (sc->memcg_low_protection && !sc->may_thrash) { + if (sc->memcg_low_skipped) { sc->priority = initial_priority; - sc->may_thrash = 1; + sc->memcg_low_reclaim = 1; + sc->memcg_low_skipped = 0; goto retry; } -- cgit v1.2.3 From c5f2c8719aa0935ecaeb842cd5317489e2d87129 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:20:52 +1000 Subject: mm: remove unncessary ret in page_referenced Nobody uses ret variable. Remove it. Link: http://lkml.kernel.org/r/1489555493-14659-2-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Hillf Danton Acked-by: Kirill A. Shutemov Cc: Johannes Weiner Cc: Michal Hocko Cc: Kirill A. Shutemov Cc: Anshuman Khandual Cc: Vlastimil Babka Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/rmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index a19bd8b8ab0d..4baf504e4213 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -812,7 +812,6 @@ int page_referenced(struct page *page, struct mem_cgroup *memcg, unsigned long *vm_flags) { - int ret; int we_locked = 0; struct page_referenced_arg pra = { .mapcount = total_mapcount(page), @@ -846,7 +845,7 @@ int page_referenced(struct page *page, rwc.invalid_vma = invalid_page_referenced_vma; } - ret = rmap_walk(page, &rwc); + rmap_walk(page, &rwc); *vm_flags = pra.vm_flags; if (we_locked) -- cgit v1.2.3 From 9bf5b6c237b198bd3d8b9fd37466566ae0d72941 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:20:52 +1000 Subject: mm: remove SWAP_DIRTY in ttu If we found lazyfree page is dirty, try_to_unmap_one can just SetPageSwapBakced in there like PG_mlocked page and just return with SWAP_FAIL which is very natural because the page is not swappable right now so that vmscan can activate it. There is no point to introduce new return value SWAP_DIRTY in try_to_unmap at the moment. Link: http://lkml.kernel.org/r/1489555493-14659-3-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Hillf Danton Acked-by: Kirill A. Shutemov Cc: Anshuman Khandual Cc: Johannes Weiner Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/rmap.h | 1 - mm/rmap.c | 4 ++-- mm/vmscan.c | 3 --- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index fee10d744ebd..b556eefa62bc 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -298,6 +298,5 @@ static inline int page_mkclean(struct page *page) #define SWAP_AGAIN 1 #define SWAP_FAIL 2 #define SWAP_MLOCK 3 -#define SWAP_DIRTY 4 #endif /* _LINUX_RMAP_H */ diff --git a/mm/rmap.c b/mm/rmap.c index 4baf504e4213..f6aa18d8a420 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1436,7 +1436,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * discarded. Remap the page to page table. */ set_pte_at(mm, address, pvmw.pte, pteval); - ret = SWAP_DIRTY; + SetPageSwapBacked(page); + ret = SWAP_FAIL; page_vma_mapped_walk_done(&pvmw); break; } @@ -1506,7 +1507,6 @@ static int page_mapcount_is_zero(struct page *page) * SWAP_AGAIN - we missed a mapping, try again later * SWAP_FAIL - the page is unswappable * SWAP_MLOCK - page is mlocked. - * SWAP_DIRTY - page is dirty MADV_FREE page */ int try_to_unmap(struct page *page, enum ttu_flags flags) { diff --git a/mm/vmscan.c b/mm/vmscan.c index e54c882d6789..f1fd388454bd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1147,9 +1147,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (page_mapped(page)) { switch (ret = try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) { - case SWAP_DIRTY: - SetPageSwapBacked(page); - /* fall through */ case SWAP_FAIL: nr_unmap_fail++; goto activate_locked; -- cgit v1.2.3 From 45d2347672d947da64f95f22f947a2a843917aaf Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:20:53 +1000 Subject: mm: remove SWAP_MLOCK check for SWAP_SUCCESS in ttu If the page is mapped and rescue in try_to_unmap_one, page_mapcount(page) == 0 cannot be true so page_mapcount check in try_to_unmap is enough to return SWAP_SUCCESS. IOW, SWAP_MLOCK check is redundant so remove it. Link: http://lkml.kernel.org/r/1489555493-14659-4-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Anshuman Khandual Cc: Hillf Danton Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/rmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/rmap.c b/mm/rmap.c index f6aa18d8a420..dfe40557ea29 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1535,7 +1535,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) else ret = rmap_walk(page, &rwc); - if (ret != SWAP_MLOCK && !page_mapcount(page)) + if (!page_mapcount(page)) ret = SWAP_SUCCESS; return ret; } -- cgit v1.2.3 From 291d76b59c81abf62a3f94ee019231f38c93e6c2 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:20:54 +1000 Subject: mm: make try_to_munlock() return void try_to_munlock returns SWAP_MLOCK if the one of VMAs mapped the page has VM_LOCKED flag. In that time, VM set PG_mlocked to the page if the page is not pte-mapped THP which cannot be mlocked, either. With that, __munlock_isolated_page can use PageMlocked to check whether try_to_munlock is successful or not without relying on try_to_munlock's retval. It helps to make try_to_unmap/try_to_unmap_one simple with upcoming patches. Link: http://lkml.kernel.org/r/1489555493-14659-5-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Anshuman Khandual Cc: Hillf Danton Cc: Johannes Weiner Cc: Michal Hocko Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 +- mm/mlock.c | 6 ++---- mm/rmap.c | 17 +++++------------ 3 files changed, 8 insertions(+), 17 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b556eefa62bc..1b0cd4cf68e3 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -235,7 +235,7 @@ int page_mkclean(struct page *); * called in munlock()/munmap() path to check for other vmas holding * the page mlocked. */ -int try_to_munlock(struct page *); +void try_to_munlock(struct page *); void remove_migration_ptes(struct page *old, struct page *new, bool locked); diff --git a/mm/mlock.c b/mm/mlock.c index 0dd9ca18e19e..c483c5c20b4b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -123,17 +123,15 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage) */ static void __munlock_isolated_page(struct page *page) { - int ret = SWAP_AGAIN; - /* * Optimization: if the page was mapped just once, that's our mapping * and we don't need to check all the other vmas. */ if (page_mapcount(page) > 1) - ret = try_to_munlock(page); + try_to_munlock(page); /* Did try_to_unlock() succeed or punt? */ - if (ret != SWAP_MLOCK) + if (!PageMlocked(page)) count_vm_event(UNEVICTABLE_PGMUNLOCKED); putback_lru_page(page); diff --git a/mm/rmap.c b/mm/rmap.c index dfe40557ea29..d59112367d47 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1552,18 +1552,10 @@ static int page_not_mapped(struct page *page) * Called from munlock code. Checks all of the VMAs mapping the page * to make sure nobody else has this page mlocked. The page will be * returned with PG_mlocked cleared if no other vmas have it mlocked. - * - * Return values are: - * - * SWAP_AGAIN - no vma is holding page mlocked, or, - * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem - * SWAP_FAIL - page cannot be located at present - * SWAP_MLOCK - page is now mlocked. */ -int try_to_munlock(struct page *page) -{ - int ret; +void try_to_munlock(struct page *page) +{ struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = (void *)TTU_MUNLOCK, @@ -1573,9 +1565,10 @@ int try_to_munlock(struct page *page) }; VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); + VM_BUG_ON_PAGE(PageMlocked(page), page); + VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); - ret = rmap_walk(page, &rwc); - return ret; + rmap_walk(page, &rwc); } void __put_anon_vma(struct anon_vma *anon_vma) -- cgit v1.2.3 From 3669edd331aecfb4967f468a0204692e1ab10584 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:20:55 +1000 Subject: mm: remove SWAP_MLOCK in ttu ttu doesn't need to return SWAP_MLOCK. Instead, just return SWAP_FAIL because it means the page is not-swappable so it should move to another LRU list(active or unevictable). putback friends will move it to right list depending on the page's LRU flag. Link: http://lkml.kernel.org/r/1489555493-14659-6-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Anshuman Khandual Cc: Hillf Danton Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/rmap.h | 1 - mm/rmap.c | 3 +-- mm/vmscan.c | 20 +++++++------------- 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 1b0cd4cf68e3..3630d4dcee13 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -297,6 +297,5 @@ static inline int page_mkclean(struct page *page) #define SWAP_SUCCESS 0 #define SWAP_AGAIN 1 #define SWAP_FAIL 2 -#define SWAP_MLOCK 3 #endif /* _LINUX_RMAP_H */ diff --git a/mm/rmap.c b/mm/rmap.c index d59112367d47..4935d317e654 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1329,7 +1329,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ mlock_vma_page(page); } - ret = SWAP_MLOCK; + ret = SWAP_FAIL; page_vma_mapped_walk_done(&pvmw); break; } @@ -1506,7 +1506,6 @@ static int page_mapcount_is_zero(struct page *page) * SWAP_SUCCESS - we succeeded in removing all mappings * SWAP_AGAIN - we missed a mapping, try again later * SWAP_FAIL - the page is unswappable - * SWAP_MLOCK - page is mlocked. */ int try_to_unmap(struct page *page, enum ttu_flags flags) { diff --git a/mm/vmscan.c b/mm/vmscan.c index f1fd388454bd..cfd2651966a8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -987,7 +987,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, sc->nr_scanned++; if (unlikely(!page_evictable(page))) - goto cull_mlocked; + goto activate_locked; if (!sc->may_unmap && page_mapped(page)) goto keep_locked; @@ -1152,8 +1152,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto activate_locked; case SWAP_AGAIN: goto keep_locked; - case SWAP_MLOCK: - goto cull_mlocked; case SWAP_SUCCESS: ; /* try to free the page below */ } @@ -1295,20 +1293,16 @@ free_it: list_add(&page->lru, &free_pages); continue; -cull_mlocked: - if (PageSwapCache(page)) - try_to_free_swap(page); - unlock_page(page); - list_add(&page->lru, &ret_pages); - continue; - activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && mem_cgroup_swap_full(page)) + if (PageSwapCache(page) && (mem_cgroup_swap_full(page) || + PageMlocked(page))) try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); - SetPageActive(page); - pgactivate++; + if (!PageMlocked(page)) { + SetPageActive(page); + pgactivate++; + } keep_locked: unlock_page(page); keep: -- cgit v1.2.3 From 2955ab2b800fcf08ae908aa62d8ae32fc933b600 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:20:56 +1000 Subject: mm: remove SWAP_AGAIN in ttu In 2002, [1] introduced SWAP_AGAIN. At that time, try_to_unmap_one used spin_trylock(&mm->page_table_lock) so it's really easy to contend and fail to hold a lock so SWAP_AGAIN to keep LRU status makes sense. However, now we changed it to mutex-based lock and be able to block without skip pte so there is few of small window to return SWAP_AGAIN so remove SWAP_AGAIN and just return SWAP_FAIL. [1] c48c43e, minimal rmap Link: http://lkml.kernel.org/r/1489555493-14659-7-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Anshuman Khandual Cc: Hillf Danton Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/rmap.c | 11 +++-------- mm/vmscan.c | 2 -- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 4935d317e654..84ea7cc17c20 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1504,13 +1504,10 @@ static int page_mapcount_is_zero(struct page *page) * Return values are: * * SWAP_SUCCESS - we succeeded in removing all mappings - * SWAP_AGAIN - we missed a mapping, try again later * SWAP_FAIL - the page is unswappable */ int try_to_unmap(struct page *page, enum ttu_flags flags) { - int ret; - struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = (void *)flags, @@ -1530,13 +1527,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) rwc.invalid_vma = invalid_migration_vma; if (flags & TTU_RMAP_LOCKED) - ret = rmap_walk_locked(page, &rwc); + rmap_walk_locked(page, &rwc); else - ret = rmap_walk(page, &rwc); + rmap_walk(page, &rwc); - if (!page_mapcount(page)) - ret = SWAP_SUCCESS; - return ret; + return !page_mapcount(page) ? SWAP_SUCCESS : SWAP_FAIL; } static int page_not_mapped(struct page *page) diff --git a/mm/vmscan.c b/mm/vmscan.c index cfd2651966a8..f80a54da5f7f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1150,8 +1150,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, case SWAP_FAIL: nr_unmap_fail++; goto activate_locked; - case SWAP_AGAIN: - goto keep_locked; case SWAP_SUCCESS: ; /* try to free the page below */ } -- cgit v1.2.3 From 304ba3e7009a075c0a5cd53bab4f93ece95f3c2d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:20:57 +1000 Subject: mm: make ttu's return boolean try_to_unmap() returns SWAP_SUCCESS or SWAP_FAIL so it's suitable for boolean return. This patch changes it. Link: http://lkml.kernel.org/r/1489555493-14659-8-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Naoya Horiguchi Cc: Anshuman Khandual Cc: Hillf Danton Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/rmap.h | 4 ++-- mm/huge_memory.c | 6 +++--- mm/memory-failure.c | 26 ++++++++++++-------------- mm/rmap.c | 8 +++----- mm/vmscan.c | 7 +------ 5 files changed, 21 insertions(+), 30 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 3630d4dcee13..6028c38d3cac 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -191,7 +191,7 @@ static inline void page_dup_rmap(struct page *page, bool compound) int page_referenced(struct page *, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags); -int try_to_unmap(struct page *, enum ttu_flags flags); +bool try_to_unmap(struct page *, enum ttu_flags flags); /* Avoid racy checks */ #define PVMW_SYNC (1 << 0) @@ -281,7 +281,7 @@ static inline int page_referenced(struct page *page, int is_locked, return 0; } -#define try_to_unmap(page, refs) SWAP_FAIL +#define try_to_unmap(page, refs) false static inline int page_mkclean(struct page *page) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c36094c4fa4b..d14dd961f626 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2144,15 +2144,15 @@ static void freeze_page(struct page *page) { enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; - int ret; + bool unmap_success; VM_BUG_ON_PAGE(!PageHead(page), page); if (PageAnon(page)) ttu_flags |= TTU_MIGRATION; - ret = try_to_unmap(page, ttu_flags); - VM_BUG_ON_PAGE(ret, page); + unmap_success = try_to_unmap(page, ttu_flags); + VM_BUG_ON_PAGE(!unmap_success, page); } static void unfreeze_page(struct page *page) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f85adfe57484..3d3cf6add4c1 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -322,7 +322,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * wrong earlier. */ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, - int fail, struct page *page, unsigned long pfn, + bool fail, struct page *page, unsigned long pfn, int flags) { struct to_kill *tk, *next; @@ -904,13 +904,13 @@ EXPORT_SYMBOL_GPL(get_hwpoison_page); * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. */ -static int hwpoison_user_mappings(struct page *p, unsigned long pfn, +static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, int trapno, int flags, struct page **hpagep) { enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; struct address_space *mapping; LIST_HEAD(tokill); - int ret; + bool unmap_success; int kill = 1, forcekill; struct page *hpage = *hpagep; @@ -919,20 +919,20 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * other types of pages. */ if (PageReserved(p) || PageSlab(p)) - return SWAP_SUCCESS; + return true; if (!(PageLRU(hpage) || PageHuge(p))) - return SWAP_SUCCESS; + return true; /* * This check implies we don't kill processes if their pages * are in the swap cache early. Those are always late kills. */ if (!page_mapped(hpage)) - return SWAP_SUCCESS; + return true; if (PageKsm(p)) { pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn); - return SWAP_FAIL; + return false; } if (PageSwapCache(p)) { @@ -971,8 +971,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - ret = try_to_unmap(hpage, ttu); - if (ret != SWAP_SUCCESS) + unmap_success = try_to_unmap(hpage, ttu); + if (!unmap_success) pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); @@ -987,10 +987,9 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * any accesses to the poisoned memory. */ forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); - kill_procs(&tokill, forcekill, trapno, - ret != SWAP_SUCCESS, p, pfn, flags); + kill_procs(&tokill, forcekill, trapno, !unmap_success, p, pfn, flags); - return ret; + return unmap_success; } static void set_page_hwpoison_huge_page(struct page *hpage) @@ -1230,8 +1229,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * When the raw error page is thp tail page, hpage points to the raw * page after thp split. */ - if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) - != SWAP_SUCCESS) { + if (!hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto out; diff --git a/mm/rmap.c b/mm/rmap.c index 84ea7cc17c20..7eaca35c0f35 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1501,12 +1501,10 @@ static int page_mapcount_is_zero(struct page *page) * * Tries to remove all the page table entries which are mapping this * page, used in the pageout path. Caller must hold the page lock. - * Return values are: * - * SWAP_SUCCESS - we succeeded in removing all mappings - * SWAP_FAIL - the page is unswappable + * If unmap is successful, return true. Otherwise, false. */ -int try_to_unmap(struct page *page, enum ttu_flags flags) +bool try_to_unmap(struct page *page, enum ttu_flags flags) { struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, @@ -1531,7 +1529,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) else rmap_walk(page, &rwc); - return !page_mapcount(page) ? SWAP_SUCCESS : SWAP_FAIL; + return !page_mapcount(page) ? true : false; } static int page_not_mapped(struct page *page) diff --git a/mm/vmscan.c b/mm/vmscan.c index f80a54da5f7f..7a30150b4dee 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -972,7 +972,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; bool dirty, writeback; - int ret = SWAP_SUCCESS; cond_resched(); @@ -1145,13 +1144,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page)) { - switch (ret = try_to_unmap(page, - ttu_flags | TTU_BATCH_FLUSH)) { - case SWAP_FAIL: + if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) { nr_unmap_fail++; goto activate_locked; - case SWAP_SUCCESS: - ; /* try to free the page below */ } } -- cgit v1.2.3 From 63556f68b73a2a7b55e65552c6c7a80c36f7ffba Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:20:57 +1000 Subject: mm: make rmap_walk() return void There is no user of the return value from rmap_walk() and friends so this patch makes them void-returning functions. Link: http://lkml.kernel.org/r/1489555493-14659-9-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Anshuman Khandual Cc: Hillf Danton Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/ksm.h | 5 ++--- include/linux/rmap.h | 4 ++-- mm/ksm.c | 16 ++++++---------- mm/rmap.c | 32 +++++++++++++------------------- 4 files changed, 23 insertions(+), 34 deletions(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index e1cfda4bee58..78b44a024eaa 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -61,7 +61,7 @@ static inline void set_page_stable_node(struct page *page, struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address); -int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); +void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); #else /* !CONFIG_KSM */ @@ -94,10 +94,9 @@ static inline int page_referenced_ksm(struct page *page, return 0; } -static inline int rmap_walk_ksm(struct page *page, +static inline void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { - return 0; } static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 6028c38d3cac..1d7d457ca0dc 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -264,8 +264,8 @@ struct rmap_walk_control { bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; -int rmap_walk(struct page *page, struct rmap_walk_control *rwc); -int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc); +void rmap_walk(struct page *page, struct rmap_walk_control *rwc); +void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc); #else /* !CONFIG_MMU */ diff --git a/mm/ksm.c b/mm/ksm.c index 19b4f2dea7a5..6edffb9a795b 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1933,11 +1933,10 @@ struct page *ksm_might_need_to_copy(struct page *page, return new_page; } -int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; struct rmap_item *rmap_item; - int ret = SWAP_AGAIN; int search_new_forks = 0; VM_BUG_ON_PAGE(!PageKsm(page), page); @@ -1950,7 +1949,7 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) stable_node = page_stable_node(page); if (!stable_node) - return ret; + return; again: hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; @@ -1978,23 +1977,20 @@ again: if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - ret = rwc->rmap_one(page, vma, - rmap_item->address, rwc->arg); - if (ret != SWAP_AGAIN) { + if (SWAP_AGAIN != rwc->rmap_one(page, vma, + rmap_item->address, rwc->arg)) { anon_vma_unlock_read(anon_vma); - goto out; + return; } if (rwc->done && rwc->done(page)) { anon_vma_unlock_read(anon_vma); - goto out; + return; } } anon_vma_unlock_read(anon_vma); } if (!search_new_forks++) goto again; -out: - return ret; } #ifdef CONFIG_MIGRATION diff --git a/mm/rmap.c b/mm/rmap.c index 7eaca35c0f35..7f22e8af80b5 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1608,13 +1608,12 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, +static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, bool locked) { struct anon_vma *anon_vma; pgoff_t pgoff_start, pgoff_end; struct anon_vma_chain *avc; - int ret = SWAP_AGAIN; if (locked) { anon_vma = page_anon_vma(page); @@ -1624,7 +1623,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, anon_vma = rmap_walk_anon_lock(page, rwc); } if (!anon_vma) - return ret; + return; pgoff_start = page_to_pgoff(page); pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; @@ -1638,8 +1637,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - ret = rwc->rmap_one(page, vma, address, rwc->arg); - if (ret != SWAP_AGAIN) + if (SWAP_AGAIN != rwc->rmap_one(page, vma, address, rwc->arg)) break; if (rwc->done && rwc->done(page)) break; @@ -1647,7 +1645,6 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, if (!locked) anon_vma_unlock_read(anon_vma); - return ret; } /* @@ -1663,13 +1660,12 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, +static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, bool locked) { struct address_space *mapping = page_mapping(page); pgoff_t pgoff_start, pgoff_end; struct vm_area_struct *vma; - int ret = SWAP_AGAIN; /* * The page lock not only makes sure that page->mapping cannot @@ -1680,7 +1676,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, VM_BUG_ON_PAGE(!PageLocked(page), page); if (!mapping) - return ret; + return; pgoff_start = page_to_pgoff(page); pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; @@ -1695,8 +1691,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - ret = rwc->rmap_one(page, vma, address, rwc->arg); - if (ret != SWAP_AGAIN) + if (SWAP_AGAIN != rwc->rmap_one(page, vma, address, rwc->arg)) goto done; if (rwc->done && rwc->done(page)) goto done; @@ -1705,28 +1700,27 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, done: if (!locked) i_mmap_unlock_read(mapping); - return ret; } -int rmap_walk(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk(struct page *page, struct rmap_walk_control *rwc) { if (unlikely(PageKsm(page))) - return rmap_walk_ksm(page, rwc); + rmap_walk_ksm(page, rwc); else if (PageAnon(page)) - return rmap_walk_anon(page, rwc, false); + rmap_walk_anon(page, rwc, false); else - return rmap_walk_file(page, rwc, false); + rmap_walk_file(page, rwc, false); } /* Like rmap_walk, but caller holds relevant rmap lock */ -int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) { /* no ksm support for now */ VM_BUG_ON_PAGE(PageKsm(page), page); if (PageAnon(page)) - return rmap_walk_anon(page, rwc, true); + rmap_walk_anon(page, rwc, true); else - return rmap_walk_file(page, rwc, true); + rmap_walk_file(page, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.2.3 From f6a77ae632c6fab01166f48afaa4d70a8a0620f5 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:20:58 +1000 Subject: mm: make rmap_one boolean function rmap_one's return value controls whether rmap_work should contine to scan other ptes or not so it's target for changing to boolean. Return true if the scan should be continued. Otherwise, return false to stop the scanning. This patch makes rmap_one's return value to boolean. Link: http://lkml.kernel.org/r/1489555493-14659-10-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Anshuman Khandual Cc: Hillf Danton Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/rmap.h | 6 +++++- mm/ksm.c | 2 +- mm/migrate.c | 4 ++-- mm/page_idle.c | 4 ++-- mm/rmap.c | 30 +++++++++++++++--------------- 5 files changed, 25 insertions(+), 21 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 1d7d457ca0dc..13ed232cbb29 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -257,7 +257,11 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); */ struct rmap_walk_control { void *arg; - int (*rmap_one)(struct page *page, struct vm_area_struct *vma, + /* + * Return false if page table scanning in rmap_walk should be stopped. + * Otherwise, return true. + */ + bool (*rmap_one)(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct page *page); struct anon_vma *(*anon_lock)(struct page *page); diff --git a/mm/ksm.c b/mm/ksm.c index 6edffb9a795b..d9fc0e456128 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1977,7 +1977,7 @@ again: if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - if (SWAP_AGAIN != rwc->rmap_one(page, vma, + if (!rwc->rmap_one(page, vma, rmap_item->address, rwc->arg)) { anon_vma_unlock_read(anon_vma); return; diff --git a/mm/migrate.c b/mm/migrate.c index 937378e8b883..410b56bdabed 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -194,7 +194,7 @@ void putback_movable_pages(struct list_head *l) /* * Restore a potential migration pte to a working pte entry */ -static int remove_migration_pte(struct page *page, struct vm_area_struct *vma, +static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *old) { struct page_vma_mapped_walk pvmw = { @@ -253,7 +253,7 @@ static int remove_migration_pte(struct page *page, struct vm_area_struct *vma, update_mmu_cache(vma, pvmw.address, pvmw.pte); } - return SWAP_AGAIN; + return true; } /* diff --git a/mm/page_idle.c b/mm/page_idle.c index b0ee56c56b58..1b0f48c62316 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -50,7 +50,7 @@ static struct page *page_idle_get_page(unsigned long pfn) return page; } -static int page_idle_clear_pte_refs_one(struct page *page, +static bool page_idle_clear_pte_refs_one(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg) { @@ -84,7 +84,7 @@ static int page_idle_clear_pte_refs_one(struct page *page, */ set_page_young(page); } - return SWAP_AGAIN; + return true; } static void page_idle_clear_pte_refs(struct page *page) diff --git a/mm/rmap.c b/mm/rmap.c index 7f22e8af80b5..fabf9aac4ea0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -724,7 +724,7 @@ struct page_referenced_arg { /* * arg: page_referenced_arg will be passed */ -static int page_referenced_one(struct page *page, struct vm_area_struct *vma, +static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct page_referenced_arg *pra = arg; @@ -741,7 +741,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED) { page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; - return SWAP_FAIL; /* To break the loop */ + return false; /* To break the loop */ } if (pvmw.pte) { @@ -781,9 +781,9 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, } if (!pra->mapcount) - return SWAP_SUCCESS; /* To break the loop */ + return false; /* To break the loop */ - return SWAP_AGAIN; + return true; } static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) @@ -854,7 +854,7 @@ int page_referenced(struct page *page, return pra.referenced; } -static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, +static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct page_vma_mapped_walk pvmw = { @@ -907,7 +907,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, } } - return SWAP_AGAIN; + return true; } static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) @@ -1290,7 +1290,7 @@ void page_remove_rmap(struct page *page, bool compound) /* * @arg: enum ttu_flags will be passed to this argument */ -static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, +static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; @@ -1301,12 +1301,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, }; pte_t pteval; struct page *subpage; - int ret = SWAP_AGAIN; + bool ret = true; enum ttu_flags flags = (enum ttu_flags)arg; /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) - return SWAP_AGAIN; + return true; if (flags & TTU_SPLIT_HUGE_PMD) { split_huge_pmd_address(vma, address, @@ -1329,7 +1329,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ mlock_vma_page(page); } - ret = SWAP_FAIL; + ret = false; page_vma_mapped_walk_done(&pvmw); break; } @@ -1347,7 +1347,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (!(flags & TTU_IGNORE_ACCESS)) { if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) { - ret = SWAP_FAIL; + ret = false; page_vma_mapped_walk_done(&pvmw); break; } @@ -1437,14 +1437,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ set_pte_at(mm, address, pvmw.pte, pteval); SetPageSwapBacked(page); - ret = SWAP_FAIL; + ret = false; page_vma_mapped_walk_done(&pvmw); break; } if (swap_duplicate(entry) < 0) { set_pte_at(mm, address, pvmw.pte, pteval); - ret = SWAP_FAIL; + ret = false; page_vma_mapped_walk_done(&pvmw); break; } @@ -1637,7 +1637,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - if (SWAP_AGAIN != rwc->rmap_one(page, vma, address, rwc->arg)) + if (!rwc->rmap_one(page, vma, address, rwc->arg)) break; if (rwc->done && rwc->done(page)) break; @@ -1691,7 +1691,7 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - if (SWAP_AGAIN != rwc->rmap_one(page, vma, address, rwc->arg)) + if (!rwc->rmap_one(page, vma, address, rwc->arg)) goto done; if (rwc->done && rwc->done(page)) goto done; -- cgit v1.2.3 From c6814dc1cbbbc7d11bbda8d302cf917ec2af0a2a Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:20:59 +1000 Subject: mm: remove SWAP_[SUCCESS|AGAIN|FAIL] There is no user for it. Remove it. Link: http://lkml.kernel.org/r/1489555493-14659-11-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Anshuman Khandual Cc: Hillf Danton Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/rmap.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 13ed232cbb29..43ef2c30cb0f 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -295,11 +295,4 @@ static inline int page_mkclean(struct page *page) #endif /* CONFIG_MMU */ -/* - * Return values of try_to_unmap - */ -#define SWAP_SUCCESS 0 -#define SWAP_AGAIN 1 -#define SWAP_FAIL 2 - #endif /* _LINUX_RMAP_H */ -- cgit v1.2.3 From e190bdc81bbc9bf6877c57f043948a3a8dcaaa85 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:20:59 +1000 Subject: mm: use false instead of SWAP_FAIL There was mistake with git-rebase. SWAP_FAIL was removed. Use false. It could be folded into mm-make-ttus-return-boolean.patch. Link: http://lkml.kernel.org/r/20170316053313.GA19241@bbox Signed-off-by: Minchan Kim Reported-by: Sergey Senozhatsky Signed-off-by: Andrew Morton --- mm/rmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/rmap.c b/mm/rmap.c index fabf9aac4ea0..589c6f80c090 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1419,7 +1419,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { WARN_ON_ONCE(1); - ret = SWAP_FAIL; + ret = false; page_vma_mapped_walk_done(&pvmw); break; } -- cgit v1.2.3 From d3e1ae61c04fe987b2045bdb441efa2c776c374f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 5 Apr 2017 09:21:00 +1000 Subject: mm, swap: Fix comment in __read_swap_cache_async cbab0e4eec29 ("swap: avoid read_swap_cache_async() race to deadlock while waiting on discard I/O completion") fixed a deadlock in read_swap_cache_async(). Because at that time, in swap allocation path, a swap entry may be set as SWAP_HAS_CACHE, then wait for discarding to complete before the page for the swap entry is added to the swap cache. But in 815c2c543d3a ("swap: make swap discard async"), the discarding for swap become asynchronous, waiting for discarding to complete will be done before the swap entry is set as SWAP_HAS_CACHE. So the comments in code is incorrect now. This patch fixes the comments. The cond_resched() added in the commit cbab0e4eec29 is not necessary now too. But if we added some sleep in swap allocation path in the future, there may be some hard to debug/reproduce deadlock bug. So it is kept. Link: http://lkml.kernel.org/r/20170317064635.12792-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Rafael Aquini Cc: Shaohua Li Signed-off-by: Andrew Morton --- mm/swap_state.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 473b71e052a8..7bfb9bd1ca21 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -360,17 +360,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * We might race against get_swap_page() and stumble * across a SWAP_HAS_CACHE swap_map entry whose page - * has not been brought into the swapcache yet, while - * the other end is scheduled away waiting on discard - * I/O completion at scan_swap_map(). - * - * In order to avoid turning this transitory state - * into a permanent loop around this -EEXIST case - * if !CONFIG_PREEMPT and the I/O completion happens - * to be waiting on the CPU waitqueue where we are now - * busy looping, we just conditionally invoke the - * scheduler here, if there are some more important - * tasks to run. + * has not been brought into the swapcache yet. */ cond_resched(); continue; -- cgit v1.2.3 From b988b7693347b8ea32bfc26b3d43e35451fe7429 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 5 Apr 2017 09:21:01 +1000 Subject: mm, swap: improve readability via make spin_lock/unlock balanced This is just a cleanup patch, no functionality change. In cluster_list_add_tail(), spin_lock_nested() is used to lock the cluster, while unlock_cluster() is used to unlock the cluster. To improve the code readability. Use spin_unlock() directly to unlock the cluster. Link: http://lkml.kernel.org/r/20170317064635.12792-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Tim Chen Signed-off-by: Andrew Morton --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 6b6bb1bb6209..42fd620dcf4c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -335,7 +335,7 @@ static void cluster_list_add_tail(struct swap_cluster_list *list, ci_tail = ci + tail; spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); cluster_set_next(ci_tail, idx); - unlock_cluster(ci_tail); + spin_unlock(&ci_tail->lock); cluster_set_next_flag(&list->tail, idx, 0); } } -- cgit v1.2.3 From 1f1b7dc4acb469c145631c136453e01d00a8a91f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 5 Apr 2017 09:21:02 +1000 Subject: mm, swap: avoid lock swap_avail_lock when held cluster lock Cluster lock is used to protect the swap_cluster_info and corresponding elements in swap_info_struct->swap_map[]. But it is found that now in scan_swap_map_slots(), swap_avail_lock may be acquired when cluster lock is held. This does no good except making the locking more complex and improving the potential locking contention, because the swap_info_struct->lock is used to protect the data structure operated in the code already. Fix this via moving the corresponding operations in scan_swap_map_slots() out of cluster lock. Link: http://lkml.kernel.org/r/20170317064635.12792-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Tim Chen Signed-off-by: Andrew Morton --- mm/swapfile.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 42fd620dcf4c..53b5881ee0d6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -672,6 +672,9 @@ checks: else goto done; } + si->swap_map[offset] = usage; + inc_cluster_info_page(si, si->cluster_info, offset); + unlock_cluster(ci); if (offset == si->lowest_bit) si->lowest_bit++; @@ -685,9 +688,6 @@ checks: plist_del(&si->avail_list, &swap_avail_head); spin_unlock(&swap_avail_lock); } - si->swap_map[offset] = usage; - inc_cluster_info_page(si, si->cluster_info, offset); - unlock_cluster(ci); si->cluster_next = offset + 1; slots[n_ret++] = swp_entry(si->type, offset); -- cgit v1.2.3 From 7bab0c9f866c8dc26faab297ab59e6163455c5a4 Mon Sep 17 00:00:00 2001 From: Vinayak Menon Date: Wed, 5 Apr 2017 09:21:03 +1000 Subject: mm: enable page poisoning early at boot On SPARSEMEM systems page poisoning is enabled after buddy is up, because of the dependency on page extension init. This causes the pages released by free_all_bootmem not to be poisoned. This either delays or misses the identification of some issues because the pages have to undergo another cycle of alloc-free-alloc for any corruption to be detected. Enable page poisoning early by getting rid of the PAGE_EXT_DEBUG_POISON flag. Since all the free pages will now be poisoned, the flag need not be verified before checking the poison during an alloc. Link: http://lkml.kernel.org/r/1490358246-11001-1-git-send-email-vinmenon@codeaurora.org Signed-off-by: Vinayak Menon Acked-by: Laura Abbott Tested-by: Laura Abbott Cc: Joonsoo Kim Cc: Michal Hocko Cc: Akinobu Mita Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - mm/page_alloc.c | 13 +++------ mm/page_ext.c | 3 --- mm/page_poison.c | 77 +++++++++--------------------------------------------- 4 files changed, 15 insertions(+), 79 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 00a8fa7e366a..8ae7460e44c6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2497,7 +2497,6 @@ extern long copy_huge_page_from_user(struct page *dst_page, #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ extern struct page_ext_operations debug_guardpage_ops; -extern struct page_ext_operations page_poisoning_ops; #ifdef CONFIG_DEBUG_PAGEALLOC extern unsigned int _debug_guardpage_minorder; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 975b081b5c6c..e025e620053d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1688,10 +1688,10 @@ static inline int check_new_page(struct page *page) return 1; } -static inline bool free_pages_prezeroed(bool poisoned) +static inline bool free_pages_prezeroed(void) { return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && - page_poisoning_enabled() && poisoned; + page_poisoning_enabled(); } #ifdef CONFIG_DEBUG_VM @@ -1745,17 +1745,10 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags unsigned int alloc_flags) { int i; - bool poisoned = true; - - for (i = 0; i < (1 << order); i++) { - struct page *p = page + i; - if (poisoned) - poisoned &= page_is_poisoned(p); - } post_alloc_hook(page, order, gfp_flags); - if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) + if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) for (i = 0; i < (1 << order); i++) clear_highpage(page + i); diff --git a/mm/page_ext.c b/mm/page_ext.c index 121dcffc4ec1..fc3e7ffd189e 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -59,9 +59,6 @@ static struct page_ext_operations *page_ext_ops[] = { &debug_guardpage_ops, -#ifdef CONFIG_PAGE_POISONING - &page_poisoning_ops, -#endif #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif diff --git a/mm/page_poison.c b/mm/page_poison.c index 2e647c65916b..be19e989ccff 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -6,7 +6,6 @@ #include #include -static bool __page_poisoning_enabled __read_mostly; static bool want_page_poisoning __read_mostly; static int early_page_poison_param(char *buf) @@ -18,75 +17,22 @@ static int early_page_poison_param(char *buf) early_param("page_poison", early_page_poison_param); bool page_poisoning_enabled(void) -{ - return __page_poisoning_enabled; -} - -static bool need_page_poisoning(void) -{ - return want_page_poisoning; -} - -static void init_page_poisoning(void) { /* - * page poisoning is debug page alloc for some arches. If either - * of those options are enabled, enable poisoning + * Assumes that debug_pagealloc_enabled is set before + * free_all_bootmem. + * Page poisoning is debug page alloc for some arches. If + * either of those options are enabled, enable poisoning. */ - if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) { - if (!want_page_poisoning && !debug_pagealloc_enabled()) - return; - } else { - if (!want_page_poisoning) - return; - } - - __page_poisoning_enabled = true; -} - -struct page_ext_operations page_poisoning_ops = { - .need = need_page_poisoning, - .init = init_page_poisoning, -}; - -static inline void set_page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return; - - __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -static inline void clear_page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return; - - __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -bool page_is_poisoned(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return false; - - return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); + return (want_page_poisoning || + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && + debug_pagealloc_enabled())); } static void poison_page(struct page *page) { void *addr = kmap_atomic(page); - set_page_poison(page); memset(addr, PAGE_POISON, PAGE_SIZE); kunmap_atomic(addr); } @@ -140,12 +86,13 @@ static void unpoison_page(struct page *page) { void *addr; - if (!page_is_poisoned(page)) - return; - addr = kmap_atomic(page); + /* + * Page poisoning when enabled poisons each and every page + * that is freed to buddy. Thus no extra check is done to + * see if a page was posioned. + */ check_poison_mem(addr, PAGE_SIZE); - clear_page_poison(page); kunmap_atomic(addr); } -- cgit v1.2.3 From 3fd18b363b5aee037c3617654200afe121235d62 Mon Sep 17 00:00:00 2001 From: Vinayak Menon Date: Wed, 5 Apr 2017 09:21:03 +1000 Subject: mm-enable-page-poisoning-early-at-boot-v2 (1) Removed "select PAGE_EXTENSION" on CONFIG_PAGE_POISONING (2) Removed CONFIG_PAGE_POISONING checks in page_ext.c Link: http://lkml.kernel.org/r/1490878002-14423-1-git-send-email-vinmenon@codeaurora.org Signed-off-by: Vinayak Menon Cc: Laura Abbott Cc: Joonsoo Kim Cc: Michal Hocko Cc: Akinobu Mita Signed-off-by: Andrew Morton --- mm/Kconfig.debug | 1 - mm/page_ext.c | 10 ++-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 79d0fd13b5b3..5b0adf1435de 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -42,7 +42,6 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT config PAGE_POISONING bool "Poison pages after freeing" - select PAGE_EXTENSION select PAGE_POISONING_NO_SANITY if HIBERNATION ---help--- Fill the pages with poison patterns after free_pages() and verify diff --git a/mm/page_ext.c b/mm/page_ext.c index fc3e7ffd189e..88ccc044b09a 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -124,15 +124,12 @@ struct page_ext *lookup_page_ext(struct page *page) struct page_ext *base; base = NODE_DATA(page_to_nid(page))->node_page_ext; -#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) +#if defined(CONFIG_DEBUG_VM) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. - * - * This check is also necessary for ensuring page poisoning - * works as expected when enabled */ if (unlikely(!base)) return NULL; @@ -201,15 +198,12 @@ struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) +#if defined(CONFIG_DEBUG_VM) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. - * - * This check is also necessary for ensuring page poisoning - * works as expected when enabled */ if (!section->page_ext) return NULL; -- cgit v1.2.3 From e942163483b72adce996d802c638c63c9002210d Mon Sep 17 00:00:00 2001 From: Pushkar Jambhlekar Date: Wed, 5 Apr 2017 09:21:04 +1000 Subject: include/linux/migrate.h: add arg names to prototype It is preferred, and the rest of migrate.h gets it right. Link: http://lkml.kernel.org/r/1490336009-8024-1-git-send-email-pushkar.iit@gmail.com Signed-off-by: Pushkar Jambhlekar Signed-off-by: Andrew Morton --- include/linux/migrate.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index fa76b516fa47..48e24844b3c5 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -33,8 +33,9 @@ extern char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION extern void putback_movable_pages(struct list_head *l); -extern int migrate_page(struct address_space *, - struct page *, struct page *, enum migrate_mode); +extern int migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode); extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, int reason); extern int isolate_movable_page(struct page *page, isolate_mode_t mode); -- cgit v1.2.3 From 5ef9d7880f7c5332e779c24e100a920f494224eb Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 5 Apr 2017 09:21:05 +1000 Subject: mm/swap_slots.c: add warning if swap slots cache failed to initialize Add a warning diagnostics to user if we failed to allocate swap slots cache and use it. Link: http://lkml.kernel.org/r/20170328234827.GA10107@linux.intel.com Signed-off-by: Tim Chen Signed-off-by: Andrew Morton --- mm/swap_slots.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/swap_slots.c b/mm/swap_slots.c index b1ccb58ad397..ceb783630880 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -241,8 +241,11 @@ int enable_swap_slots_cache(void) ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache", alloc_swap_slot_cache, free_slot_cache); - if (ret < 0) + if (ret < 0) { + WARN_ONCE(1, "Cache allocation failed (%s), operate without swap slots cache.\n", + __func__); goto out_unlock; + } swap_slot_cache_initialized = true; __reenable_swap_slots_cache(); out_unlock: -- cgit v1.2.3 From f2fc9d97b3123496f8cbab8097efe08645963348 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 5 Apr 2017 09:21:05 +1000 Subject: swap-add-warning-if-swap-slots-cache-failed-to-initialize-fix use WARN_ONCE return value, fix grammar in message Cc: Tim Chen Signed-off-by: Andrew Morton --- mm/swap_slots.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/swap_slots.c b/mm/swap_slots.c index ceb783630880..aa1c415f4abd 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -241,11 +241,10 @@ int enable_swap_slots_cache(void) ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache", alloc_swap_slot_cache, free_slot_cache); - if (ret < 0) { - WARN_ONCE(1, "Cache allocation failed (%s), operate without swap slots cache.\n", - __func__); + if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating " + "without swap slots cache.\n", __func__)) goto out_unlock; - } + swap_slot_cache_initialized = true; __reenable_swap_slots_cache(); out_unlock: -- cgit v1.2.3 From 9ff290b4038492db1267ac3af9ca83d9e84b7cd2 Mon Sep 17 00:00:00 2001 From: Hao Lee Date: Wed, 5 Apr 2017 09:21:06 +1000 Subject: mm: fix spelling error Fix variable name error in comments. No code changes. Link: http://lkml.kernel.org/r/20170403161655.5081-1-haolee.swjtu@gmail.com Signed-off-by: Hao Lee Signed-off-by: Andrew Morton --- include/linux/gfp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 2bfcfd33e476..2b1a44f5bdb6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -313,8 +313,8 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) /* * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the - * zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long - * and there are 16 of them to cover all possible combinations of + * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT + * bits long and there are 16 of them to cover all possible combinations of * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM. * * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA. -- cgit v1.2.3 From 437f8d09b04871ce64ae42461bbd01dbecbf71e6 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 5 Apr 2017 09:21:07 +1000 Subject: userfaultfd: selftest: combine all cases into a single executable Currently, selftest for userfaultfd is compiled three times: for anonymous, shared and hugetlb memory. Let's combine all the cases into a single executable which will have a command line option for selection of the test type. Link: http://lkml.kernel.org/r/1490869741-5913-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/Makefile | 11 +- tools/testing/selftests/vm/run_vmtests | 6 +- tools/testing/selftests/vm/userfaultfd.c | 207 +++++++++++++++++-------------- 3 files changed, 116 insertions(+), 108 deletions(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 41642ba5e318..dba889004ea1 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -15,21 +15,14 @@ TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd -TEST_GEN_FILES += userfaultfd_hugetlb -TEST_GEN_FILES += userfaultfd_shmem TEST_GEN_FILES += mlock-random-test TEST_PROGS := run_vmtests include ../lib.mk -$(OUTPUT)/userfaultfd: LDLIBS += -lpthread ../../../../usr/include/linux/kernel.h - -$(OUTPUT)/userfaultfd_hugetlb: userfaultfd.c ../../../../usr/include/linux/kernel.h - $(CC) $(CFLAGS) -DHUGETLB_TEST -O2 -o $@ $< -lpthread - -$(OUTPUT)/userfaultfd_shmem: userfaultfd.c ../../../../usr/include/linux/kernel.h - $(CC) $(CFLAGS) -DSHMEM_TEST -O2 -o $@ $< -lpthread +$(OUTPUT)/userfaultfd: ../../../../usr/include/linux/kernel.h +$(OUTPUT)/userfaultfd: LDLIBS += -lpthread $(OUTPUT)/mlock-random-test: LDLIBS += -lcap diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index c92f6cf31d0a..3214a6456d13 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -95,7 +95,7 @@ echo " hugetlb regression testing." echo "--------------------" echo "running userfaultfd" echo "--------------------" -./userfaultfd 128 32 +./userfaultfd anon 128 32 if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 @@ -107,7 +107,7 @@ echo "----------------------------" echo "running userfaultfd_hugetlb" echo "----------------------------" # 258MB total huge pages == 128MB src and 128MB dst -./userfaultfd_hugetlb 128 32 $mnt/ufd_test_file +./userfaultfd hugetlb 128 32 $mnt/ufd_test_file if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 @@ -119,7 +119,7 @@ rm -f $mnt/ufd_test_file echo "----------------------------" echo "running userfaultfd_shmem" echo "----------------------------" -./userfaultfd_shmem 128 32 +./userfaultfd shmem 128 32 if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index e9449c801888..1eae79ae5b4e 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -77,10 +77,13 @@ static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; #define BOUNCE_POLL (1<<3) static int bounces; -#ifdef HUGETLB_TEST +#define TEST_ANON 1 +#define TEST_HUGETLB 2 +#define TEST_SHMEM 3 +static int test_type; + static int huge_fd; static char *huge_fd_off0; -#endif static unsigned long long *count_verify; static int uffd, uffd_flags, finished, *pipefd; static char *area_src, *area_dst; @@ -102,14 +105,7 @@ pthread_attr_t attr; ~(unsigned long)(sizeof(unsigned long long) \ - 1))) -#if !defined(HUGETLB_TEST) && !defined(SHMEM_TEST) - -/* Anonymous memory */ -#define EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ - (1 << _UFFDIO_COPY) | \ - (1 << _UFFDIO_ZEROPAGE)) - -static int release_pages(char *rel_area) +static int anon_release_pages(char *rel_area) { int ret = 0; @@ -121,7 +117,7 @@ static int release_pages(char *rel_area) return ret; } -static void allocate_area(void **alloc_area) +static void anon_allocate_area(void **alloc_area) { if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) { fprintf(stderr, "out of memory\n"); @@ -129,14 +125,9 @@ static void allocate_area(void **alloc_area) } } -#else /* HUGETLB_TEST or SHMEM_TEST */ - -#define EXPECTED_IOCTLS UFFD_API_RANGE_IOCTLS_BASIC - -#ifdef HUGETLB_TEST /* HugeTLB memory */ -static int release_pages(char *rel_area) +static int hugetlb_release_pages(char *rel_area) { int ret = 0; @@ -152,7 +143,7 @@ static int release_pages(char *rel_area) } -static void allocate_area(void **alloc_area) +static void hugetlb_allocate_area(void **alloc_area) { *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_HUGETLB, huge_fd, @@ -167,10 +158,8 @@ static void allocate_area(void **alloc_area) huge_fd_off0 = *alloc_area; } -#elif defined(SHMEM_TEST) - /* Shared memory */ -static int release_pages(char *rel_area) +static int shmem_release_pages(char *rel_area) { int ret = 0; @@ -182,7 +171,7 @@ static int release_pages(char *rel_area) return ret; } -static void allocate_area(void **alloc_area) +static void shmem_allocate_area(void **alloc_area) { *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); @@ -192,11 +181,35 @@ static void allocate_area(void **alloc_area) } } -#else /* SHMEM_TEST */ -#error "Undefined test type" -#endif /* HUGETLB_TEST */ - -#endif /* !defined(HUGETLB_TEST) && !defined(SHMEM_TEST) */ +struct uffd_test_ops { + unsigned long expected_ioctls; + void (*allocate_area)(void **alloc_area); + int (*release_pages)(char *rel_area); +}; + +#define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \ + (1 << _UFFDIO_COPY) | \ + (1 << _UFFDIO_ZEROPAGE)) + +static struct uffd_test_ops anon_uffd_test_ops = { + .expected_ioctls = ANON_EXPECTED_IOCTLS, + .allocate_area = anon_allocate_area, + .release_pages = anon_release_pages, +}; + +static struct uffd_test_ops shmem_uffd_test_ops = { + .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC, + .allocate_area = shmem_allocate_area, + .release_pages = shmem_release_pages, +}; + +static struct uffd_test_ops hugetlb_uffd_test_ops = { + .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC, + .allocate_area = hugetlb_allocate_area, + .release_pages = hugetlb_release_pages, +}; + +static struct uffd_test_ops *uffd_test_ops; static int my_bcmp(char *str1, char *str2, size_t n) { @@ -505,7 +518,7 @@ static int stress(unsigned long *userfaults) * UFFDIO_COPY without writing zero pages into area_dst * because the background threads already completed). */ - if (release_pages(area_src)) + if (uffd_test_ops->release_pages(area_src)) return 1; for (cpu = 0; cpu < nr_cpus; cpu++) { @@ -577,12 +590,12 @@ static int faulting_process(void) { unsigned long nr; unsigned long long count; + unsigned long split_nr_pages; -#ifndef HUGETLB_TEST - unsigned long split_nr_pages = (nr_pages + 1) / 2; -#else - unsigned long split_nr_pages = nr_pages; -#endif + if (test_type != TEST_HUGETLB) + split_nr_pages = (nr_pages + 1) / 2; + else + split_nr_pages = nr_pages; for (nr = 0; nr < split_nr_pages; nr++) { count = *area_count(area_dst, nr); @@ -594,7 +607,9 @@ static int faulting_process(void) } } -#ifndef HUGETLB_TEST + if (test_type == TEST_HUGETLB) + return 0; + area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, area_src); if (area_dst == MAP_FAILED) @@ -610,7 +625,7 @@ static int faulting_process(void) } } - if (release_pages(area_dst)) + if (uffd_test_ops->release_pages(area_dst)) return 1; for (nr = 0; nr < nr_pages; nr++) { @@ -618,8 +633,6 @@ static int faulting_process(void) fprintf(stderr, "nr %lu is not zero\n", nr), exit(1); } -#endif /* HUGETLB_TEST */ - return 0; } @@ -627,7 +640,9 @@ static int uffdio_zeropage(int ufd, unsigned long offset) { struct uffdio_zeropage uffdio_zeropage; int ret; - unsigned long has_zeropage = EXPECTED_IOCTLS & (1 << _UFFDIO_ZEROPAGE); + unsigned long has_zeropage; + + has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE); if (offset >= nr_pages * page_size) fprintf(stderr, "unexpected offset %lu\n", @@ -675,7 +690,7 @@ static int userfaultfd_zeropage_test(void) printf("testing UFFDIO_ZEROPAGE: "); fflush(stdout); - if (release_pages(area_dst)) + if (uffd_test_ops->release_pages(area_dst)) return 1; if (userfaultfd_open(0) < 0) @@ -686,7 +701,7 @@ static int userfaultfd_zeropage_test(void) if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) fprintf(stderr, "register failure\n"), exit(1); - expected_ioctls = EXPECTED_IOCTLS; + expected_ioctls = uffd_test_ops->expected_ioctls; if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) fprintf(stderr, @@ -716,7 +731,7 @@ static int userfaultfd_events_test(void) printf("testing events (fork, remap, remove): "); fflush(stdout); - if (release_pages(area_dst)) + if (uffd_test_ops->release_pages(area_dst)) return 1; features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | @@ -731,7 +746,7 @@ static int userfaultfd_events_test(void) if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) fprintf(stderr, "register failure\n"), exit(1); - expected_ioctls = EXPECTED_IOCTLS; + expected_ioctls = uffd_test_ops->expected_ioctls; if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) fprintf(stderr, @@ -773,10 +788,10 @@ static int userfaultfd_stress(void) int err; unsigned long userfaults[nr_cpus]; - allocate_area((void **)&area_src); + uffd_test_ops->allocate_area((void **)&area_src); if (!area_src) return 1; - allocate_area((void **)&area_dst); + uffd_test_ops->allocate_area((void **)&area_dst); if (!area_dst) return 1; @@ -856,7 +871,7 @@ static int userfaultfd_stress(void) fprintf(stderr, "register failure\n"); return 1; } - expected_ioctls = EXPECTED_IOCTLS; + expected_ioctls = uffd_test_ops->expected_ioctls; if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) { fprintf(stderr, @@ -888,7 +903,7 @@ static int userfaultfd_stress(void) * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's * required to MADV_DONTNEED here. */ - if (release_pages(area_dst)) + if (uffd_test_ops->release_pages(area_dst)) return 1; /* bounce pass */ @@ -934,36 +949,6 @@ static int userfaultfd_stress(void) return userfaultfd_zeropage_test() || userfaultfd_events_test(); } -#ifndef HUGETLB_TEST - -int main(int argc, char **argv) -{ - if (argc < 3) - fprintf(stderr, "Usage: \n"), exit(1); - nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - page_size = sysconf(_SC_PAGE_SIZE); - if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 - > page_size) - fprintf(stderr, "Impossible to run this test\n"), exit(2); - nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size / - nr_cpus; - if (!nr_pages_per_cpu) { - fprintf(stderr, "invalid MiB\n"); - fprintf(stderr, "Usage: \n"), exit(1); - } - bounces = atoi(argv[2]); - if (bounces <= 0) { - fprintf(stderr, "invalid bounces\n"); - fprintf(stderr, "Usage: \n"), exit(1); - } - nr_pages = nr_pages_per_cpu * nr_cpus; - printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", - nr_pages, nr_pages_per_cpu); - return userfaultfd_stress(); -} - -#else /* HUGETLB_TEST */ - /* * Copied from mlock2-tests.c */ @@ -988,48 +973,78 @@ unsigned long default_huge_page_size(void) return hps; } -int main(int argc, char **argv) +static void set_test_type(const char *type) { - if (argc < 4) - fprintf(stderr, "Usage: \n"), - exit(1); - nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - page_size = default_huge_page_size(); + if (!strcmp(type, "anon")) { + test_type = TEST_ANON; + uffd_test_ops = &anon_uffd_test_ops; + } else if (!strcmp(type, "hugetlb")) { + test_type = TEST_HUGETLB; + uffd_test_ops = &hugetlb_uffd_test_ops; + } else if (!strcmp(type, "shmem")) { + test_type = TEST_SHMEM; + uffd_test_ops = &shmem_uffd_test_ops; + } else { + fprintf(stderr, "Unknown test type: %s\n", type), exit(1); + } + + if (test_type == TEST_HUGETLB) + page_size = default_huge_page_size(); + else + page_size = sysconf(_SC_PAGE_SIZE); + if (!page_size) - fprintf(stderr, "Unable to determine huge page size\n"), + fprintf(stderr, "Unable to determine page size\n"), exit(2); if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 > page_size) fprintf(stderr, "Impossible to run this test\n"), exit(2); - nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size / +} + +int main(int argc, char **argv) +{ + if (argc < 4) + fprintf(stderr, "Usage: [hugetlbfs_file]\n"), + exit(1); + + set_test_type(argv[1]); + + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size / nr_cpus; if (!nr_pages_per_cpu) { fprintf(stderr, "invalid MiB\n"); fprintf(stderr, "Usage: \n"), exit(1); } - bounces = atoi(argv[2]); + + bounces = atoi(argv[3]); if (bounces <= 0) { fprintf(stderr, "invalid bounces\n"); fprintf(stderr, "Usage: \n"), exit(1); } nr_pages = nr_pages_per_cpu * nr_cpus; - huge_fd = open(argv[3], O_CREAT | O_RDWR, 0755); - if (huge_fd < 0) { - fprintf(stderr, "Open of %s failed", argv[3]); - perror("open"); - exit(1); - } - if (ftruncate(huge_fd, 0)) { - fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]); - perror("ftruncate"); - exit(1); + + if (test_type == TEST_HUGETLB) { + if (argc < 5) + fprintf(stderr, "Usage: hugetlb \n"), + exit(1); + huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755); + if (huge_fd < 0) { + fprintf(stderr, "Open of %s failed", argv[3]); + perror("open"); + exit(1); + } + if (ftruncate(huge_fd, 0)) { + fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]); + perror("ftruncate"); + exit(1); + } } printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", nr_pages, nr_pages_per_cpu); return userfaultfd_stress(); } -#endif #else /* __NR_userfaultfd */ #warning "missing __NR_userfaultfd definition" -- cgit v1.2.3 From d2da41daaebf5e81d5e102efb5cc079e938e8e47 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:21:07 +1000 Subject: zram: handle multiple pages attached to bio's bvec Johannes Thumshirn reported system goes the panic when using NVMe over Fabrics loopback target with zram. The reason is zram expects each bvec in bio contains a single page but nvme can attach a huge bulk of pages attached to the bio's bvec so that zram's index arithmetic could be wrong so that out-of-bound access makes panic. It was solved by limiting max_sectors with SECTORS_PER_PAGE in 0bc315381fe9 ("zram: set physical queue limits to avoid array out of bounds accesses") but that makes zram slow because a bio should split with each pages. So this patch makes zram aware of multiple pages in a bvec so it can solve the panic without causing any performance regression. Link: http://lkml.kernel.org/r/1491196653-7388-2-git-send-email-minchan@kernel.org Signed-off-by: Johannes Thumshirn Signed-off-by: Minchan Kim Reported-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Reviewed-by: Johannes Thumshirn Cc: Jens Axboe Cc: Hannes Reinecke Cc: Mika Penttil Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 39 ++++++++++----------------------------- 1 file changed, 10 insertions(+), 29 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 01944419b1f3..28c2836f8c96 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -137,8 +137,7 @@ static inline bool valid_io_request(struct zram *zram, static void update_position(u32 *index, int *offset, struct bio_vec *bvec) { - if (*offset + bvec->bv_len >= PAGE_SIZE) - (*index)++; + *index += (*offset + bvec->bv_len) / PAGE_SIZE; *offset = (*offset + bvec->bv_len) % PAGE_SIZE; } @@ -838,34 +837,20 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) } bio_for_each_segment(bvec, bio, iter) { - int max_transfer_size = PAGE_SIZE - offset; - - if (bvec.bv_len > max_transfer_size) { - /* - * zram_bvec_rw() can only make operation on a single - * zram page. Split the bio vector. - */ - struct bio_vec bv; - - bv.bv_page = bvec.bv_page; - bv.bv_len = max_transfer_size; - bv.bv_offset = bvec.bv_offset; + struct bio_vec bv = bvec; + unsigned int remained = bvec.bv_len; + do { + bv.bv_len = min_t(unsigned int, PAGE_SIZE, remained); if (zram_bvec_rw(zram, &bv, index, offset, - op_is_write(bio_op(bio))) < 0) + op_is_write(bio_op(bio))) < 0) goto out; - bv.bv_len = bvec.bv_len - max_transfer_size; - bv.bv_offset += max_transfer_size; - if (zram_bvec_rw(zram, &bv, index + 1, 0, - op_is_write(bio_op(bio))) < 0) - goto out; - } else - if (zram_bvec_rw(zram, &bvec, index, offset, - op_is_write(bio_op(bio))) < 0) - goto out; + bv.bv_offset += bv.bv_len; + remained -= bv.bv_len; - update_position(&index, &offset, &bvec); + update_position(&index, &offset, &bv); + } while (remained); } bio_endio(bio); @@ -882,8 +867,6 @@ static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio) { struct zram *zram = queue->queuedata; - blk_queue_split(queue, &bio, queue->bio_split); - if (!valid_io_request(zram, bio->bi_iter.bi_sector, bio->bi_iter.bi_size)) { atomic64_inc(&zram->stats.invalid_io); @@ -1191,8 +1174,6 @@ static int zram_add(void) blk_queue_io_min(zram->disk->queue, PAGE_SIZE); blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); zram->disk->queue->limits.discard_granularity = PAGE_SIZE; - zram->disk->queue->limits.max_sectors = SECTORS_PER_PAGE; - zram->disk->queue->limits.chunk_sectors = 0; blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX); /* * zram_bio_discard() will clear all logical blocks if logical block -- cgit v1.2.3 From c610e8d62da66b47e41338c96260766a8cb0095e Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:21:08 +1000 Subject: zram: partial IO refactoring For architecture(PAGE_SIZE > 4K), zram have supported partial IO. However, the mixed code for handling normal/partial IO is too mess, error-prone to modify IO handler functions with upcoming feature so this patch aims for cleaning up zram's IO handling functions. Link: http://lkml.kernel.org/r/1491196653-7388-3-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Hannes Reinecke Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Mika Penttil Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 333 +++++++++++++++++++++++------------------- 1 file changed, 184 insertions(+), 149 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 28c2836f8c96..7938f4b98b01 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -45,6 +45,8 @@ static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ static unsigned int num_devices = 1; +static void zram_free_page(struct zram *zram, size_t index); + static inline bool init_done(struct zram *zram) { return zram->disksize; @@ -98,10 +100,17 @@ static void zram_set_obj_size(struct zram_meta *meta, meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; } +#if PAGE_SIZE != 4096 static inline bool is_partial_io(struct bio_vec *bvec) { return bvec->bv_len != PAGE_SIZE; } +#else +static inline bool is_partial_io(struct bio_vec *bvec) +{ + return false; +} +#endif static void zram_revalidate_disk(struct zram *zram) { @@ -191,18 +200,6 @@ static bool page_same_filled(void *ptr, unsigned long *element) return true; } -static void handle_same_page(struct bio_vec *bvec, unsigned long element) -{ - struct page *page = bvec->bv_page; - void *user_mem; - - user_mem = kmap_atomic(page); - zram_fill_page(user_mem + bvec->bv_offset, bvec->bv_len, element); - kunmap_atomic(user_mem); - - flush_dcache_page(page); -} - static ssize_t initstate_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -418,6 +415,53 @@ static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); static DEVICE_ATTR_RO(debug_stat); +static bool zram_special_page_read(struct zram *zram, u32 index, + struct page *page, + unsigned int offset, unsigned int len) +{ + struct zram_meta *meta = zram->meta; + + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + if (unlikely(!meta->table[index].handle) || + zram_test_flag(meta, index, ZRAM_SAME)) { + void *mem; + + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + mem = kmap_atomic(page); + zram_fill_page(mem + offset, len, meta->table[index].element); + kunmap_atomic(mem); + return true; + } + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + + return false; +} + +static bool zram_special_page_write(struct zram *zram, u32 index, + struct page *page) +{ + unsigned long element; + void *mem = kmap_atomic(page); + + if (page_same_filled(mem, &element)) { + struct zram_meta *meta = zram->meta; + + kunmap_atomic(mem); + /* Free memory associated with this sector now. */ + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_free_page(zram, index); + zram_set_flag(meta, index, ZRAM_SAME); + zram_set_element(meta, index, element); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + + atomic64_inc(&zram->stats.same_pages); + return true; + } + kunmap_atomic(mem); + + return false; +} + static void zram_meta_free(struct zram_meta *meta, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; @@ -504,169 +548,104 @@ static void zram_free_page(struct zram *zram, size_t index) zram_set_obj_size(meta, index, 0); } -static int zram_decompress_page(struct zram *zram, char *mem, u32 index) +static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) { - int ret = 0; - unsigned char *cmem; - struct zram_meta *meta = zram->meta; + int ret; unsigned long handle; unsigned int size; + void *src, *dst; + struct zram_meta *meta = zram->meta; + + if (zram_special_page_read(zram, index, page, 0, PAGE_SIZE)) + return 0; bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); handle = meta->table[index].handle; size = zram_get_obj_size(meta, index); - if (!handle || zram_test_flag(meta, index, ZRAM_SAME)) { - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - zram_fill_page(mem, PAGE_SIZE, meta->table[index].element); - return 0; - } - - cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); + src = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); if (size == PAGE_SIZE) { - copy_page(mem, cmem); + dst = kmap_atomic(page); + copy_page(dst, src); + kunmap_atomic(dst); + ret = 0; } else { struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp); - ret = zcomp_decompress(zstrm, cmem, size, mem); + dst = kmap_atomic(page); + ret = zcomp_decompress(zstrm, src, size, dst); + kunmap_atomic(dst); zcomp_stream_put(zram->comp); } zs_unmap_object(meta->mem_pool, handle); bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret)) { + if (unlikely(ret)) pr_err("Decompression failed! err=%d, page=%u\n", ret, index); - return ret; - } - return 0; + return ret; } static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, - u32 index, int offset) + u32 index, int offset) { int ret; struct page *page; - unsigned char *user_mem, *uncmem = NULL; - struct zram_meta *meta = zram->meta; - page = bvec->bv_page; - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); - if (unlikely(!meta->table[index].handle) || - zram_test_flag(meta, index, ZRAM_SAME)) { - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - handle_same_page(bvec, meta->table[index].element); + page = bvec->bv_page; + if (zram_special_page_read(zram, index, page, bvec->bv_offset, + bvec->bv_len)) return 0; - } - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - - if (is_partial_io(bvec)) - /* Use a temporary buffer to decompress the page */ - uncmem = kmalloc(PAGE_SIZE, GFP_NOIO); - - user_mem = kmap_atomic(page); - if (!is_partial_io(bvec)) - uncmem = user_mem; - if (!uncmem) { - pr_err("Unable to allocate temp memory\n"); - ret = -ENOMEM; - goto out_cleanup; + if (is_partial_io(bvec)) { + /* Use a temporary buffer to decompress the page */ + page = alloc_page(GFP_NOIO|__GFP_HIGHMEM); + if (!page) + return -ENOMEM; } - ret = zram_decompress_page(zram, uncmem, index); - /* Should NEVER happen. Return bio error if it does. */ + ret = zram_decompress_page(zram, page, index); if (unlikely(ret)) - goto out_cleanup; + goto out; - if (is_partial_io(bvec)) - memcpy(user_mem + bvec->bv_offset, uncmem + offset, - bvec->bv_len); + if (is_partial_io(bvec)) { + void *dst = kmap_atomic(bvec->bv_page); + void *src = kmap_atomic(page); - flush_dcache_page(page); - ret = 0; -out_cleanup: - kunmap_atomic(user_mem); + memcpy(dst + bvec->bv_offset, src + offset, bvec->bv_len); + kunmap_atomic(src); + kunmap_atomic(dst); + } +out: if (is_partial_io(bvec)) - kfree(uncmem); + __free_page(page); + return ret; } -static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset) +static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm, + struct page *page, + unsigned long *out_handle, unsigned int *out_comp_len) { - int ret = 0; - unsigned int clen; + int ret; + unsigned int comp_len; + void *src; unsigned long handle = 0; - struct page *page; - unsigned char *user_mem, *cmem, *src, *uncmem = NULL; struct zram_meta *meta = zram->meta; - struct zcomp_strm *zstrm = NULL; - unsigned long alloced_pages; - unsigned long element; - - page = bvec->bv_page; - if (is_partial_io(bvec)) { - /* - * This is a partial IO. We need to read the full page - * before to write the changes. - */ - uncmem = kmalloc(PAGE_SIZE, GFP_NOIO); - if (!uncmem) { - ret = -ENOMEM; - goto out; - } - ret = zram_decompress_page(zram, uncmem, index); - if (ret) - goto out; - } compress_again: - user_mem = kmap_atomic(page); - if (is_partial_io(bvec)) { - memcpy(uncmem + offset, user_mem + bvec->bv_offset, - bvec->bv_len); - kunmap_atomic(user_mem); - user_mem = NULL; - } else { - uncmem = user_mem; - } - - if (page_same_filled(uncmem, &element)) { - if (user_mem) - kunmap_atomic(user_mem); - /* Free memory associated with this sector now. */ - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); - zram_free_page(zram, index); - zram_set_flag(meta, index, ZRAM_SAME); - zram_set_element(meta, index, element); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - - atomic64_inc(&zram->stats.same_pages); - ret = 0; - goto out; - } - - zstrm = zcomp_stream_get(zram->comp); - ret = zcomp_compress(zstrm, uncmem, &clen); - if (!is_partial_io(bvec)) { - kunmap_atomic(user_mem); - user_mem = NULL; - uncmem = NULL; - } + src = kmap_atomic(page); + ret = zcomp_compress(*zstrm, src, &comp_len); + kunmap_atomic(src); if (unlikely(ret)) { pr_err("Compression failed! err=%d\n", ret); - goto out; + return ret; } - src = zstrm->buffer; - if (unlikely(clen > max_zpage_size)) { - clen = PAGE_SIZE; - if (is_partial_io(bvec)) - src = uncmem; - } + if (unlikely(comp_len > max_zpage_size)) + comp_len = PAGE_SIZE; /* * handle allocation has 2 paths: @@ -682,50 +661,70 @@ compress_again: * from the slow path and handle has already been allocated. */ if (!handle) - handle = zs_malloc(meta->mem_pool, clen, + handle = zs_malloc(meta->mem_pool, comp_len, __GFP_KSWAPD_RECLAIM | __GFP_NOWARN | __GFP_HIGHMEM | __GFP_MOVABLE); if (!handle) { zcomp_stream_put(zram->comp); - zstrm = NULL; - atomic64_inc(&zram->stats.writestall); - - handle = zs_malloc(meta->mem_pool, clen, + handle = zs_malloc(meta->mem_pool, comp_len, GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE); + *zstrm = zcomp_stream_get(zram->comp); if (handle) goto compress_again; + return -ENOMEM; + } - pr_err("Error allocating memory for compressed page: %u, size=%u\n", - index, clen); - ret = -ENOMEM; - goto out; + *out_handle = handle; + *out_comp_len = comp_len; + return 0; +} + +static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, + u32 index, int offset) +{ + int ret; + unsigned long handle; + unsigned int comp_len; + void *src, *dst; + struct zcomp_strm *zstrm; + unsigned long alloced_pages; + struct zram_meta *meta = zram->meta; + struct page *page = bvec->bv_page; + + if (zram_special_page_write(zram, index, page)) + return 0; + + zstrm = zcomp_stream_get(zram->comp); + ret = zram_compress(zram, &zstrm, page, &handle, &comp_len); + if (ret) { + zcomp_stream_put(zram->comp); + return ret; } alloced_pages = zs_get_total_pages(meta->mem_pool); update_used_max(zram, alloced_pages); if (zram->limit_pages && alloced_pages > zram->limit_pages) { + zcomp_stream_put(zram->comp); zs_free(meta->mem_pool, handle); - ret = -ENOMEM; - goto out; + return -ENOMEM; } - cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); + dst = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); - if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) { + if (comp_len == PAGE_SIZE) { src = kmap_atomic(page); - copy_page(cmem, src); + copy_page(dst, src); kunmap_atomic(src); } else { - memcpy(cmem, src, clen); + memcpy(dst, zstrm->buffer, comp_len); } zcomp_stream_put(zram->comp); - zstrm = NULL; zs_unmap_object(meta->mem_pool, handle); /* @@ -734,19 +733,54 @@ compress_again: */ bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); - meta->table[index].handle = handle; - zram_set_obj_size(meta, index, clen); + zram_set_obj_size(meta, index, comp_len); bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); /* Update stats */ - atomic64_add(clen, &zram->stats.compr_data_size); + atomic64_add(comp_len, &zram->stats.compr_data_size); atomic64_inc(&zram->stats.pages_stored); + return 0; +} + +static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, + u32 index, int offset) +{ + int ret; + struct page *page = NULL; + void *src; + struct bio_vec vec; + + vec = *bvec; + if (is_partial_io(bvec)) { + void *dst; + /* + * This is a partial IO. We need to read the full page + * before to write the changes. + */ + page = alloc_page(GFP_NOIO|__GFP_HIGHMEM); + if (!page) + return -ENOMEM; + + ret = zram_decompress_page(zram, page, index); + if (ret) + goto out; + + src = kmap_atomic(bvec->bv_page); + dst = kmap_atomic(page); + memcpy(dst + offset, src + bvec->bv_offset, bvec->bv_len); + kunmap_atomic(dst); + kunmap_atomic(src); + + vec.bv_page = page; + vec.bv_len = PAGE_SIZE; + vec.bv_offset = 0; + } + + ret = __zram_bvec_write(zram, &vec, index, offset); out: - if (zstrm) - zcomp_stream_put(zram->comp); if (is_partial_io(bvec)) - kfree(uncmem); + __free_page(page); return ret; } @@ -802,6 +836,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, if (!is_write) { atomic64_inc(&zram->stats.num_reads); ret = zram_bvec_read(zram, bvec, index, offset); + flush_dcache_page(bvec->bv_page); } else { atomic64_inc(&zram->stats.num_writes); ret = zram_bvec_write(zram, bvec, index, offset); -- cgit v1.2.3 From 81dcaf8152aa471dfe836624af7e2628463e7c23 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:21:09 +1000 Subject: zram: use zram_slot_lock instead of raw bit_spin_lock op With this cleanup phase, I want to use zram's wrapper function to lock table access which is more consistent with other zram's functions. Link: http://lkml.kernel.org/r/1491196653-7388-4-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Hannes Reinecke Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Mika Penttil Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 7938f4b98b01..71b0a584bc85 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -415,24 +415,39 @@ static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); static DEVICE_ATTR_RO(debug_stat); + +static void zram_slot_lock(struct zram *zram, u32 index) +{ + struct zram_meta *meta = zram->meta; + + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); +} + +static void zram_slot_unlock(struct zram *zram, u32 index) +{ + struct zram_meta *meta = zram->meta; + + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); +} + static bool zram_special_page_read(struct zram *zram, u32 index, struct page *page, unsigned int offset, unsigned int len) { struct zram_meta *meta = zram->meta; - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_lock(zram, index); if (unlikely(!meta->table[index].handle) || zram_test_flag(meta, index, ZRAM_SAME)) { void *mem; - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_unlock(zram, index); mem = kmap_atomic(page); zram_fill_page(mem + offset, len, meta->table[index].element); kunmap_atomic(mem); return true; } - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_unlock(zram, index); return false; } @@ -448,11 +463,11 @@ static bool zram_special_page_write(struct zram *zram, u32 index, kunmap_atomic(mem); /* Free memory associated with this sector now. */ - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_lock(zram, index); zram_free_page(zram, index); zram_set_flag(meta, index, ZRAM_SAME); zram_set_element(meta, index, element); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_unlock(zram, index); atomic64_inc(&zram->stats.same_pages); return true; @@ -559,7 +574,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) if (zram_special_page_read(zram, index, page, 0, PAGE_SIZE)) return 0; - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_lock(zram, index); handle = meta->table[index].handle; size = zram_get_obj_size(meta, index); @@ -578,7 +593,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) zcomp_stream_put(zram->comp); } zs_unmap_object(meta->mem_pool, handle); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_unlock(zram, index); /* Should NEVER happen. Return bio error if it does. */ if (unlikely(ret)) @@ -731,11 +746,11 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, * Free memory associated with this sector * before overwriting unused sectors. */ - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_lock(zram, index); zram_free_page(zram, index); meta->table[index].handle = handle; zram_set_obj_size(meta, index, comp_len); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_unlock(zram, index); /* Update stats */ atomic64_add(comp_len, &zram->stats.compr_data_size); @@ -793,7 +808,6 @@ static void zram_bio_discard(struct zram *zram, u32 index, int offset, struct bio *bio) { size_t n = bio->bi_iter.bi_size; - struct zram_meta *meta = zram->meta; /* * zram manages data in physical block size units. Because logical block @@ -814,9 +828,9 @@ static void zram_bio_discard(struct zram *zram, u32 index, } while (n >= PAGE_SIZE) { - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_lock(zram, index); zram_free_page(zram, index); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_unlock(zram, index); atomic64_inc(&zram->stats.notify_free); index++; n -= PAGE_SIZE; @@ -925,9 +939,9 @@ static void zram_slot_free_notify(struct block_device *bdev, zram = bdev->bd_disk->private_data; meta = zram->meta; - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_lock(zram, index); zram_free_page(zram, index); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_slot_unlock(zram, index); atomic64_inc(&zram->stats.notify_free); } -- cgit v1.2.3 From cbdd6aae9e56a1683f18de141d9553cb1d1f7c52 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:21:09 +1000 Subject: zram: remove zram_meta structure It's redundant now. Instead, remove it and use zram structure directly. Link: http://lkml.kernel.org/r/1491196653-7388-5-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Hannes Reinecke Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Mika Penttil Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 163 +++++++++++++++++------------------------- drivers/block/zram/zram_drv.h | 6 +- 2 files changed, 65 insertions(+), 104 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 71b0a584bc85..fdb73222841d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -58,46 +58,46 @@ static inline struct zram *dev_to_zram(struct device *dev) } /* flag operations require table entry bit_spin_lock() being held */ -static int zram_test_flag(struct zram_meta *meta, u32 index, +static int zram_test_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { - return meta->table[index].value & BIT(flag); + return zram->table[index].value & BIT(flag); } -static void zram_set_flag(struct zram_meta *meta, u32 index, +static void zram_set_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { - meta->table[index].value |= BIT(flag); + zram->table[index].value |= BIT(flag); } -static void zram_clear_flag(struct zram_meta *meta, u32 index, +static void zram_clear_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { - meta->table[index].value &= ~BIT(flag); + zram->table[index].value &= ~BIT(flag); } -static inline void zram_set_element(struct zram_meta *meta, u32 index, +static inline void zram_set_element(struct zram *zram, u32 index, unsigned long element) { - meta->table[index].element = element; + zram->table[index].element = element; } -static inline void zram_clear_element(struct zram_meta *meta, u32 index) +static inline void zram_clear_element(struct zram *zram, u32 index) { - meta->table[index].element = 0; + zram->table[index].element = 0; } -static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) +static size_t zram_get_obj_size(struct zram *zram, u32 index) { - return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); + return zram->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); } -static void zram_set_obj_size(struct zram_meta *meta, +static void zram_set_obj_size(struct zram *zram, u32 index, size_t size) { - unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT; + unsigned long flags = zram->table[index].value >> ZRAM_FLAG_SHIFT; - meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; + zram->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; } #if PAGE_SIZE != 4096 @@ -252,9 +252,8 @@ static ssize_t mem_used_max_store(struct device *dev, down_read(&zram->init_lock); if (init_done(zram)) { - struct zram_meta *meta = zram->meta; atomic_long_set(&zram->stats.max_used_pages, - zs_get_total_pages(meta->mem_pool)); + zs_get_total_pages(zram->mem_pool)); } up_read(&zram->init_lock); @@ -327,7 +326,6 @@ static ssize_t compact_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); - struct zram_meta *meta; down_read(&zram->init_lock); if (!init_done(zram)) { @@ -335,8 +333,7 @@ static ssize_t compact_store(struct device *dev, return -EINVAL; } - meta = zram->meta; - zs_compact(meta->mem_pool); + zs_compact(zram->mem_pool); up_read(&zram->init_lock); return len; @@ -373,8 +370,8 @@ static ssize_t mm_stat_show(struct device *dev, down_read(&zram->init_lock); if (init_done(zram)) { - mem_used = zs_get_total_pages(zram->meta->mem_pool); - zs_pool_stats(zram->meta->mem_pool, &pool_stats); + mem_used = zs_get_total_pages(zram->mem_pool); + zs_pool_stats(zram->mem_pool, &pool_stats); } orig_size = atomic64_read(&zram->stats.pages_stored); @@ -418,32 +415,26 @@ static DEVICE_ATTR_RO(debug_stat); static void zram_slot_lock(struct zram *zram, u32 index) { - struct zram_meta *meta = zram->meta; - - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value); } static void zram_slot_unlock(struct zram *zram, u32 index) { - struct zram_meta *meta = zram->meta; - - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value); } static bool zram_special_page_read(struct zram *zram, u32 index, struct page *page, unsigned int offset, unsigned int len) { - struct zram_meta *meta = zram->meta; - zram_slot_lock(zram, index); - if (unlikely(!meta->table[index].handle) || - zram_test_flag(meta, index, ZRAM_SAME)) { + if (unlikely(!zram->table[index].handle) || + zram_test_flag(zram, index, ZRAM_SAME)) { void *mem; zram_slot_unlock(zram, index); mem = kmap_atomic(page); - zram_fill_page(mem + offset, len, meta->table[index].element); + zram_fill_page(mem + offset, len, zram->table[index].element); kunmap_atomic(mem); return true; } @@ -459,14 +450,12 @@ static bool zram_special_page_write(struct zram *zram, u32 index, void *mem = kmap_atomic(page); if (page_same_filled(mem, &element)) { - struct zram_meta *meta = zram->meta; - kunmap_atomic(mem); /* Free memory associated with this sector now. */ zram_slot_lock(zram, index); zram_free_page(zram, index); - zram_set_flag(meta, index, ZRAM_SAME); - zram_set_element(meta, index, element); + zram_set_flag(zram, index, ZRAM_SAME); + zram_set_element(zram, index, element); zram_slot_unlock(zram, index); atomic64_inc(&zram->stats.same_pages); @@ -477,56 +466,44 @@ static bool zram_special_page_write(struct zram *zram, u32 index, return false; } -static void zram_meta_free(struct zram_meta *meta, u64 disksize) +static void zram_meta_free(struct zram *zram, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; size_t index; /* Free all pages that are still in this zram device */ for (index = 0; index < num_pages; index++) { - unsigned long handle = meta->table[index].handle; + unsigned long handle = zram->table[index].handle; /* * No memory is allocated for same element filled pages. * Simply clear same page flag. */ - if (!handle || zram_test_flag(meta, index, ZRAM_SAME)) + if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) continue; - zs_free(meta->mem_pool, handle); + zs_free(zram->mem_pool, handle); } - zs_destroy_pool(meta->mem_pool); - vfree(meta->table); - kfree(meta); + zs_destroy_pool(zram->mem_pool); + vfree(zram->table); } -static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize) +static bool zram_meta_alloc(struct zram *zram, u64 disksize) { size_t num_pages; - struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL); - - if (!meta) - return NULL; num_pages = disksize >> PAGE_SHIFT; - meta->table = vzalloc(num_pages * sizeof(*meta->table)); - if (!meta->table) { - pr_err("Error allocating zram address table\n"); - goto out_error; - } + zram->table = vzalloc(num_pages * sizeof(*zram->table)); + if (!zram->table) + return false; - meta->mem_pool = zs_create_pool(pool_name); - if (!meta->mem_pool) { - pr_err("Error creating memory pool\n"); - goto out_error; + zram->mem_pool = zs_create_pool(zram->disk->disk_name); + if (!zram->mem_pool) { + vfree(zram->table); + return false; } - return meta; - -out_error: - vfree(meta->table); - kfree(meta); - return NULL; + return true; } /* @@ -536,16 +513,15 @@ out_error: */ static void zram_free_page(struct zram *zram, size_t index) { - struct zram_meta *meta = zram->meta; - unsigned long handle = meta->table[index].handle; + unsigned long handle = zram->table[index].handle; /* * No memory is allocated for same element filled pages. * Simply clear same page flag. */ - if (zram_test_flag(meta, index, ZRAM_SAME)) { - zram_clear_flag(meta, index, ZRAM_SAME); - zram_clear_element(meta, index); + if (zram_test_flag(zram, index, ZRAM_SAME)) { + zram_clear_flag(zram, index, ZRAM_SAME); + zram_clear_element(zram, index); atomic64_dec(&zram->stats.same_pages); return; } @@ -553,14 +529,14 @@ static void zram_free_page(struct zram *zram, size_t index) if (!handle) return; - zs_free(meta->mem_pool, handle); + zs_free(zram->mem_pool, handle); - atomic64_sub(zram_get_obj_size(meta, index), + atomic64_sub(zram_get_obj_size(zram, index), &zram->stats.compr_data_size); atomic64_dec(&zram->stats.pages_stored); - meta->table[index].handle = 0; - zram_set_obj_size(meta, index, 0); + zram->table[index].handle = 0; + zram_set_obj_size(zram, index, 0); } static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) @@ -569,16 +545,15 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) unsigned long handle; unsigned int size; void *src, *dst; - struct zram_meta *meta = zram->meta; if (zram_special_page_read(zram, index, page, 0, PAGE_SIZE)) return 0; zram_slot_lock(zram, index); - handle = meta->table[index].handle; - size = zram_get_obj_size(meta, index); + handle = zram->table[index].handle; + size = zram_get_obj_size(zram, index); - src = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); if (size == PAGE_SIZE) { dst = kmap_atomic(page); copy_page(dst, src); @@ -592,7 +567,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) kunmap_atomic(dst); zcomp_stream_put(zram->comp); } - zs_unmap_object(meta->mem_pool, handle); + zs_unmap_object(zram->mem_pool, handle); zram_slot_unlock(zram, index); /* Should NEVER happen. Return bio error if it does. */ @@ -647,7 +622,6 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm, unsigned int comp_len; void *src; unsigned long handle = 0; - struct zram_meta *meta = zram->meta; compress_again: src = kmap_atomic(page); @@ -676,7 +650,7 @@ compress_again: * from the slow path and handle has already been allocated. */ if (!handle) - handle = zs_malloc(meta->mem_pool, comp_len, + handle = zs_malloc(zram->mem_pool, comp_len, __GFP_KSWAPD_RECLAIM | __GFP_NOWARN | __GFP_HIGHMEM | @@ -684,7 +658,7 @@ compress_again: if (!handle) { zcomp_stream_put(zram->comp); atomic64_inc(&zram->stats.writestall); - handle = zs_malloc(meta->mem_pool, comp_len, + handle = zs_malloc(zram->mem_pool, comp_len, GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE); *zstrm = zcomp_stream_get(zram->comp); @@ -707,7 +681,6 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, void *src, *dst; struct zcomp_strm *zstrm; unsigned long alloced_pages; - struct zram_meta *meta = zram->meta; struct page *page = bvec->bv_page; if (zram_special_page_write(zram, index, page)) @@ -720,16 +693,16 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, return ret; } - alloced_pages = zs_get_total_pages(meta->mem_pool); + alloced_pages = zs_get_total_pages(zram->mem_pool); update_used_max(zram, alloced_pages); if (zram->limit_pages && alloced_pages > zram->limit_pages) { zcomp_stream_put(zram->comp); - zs_free(meta->mem_pool, handle); + zs_free(zram->mem_pool, handle); return -ENOMEM; } - dst = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); + dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); if (comp_len == PAGE_SIZE) { src = kmap_atomic(page); @@ -740,7 +713,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, } zcomp_stream_put(zram->comp); - zs_unmap_object(meta->mem_pool, handle); + zs_unmap_object(zram->mem_pool, handle); /* * Free memory associated with this sector @@ -748,8 +721,8 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, */ zram_slot_lock(zram, index); zram_free_page(zram, index); - meta->table[index].handle = handle; - zram_set_obj_size(meta, index, comp_len); + zram->table[index].handle = handle; + zram_set_obj_size(zram, index, comp_len); zram_slot_unlock(zram, index); /* Update stats */ @@ -934,10 +907,8 @@ static void zram_slot_free_notify(struct block_device *bdev, unsigned long index) { struct zram *zram; - struct zram_meta *meta; zram = bdev->bd_disk->private_data; - meta = zram->meta; zram_slot_lock(zram, index); zram_free_page(zram, index); @@ -985,7 +956,6 @@ out: static void zram_reset_device(struct zram *zram) { - struct zram_meta *meta; struct zcomp *comp; u64 disksize; @@ -998,7 +968,6 @@ static void zram_reset_device(struct zram *zram) return; } - meta = zram->meta; comp = zram->comp; disksize = zram->disksize; @@ -1011,7 +980,7 @@ static void zram_reset_device(struct zram *zram) up_write(&zram->init_lock); /* I/O operation under all of CPU are done so let's free */ - zram_meta_free(meta, disksize); + zram_meta_free(zram, disksize); zcomp_destroy(comp); } @@ -1020,7 +989,6 @@ static ssize_t disksize_store(struct device *dev, { u64 disksize; struct zcomp *comp; - struct zram_meta *meta; struct zram *zram = dev_to_zram(dev); int err; @@ -1029,8 +997,7 @@ static ssize_t disksize_store(struct device *dev, return -EINVAL; disksize = PAGE_ALIGN(disksize); - meta = zram_meta_alloc(zram->disk->disk_name, disksize); - if (!meta) + if (!zram_meta_alloc(zram, disksize)) return -ENOMEM; comp = zcomp_create(zram->compressor); @@ -1048,7 +1015,6 @@ static ssize_t disksize_store(struct device *dev, goto out_destroy_comp; } - zram->meta = meta; zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); @@ -1061,7 +1027,7 @@ out_destroy_comp: up_write(&zram->init_lock); zcomp_destroy(comp); out_free_meta: - zram_meta_free(meta, disksize); + zram_meta_free(zram, disksize); return err; } @@ -1248,7 +1214,6 @@ static int zram_add(void) goto out_free_disk; } strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); - zram->meta = NULL; pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index caeff51f1571..e34e44d02e3e 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -92,13 +92,9 @@ struct zram_stats { atomic64_t writestall; /* no. of write slow paths */ }; -struct zram_meta { +struct zram { struct zram_table_entry *table; struct zs_pool *mem_pool; -}; - -struct zram { - struct zram_meta *meta; struct zcomp *comp; struct gendisk *disk; /* Prevent concurrent execution of device init */ -- cgit v1.2.3 From 6d398c02ffb90d4a581a83d9a0a41db9ae5ec2c8 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 5 Apr 2017 09:21:10 +1000 Subject: zram: introduce zram data accessor With element, sometime I got confused handle and element access. It might be my bad but I think it's time to introduce accessor to prevent future idiot like me. This patch is just clean-up patch so it shouldn't change any behavior. Link: http://lkml.kernel.org/r/1491196653-7388-6-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Hannes Reinecke Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Mika Penttil Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index fdb73222841d..c3171e5aa582 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -57,6 +57,16 @@ static inline struct zram *dev_to_zram(struct device *dev) return (struct zram *)dev_to_disk(dev)->private_data; } +static unsigned long zram_get_handle(struct zram *zram, u32 index) +{ + return zram->table[index].handle; +} + +static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) +{ + zram->table[index].handle = handle; +} + /* flag operations require table entry bit_spin_lock() being held */ static int zram_test_flag(struct zram *zram, u32 index, enum zram_pageflags flag) @@ -82,9 +92,9 @@ static inline void zram_set_element(struct zram *zram, u32 index, zram->table[index].element = element; } -static inline void zram_clear_element(struct zram *zram, u32 index) +static unsigned long zram_get_element(struct zram *zram, u32 index) { - zram->table[index].element = 0; + return zram->table[index].element; } static size_t zram_get_obj_size(struct zram *zram, u32 index) @@ -428,13 +438,14 @@ static bool zram_special_page_read(struct zram *zram, u32 index, unsigned int offset, unsigned int len) { zram_slot_lock(zram, index); - if (unlikely(!zram->table[index].handle) || - zram_test_flag(zram, index, ZRAM_SAME)) { + if (unlikely(!zram_get_handle(zram, index) || + zram_test_flag(zram, index, ZRAM_SAME))) { void *mem; zram_slot_unlock(zram, index); mem = kmap_atomic(page); - zram_fill_page(mem + offset, len, zram->table[index].element); + zram_fill_page(mem + offset, len, + zram_get_element(zram, index)); kunmap_atomic(mem); return true; } @@ -473,7 +484,7 @@ static void zram_meta_free(struct zram *zram, u64 disksize) /* Free all pages that are still in this zram device */ for (index = 0; index < num_pages; index++) { - unsigned long handle = zram->table[index].handle; + unsigned long handle = zram_get_handle(zram, index); /* * No memory is allocated for same element filled pages. * Simply clear same page flag. @@ -513,7 +524,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) */ static void zram_free_page(struct zram *zram, size_t index) { - unsigned long handle = zram->table[index].handle; + unsigned long handle = zram_get_handle(zram, index); /* * No memory is allocated for same element filled pages. @@ -521,7 +532,7 @@ static void zram_free_page(struct zram *zram, size_t index) */ if (zram_test_flag(zram, index, ZRAM_SAME)) { zram_clear_flag(zram, index, ZRAM_SAME); - zram_clear_element(zram, index); + zram_set_element(zram, index, 0); atomic64_dec(&zram->stats.same_pages); return; } @@ -535,7 +546,7 @@ static void zram_free_page(struct zram *zram, size_t index) &zram->stats.compr_data_size); atomic64_dec(&zram->stats.pages_stored); - zram->table[index].handle = 0; + zram_set_handle(zram, index, 0); zram_set_obj_size(zram, index, 0); } @@ -550,7 +561,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index) return 0; zram_slot_lock(zram, index); - handle = zram->table[index].handle; + handle = zram_get_handle(zram, index); size = zram_get_obj_size(zram, index); src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); @@ -721,7 +732,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, */ zram_slot_lock(zram, index); zram_free_page(zram, index); - zram->table[index].handle = handle; + zram_set_handle(zram, index, handle); zram_set_obj_size(zram, index, comp_len); zram_slot_unlock(zram, index); -- cgit v1.2.3 From f0f450254111c052e14bb721089ec893ced2d3e0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 5 Apr 2017 09:21:11 +1000 Subject: mm/slab.h: add additional consistency check As found in PaX, this adds a cheap check on heap consistency, just to notice if things have gotten corrupted in the page lookup. Given the kinds of heap attacks I've been seeing, I think this added consistency check is worth it given how inexpensive it is. When heap metadata gets corrupted, we can get into nasty side-effects that can be attacker-controlled, so better to catch obviously bad states as early as possible. Link: http://lkml.kernel.org/r/20170331164028.GA118828@beast Signed-off-by: Kees Cook Cc: Michael Ellerman Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/slab.h | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/slab.h b/mm/slab.h index 65e7c3fcac72..64447640b70c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -384,6 +384,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return s; page = virt_to_head_page(x); + BUG_ON(!PageSlab(page)); cachep = page->slab_cache; if (slab_equal_or_root(cachep, s)) return cachep; -- cgit v1.2.3 From bccc07c8529761ea6e0e8609f40c1b3bb8ddc2e0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 5 Apr 2017 09:21:11 +1000 Subject: oom: improve oom disable handling Tetsuo has reported that sysrq triggered OOM killer will print a misleading information when no tasks are selected: [ 713.805315] sysrq: SysRq : Manual OOM execution [ 713.808920] Out of memory: Kill process 4468 ((agetty)) score 0 or sacrifice child [ 713.814913] Killed process 4468 ((agetty)) total-vm:43704kB, anon-rss:1760kB, file-rss:0kB, shmem-rss:0kB [ 714.004805] sysrq: SysRq : Manual OOM execution [ 714.005936] Out of memory: Kill process 4469 (systemd-cgroups) score 0 or sacrifice child [ 714.008117] Killed process 4469 (systemd-cgroups) total-vm:10704kB, anon-rss:120kB, file-rss:0kB, shmem-rss:0kB [ 714.189310] sysrq: SysRq : Manual OOM execution [ 714.193425] sysrq: OOM request ignored because killer is disabled [ 714.381313] sysrq: SysRq : Manual OOM execution [ 714.385158] sysrq: OOM request ignored because killer is disabled [ 714.573320] sysrq: SysRq : Manual OOM execution [ 714.576988] sysrq: OOM request ignored because killer is disabled The real reason is that there are no eligible tasks for the OOM killer to select but since 7c5f64f84483bd13 ("mm: oom: deduplicate victim selection code for memcg and global oom") the semantic of out_of_memory has changed without updating moom_callback. This patch updates moom_callback to tell that no task was eligible which is the case for both oom killer disabled and no eligible tasks. In order to help distinguish first case from the second add printk to both oom_killer_{enable,disable}. This information is useful on its own because it might help debugging potential memory allocation failures. Fixes: 7c5f64f84483bd13 ("mm: oom: deduplicate victim selection code for memcg and global oom") Link: http://lkml.kernel.org/r/20170404134705.6361-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Signed-off-by: Andrew Morton --- drivers/tty/sysrq.c | 2 +- mm/oom_kill.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index c6fc7141d7b2..32d61f1810f0 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -372,7 +372,7 @@ static void moom_callback(struct work_struct *ignored) mutex_lock(&oom_lock); if (!out_of_memory(&oc)) - pr_info("OOM request ignored because killer is disabled\n"); + pr_info("OOM request ignored. No task eligible\n"); mutex_unlock(&oom_lock); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d083714a2bb9..04c9143a8625 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -685,6 +685,7 @@ void exit_oom_victim(void) void oom_killer_enable(void) { oom_killer_disabled = false; + pr_info("OOM killer enabled.\n"); } /** @@ -721,6 +722,7 @@ bool oom_killer_disable(signed long timeout) oom_killer_enable(); return false; } + pr_info("OOM killer disabled.\n"); return true; } -- cgit v1.2.3 From 5560b2a6eac9664304bbbc280b007e019c491470 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 5 Apr 2017 09:21:12 +1000 Subject: mm/mmap: replace SHM_HUGE_MASK with MAP_HUGE_MASK inside mmap_pgoff 091d0d55b286 ("shm: fix null pointer deref when userspace specifies invalid hugepage size") had replaced MAP_HUGE_MASK with SHM_HUGE_MASK. Though both of them contain the same numeric value of 0x3f, MAP_HUGE_MASK flag sounds more appropriate than the other one in the context. Hence change it back. Link: http://lkml.kernel.org/r/20170404045635.616-1-khandual@linux.vnet.ibm.com Signed-off-by: Anshuman Khandual Reviewed-by: Matthew Wilcox Acked-by: Balbir Singh Acked-by: Michal Hocko Signed-off-by: Andrew Morton --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index bfbe8856d134..f82741e199c0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1479,7 +1479,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, struct user_struct *user = NULL; struct hstate *hs; - hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); + hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (!hs) return -EINVAL; -- cgit v1.2.3 From 4bc7055c7f0f9aee23aeed9001ff5530416fa069 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 5 Apr 2017 09:21:12 +1000 Subject: mm/page_alloc: return 0 in case this node has no page within the zone The whole memory space is divided into several zones and nodes may have no page in some zones. In this case, the __absent_pages_in_range() would return 0, since the range it is searching for is an empty range. Also this happens more often to those nodes with higher memory range when there are more nodes, which is a trend for future architectures. This patch checks the zone range after clamp and adjustment, return 0 if the range is an empty range. Link: http://lkml.kernel.org/r/20170206154314.15705-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Vlastimil Babka Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e025e620053d..4ec48c83c460 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5783,6 +5783,11 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, &zone_start_pfn, &zone_end_pfn); + + /* If this node has no page within this zone, return 0. */ + if (zone_start_pfn == zone_end_pfn) + return 0; + nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); /* -- cgit v1.2.3 From d53fffc22b530fc4ed15e6f031ced4b54ee8f46c Mon Sep 17 00:00:00 2001 From: Vinayak Menon Date: Wed, 5 Apr 2017 09:21:13 +1000 Subject: mm: vmscan: do not pass reclaimed slab to vmpressure During global reclaim, the nr_reclaimed passed to vmpressure includes the pages reclaimed from slab. But the corresponding scanned slab pages is not passed. There is an impact to the vmpressure values because of this. While moving from kernel version 3.18 to 4.4, a difference is seen in the vmpressure values for the same workload resulting in a different behaviour of the vmpressure consumer. One such case is of a vmpressure based lowmemorykiller. It is observed that the vmpressure events are received late and less in number resulting in tasks not being killed at the right time. In this use case, The number of critical vmpressure events received is around 50% less on 4.4 than 3.18. The following numbers show the impact on reclaim activity due to the change in behaviour of lowmemorykiller on a 4GB device. The test launches a number of apps in sequence and repeats it multiple times. The difference in reclaim behaviour is because of lesser number of kills and kills happening late, resulting in more swapping and page cache reclaim. v4.4 v3.18 pgpgin 163016456 145617236 pgpgout 4366220 4188004 workingset_refault 29857868 26781854 workingset_activate 6293946 5634625 pswpin 1327601 1133912 pswpout 3593842 3229602 pgalloc_dma 99520618 94402970 pgalloc_normal 104046854 98124798 pgfree 203772640 192600737 pgmajfault 2126962 1851836 pgsteal_kswapd_dma 19732899 18039462 pgsteal_kswapd_normal 19945336 17977706 pgsteal_direct_dma 206757 131376 pgsteal_direct_normal 236783 138247 pageoutrun 116622 108370 allocstall 7220 4684 compact_stall 931 856 The lowmemorykiller example above is just for indicating the difference in vmpressure events between 4.4 and 3.18. Do not consider reclaimed slab pages for vmpressure calculation. The reclaimed pages from slab can be excluded because the freeing of a page by slab shrinking depends on each slab's object population, making the cost model (i.e. scan:free) different from that of LRU. Also, not every shrinker accounts the pages it reclaims. Ideally the pages reclaimed from slab should be passed to vmpressure, otherwise higher vmpressure levels can be triggered even when there is a reclaim progress. But accounting only the reclaimed slab pages without the scanned, and adding something which does not fit into the cost model just adds noise to the vmpressure values. Fixes: 6b4f7799c6a5 ("mm: vmscan: invoke slab shrinkers from shrink_zone()") Link: http://lkml.kernel.org/r/1486641577-11685-2-git-send-email-vinmenon@codeaurora.org Signed-off-by: Vinayak Menon Acked-by: Minchan Kim Cc: Johannes Weiner Cc: Mel Gorman Cc: Vlastimil Babka Cc: Michal Hocko Cc: Rik van Riel Cc: Vladimir Davydov Cc: Anton Vorontsov Cc: Shiraz Hashim Signed-off-by: Andrew Morton --- mm/vmscan.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 7a30150b4dee..58615bb27f2f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2550,16 +2550,23 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) sc->nr_scanned - nr_scanned, node_lru_pages); + /* + * Record the subtree's reclaim efficiency. The reclaimed + * pages from slab is excluded here because the corresponding + * scanned pages is not accounted. Moreover, freeing a page + * by slab shrinking depends on each slab's object population, + * making the cost model (i.e. scan:free) different from that + * of LRU. + */ + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); + if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } - /* Record the subtree's reclaim efficiency */ - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, - sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); - if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; -- cgit v1.2.3 From 013ae23fee7926cffae1e1b840f0466baed4191b Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Wed, 5 Apr 2017 09:21:13 +1000 Subject: mm/page_owner: align with pageblock_nr pages When pfn_valid(pfn) returns false, pfn should be aligned with pageblock_nr_pages other than MAX_ORDER_NR_PAGES in init_pages_in_zone, because the skipped 2M may be valid pfn, as a result, early allocated count will not be accurate. Link: http://lkml.kernel.org/r/1468938136-24228-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/page_owner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 60634dc53a88..754efdd52bf7 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -527,7 +527,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) */ for (; pfn < end_pfn; ) { if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } -- cgit v1.2.3 From 0fba0909df759f27efbed7f234beff93139ba60e Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Wed, 5 Apr 2017 09:21:14 +1000 Subject: mm/vmstat.c: walk the zone in pageblock_nr_pages steps when walking the zone, we can happens to the holes. we should not align MAX_ORDER_NR_PAGES, so it can skip the normal memory. In addition, pagetypeinfo_showmixedcount_print reflect fragmentization. we hope to get more accurate data. therefore, I decide to fix it. Link: http://lkml.kernel.org/r/1469502526-24486-2-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Cc: Joonsoo Kim Signed-off-by: Andrew Morton --- mm/page_owner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 754efdd52bf7..c3cee247f2e6 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -261,7 +261,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, */ for (; pfn < end_pfn; ) { if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + pfn = ALIGN(pfn + 1, pageblock_nr_pages); continue; } -- cgit v1.2.3 From b4b49d521d1ee53b4988ea36eaa292a0787e330e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 5 Apr 2017 09:21:14 +1000 Subject: kasan: introduce helper functions for determining bug type Patch series "kasan: improve error reports", v2. This patchset improves KASAN reports by making them easier to read and a little more detailed. Also improves mm/kasan/report.c readability. Effectively changes a use-after-free report to: ================================================================== BUG: KASAN: use-after-free in kmalloc_uaf+0xaa/0xb6 [test_kasan] Write of size 1 at addr ffff88006aa59da8 by task insmod/3951 CPU: 1 PID: 3951 Comm: insmod Tainted: G B 4.10.0+ #84 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x292/0x398 print_address_description+0x73/0x280 kasan_report.part.2+0x207/0x2f0 __asan_report_store1_noabort+0x2c/0x30 kmalloc_uaf+0xaa/0xb6 [test_kasan] kmalloc_tests_init+0x4f/0xa48 [test_kasan] do_one_initcall+0xf3/0x390 do_init_module+0x215/0x5d0 load_module+0x54de/0x82b0 SYSC_init_module+0x3be/0x430 SyS_init_module+0x9/0x10 entry_SYSCALL_64_fastpath+0x1f/0xc2 RIP: 0033:0x7f22cfd0b9da RSP: 002b:00007ffe69118a78 EFLAGS: 00000206 ORIG_RAX: 00000000000000af RAX: ffffffffffffffda RBX: 0000555671242090 RCX: 00007f22cfd0b9da RDX: 00007f22cffcaf88 RSI: 000000000004df7e RDI: 00007f22d0399000 RBP: 00007f22cffcaf88 R08: 0000000000000003 R09: 0000000000000000 R10: 00007f22cfd07d0a R11: 0000000000000206 R12: 0000555671243190 R13: 000000000001fe81 R14: 0000000000000000 R15: 0000000000000004 Allocated by task 3951: save_stack_trace+0x16/0x20 save_stack+0x43/0xd0 kasan_kmalloc+0xad/0xe0 kmem_cache_alloc_trace+0x82/0x270 kmalloc_uaf+0x56/0xb6 [test_kasan] kmalloc_tests_init+0x4f/0xa48 [test_kasan] do_one_initcall+0xf3/0x390 do_init_module+0x215/0x5d0 load_module+0x54de/0x82b0 SYSC_init_module+0x3be/0x430 SyS_init_module+0x9/0x10 entry_SYSCALL_64_fastpath+0x1f/0xc2 Freed by task 3951: save_stack_trace+0x16/0x20 save_stack+0x43/0xd0 kasan_slab_free+0x72/0xc0 kfree+0xe8/0x2b0 kmalloc_uaf+0x85/0xb6 [test_kasan] kmalloc_tests_init+0x4f/0xa48 [test_kasan] do_one_initcall+0xf3/0x390 do_init_module+0x215/0x5d0 load_module+0x54de/0x82b0 SYSC_init_module+0x3be/0x430 SyS_init_module+0x9/0x10 entry_SYSCALL_64_fastpath+0x1f/0xc The buggy address belongs to the object at ffff88006aa59da0 which belongs to the cache kmalloc-16 of size 16 The buggy address is located 8 bytes inside of 16-byte region [ffff88006aa59da0, ffff88006aa59db0) The buggy address belongs to the page: page:ffffea0001aa9640 count:1 mapcount:0 mapping: (null) index:0x0 flags: 0x100000000000100(slab) raw: 0100000000000100 0000000000000000 0000000000000000 0000000180800080 raw: ffffea0001abe380 0000000700000007 ffff88006c401b40 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88006aa59c80: 00 00 fc fc 00 00 fc fc 00 00 fc fc 00 00 fc fc ffff88006aa59d00: 00 00 fc fc 00 00 fc fc 00 00 fc fc 00 00 fc fc >ffff88006aa59d80: fb fb fc fc fb fb fc fc fb fb fc fc fb fb fc fc ^ ffff88006aa59e00: fb fb fc fc fb fb fc fc fb fb fc fc fb fb fc fc ffff88006aa59e80: fb fb fc fc 00 00 fc fc 00 00 fc fc 00 00 fc fc ================================================================== from: ================================================================== BUG: KASAN: use-after-free in kmalloc_uaf+0xaa/0xb6 [test_kasan] at addr ffff88006c4dcb28 Write of size 1 by task insmod/3984 CPU: 1 PID: 3984 Comm: insmod Tainted: G B 4.10.0+ #83 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x292/0x398 kasan_object_err+0x1c/0x70 kasan_report.part.1+0x20e/0x4e0 __asan_report_store1_noabort+0x2c/0x30 kmalloc_uaf+0xaa/0xb6 [test_kasan] kmalloc_tests_init+0x4f/0xa48 [test_kasan] do_one_initcall+0xf3/0x390 do_init_module+0x215/0x5d0 load_module+0x54de/0x82b0 SYSC_init_module+0x3be/0x430 SyS_init_module+0x9/0x10 entry_SYSCALL_64_fastpath+0x1f/0xc2 RIP: 0033:0x7feca0f779da RSP: 002b:00007ffdfeae5218 EFLAGS: 00000206 ORIG_RAX: 00000000000000af RAX: ffffffffffffffda RBX: 000055a064c13090 RCX: 00007feca0f779da RDX: 00007feca1236f88 RSI: 000000000004df7e RDI: 00007feca1605000 RBP: 00007feca1236f88 R08: 0000000000000003 R09: 0000000000000000 R10: 00007feca0f73d0a R11: 0000000000000206 R12: 000055a064c14190 R13: 000000000001fe81 R14: 0000000000000000 R15: 0000000000000004 Object at ffff88006c4dcb20, in cache kmalloc-16 size: 16 Allocated: PID = 3984 save_stack_trace+0x16/0x20 save_stack+0x43/0xd0 kasan_kmalloc+0xad/0xe0 kmem_cache_alloc_trace+0x82/0x270 kmalloc_uaf+0x56/0xb6 [test_kasan] kmalloc_tests_init+0x4f/0xa48 [test_kasan] do_one_initcall+0xf3/0x390 do_init_module+0x215/0x5d0 load_module+0x54de/0x82b0 SYSC_init_module+0x3be/0x430 SyS_init_module+0x9/0x10 entry_SYSCALL_64_fastpath+0x1f/0xc2 Freed: PID = 3984 save_stack_trace+0x16/0x20 save_stack+0x43/0xd0 kasan_slab_free+0x73/0xc0 kfree+0xe8/0x2b0 kmalloc_uaf+0x85/0xb6 [test_kasan] kmalloc_tests_init+0x4f/0xa48 [test_kasan] do_one_initcall+0xf3/0x390 do_init_module+0x215/0x5d0 load_module+0x54de/0x82b0 SYSC_init_module+0x3be/0x430 SyS_init_module+0x9/0x10 entry_SYSCALL_64_fastpath+0x1f/0xc2 Memory state around the buggy address: ffff88006c4dca00: fb fb fc fc fb fb fc fc fb fb fc fc fb fb fc fc ffff88006c4dca80: fb fb fc fc fb fb fc fc fb fb fc fc fb fb fc fc >ffff88006c4dcb00: fb fb fc fc fb fb fc fc fb fb fc fc fb fb fc fc ^ ffff88006c4dcb80: fb fb fc fc 00 00 fc fc fb fb fc fc fb fb fc fc ffff88006c4dcc00: fb fb fc fc fb fb fc fc fb fb fc fc fb fb fc fc ================================================================== This patch (of 9): Introduce get_shadow_bug_type() function, which determines bug type based on the shadow value for a particular kernel address. Introduce get_wild_bug_type() function, which determines bug type for addresses which don't have a corresponding shadow value. Link: http://lkml.kernel.org/r/20170302134851.101218-2-andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Alexander Potapenko Signed-off-by: Andrew Morton --- mm/kasan/report.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ab42a0803f16..c2bc08b1b5e0 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -51,7 +51,13 @@ static const void *find_first_bad_addr(const void *addr, size_t size) return first_bad_addr; } -static void print_error_description(struct kasan_access_info *info) +static bool addr_has_shadow(struct kasan_access_info *info) +{ + return (info->access_addr >= + kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); +} + +static const char *get_shadow_bug_type(struct kasan_access_info *info) { const char *bug_type = "unknown-crash"; u8 *shadow_addr; @@ -98,6 +104,27 @@ static void print_error_description(struct kasan_access_info *info) break; } + return bug_type; +} + +const char *get_wild_bug_type(struct kasan_access_info *info) +{ + const char *bug_type = "unknown-crash"; + + if ((unsigned long)info->access_addr < PAGE_SIZE) + bug_type = "null-ptr-deref"; + else if ((unsigned long)info->access_addr < TASK_SIZE) + bug_type = "user-memory-access"; + else + bug_type = "wild-memory-access"; + + return bug_type; +} + +static void print_error_description(struct kasan_access_info *info) +{ + const char *bug_type = get_shadow_bug_type(info); + pr_err("BUG: KASAN: %s in %pS at addr %p\n", bug_type, (void *)info->ip, info->access_addr); @@ -267,18 +294,11 @@ static void print_shadow_for_address(const void *addr) static void kasan_report_error(struct kasan_access_info *info) { unsigned long flags; - const char *bug_type; kasan_start_report(&flags); - if (info->access_addr < - kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) { - if ((unsigned long)info->access_addr < PAGE_SIZE) - bug_type = "null-ptr-deref"; - else if ((unsigned long)info->access_addr < TASK_SIZE) - bug_type = "user-memory-access"; - else - bug_type = "wild-memory-access"; + if (!addr_has_shadow(info)) { + const char *bug_type = get_wild_bug_type(info); pr_err("BUG: KASAN: %s on address %p\n", bug_type, info->access_addr); pr_err("%s of size %zu by task %s/%d\n", -- cgit v1.2.3 From cd971c1a06d1c77a03873e8c7670bf6a9ab9c639 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 5 Apr 2017 09:21:15 +1000 Subject: kasan: unify report headers Unify KASAN report header format for different kinds of bad memory accesses. Makes the code simpler. Link: http://lkml.kernel.org/r/20170302134851.101218-3-andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Alexander Potapenko Signed-off-by: Andrew Morton --- mm/kasan/report.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index c2bc08b1b5e0..d6b6ec77c56a 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -121,16 +121,22 @@ const char *get_wild_bug_type(struct kasan_access_info *info) return bug_type; } +static const char *get_bug_type(struct kasan_access_info *info) +{ + if (addr_has_shadow(info)) + return get_shadow_bug_type(info); + return get_wild_bug_type(info); +} + static void print_error_description(struct kasan_access_info *info) { - const char *bug_type = get_shadow_bug_type(info); + const char *bug_type = get_bug_type(info); pr_err("BUG: KASAN: %s in %pS at addr %p\n", - bug_type, (void *)info->ip, - info->access_addr); + bug_type, (void *)info->ip, info->access_addr); pr_err("%s of size %zu by task %s/%d\n", - info->is_write ? "Write" : "Read", - info->access_size, current->comm, task_pid_nr(current)); + info->is_write ? "Write" : "Read", info->access_size, + current->comm, task_pid_nr(current)); } static inline bool kernel_or_module_addr(const void *addr) @@ -297,17 +303,11 @@ static void kasan_report_error(struct kasan_access_info *info) kasan_start_report(&flags); + print_error_description(info); + if (!addr_has_shadow(info)) { - const char *bug_type = get_wild_bug_type(info); - pr_err("BUG: KASAN: %s on address %p\n", - bug_type, info->access_addr); - pr_err("%s of size %zu by task %s/%d\n", - info->is_write ? "Write" : "Read", - info->access_size, current->comm, - task_pid_nr(current)); dump_stack(); } else { - print_error_description(info); print_address_description(info); print_shadow_for_address(info->first_bad_addr); } -- cgit v1.2.3 From 20e5c73f3be18668199219e9685c4aa6b1eac669 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 5 Apr 2017 09:21:15 +1000 Subject: kasan: change allocation and freeing stack traces headers Change stack traces headers from: Allocated: PID = 42 to: Allocated by task 42: Makes the report one line shorter and look better. Link: http://lkml.kernel.org/r/20170302134851.101218-4-andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Alexander Potapenko Signed-off-by: Andrew Morton --- mm/kasan/report.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index d6b6ec77c56a..7d24363edd66 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -177,9 +177,9 @@ static void kasan_end_report(unsigned long *flags) kasan_enable_current(); } -static void print_track(struct kasan_track *track) +static void print_track(struct kasan_track *track, const char *prefix) { - pr_err("PID = %u\n", track->pid); + pr_err("%s by task %u:\n", prefix, track->pid); if (track->stack) { struct stack_trace trace; @@ -201,10 +201,8 @@ static void kasan_object_err(struct kmem_cache *cache, void *object) if (!(cache->flags & SLAB_KASAN)) return; - pr_err("Allocated:\n"); - print_track(&alloc_info->alloc_track); - pr_err("Freed:\n"); - print_track(&alloc_info->free_track); + print_track(&alloc_info->alloc_track, "Allocated"); + print_track(&alloc_info->free_track, "Freed"); } void kasan_report_double_free(struct kmem_cache *cache, void *object, -- cgit v1.2.3 From 20af2c1cc0bb35b70578ed9d8f209ac8a5df6d5d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 5 Apr 2017 09:21:16 +1000 Subject: kasan: simplify address description logic Simplify logic for describing a memory address. Add addr_to_page() helper function. Makes the code easier to follow. Link: http://lkml.kernel.org/r/20170302134851.101218-5-andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Alexander Potapenko Signed-off-by: Andrew Morton --- mm/kasan/report.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 7d24363edd66..a82d6896062b 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -190,11 +190,18 @@ static void print_track(struct kasan_track *track, const char *prefix) } } -static void kasan_object_err(struct kmem_cache *cache, void *object) +static struct page *addr_to_page(const void *addr) +{ + if ((addr >= (void *)PAGE_OFFSET) && + (addr < high_memory)) + return virt_to_head_page(addr); + return NULL; +} + +static void describe_object(struct kmem_cache *cache, void *object) { struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); - dump_stack(); pr_err("Object at %p, in cache %s size: %d\n", object, cache->name, cache->object_size); @@ -213,34 +220,32 @@ void kasan_report_double_free(struct kmem_cache *cache, void *object, kasan_start_report(&flags); pr_err("BUG: Double free or freeing an invalid pointer\n"); pr_err("Unexpected shadow byte: 0x%hhX\n", shadow); - kasan_object_err(cache, object); + dump_stack(); + describe_object(cache, object); kasan_end_report(&flags); } static void print_address_description(struct kasan_access_info *info) { const void *addr = info->access_addr; + struct page *page = addr_to_page(addr); - if ((addr >= (void *)PAGE_OFFSET) && - (addr < high_memory)) { - struct page *page = virt_to_head_page(addr); - - if (PageSlab(page)) { - void *object; - struct kmem_cache *cache = page->slab_cache; - object = nearest_obj(cache, page, - (void *)info->access_addr); - kasan_object_err(cache, object); - return; - } + if (page) dump_page(page, "kasan: bad access detected"); + + dump_stack(); + + if (page && PageSlab(page)) { + struct kmem_cache *cache = page->slab_cache; + void *object = nearest_obj(cache, page, (void *)addr); + + describe_object(cache, object); } if (kernel_or_module_addr(addr)) { if (!init_task_stack_addr(addr)) pr_err("Address belongs to variable %pS\n", addr); } - dump_stack(); } static bool row_is_guilty(const void *row, const void *guilty) -- cgit v1.2.3 From 90ca2984ef36bf76ebb10b004677cc555bb29200 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 5 Apr 2017 09:21:16 +1000 Subject: kasan: change report header Change report header format from: BUG: KASAN: use-after-free in unwind_get_return_address+0x28a/0x2c0 at addr ffff880069437950 Read of size 8 by task insmod/3925 to: BUG: KASAN: use-after-free in unwind_get_return_address+0x28a/0x2c0 Read of size 8 at addr ffff880069437950 by task insmod/3925 The exact access address is not usually important, so move it to the second line. This also makes the header look visually balanced. Link: http://lkml.kernel.org/r/20170302134851.101218-6-andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Alexander Potapenko Signed-off-by: Andrew Morton --- mm/kasan/report.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index a82d6896062b..8efc69473a37 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -132,11 +132,11 @@ static void print_error_description(struct kasan_access_info *info) { const char *bug_type = get_bug_type(info); - pr_err("BUG: KASAN: %s in %pS at addr %p\n", - bug_type, (void *)info->ip, info->access_addr); - pr_err("%s of size %zu by task %s/%d\n", + pr_err("BUG: KASAN: %s in %pS\n", + bug_type, (void *)info->ip); + pr_err("%s of size %zu at addr %p by task %s/%d\n", info->is_write ? "Write" : "Read", info->access_size, - current->comm, task_pid_nr(current)); + info->access_addr, current->comm, task_pid_nr(current)); } static inline bool kernel_or_module_addr(const void *addr) -- cgit v1.2.3 From 493f185ab63909658f4c500ad0ee652e0abece4e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 5 Apr 2017 09:21:17 +1000 Subject: kasan: improve slab object description Changes slab object description from: Object at ffff880068388540, in cache kmalloc-128 size: 128 to: The buggy address belongs to the object at ffff880068388540 which belongs to the cache kmalloc-128 of size 128 The buggy address is located 123 bytes inside of 128-byte region [ffff880068388540, ffff8800683885c0) Makes it more explanatory and adds information about relative offset of the accessed address to the start of the object. Link: http://lkml.kernel.org/r/20170302134851.101218-7-andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Alexander Potapenko Signed-off-by: Andrew Morton --- mm/kasan/report.c | 53 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 8efc69473a37..a1b1ef7a19f5 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -198,18 +198,49 @@ static struct page *addr_to_page(const void *addr) return NULL; } -static void describe_object(struct kmem_cache *cache, void *object) +static void describe_object_addr(struct kmem_cache *cache, void *object, + const void *addr) { - struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); + unsigned long access_addr = (unsigned long)addr; + unsigned long object_addr = (unsigned long)object; + const char *rel_type; + int rel_bytes; - pr_err("Object at %p, in cache %s size: %d\n", object, cache->name, - cache->object_size); + pr_err("The buggy address belongs to the object at %p\n" + " which belongs to the cache %s of size %d\n", + object, cache->name, cache->object_size); - if (!(cache->flags & SLAB_KASAN)) + if (!addr) return; - print_track(&alloc_info->alloc_track, "Allocated"); - print_track(&alloc_info->free_track, "Freed"); + if (access_addr < object_addr) { + rel_type = "to the left"; + rel_bytes = object_addr - access_addr; + } else if (access_addr >= object_addr + cache->object_size) { + rel_type = "to the right"; + rel_bytes = access_addr - (object_addr + cache->object_size); + } else { + rel_type = "inside"; + rel_bytes = access_addr - object_addr; + } + + pr_err("The buggy address is located %d bytes %s of\n" + " %d-byte region [%p, %p)\n", + rel_bytes, rel_type, cache->object_size, (void *)object_addr, + (void *)(object_addr + cache->object_size)); +} + +static void describe_object(struct kmem_cache *cache, void *object, + const void *addr) +{ + struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); + + if (cache->flags & SLAB_KASAN) { + print_track(&alloc_info->alloc_track, "Allocated"); + print_track(&alloc_info->free_track, "Freed"); + } + + describe_object_addr(cache, object, addr); } void kasan_report_double_free(struct kmem_cache *cache, void *object, @@ -221,13 +252,13 @@ void kasan_report_double_free(struct kmem_cache *cache, void *object, pr_err("BUG: Double free or freeing an invalid pointer\n"); pr_err("Unexpected shadow byte: 0x%hhX\n", shadow); dump_stack(); - describe_object(cache, object); + describe_object(cache, object, NULL); kasan_end_report(&flags); } static void print_address_description(struct kasan_access_info *info) { - const void *addr = info->access_addr; + void *addr = (void *)info->access_addr; struct page *page = addr_to_page(addr); if (page) @@ -237,9 +268,9 @@ static void print_address_description(struct kasan_access_info *info) if (page && PageSlab(page)) { struct kmem_cache *cache = page->slab_cache; - void *object = nearest_obj(cache, page, (void *)addr); + void *object = nearest_obj(cache, page, addr); - describe_object(cache, object); + describe_object(cache, object, addr); } if (kernel_or_module_addr(addr)) { -- cgit v1.2.3 From 8927e95194a69c91b9a7c71b6f83a2fbcd81573f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 5 Apr 2017 09:21:17 +1000 Subject: kasan: print page description after stacks Moves page description after the stacks since it's less important. Link: http://lkml.kernel.org/r/20170302134851.101218-8-andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Alexander Potapenko Signed-off-by: Andrew Morton --- mm/kasan/report.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index a1b1ef7a19f5..b015acc80876 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -261,9 +261,6 @@ static void print_address_description(struct kasan_access_info *info) void *addr = (void *)info->access_addr; struct page *page = addr_to_page(addr); - if (page) - dump_page(page, "kasan: bad access detected"); - dump_stack(); if (page && PageSlab(page)) { @@ -273,9 +270,14 @@ static void print_address_description(struct kasan_access_info *info) describe_object(cache, object, addr); } - if (kernel_or_module_addr(addr)) { - if (!init_task_stack_addr(addr)) - pr_err("Address belongs to variable %pS\n", addr); + if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) { + pr_err("The buggy address belongs to the variable:\n"); + pr_err(" %pS\n", addr); + } + + if (page) { + pr_err("The buggy address belongs to the page:\n"); + dump_page(page, "kasan: bad access detected"); } } -- cgit v1.2.3 From ae8d01251c6ccb361a3cf3ae881f360e5e48fa54 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 5 Apr 2017 09:21:18 +1000 Subject: kasan: improve double-free report format Changes double-free report header from BUG: Double free or freeing an invalid pointer Unexpected shadow byte: 0xFB to BUG: KASAN: double-free or invalid-free in kmalloc_oob_left+0xe5/0xef This makes a bug uniquely identifiable by the first report line. To account for removing of the unexpected shadow value, print shadow bytes at the end of the report as in reports for other kinds of bugs. Link: http://lkml.kernel.org/r/20170302134851.101218-9-andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Alexander Potapenko Signed-off-by: Andrew Morton --- mm/kasan/kasan.c | 3 ++- mm/kasan/kasan.h | 2 +- mm/kasan/report.c | 30 ++++++++++++++---------------- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 98b27195e38b..9348d27088c1 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -577,7 +577,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object) shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) { - kasan_report_double_free(cache, object, shadow_byte); + kasan_report_double_free(cache, object, + __builtin_return_address(1)); return true; } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index dd2dea8eb077..1229298cce64 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -99,7 +99,7 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_double_free(struct kmem_cache *cache, void *object, - s8 shadow); + void *ip); #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index b015acc80876..7d3d9670e233 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -243,22 +243,8 @@ static void describe_object(struct kmem_cache *cache, void *object, describe_object_addr(cache, object, addr); } -void kasan_report_double_free(struct kmem_cache *cache, void *object, - s8 shadow) -{ - unsigned long flags; - - kasan_start_report(&flags); - pr_err("BUG: Double free or freeing an invalid pointer\n"); - pr_err("Unexpected shadow byte: 0x%hhX\n", shadow); - dump_stack(); - describe_object(cache, object, NULL); - kasan_end_report(&flags); -} - -static void print_address_description(struct kasan_access_info *info) +static void print_address_description(void *addr) { - void *addr = (void *)info->access_addr; struct page *page = addr_to_page(addr); dump_stack(); @@ -333,6 +319,18 @@ static void print_shadow_for_address(const void *addr) } } +void kasan_report_double_free(struct kmem_cache *cache, void *object, + void *ip) +{ + unsigned long flags; + + kasan_start_report(&flags); + pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip); + print_address_description(object); + print_shadow_for_address(object); + kasan_end_report(&flags); +} + static void kasan_report_error(struct kasan_access_info *info) { unsigned long flags; @@ -344,7 +342,7 @@ static void kasan_report_error(struct kasan_access_info *info) if (!addr_has_shadow(info)) { dump_stack(); } else { - print_address_description(info); + print_address_description((void *)info->access_addr); print_shadow_for_address(info->first_bad_addr); } -- cgit v1.2.3 From 60a165975d8fb595641ce715522b071e6de0b1e6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 5 Apr 2017 09:21:18 +1000 Subject: kasan: separate report parts by empty lines Makes the report easier to read. Link: http://lkml.kernel.org/r/20170302134851.101218-10-andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Alexander Potapenko Signed-off-by: Andrew Morton --- mm/kasan/report.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 7d3d9670e233..beee0e980e2d 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -237,7 +237,9 @@ static void describe_object(struct kmem_cache *cache, void *object, if (cache->flags & SLAB_KASAN) { print_track(&alloc_info->alloc_track, "Allocated"); + pr_err("\n"); print_track(&alloc_info->free_track, "Freed"); + pr_err("\n"); } describe_object_addr(cache, object, addr); @@ -248,6 +250,7 @@ static void print_address_description(void *addr) struct page *page = addr_to_page(addr); dump_stack(); + pr_err("\n"); if (page && PageSlab(page)) { struct kmem_cache *cache = page->slab_cache; @@ -326,7 +329,9 @@ void kasan_report_double_free(struct kmem_cache *cache, void *object, kasan_start_report(&flags); pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip); + pr_err("\n"); print_address_description(object); + pr_err("\n"); print_shadow_for_address(object); kasan_end_report(&flags); } @@ -338,11 +343,13 @@ static void kasan_report_error(struct kasan_access_info *info) kasan_start_report(&flags); print_error_description(info); + pr_err("\n"); if (!addr_has_shadow(info)) { dump_stack(); } else { print_address_description((void *)info->access_addr); + pr_err("\n"); print_shadow_for_address(info->first_bad_addr); } -- cgit v1.2.3 From 4e9ebb5190ba6df165cfec2dd6f71e00739ec5b0 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Wed, 5 Apr 2017 09:21:19 +1000 Subject: fs/proc/inode.c: remove cast from memory allocation Coccinelle emits WARNING: casting value returned by memory allocation function to (struct proc_inode *) is useless. Remove unnecessary cast. Link: http://lkml.kernel.org/r/1487745720-16967-1-git-send-email-me@tobin.cc Signed-off-by: Tobin C. Harding Signed-off-by: Andrew Morton --- fs/proc/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 2cc7a8030275..e250910cffc8 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -58,7 +58,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb) struct proc_inode *ei; struct inode *inode; - ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->pid = NULL; -- cgit v1.2.3 From 44f507272bf7630f6f5dfc029c848e9afb0e5c5a Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Wed, 5 Apr 2017 09:21:20 +1000 Subject: proc/sysctl: fix the int overflow for jiffies conversion do_proc_dointvec_jiffies_conv() uses LONG_MAX/HZ as the max value to avoid overflow. But actually the *valp is int type, so it still causes overflow. For example, echo 2147483647 > ./sys/net/ipv4/tcp_keepalive_time Then, cat ./sys/net/ipv4/tcp_keepalive_time The output is "-1", it is not expected. Now use INT_MAX/HZ as the max value instead LONG_MAX/HZ to fix it. Link: http://lkml.kernel.org/r/1490109532-9228-1-git-send-email-fgao@ikuai8.com Signed-off-by: Gao Feng Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Alexey Dobriyan Cc: Eric Dumazet Cc: Josh Poimboeuf Signed-off-by: Andrew Morton --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index acf0a5a06da7..60474dfa45c9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2571,7 +2571,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, int write, void *data) { if (write) { - if (*lvalp > LONG_MAX / HZ) + if (*lvalp > INT_MAX / HZ) return 1; *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); } else { -- cgit v1.2.3 From cbf9d4280bc59f242e0266258da11685380558cf Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 5 Apr 2017 09:21:20 +1000 Subject: drivers/virt/fsl_hypervisor.c: use get_user_pages_unlocked() Moving from get_user_pages() to get_user_pages_unlocked() simplifies the code and takes advantage of VM_FAULT_RETRY functionality when faulting in pages. Link: http://lkml.kernel.org/r/20161101194332.23961-1-lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Cc: Michal Hocko Cc: Paolo Bonzini Cc: Kumar Gala Cc: Mihai Caraman Cc: Greg KH Signed-off-by: Andrew Morton --- drivers/virt/fsl_hypervisor.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c index 150ce2abf6c8..d3eca879a0a8 100644 --- a/drivers/virt/fsl_hypervisor.c +++ b/drivers/virt/fsl_hypervisor.c @@ -243,11 +243,8 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p) sg_list = PTR_ALIGN(sg_list_unaligned, sizeof(struct fh_sg_list)); /* Get the physical addresses of the source buffer */ - down_read(¤t->mm->mmap_sem); - num_pinned = get_user_pages(param.local_vaddr - lb_offset, - num_pages, (param.source == -1) ? 0 : FOLL_WRITE, - pages, NULL); - up_read(¤t->mm->mmap_sem); + num_pinned = get_user_pages_unlocked(param.local_vaddr - lb_offset, + num_pages, pages, (param.source == -1) ? 0 : FOLL_WRITE); if (num_pinned != num_pages) { /* get_user_pages() failed */ -- cgit v1.2.3 From 5e2b943025143912ffc0cf30dd2c04c25733e014 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 5 Apr 2017 09:21:21 +1000 Subject: jiffies.h: declare jiffies and jiffies_64 with ____cacheline_aligned_in_smp jiffies_64 is defined in kernel/time/timer.c with ____cacheline_aligned_in_smp, however this macro is not part of the declaration of jiffies and jiffies_64 in jiffies.h. As a result clang generates the following warning: kernel/time/timer.c:57:26: error: section does not match previous declaration [-Werror,-Wsection] __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; ^ include/linux/cache.h:39:36: note: expanded from macro '__cacheline_aligned_in_smp' ^ include/linux/cache.h:34:4: note: expanded from macro '__cacheline_aligned' __section__(".data..cacheline_aligned"))) ^ include/linux/jiffies.h:77:12: note: previous attribute is here extern u64 __jiffy_data jiffies_64; ^ include/linux/jiffies.h:70:38: note: expanded from macro '__jiffy_data' Link: http://lkml.kernel.org/r/20170403190200.70273-1-mka@chromium.org Signed-off-by: Matthias Kaehlcke Cc: "Jason A . Donenfeld" Cc: Grant Grundler Cc: Michael Davidson Cc: Greg Hackmann Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/jiffies.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 624215cebee5..36872fbb815d 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -1,6 +1,7 @@ #ifndef _LINUX_JIFFIES_H #define _LINUX_JIFFIES_H +#include #include #include #include @@ -63,19 +64,13 @@ extern int register_refined_jiffies(long clock_tick_rate); /* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) -/* some arch's have a small-data section that can be accessed register-relative - * but that can only take up to, say, 4-byte variables. jiffies being part of - * an 8-byte variable may not be correctly accessed unless we force the issue - */ -#define __jiffy_data __attribute__((section(".data"))) - /* * The 64-bit value is not atomic - you MUST NOT read it * without sampling the sequence number in jiffies_lock. * get_jiffies_64() will do this for you as appropriate. */ -extern u64 __jiffy_data jiffies_64; -extern unsigned long volatile __jiffy_data jiffies; +extern u64 __cacheline_aligned_in_smp jiffies_64; +extern unsigned long volatile __cacheline_aligned_in_smp jiffies; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void); -- cgit v1.2.3 From d1b6ad546626d39f914fb4777fdcbed07a616932 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 5 Apr 2017 09:21:21 +1000 Subject: kernel/hung_task.c: defer showing held locks When I was running my testcase which may block hundreds of threads on fs locks, I got lockup due to output from debug_show_all_locks() added by commit b2d4c2edb2e4f89a ("locking/hung_task: Show all locks"). For example, if 1000 threads were blocked in TASK_UNINTERRUPTIBLE state and 500 out of 1000 threads hold some lock, debug_show_all_locks() from for_each_process_thread() loop will report locks held by 500 threads for 1000 times. This is a too much noise. In order to make sure rcu_lock_break() is called frequently, we should avoid calling debug_show_all_locks() from for_each_process_thread() loop because debug_show_all_locks() effectively calls for_each_process_thread() loop. Let's defer calling debug_show_all_locks() till before panic() or leaving for_each_process_thread() loop. Link: http://lkml.kernel.org/r/1489296834-60436-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Reviewed-by: Vegard Nossum Cc: Ingo Molnar Signed-off-by: Andrew Morton --- kernel/hung_task.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index f0f8e2a8496f..751593ed7c0b 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -43,6 +43,7 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; +static bool hung_task_show_lock; static struct task_struct *watchdog_task; @@ -120,12 +121,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - debug_show_all_locks(); + hung_task_show_lock = true; } touch_nmi_watchdog(); if (sysctl_hung_task_panic) { + if (hung_task_show_lock) + debug_show_all_locks(); trigger_all_cpu_backtrace(); panic("hung_task: blocked tasks"); } @@ -172,6 +175,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) if (test_taint(TAINT_DIE) || did_panic) return; + hung_task_show_lock = false; rcu_read_lock(); for_each_process_thread(g, t) { if (!max_count--) @@ -187,6 +191,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) } unlock: rcu_read_unlock(); + if (hung_task_show_lock) + debug_show_all_locks(); } static long hung_timeout_jiffies(unsigned long last_checked, -- cgit v1.2.3 From cc70daf1fe328db2d728652f44266766f54669e8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 5 Apr 2017 09:21:22 +1000 Subject: drivers/misc/vmw_vmci/vmci_queue_pair.c: fix a couple integer overflow tests The "DIV_ROUND_UP(size, PAGE_SIZE)" operation can overflow if "size" is more than ULLONG_MAX - PAGE_SIZE. Link: http://lkml.kernel.org/r/20170322111950.GA11279@mwanda Signed-off-by: Dan Carpenter Cc: Jorgen Hansen Cc: Masahiro Yamada Cc: Michal Hocko Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- drivers/misc/vmw_vmci/vmci_queue_pair.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c index 498c0854305f..06c4974ee8dd 100644 --- a/drivers/misc/vmw_vmci/vmci_queue_pair.c +++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c @@ -298,8 +298,11 @@ static void *qp_alloc_queue(u64 size, u32 flags) size_t pas_size; size_t vas_size; size_t queue_size = sizeof(*queue) + sizeof(*queue->kernel_if); - const u64 num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1; + u64 num_pages; + if (size > SIZE_MAX - PAGE_SIZE) + return NULL; + num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1; if (num_pages > (SIZE_MAX - queue_size) / (sizeof(*queue->kernel_if->u.g.pas) + @@ -624,9 +627,12 @@ static struct vmci_queue *qp_host_alloc_queue(u64 size) { struct vmci_queue *queue; size_t queue_page_size; - const u64 num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1; + u64 num_pages; const size_t queue_size = sizeof(*queue) + sizeof(*(queue->kernel_if)); + if (size > SIZE_MAX - PAGE_SIZE) + return NULL; + num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1; if (num_pages > (SIZE_MAX - queue_size) / sizeof(*queue->kernel_if->u.h.page)) return NULL; -- cgit v1.2.3 From cf3e4b6661253d8670899cba733b0a94b41df943 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 5 Apr 2017 09:21:23 +1000 Subject: Revert "lib/test_sort.c: make it explicitly non-modular" Patch series "lib: add module support to sort tests". This patch series allows to compile the array-based and linked list sort test code either to loadable modules, or builtin into the kernel. It's very valuable to have modular tests, so you can run them just by insmodding the test modules, instead of needing a separate kernel that runs them at boot. This patch (of 3): This reverts commit 8893f519330bb073a49c5b4676fce4be6f1be15d. It's very valuable to have modular tests, so you can run them just by insmodding the test modules, instead of needing a separate kernel that runs them at boot. Link: http://lkml.kernel.org/r/1488287219-15832-2-git-send-email-geert@linux-m68k.org Signed-off-by: Geert Uytterhoeven Reviewed-by: Andy Shevchenko Cc: Arnd Bergmann Cc: Paul Gortmaker Cc: Shuah Khan Signed-off-by: Andrew Morton --- lib/test_sort.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lib/test_sort.c b/lib/test_sort.c index 4db3911db50a..d389c1cc2f6c 100644 --- a/lib/test_sort.c +++ b/lib/test_sort.c @@ -1,11 +1,8 @@ #include #include -#include +#include -/* - * A simple boot-time regression test - * License: GPL - */ +/* a simple boot-time regression test */ #define TEST_LEN 1000 @@ -41,4 +38,6 @@ exit: kfree(a); return err; } -subsys_initcall(test_sort_init); + +module_init(test_sort_init); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From d07dca671be9499dc32308f7b2f19666c60f480e Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 5 Apr 2017 09:21:23 +1000 Subject: lib: add module support to array-based sort tests Allow to compile the array-based sort test code either to a loadable module, or builtin into the kernel. Link: http://lkml.kernel.org/r/1488287219-15832-3-git-send-email-geert@linux-m68k.org Signed-off-by: Geert Uytterhoeven Reviewed-by: Andy Shevchenko Cc: Arnd Bergmann Cc: Paul Gortmaker Cc: Shuah Khan Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 97d62c2da6c2..22dd8721d7c2 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1727,10 +1727,11 @@ config TEST_LIST_SORT If unsure, say N. config TEST_SORT - bool "Array-based sort test" - depends on DEBUG_KERNEL + tristate "Array-based sort test" + depends on DEBUG_KERNEL || m help - This option enables the self-test function of 'sort()' at boot. + This option enables the self-test function of 'sort()' at boot, + or at module load time. If unsure, say N. -- cgit v1.2.3 From a496bb33d48780a9bb8fc35e6174ef29ed83d33a Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 5 Apr 2017 09:21:24 +1000 Subject: lib: add module support to linked list sorting tests Extract the linked list sorting test code into its own source file, to allow to compile it either to a loadable module, or builtin into the kernel. Link: http://lkml.kernel.org/r/1488287219-15832-4-git-send-email-geert@linux-m68k.org Signed-off-by: Geert Uytterhoeven Reviewed-by: Andy Shevchenko Cc: Arnd Bergmann Cc: Paul Gortmaker Cc: Shuah Khan Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 7 +-- lib/Makefile | 1 + lib/list_sort.c | 149 -------------------------------------------------- lib/test_list_sort.c | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 155 insertions(+), 152 deletions(-) create mode 100644 lib/test_list_sort.c diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 22dd8721d7c2..924f210db65f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1718,11 +1718,12 @@ config LKDTM Documentation/fault-injection/provoke-crashes.txt config TEST_LIST_SORT - bool "Linked list sorting test" - depends on DEBUG_KERNEL + tristate "Linked list sorting test" + depends on DEBUG_KERNEL || m help Enable this to turn on 'list_sort()' function test. This test is - executed only once during system boot, so affects only boot time. + executed only once during system boot (so affects only boot time), + or at module load time. If unsure, say N. diff --git a/lib/Makefile b/lib/Makefile index 320ac46a8725..786c4538a91f 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -52,6 +52,7 @@ obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o obj-$(CONFIG_TEST_KASAN) += test_kasan.o obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o +obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o obj-$(CONFIG_TEST_LKM) += test_module.o obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o obj-$(CONFIG_TEST_SORT) += test_sort.o diff --git a/lib/list_sort.c b/lib/list_sort.c index 3fe401067e20..9e9acc37652f 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -1,6 +1,3 @@ - -#define pr_fmt(fmt) "list_sort_test: " fmt - #include #include #include @@ -145,149 +142,3 @@ void list_sort(void *priv, struct list_head *head, merge_and_restore_back_links(priv, cmp, head, part[max_lev], list); } EXPORT_SYMBOL(list_sort); - -#ifdef CONFIG_TEST_LIST_SORT - -#include -#include - -/* - * The pattern of set bits in the list length determines which cases - * are hit in list_sort(). - */ -#define TEST_LIST_LEN (512+128+2) /* not including head */ - -#define TEST_POISON1 0xDEADBEEF -#define TEST_POISON2 0xA324354C - -struct debug_el { - unsigned int poison1; - struct list_head list; - unsigned int poison2; - int value; - unsigned serial; -}; - -/* Array, containing pointers to all elements in the test list */ -static struct debug_el **elts __initdata; - -static int __init check(struct debug_el *ela, struct debug_el *elb) -{ - if (ela->serial >= TEST_LIST_LEN) { - pr_err("error: incorrect serial %d\n", ela->serial); - return -EINVAL; - } - if (elb->serial >= TEST_LIST_LEN) { - pr_err("error: incorrect serial %d\n", elb->serial); - return -EINVAL; - } - if (elts[ela->serial] != ela || elts[elb->serial] != elb) { - pr_err("error: phantom element\n"); - return -EINVAL; - } - if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) { - pr_err("error: bad poison: %#x/%#x\n", - ela->poison1, ela->poison2); - return -EINVAL; - } - if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) { - pr_err("error: bad poison: %#x/%#x\n", - elb->poison1, elb->poison2); - return -EINVAL; - } - return 0; -} - -static int __init cmp(void *priv, struct list_head *a, struct list_head *b) -{ - struct debug_el *ela, *elb; - - ela = container_of(a, struct debug_el, list); - elb = container_of(b, struct debug_el, list); - - check(ela, elb); - return ela->value - elb->value; -} - -static int __init list_sort_test(void) -{ - int i, count = 1, err = -ENOMEM; - struct debug_el *el; - struct list_head *cur; - LIST_HEAD(head); - - pr_debug("start testing list_sort()\n"); - - elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL); - if (!elts) { - pr_err("error: cannot allocate memory\n"); - return err; - } - - for (i = 0; i < TEST_LIST_LEN; i++) { - el = kmalloc(sizeof(*el), GFP_KERNEL); - if (!el) { - pr_err("error: cannot allocate memory\n"); - goto exit; - } - /* force some equivalencies */ - el->value = prandom_u32() % (TEST_LIST_LEN / 3); - el->serial = i; - el->poison1 = TEST_POISON1; - el->poison2 = TEST_POISON2; - elts[i] = el; - list_add_tail(&el->list, &head); - } - - list_sort(NULL, &head, cmp); - - err = -EINVAL; - for (cur = head.next; cur->next != &head; cur = cur->next) { - struct debug_el *el1; - int cmp_result; - - if (cur->next->prev != cur) { - pr_err("error: list is corrupted\n"); - goto exit; - } - - cmp_result = cmp(NULL, cur, cur->next); - if (cmp_result > 0) { - pr_err("error: list is not sorted\n"); - goto exit; - } - - el = container_of(cur, struct debug_el, list); - el1 = container_of(cur->next, struct debug_el, list); - if (cmp_result == 0 && el->serial >= el1->serial) { - pr_err("error: order of equivalent elements not " - "preserved\n"); - goto exit; - } - - if (check(el, el1)) { - pr_err("error: element check failed\n"); - goto exit; - } - count++; - } - if (head.prev != cur) { - pr_err("error: list is corrupted\n"); - goto exit; - } - - - if (count != TEST_LIST_LEN) { - pr_err("error: bad list length %d", count); - goto exit; - } - - err = 0; -exit: - for (i = 0; i < TEST_LIST_LEN; i++) - kfree(elts[i]); - kfree(elts); - return err; -} -late_initcall(list_sort_test); -#endif /* CONFIG_TEST_LIST_SORT */ diff --git a/lib/test_list_sort.c b/lib/test_list_sort.c new file mode 100644 index 000000000000..28e817387b04 --- /dev/null +++ b/lib/test_list_sort.c @@ -0,0 +1,150 @@ +#define pr_fmt(fmt) "list_sort_test: " fmt + +#include +#include +#include +#include +#include +#include +#include + +/* + * The pattern of set bits in the list length determines which cases + * are hit in list_sort(). + */ +#define TEST_LIST_LEN (512+128+2) /* not including head */ + +#define TEST_POISON1 0xDEADBEEF +#define TEST_POISON2 0xA324354C + +struct debug_el { + unsigned int poison1; + struct list_head list; + unsigned int poison2; + int value; + unsigned serial; +}; + +/* Array, containing pointers to all elements in the test list */ +static struct debug_el **elts __initdata; + +static int __init check(struct debug_el *ela, struct debug_el *elb) +{ + if (ela->serial >= TEST_LIST_LEN) { + pr_err("error: incorrect serial %d\n", ela->serial); + return -EINVAL; + } + if (elb->serial >= TEST_LIST_LEN) { + pr_err("error: incorrect serial %d\n", elb->serial); + return -EINVAL; + } + if (elts[ela->serial] != ela || elts[elb->serial] != elb) { + pr_err("error: phantom element\n"); + return -EINVAL; + } + if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) { + pr_err("error: bad poison: %#x/%#x\n", + ela->poison1, ela->poison2); + return -EINVAL; + } + if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) { + pr_err("error: bad poison: %#x/%#x\n", + elb->poison1, elb->poison2); + return -EINVAL; + } + return 0; +} + +static int __init cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct debug_el *ela, *elb; + + ela = container_of(a, struct debug_el, list); + elb = container_of(b, struct debug_el, list); + + check(ela, elb); + return ela->value - elb->value; +} + +static int __init list_sort_test(void) +{ + int i, count = 1, err = -ENOMEM; + struct debug_el *el; + struct list_head *cur; + LIST_HEAD(head); + + pr_debug("start testing list_sort()\n"); + + elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL); + if (!elts) { + pr_err("error: cannot allocate memory\n"); + return err; + } + + for (i = 0; i < TEST_LIST_LEN; i++) { + el = kmalloc(sizeof(*el), GFP_KERNEL); + if (!el) { + pr_err("error: cannot allocate memory\n"); + goto exit; + } + /* force some equivalencies */ + el->value = prandom_u32() % (TEST_LIST_LEN / 3); + el->serial = i; + el->poison1 = TEST_POISON1; + el->poison2 = TEST_POISON2; + elts[i] = el; + list_add_tail(&el->list, &head); + } + + list_sort(NULL, &head, cmp); + + err = -EINVAL; + for (cur = head.next; cur->next != &head; cur = cur->next) { + struct debug_el *el1; + int cmp_result; + + if (cur->next->prev != cur) { + pr_err("error: list is corrupted\n"); + goto exit; + } + + cmp_result = cmp(NULL, cur, cur->next); + if (cmp_result > 0) { + pr_err("error: list is not sorted\n"); + goto exit; + } + + el = container_of(cur, struct debug_el, list); + el1 = container_of(cur->next, struct debug_el, list); + if (cmp_result == 0 && el->serial >= el1->serial) { + pr_err("error: order of equivalent elements not " + "preserved\n"); + goto exit; + } + + if (check(el, el1)) { + pr_err("error: element check failed\n"); + goto exit; + } + count++; + } + if (head.prev != cur) { + pr_err("error: list is corrupted\n"); + goto exit; + } + + + if (count != TEST_LIST_LEN) { + pr_err("error: bad list length %d", count); + goto exit; + } + + err = 0; +exit: + for (i = 0; i < TEST_LIST_LEN; i++) + kfree(elts[i]); + kfree(elts); + return err; +} +module_init(list_sort_test); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 004d4db8186a65fdb9035c56487ad716fb3cab28 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 5 Apr 2017 09:21:24 +1000 Subject: firmware/Makefile: force recompilation if makefile changes If you modify the target asm we currently do not force the recompilation of the firmware files. The target asm is in the firmware/Makefile, peg this file as a dependency to require re-compilation of firmware targets when the asm changes. Link: http://lkml.kernel.org/r/20170123150727.4883-1-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Cc: Masahiro Yamada Cc: Michal Marek Cc: Ming Lei Cc: Greg Kroah-Hartman Cc: Tom Gundersen Cc: David Woodhouse Signed-off-by: Andrew Morton --- firmware/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/firmware/Makefile b/firmware/Makefile index e297e1b52636..fa3e81c2a97b 100644 --- a/firmware/Makefile +++ b/firmware/Makefile @@ -176,7 +176,8 @@ quiet_cmd_fwbin = MK_FW $@ wordsize_deps := $(wildcard include/config/64bit.h include/config/32bit.h \ include/config/ppc32.h include/config/ppc64.h \ include/config/superh32.h include/config/superh64.h \ - include/config/x86_32.h include/config/x86_64.h) + include/config/x86_32.h include/config/x86_64.h \ + firmware/Makefile) $(patsubst %,$(obj)/%.gen.S, $(fw-shipped-y)): %: $(wordsize_deps) $(call cmd,fwbin,$(patsubst %.gen.S,%,$@)) -- cgit v1.2.3 From 968fc5a74122ccb4805e158f00041c310b00c4d9 Mon Sep 17 00:00:00 2001 From: Ruslan Bilovol Date: Wed, 5 Apr 2017 09:21:25 +1000 Subject: checkpatch: remove obsolete CONFIG_EXPERIMENTAL checks Config EXPERIMENTAL has been removed from kernel in 2013 (see 3d374d0: "final removal of CONFIG_EXPERIMENTAL"), there is no any reason to do these checks now. Link: http://lkml.kernel.org/r/1488234097-20119-1-git-send-email-ruslan.bilovol@gmail.com Signed-off-by: Ruslan Bilovol Acked-by: Kees Cook Acked-by: Joe Perches Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index baa3c7be04ad..30eeba4f1602 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2757,13 +2757,6 @@ sub process { #print "is_start<$is_start> is_end<$is_end> length<$length>\n"; } -# discourage the addition of CONFIG_EXPERIMENTAL in Kconfig. - if ($realfile =~ /Kconfig/ && - $line =~ /.\s*depends on\s+.*\bEXPERIMENTAL\b/) { - WARN("CONFIG_EXPERIMENTAL", - "Use of CONFIG_EXPERIMENTAL is deprecated. For alternatives, see https://lkml.org/lkml/2012/10/23/580\n"); - } - # discourage the use of boolean for type definition attributes of Kconfig options if ($realfile =~ /Kconfig/ && $line =~ /^\+\s*\bboolean\b/) { @@ -3157,12 +3150,6 @@ sub process { } } -# discourage the addition of CONFIG_EXPERIMENTAL in #if(def). - if ($line =~ /^\+\s*\#\s*if.*\bCONFIG_EXPERIMENTAL\b/) { - WARN("CONFIG_EXPERIMENTAL", - "Use of CONFIG_EXPERIMENTAL is deprecated. For alternatives, see https://lkml.org/lkml/2012/10/23/580\n"); - } - # check for RCS/CVS revision markers if ($rawline =~ /^\+.*\$(Revision|Log|Id)(?:\$|)/) { WARN("CVS_KEYWORD", -- cgit v1.2.3 From 76a02a9d1023bdc9a5e6cc40549f9a3b2bb807ac Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 5 Apr 2017 09:21:25 +1000 Subject: checkpatch: add ability to find bad uses of vsprintf %p extensions %pK was at least once misused at %pk in an out-of-tree module. This lead to some security concerns. Add the ability to track single and multiple line statements for misuses of %p. Link: http://lkml.kernel.org/r/163a690510e636a23187c0dc9caa09ddac6d4cde.1488228427.git.joe@perches.com Signed-off-by: Joe Perches Acked-by: Kees Cook Acked-by: William Roberts Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 30eeba4f1602..732bb3e2fe9a 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5663,6 +5663,32 @@ sub process { } } + # check for vsprintf extension %p misuses + if ($^V && $^V ge 5.10.0 && + defined $stat && + $stat =~ /^\+(?![^\{]*\{\s*).*\b(\w+)\s*\(.*$String\s*,/s && + $1 !~ /^_*volatile_*$/) { + my $bad_extension = ""; + my $lc = $stat =~ tr@\n@@; + $lc = $lc + $linenr; + for (my $count = $linenr; $count <= $lc; $count++) { + my $fmt = get_quoted_string($lines[$count - 1], raw_line($count, 0)); + $fmt =~ s/%%//g; + if ($fmt =~ /(\%[\*\d\.]*p(?![\WFfSsBKRraEhMmIiUDdgVCbGN]).)/) { + $bad_extension = $1; + last; + } + } + if ($bad_extension ne "") { + my $stat_real = raw_line($linenr, 0); + for (my $count = $linenr + 1; $count <= $lc; $count++) { + $stat_real = $stat_real . "\n" . raw_line($count, 0); + } + WARN("VSPRINTF_POINTER_EXTENSION", + "Invalid vsprintf pointer extension '$bad_extension'\n" . "$here\n$stat_real\n"); + } + } + # Check for misused memsets if ($^V && $^V ge 5.10.0 && defined $stat && -- cgit v1.2.3 From f58283acea78e2ce95a3dc9b6bdc39160cfe4dd8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 5 Apr 2017 09:21:26 +1000 Subject: checkpatch-add-ability-to-find-bad-uses-of-vsprintf-%pfoo-extensions-fix add helpful comment into lib/vsprintf.c Cc: Joe Perches Cc: Kees Cook Cc: William Roberts Signed-off-by: Andrew Morton --- lib/vsprintf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index e3bf4e0f10b5..df113aa9ae5e 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1477,6 +1477,9 @@ int kptr_restrict __read_mostly; * by an extra set of alphanumeric characters that are extended format * specifiers. * + * Please update scripts/checkpatch.pl when adding new conversion characters. + * (search for "check for vsprintf extension"). + * * Right now we handle: * * - 'F' For symbolic function descriptor pointers with offset -- cgit v1.2.3 From f5c1271b9d5bddc50ccc3a4b1a186501a6d3fecd Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 5 Apr 2017 09:21:27 +1000 Subject: checkpatch-add-ability-to-find-bad-uses-of-vsprintf-%pfoo-extensions-fix-fix text tweak Cc: Joe Perches Cc: Kees Cook Cc: William Roberts Signed-off-by: Andrew Morton --- lib/vsprintf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index df113aa9ae5e..db44e6049028 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1477,8 +1477,8 @@ int kptr_restrict __read_mostly; * by an extra set of alphanumeric characters that are extended format * specifiers. * - * Please update scripts/checkpatch.pl when adding new conversion characters. - * (search for "check for vsprintf extension"). + * Please update scripts/checkpatch.pl when adding/removing conversion + * characters. (Search for "check for vsprintf extension"). * * Right now we handle: * -- cgit v1.2.3 From c8fd864cec92975be639d388837fff5dcc4e727f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 5 Apr 2017 09:21:27 +1000 Subject: checkpatch: improve EMBEDDED_FUNCTION_NAME test The existing behavior relies on patch context to identify function declarations. Add the ability to find function declarations when there is an open brace in column 1. This finds function declarations only in specific single line forms where the function name is on a single line like: int foo(args...) { and int foo(args...) { It does not recognize function declarations like: int foo(int bar, int baz) { Link: http://lkml.kernel.org/r/738d74bbbe1a06b80f11ed504818107c68903095.1488155636.git.joe@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 732bb3e2fe9a..832e8150dba3 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3126,6 +3126,17 @@ sub process { # check we are in a valid C source file if not then ignore this hunk next if ($realfile !~ /\.(h|c)$/); +# check if this appears to be the start function declaration, save the name + if ($sline =~ /^\+\{\s*$/ && + $prevline =~ /^\+(?:(?:(?:$Storage|$Inline)\s*)*\s*$Type\s*)?($Ident)\(/) { + $context_function = $1; + } + +# check if this appears to be the end of function declaration + if ($sline =~ /^\+\}\s*$/) { + undef $context_function; + } + # check indentation of any line with a bare else # (but not if it is a multiple line "if (foo) return bar; else return baz;") # if the previous line is a break or return and is indented 1 tab more... -- cgit v1.2.3 From 3a44cbfea104dd310da61346c85f0e913f7c555b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 5 Apr 2017 09:21:28 +1000 Subject: checkpatch: allow space leading blank lines in email headers Allow a leading space and otherwise blank link in the email headers as it can be a line wrapped Spamassassin multiple line string or any other valid rfc 2822/5322 email header. The line with space causes checkpatch to erroniously think that it's in the content body, as opposed to headers and thus flag a mail header as an unwrapped long comment line. Link: http://lkml.kernel.org/r/d75a9f0b78b3488078429f4037d9fff3bdfa3b78.1490247180.git.joe@perches.com Signed-off-by: Joe Perches Reported-by: Darren Hart (VMware) Tested-by: Darren Hart (VMware) Reviewed-by: Darren Hart (VMware) Original-patch-by: John 'Warthog9' Hawley (VMware) Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 832e8150dba3..089c974aa3a5 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2628,8 +2628,8 @@ sub process { # Check if it's the start of a commit log # (not a header line and we haven't seen the patch filename) if ($in_header_lines && $realfile =~ /^$/ && - !($rawline =~ /^\s+\S/ || - $rawline =~ /^(commit\b|from\b|[\w-]+:).*$/i)) { + !($rawline =~ /^\s+(?:\S|$)/ || + $rawline =~ /^(?:commit\b|from\b|[\w-]+:)/i)) { $in_header_lines = 0; $in_commit_log = 1; $has_commit_log = 1; -- cgit v1.2.3 From 315ed516e9cade03627f7ae8f03d60e0edf19c18 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 5 Apr 2017 09:21:29 +1000 Subject: reiserfs: use designated initializers Prepare to mark sensitive kernel structures for randomization by making sure they're using designated initializers. These were identified during allyesconfig builds of x86, arm, and arm64, with most initializer fixes extracted from grsecurity. Link: http://lkml.kernel.org/r/20170329210419.GA40066@beast Signed-off-by: Kees Cook Cc: Jan Kara Signed-off-by: Andrew Morton --- fs/reiserfs/item_ops.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c index aca73dd73906..e3c558d1b78c 100644 --- a/fs/reiserfs/item_ops.c +++ b/fs/reiserfs/item_ops.c @@ -724,18 +724,18 @@ static void errcatch_print_vi(struct virtual_item *vi) } static struct item_operations errcatch_ops = { - errcatch_bytes_number, - errcatch_decrement_key, - errcatch_is_left_mergeable, - errcatch_print_item, - errcatch_check_item, - - errcatch_create_vi, - errcatch_check_left, - errcatch_check_right, - errcatch_part_size, - errcatch_unit_num, - errcatch_print_vi + .bytes_number = errcatch_bytes_number, + .decrement_key = errcatch_decrement_key, + .is_left_mergeable = errcatch_is_left_mergeable, + .print_item = errcatch_print_item, + .check_item = errcatch_check_item, + + .create_vi = errcatch_create_vi, + .check_left = errcatch_check_left, + .check_right = errcatch_check_right, + .part_size = errcatch_part_size, + .unit_num = errcatch_unit_num, + .print_vi = errcatch_print_vi }; #if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3) -- cgit v1.2.3 From bf4f265e6677e23afe15a2e4cd2d4a9db9144bab Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 5 Apr 2017 09:21:30 +1000 Subject: cpumask: make "nr_cpumask_bits" unsigned Bit searching functions accept "unsigned long" indices but "nr_cpumask_bits" is "int" which is signed, so inevitable sign extensions occur on x86_64. Those MOVSX are #1 MOVSX bloat by number of uses across whole kernel. Change "nr_cpumask_bits" to unsigned, this number can't be negative after all. It allows to do implicit zero-extension on x86_64 without MOVSX. Change signed comparisons into unsigned comparisons where necessary. Other uses looks fine because it is either argument passed to a function or comparison is already unsigned. Net win on allyesconfig type of kernel: ~2.8 KB (!) add/remove: 0/0 grow/shrink: 8/725 up/down: 93/-2926 (-2833) function old new delta xen_exit_mmap 691 735 +44 qstat_read 426 440 +14 __cpufreq_cooling_register 1678 1687 +9 trace_rb_cpu_prepare 447 455 +8 vermagic 54 60 +6 nfp_driver_version 54 60 +6 rcu_torture_stats_print 1147 1151 +4 find_next_push_cpu 267 269 +2 xen_irq_resume 961 960 -1 ... init_vp_index 946 906 -40 od_set_powersave_bias 328 281 -47 power_cpu_exit 193 139 -54 arch_show_interrupts 3538 3484 -54 select_idle_sibling 1558 1471 -87 Total: Before=158358910, After=158356077, chg -0.00% Same arguments apply to "nr_cpu_ids" but I haven't yet found enough courage to delve into this issue (and proper fix may require new type "cpu_t" which is whole separate story). Link: http://lkml.kernel.org/r/20170309205322.GA1728@avx2 Signed-off-by: Alexey Dobriyan Cc: Rusty Russell Cc: Heiko Carstens Cc: Martin Schwidefsky Signed-off-by: Andrew Morton --- arch/mips/kernel/perf_event_mipsxx.c | 2 +- arch/s390/kernel/perf_cpum_sf.c | 2 +- include/linux/cpumask.h | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index 8c35b3152e1e..44b50646582d 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -618,7 +618,7 @@ static int mipspmu_event_init(struct perf_event *event) return -ENOENT; } - if (event->cpu >= nr_cpumask_bits || + if ((unsigned int)event->cpu >= nr_cpumask_bits || (event->cpu >= 0 && !cpu_online(event->cpu))) return -ENODEV; diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 1c0b58545c04..534a10c00c54 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -823,7 +823,7 @@ static int cpumsf_pmu_event_init(struct perf_event *event) } /* Check online status of the CPU to which the event is pinned */ - if (event->cpu >= nr_cpumask_bits || + if ((unsigned int)event->cpu >= nr_cpumask_bits || (event->cpu >= 0 && !cpu_online(event->cpu))) return -ENODEV; diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 96f1e88b767c..a3ba193f042e 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -40,9 +40,9 @@ extern int nr_cpu_ids; #ifdef CONFIG_CPUMASK_OFFSTACK /* Assuming NR_CPUS is huge, a runtime limit is more efficient. Also, * not all bits may be allocated. */ -#define nr_cpumask_bits nr_cpu_ids +#define nr_cpumask_bits ((unsigned int)nr_cpu_ids) #else -#define nr_cpumask_bits NR_CPUS +#define nr_cpumask_bits ((unsigned int)NR_CPUS) #endif /* -- cgit v1.2.3 From 28c968ba5ee884e98f2c54da11b6336586bdea17 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 5 Apr 2017 09:21:31 +1000 Subject: crash: move crashkernel parsing and vmcore related code under CONFIG_CRASH_CORE Patch series "kexec/fadump: remove dependency with CONFIG_KEXEC and reuse crashkernel parameter for fadump", v4. Traditionally, kdump is used to save vmcore in case of a crash. Some architectures like powerpc can save vmcore using architecture specific support instead of kexec/kdump mechanism. Such architecture specific support also needs to reserve memory, to be used by dump capture kernel. crashkernel parameter can be a reused, for memory reservation, by such architecture specific infrastructure. This patchset removes dependency with CONFIG_KEXEC for crashkernel parameter and vmcoreinfo related code as it can be reused without kexec support. Also, crashkernel parameter is reused instead of fadump_reserve_mem to reserve memory for fadump. The first patch moves crashkernel parameter parsing and vmcoreinfo related code under CONFIG_CRASH_CORE instead of CONFIG_KEXEC_CORE. The second patch reuses the definitions of append_elf_note() & final_note() functions under CONFIG_CRASH_CORE in IA64 arch code. The third patch removes dependency on CONFIG_KEXEC for firmware-assisted dump (fadump) in powerpc. The next patch reuses crashkernel parameter for reserving memory for fadump, instead of the fadump_reserve_mem parameter. This has the advantage of using all syntaxes crashkernel parameter supports, for fadump as well. The last patch updates fadump kernel documentation about use of crashkernel parameter. This patch (of 5): Traditionally, kdump is used to save vmcore in case of a crash. Some architectures like powerpc can save vmcore using architecture specific support instead of kexec/kdump mechanism. Such architecture specific support also needs to reserve memory, to be used by dump capture kernel. crashkernel parameter can be a reused, for memory reservation, by such architecture specific infrastructure. But currently, code related to vmcoreinfo and parsing of crashkernel parameter is built under CONFIG_KEXEC_CORE. This patch introduces CONFIG_CRASH_CORE and moves the above mentioned code under this config, allowing code reuse without dependency on CONFIG_KEXEC. There is no functional change with this patch. Link: http://lkml.kernel.org/r/149035338104.6881.4550894432615189948.stgit@hbathini.in.ibm.com Signed-off-by: Hari Bathini Acked-by: Dave Young Cc: Fenghua Yu Cc: Tony Luck Cc: Eric Biederman Cc: Mahesh Salgaonkar Cc: Vivek Goyal Cc: Michael Ellerman Signed-off-by: Andrew Morton --- arch/Kconfig | 4 + include/linux/crash_core.h | 65 +++++++ include/linux/kexec.h | 57 +----- include/linux/printk.h | 4 +- kernel/Makefile | 1 + kernel/crash_core.c | 445 +++++++++++++++++++++++++++++++++++++++++++++ kernel/kexec_core.c | 403 ---------------------------------------- kernel/ksysfs.c | 8 +- kernel/printk/printk.c | 6 +- 9 files changed, 531 insertions(+), 462 deletions(-) create mode 100644 include/linux/crash_core.h create mode 100644 kernel/crash_core.c diff --git a/arch/Kconfig b/arch/Kconfig index cd211a14a88f..ffdf5e3833da 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -2,7 +2,11 @@ # General architecture dependent options # +config CRASH_CORE + bool + config KEXEC_CORE + select CRASH_CORE bool config HAVE_IMA_KEXEC diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h new file mode 100644 index 000000000000..18d0f946fda3 --- /dev/null +++ b/include/linux/crash_core.h @@ -0,0 +1,65 @@ +#ifndef LINUX_CRASH_CORE_H +#define LINUX_CRASH_CORE_H + +#include +#include +#include + +#define CRASH_CORE_NOTE_NAME "CORE" +#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) +#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4) +#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) + +#define CRASH_CORE_NOTE_BYTES ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ + CRASH_CORE_NOTE_NAME_BYTES + \ + CRASH_CORE_NOTE_DESC_BYTES) + +#define VMCOREINFO_BYTES (4096) +#define VMCOREINFO_NOTE_NAME "VMCOREINFO" +#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) +#define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ + VMCOREINFO_NOTE_NAME_BYTES + \ + VMCOREINFO_BYTES) + +typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4]; + +void crash_save_vmcoreinfo(void); +void arch_crash_save_vmcoreinfo(void); +__printf(1, 2) +void vmcoreinfo_append_str(const char *fmt, ...); +phys_addr_t paddr_vmcoreinfo_note(void); + +#define VMCOREINFO_OSRELEASE(value) \ + vmcoreinfo_append_str("OSRELEASE=%s\n", value) +#define VMCOREINFO_PAGESIZE(value) \ + vmcoreinfo_append_str("PAGESIZE=%ld\n", value) +#define VMCOREINFO_SYMBOL(name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) +#define VMCOREINFO_SIZE(name) \ + vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ + (unsigned long)sizeof(name)) +#define VMCOREINFO_STRUCT_SIZE(name) \ + vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ + (unsigned long)sizeof(struct name)) +#define VMCOREINFO_OFFSET(name, field) \ + vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ + (unsigned long)offsetof(struct name, field)) +#define VMCOREINFO_LENGTH(name, value) \ + vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value) +#define VMCOREINFO_NUMBER(name) \ + vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) +#define VMCOREINFO_CONFIG(name) \ + vmcoreinfo_append_str("CONFIG_%s=y\n", #name) + +extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +extern size_t vmcoreinfo_size; +extern size_t vmcoreinfo_max_size; + +int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base); +int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base); +int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base); + +#endif /* LINUX_CRASH_CORE_H */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index d419d0e51fe5..c9481ebcbc0c 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -14,17 +14,15 @@ #if !defined(__ASSEMBLY__) +#include #include #include #ifdef CONFIG_KEXEC_CORE #include -#include #include #include -#include -#include #include #include @@ -62,19 +60,15 @@ #define KEXEC_CRASH_MEM_ALIGN PAGE_SIZE #endif -#define KEXEC_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) -#define KEXEC_CORE_NOTE_NAME "CORE" -#define KEXEC_CORE_NOTE_NAME_BYTES ALIGN(sizeof(KEXEC_CORE_NOTE_NAME), 4) -#define KEXEC_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) +#define KEXEC_CORE_NOTE_NAME CRASH_CORE_NOTE_NAME + /* * The per-cpu notes area is a list of notes terminated by a "NULL" * note header. For kdump, the code in vmcore.c runs in the context * of the second kernel to combine them into one note. */ #ifndef KEXEC_NOTE_BYTES -#define KEXEC_NOTE_BYTES ( (KEXEC_NOTE_HEAD_BYTES * 2) + \ - KEXEC_CORE_NOTE_NAME_BYTES + \ - KEXEC_CORE_NOTE_DESC_BYTES ) +#define KEXEC_NOTE_BYTES CRASH_CORE_NOTE_BYTES #endif /* @@ -256,33 +250,6 @@ extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); int kexec_crash_loaded(void); void crash_save_cpu(struct pt_regs *regs, int cpu); -void crash_save_vmcoreinfo(void); -void arch_crash_save_vmcoreinfo(void); -__printf(1, 2) -void vmcoreinfo_append_str(const char *fmt, ...); -phys_addr_t paddr_vmcoreinfo_note(void); - -#define VMCOREINFO_OSRELEASE(value) \ - vmcoreinfo_append_str("OSRELEASE=%s\n", value) -#define VMCOREINFO_PAGESIZE(value) \ - vmcoreinfo_append_str("PAGESIZE=%ld\n", value) -#define VMCOREINFO_SYMBOL(name) \ - vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) -#define VMCOREINFO_SIZE(name) \ - vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ - (unsigned long)sizeof(name)) -#define VMCOREINFO_STRUCT_SIZE(name) \ - vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ - (unsigned long)sizeof(struct name)) -#define VMCOREINFO_OFFSET(name, field) \ - vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ - (unsigned long)offsetof(struct name, field)) -#define VMCOREINFO_LENGTH(name, value) \ - vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value) -#define VMCOREINFO_NUMBER(name) \ - vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) -#define VMCOREINFO_CONFIG(name) \ - vmcoreinfo_append_str("CONFIG_%s=y\n", #name) extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; @@ -303,31 +270,15 @@ extern int kexec_load_disabled; #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ KEXEC_FILE_NO_INITRAMFS) -#define VMCOREINFO_BYTES (4096) -#define VMCOREINFO_NOTE_NAME "VMCOREINFO" -#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) -#define VMCOREINFO_NOTE_SIZE (KEXEC_NOTE_HEAD_BYTES*2 + VMCOREINFO_BYTES \ - + VMCOREINFO_NOTE_NAME_BYTES) - /* Location of a reserved region to hold the crash kernel. */ extern struct resource crashk_res; extern struct resource crashk_low_res; -typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4]; extern note_buf_t __percpu *crash_notes; -extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; -extern size_t vmcoreinfo_size; -extern size_t vmcoreinfo_max_size; /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; -int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, - unsigned long long *crash_size, unsigned long long *crash_base); -int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, - unsigned long long *crash_size, unsigned long long *crash_base); -int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, - unsigned long long *crash_size, unsigned long long *crash_base); int crash_shrink_memory(unsigned long new_size); size_t crash_get_memory_size(void); void crash_free_reserved_phys_range(unsigned long begin, unsigned long end); diff --git a/include/linux/printk.h b/include/linux/printk.h index 571257e0f53d..e10f27468322 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -198,7 +198,7 @@ extern void wake_up_klogd(void); char *log_buf_addr_get(void); u32 log_buf_len_get(void); -void log_buf_kexec_setup(void); +void log_buf_vmcoreinfo_setup(void); void __init setup_log_buf(int early); __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...); void dump_stack_print_info(const char *log_lvl); @@ -246,7 +246,7 @@ static inline u32 log_buf_len_get(void) return 0; } -static inline void log_buf_kexec_setup(void) +static inline void log_buf_vmcoreinfo_setup(void) { } diff --git a/kernel/Makefile b/kernel/Makefile index b302b4731d16..72aa080f91f0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -59,6 +59,7 @@ obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o +obj-$(CONFIG_CRASH_CORE) += crash_core.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o diff --git a/kernel/crash_core.c b/kernel/crash_core.c new file mode 100644 index 000000000000..4261587a34d2 --- /dev/null +++ b/kernel/crash_core.c @@ -0,0 +1,445 @@ +/* + * crash.c - kernel crash support code. + * Copyright (C) 2002-2004 Eric Biederman + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include +#include +#include + +#include +#include + +/* vmcoreinfo stuff */ +static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; +u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +size_t vmcoreinfo_size; +size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); + +/* + * parsing the "crashkernel" commandline + * + * this code is intended to be called from architecture specific code + */ + + +/* + * This function parses command lines in the format + * + * crashkernel=ramsize-range:size[,...][@offset] + * + * The function returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_mem(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline, *tmp; + + /* for each entry of the comma-separated list */ + do { + unsigned long long start, end = ULLONG_MAX, size; + + /* get the start of the range */ + start = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (*cur != '-') { + pr_warn("crashkernel: '-' expected\n"); + return -EINVAL; + } + cur++; + + /* if no ':' is here, than we read the end */ + if (*cur != ':') { + end = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (end <= start) { + pr_warn("crashkernel: end <= start\n"); + return -EINVAL; + } + } + + if (*cur != ':') { + pr_warn("crashkernel: ':' expected\n"); + return -EINVAL; + } + cur++; + + size = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (size >= system_ram) { + pr_warn("crashkernel: invalid size\n"); + return -EINVAL; + } + + /* match ? */ + if (system_ram >= start && system_ram < end) { + *crash_size = size; + break; + } + } while (*cur++ == ','); + + if (*crash_size > 0) { + while (*cur && *cur != ' ' && *cur != '@') + cur++; + if (*cur == '@') { + cur++; + *crash_base = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected after '@'\n"); + return -EINVAL; + } + } + } + + return 0; +} + +/* + * That function parses "simple" (old) crashkernel command lines like + * + * crashkernel=size[@offset] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_simple(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + if (*cur == '@') + *crash_base = memparse(cur+1, &cur); + else if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +#define SUFFIX_HIGH 0 +#define SUFFIX_LOW 1 +#define SUFFIX_NULL 2 +static __initdata char *suffix_tbl[] = { + [SUFFIX_HIGH] = ",high", + [SUFFIX_LOW] = ",low", + [SUFFIX_NULL] = NULL, +}; + +/* + * That function parses "suffix" crashkernel command lines like + * + * crashkernel=size,[high|low] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_suffix(char *cmdline, + unsigned long long *crash_size, + const char *suffix) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + /* check with suffix */ + if (strncmp(cur, suffix, strlen(suffix))) { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + cur += strlen(suffix); + if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +static __init char *get_last_crashkernel(char *cmdline, + const char *name, + const char *suffix) +{ + char *p = cmdline, *ck_cmdline = NULL; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, name); + while (p) { + char *end_p = strchr(p, ' '); + char *q; + + if (!end_p) + end_p = p + strlen(p); + + if (!suffix) { + int i; + + /* skip the one with any known suffix */ + for (i = 0; suffix_tbl[i]; i++) { + q = end_p - strlen(suffix_tbl[i]); + if (!strncmp(q, suffix_tbl[i], + strlen(suffix_tbl[i]))) + goto next; + } + ck_cmdline = p; + } else { + q = end_p - strlen(suffix); + if (!strncmp(q, suffix, strlen(suffix))) + ck_cmdline = p; + } +next: + p = strstr(p+1, name); + } + + if (!ck_cmdline) + return NULL; + + return ck_cmdline; +} + +static int __init __parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base, + const char *name, + const char *suffix) +{ + char *first_colon, *first_space; + char *ck_cmdline; + + BUG_ON(!crash_size || !crash_base); + *crash_size = 0; + *crash_base = 0; + + ck_cmdline = get_last_crashkernel(cmdline, name, suffix); + + if (!ck_cmdline) + return -EINVAL; + + ck_cmdline += strlen(name); + + if (suffix) + return parse_crashkernel_suffix(ck_cmdline, crash_size, + suffix); + /* + * if the commandline contains a ':', then that's the extended + * syntax -- if not, it must be the classic syntax + */ + first_colon = strchr(ck_cmdline, ':'); + first_space = strchr(ck_cmdline, ' '); + if (first_colon && (!first_space || first_colon < first_space)) + return parse_crashkernel_mem(ck_cmdline, system_ram, + crash_size, crash_base); + + return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); +} + +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + */ +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", NULL); +} + +int __init parse_crashkernel_high(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", suffix_tbl[SUFFIX_HIGH]); +} + +int __init parse_crashkernel_low(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", suffix_tbl[SUFFIX_LOW]); +} + +static u32 *append_elf_note(u32 *buf, char *name, unsigned int type, + void *data, size_t data_len) +{ + struct elf_note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = data_len; + note.n_type = type; + memcpy(buf, ¬e, sizeof(note)); + buf += (sizeof(note) + 3)/4; + memcpy(buf, name, note.n_namesz); + buf += (note.n_namesz + 3)/4; + memcpy(buf, data, note.n_descsz); + buf += (note.n_descsz + 3)/4; + + return buf; +} + +static void final_note(u32 *buf) +{ + struct elf_note note; + + note.n_namesz = 0; + note.n_descsz = 0; + note.n_type = 0; + memcpy(buf, ¬e, sizeof(note)); +} + +static void update_vmcoreinfo_note(void) +{ + u32 *buf = vmcoreinfo_note; + + if (!vmcoreinfo_size) + return; + buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, + vmcoreinfo_size); + final_note(buf); +} + +void crash_save_vmcoreinfo(void) +{ + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); + update_vmcoreinfo_note(); +} + +void vmcoreinfo_append_str(const char *fmt, ...) +{ + va_list args; + char buf[0x50]; + size_t r; + + va_start(args, fmt); + r = vscnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); + + memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); + + vmcoreinfo_size += r; +} + +/* + * provide an empty default implementation here -- architecture + * code may override this + */ +void __weak arch_crash_save_vmcoreinfo(void) +{} + +phys_addr_t __weak paddr_vmcoreinfo_note(void) +{ + return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); +} + +static int __init crash_save_vmcoreinfo_init(void) +{ + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); + VMCOREINFO_PAGESIZE(PAGE_SIZE); + + VMCOREINFO_SYMBOL(init_uts_ns); + VMCOREINFO_SYMBOL(node_online_map); +#ifdef CONFIG_MMU + VMCOREINFO_SYMBOL(swapper_pg_dir); +#endif + VMCOREINFO_SYMBOL(_stext); + VMCOREINFO_SYMBOL(vmap_area_list); + +#ifndef CONFIG_NEED_MULTIPLE_NODES + VMCOREINFO_SYMBOL(mem_map); + VMCOREINFO_SYMBOL(contig_page_data); +#endif +#ifdef CONFIG_SPARSEMEM + VMCOREINFO_SYMBOL(mem_section); + VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); + VMCOREINFO_STRUCT_SIZE(mem_section); + VMCOREINFO_OFFSET(mem_section, section_mem_map); +#endif + VMCOREINFO_STRUCT_SIZE(page); + VMCOREINFO_STRUCT_SIZE(pglist_data); + VMCOREINFO_STRUCT_SIZE(zone); + VMCOREINFO_STRUCT_SIZE(free_area); + VMCOREINFO_STRUCT_SIZE(list_head); + VMCOREINFO_SIZE(nodemask_t); + VMCOREINFO_OFFSET(page, flags); + VMCOREINFO_OFFSET(page, _refcount); + VMCOREINFO_OFFSET(page, mapping); + VMCOREINFO_OFFSET(page, lru); + VMCOREINFO_OFFSET(page, _mapcount); + VMCOREINFO_OFFSET(page, private); + VMCOREINFO_OFFSET(page, compound_dtor); + VMCOREINFO_OFFSET(page, compound_order); + VMCOREINFO_OFFSET(page, compound_head); + VMCOREINFO_OFFSET(pglist_data, node_zones); + VMCOREINFO_OFFSET(pglist_data, nr_zones); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + VMCOREINFO_OFFSET(pglist_data, node_mem_map); +#endif + VMCOREINFO_OFFSET(pglist_data, node_start_pfn); + VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); + VMCOREINFO_OFFSET(pglist_data, node_id); + VMCOREINFO_OFFSET(zone, free_area); + VMCOREINFO_OFFSET(zone, vm_stat); + VMCOREINFO_OFFSET(zone, spanned_pages); + VMCOREINFO_OFFSET(free_area, free_list); + VMCOREINFO_OFFSET(list_head, next); + VMCOREINFO_OFFSET(list_head, prev); + VMCOREINFO_OFFSET(vmap_area, va_start); + VMCOREINFO_OFFSET(vmap_area, list); + VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); + log_buf_vmcoreinfo_setup(); + VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); + VMCOREINFO_NUMBER(NR_FREE_PAGES); + VMCOREINFO_NUMBER(PG_lru); + VMCOREINFO_NUMBER(PG_private); + VMCOREINFO_NUMBER(PG_swapcache); + VMCOREINFO_NUMBER(PG_slab); +#ifdef CONFIG_MEMORY_FAILURE + VMCOREINFO_NUMBER(PG_hwpoison); +#endif + VMCOREINFO_NUMBER(PG_head_mask); + VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); +#ifdef CONFIG_HUGETLB_PAGE + VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); +#endif + + arch_crash_save_vmcoreinfo(); + update_vmcoreinfo_note(); + + return 0; +} + +subsys_initcall(crash_save_vmcoreinfo_init); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index bfe62d5b3872..9dd722912850 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -51,12 +51,6 @@ DEFINE_MUTEX(kexec_mutex); /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; -/* vmcoreinfo stuff */ -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; -size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); - /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; @@ -1084,403 +1078,6 @@ static int __init crash_notes_memory_init(void) subsys_initcall(crash_notes_memory_init); -/* - * parsing the "crashkernel" commandline - * - * this code is intended to be called from architecture specific code - */ - - -/* - * This function parses command lines in the format - * - * crashkernel=ramsize-range:size[,...][@offset] - * - * The function returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_mem(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline, *tmp; - - /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warn("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warn("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= system_ram) { - pr_warn("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (system_ram >= start && system_ram < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); - - if (*crash_size > 0) { - while (*cur && *cur != ' ' && *cur != '@') - cur++; - if (*cur == '@') { - cur++; - *crash_base = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected after '@'\n"); - return -EINVAL; - } - } - } - - return 0; -} - -/* - * That function parses "simple" (old) crashkernel command lines like - * - * crashkernel=size[@offset] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_simple(char *cmdline, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - if (*cur == '@') - *crash_base = memparse(cur+1, &cur); - else if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - - return 0; -} - -#define SUFFIX_HIGH 0 -#define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 -static __initdata char *suffix_tbl[] = { - [SUFFIX_HIGH] = ",high", - [SUFFIX_LOW] = ",low", - [SUFFIX_NULL] = NULL, -}; - -/* - * That function parses "suffix" crashkernel command lines like - * - * crashkernel=size,[high|low] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_suffix(char *cmdline, - unsigned long long *crash_size, - const char *suffix) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - /* check with suffix */ - if (strncmp(cur, suffix, strlen(suffix))) { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - cur += strlen(suffix); - if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - - return 0; -} - -static __init char *get_last_crashkernel(char *cmdline, - const char *name, - const char *suffix) -{ - char *p = cmdline, *ck_cmdline = NULL; - - /* find crashkernel and use the last one if there are more */ - p = strstr(p, name); - while (p) { - char *end_p = strchr(p, ' '); - char *q; - - if (!end_p) - end_p = p + strlen(p); - - if (!suffix) { - int i; - - /* skip the one with any known suffix */ - for (i = 0; suffix_tbl[i]; i++) { - q = end_p - strlen(suffix_tbl[i]); - if (!strncmp(q, suffix_tbl[i], - strlen(suffix_tbl[i]))) - goto next; - } - ck_cmdline = p; - } else { - q = end_p - strlen(suffix); - if (!strncmp(q, suffix, strlen(suffix))) - ck_cmdline = p; - } -next: - p = strstr(p+1, name); - } - - if (!ck_cmdline) - return NULL; - - return ck_cmdline; -} - -static int __init __parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base, - const char *name, - const char *suffix) -{ - char *first_colon, *first_space; - char *ck_cmdline; - - BUG_ON(!crash_size || !crash_base); - *crash_size = 0; - *crash_base = 0; - - ck_cmdline = get_last_crashkernel(cmdline, name, suffix); - - if (!ck_cmdline) - return -EINVAL; - - ck_cmdline += strlen(name); - - if (suffix) - return parse_crashkernel_suffix(ck_cmdline, crash_size, - suffix); - /* - * if the commandline contains a ':', then that's the extended - * syntax -- if not, it must be the classic syntax - */ - first_colon = strchr(ck_cmdline, ':'); - first_space = strchr(ck_cmdline, ' '); - if (first_colon && (!first_space || first_colon < first_space)) - return parse_crashkernel_mem(ck_cmdline, system_ram, - crash_size, crash_base); - - return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); -} - -/* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. - */ -int __init parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", NULL); -} - -int __init parse_crashkernel_high(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_HIGH]); -} - -int __init parse_crashkernel_low(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_LOW]); -} - -static void update_vmcoreinfo_note(void) -{ - u32 *buf = vmcoreinfo_note; - - if (!vmcoreinfo_size) - return; - buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, - vmcoreinfo_size); - final_note(buf); -} - -void crash_save_vmcoreinfo(void) -{ - vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); - update_vmcoreinfo_note(); -} - -void vmcoreinfo_append_str(const char *fmt, ...) -{ - va_list args; - char buf[0x50]; - size_t r; - - va_start(args, fmt); - r = vscnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - - r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); - - memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); - - vmcoreinfo_size += r; -} - -/* - * provide an empty default implementation here -- architecture - * code may override this - */ -void __weak arch_crash_save_vmcoreinfo(void) -{} - -phys_addr_t __weak paddr_vmcoreinfo_note(void) -{ - return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); -} - -static int __init crash_save_vmcoreinfo_init(void) -{ - VMCOREINFO_OSRELEASE(init_uts_ns.name.release); - VMCOREINFO_PAGESIZE(PAGE_SIZE); - - VMCOREINFO_SYMBOL(init_uts_ns); - VMCOREINFO_SYMBOL(node_online_map); -#ifdef CONFIG_MMU - VMCOREINFO_SYMBOL(swapper_pg_dir); -#endif - VMCOREINFO_SYMBOL(_stext); - VMCOREINFO_SYMBOL(vmap_area_list); - -#ifndef CONFIG_NEED_MULTIPLE_NODES - VMCOREINFO_SYMBOL(mem_map); - VMCOREINFO_SYMBOL(contig_page_data); -#endif -#ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL(mem_section); - VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); - VMCOREINFO_STRUCT_SIZE(mem_section); - VMCOREINFO_OFFSET(mem_section, section_mem_map); -#endif - VMCOREINFO_STRUCT_SIZE(page); - VMCOREINFO_STRUCT_SIZE(pglist_data); - VMCOREINFO_STRUCT_SIZE(zone); - VMCOREINFO_STRUCT_SIZE(free_area); - VMCOREINFO_STRUCT_SIZE(list_head); - VMCOREINFO_SIZE(nodemask_t); - VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _refcount); - VMCOREINFO_OFFSET(page, mapping); - VMCOREINFO_OFFSET(page, lru); - VMCOREINFO_OFFSET(page, _mapcount); - VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(page, compound_dtor); - VMCOREINFO_OFFSET(page, compound_order); - VMCOREINFO_OFFSET(page, compound_head); - VMCOREINFO_OFFSET(pglist_data, node_zones); - VMCOREINFO_OFFSET(pglist_data, nr_zones); -#ifdef CONFIG_FLAT_NODE_MEM_MAP - VMCOREINFO_OFFSET(pglist_data, node_mem_map); -#endif - VMCOREINFO_OFFSET(pglist_data, node_start_pfn); - VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); - VMCOREINFO_OFFSET(pglist_data, node_id); - VMCOREINFO_OFFSET(zone, free_area); - VMCOREINFO_OFFSET(zone, vm_stat); - VMCOREINFO_OFFSET(zone, spanned_pages); - VMCOREINFO_OFFSET(free_area, free_list); - VMCOREINFO_OFFSET(list_head, next); - VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_OFFSET(vmap_area, va_start); - VMCOREINFO_OFFSET(vmap_area, list); - VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); - log_buf_kexec_setup(); - VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); - VMCOREINFO_NUMBER(NR_FREE_PAGES); - VMCOREINFO_NUMBER(PG_lru); - VMCOREINFO_NUMBER(PG_private); - VMCOREINFO_NUMBER(PG_swapcache); - VMCOREINFO_NUMBER(PG_slab); -#ifdef CONFIG_MEMORY_FAILURE - VMCOREINFO_NUMBER(PG_hwpoison); -#endif - VMCOREINFO_NUMBER(PG_head_mask); - VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#ifdef CONFIG_HUGETLB_PAGE - VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); -#endif - - arch_crash_save_vmcoreinfo(); - update_vmcoreinfo_note(); - - return 0; -} - -subsys_initcall(crash_save_vmcoreinfo_init); - /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 0999679d6f26..23cd70651238 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -125,6 +125,10 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj, } KERNEL_ATTR_RW(kexec_crash_size); +#endif /* CONFIG_KEXEC_CORE */ + +#ifdef CONFIG_CRASH_CORE + static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -134,7 +138,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, } KERNEL_ATTR_RO(vmcoreinfo); -#endif /* CONFIG_KEXEC_CORE */ +#endif /* CONFIG_CRASH_CORE */ /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, @@ -219,6 +223,8 @@ static struct attribute * kernel_attrs[] = { &kexec_loaded_attr.attr, &kexec_crash_loaded_attr.attr, &kexec_crash_size_attr.attr, +#endif +#ifdef CONFIG_CRASH_CORE &vmcoreinfo_attr.attr, #endif #ifndef CONFIG_TINY_RCU diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 2984fb0f0257..721d37b248c7 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include @@ -1002,7 +1002,7 @@ const struct file_operations kmsg_fops = { .release = devkmsg_release, }; -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_CORE /* * This appends the listed symbols to /proc/vmcore * @@ -1011,7 +1011,7 @@ const struct file_operations kmsg_fops = { * symbols are specifically used so that utilities can access and extract the * dmesg log from a vmcore file after a crash. */ -void log_buf_kexec_setup(void) +void log_buf_vmcoreinfo_setup(void) { VMCOREINFO_SYMBOL(log_buf); VMCOREINFO_SYMBOL(log_buf_len); -- cgit v1.2.3 From 562e209d35194eb900be9a3cef4572e3836f89d8 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 5 Apr 2017 09:21:31 +1000 Subject: ia64: reuse append_elf_note() and final_note() functions Get rid of multiple definitions of append_elf_note() & final_note() functions. Reuse these functions compiled under CONFIG_CRASH_CORE Also, define Elf_Word and use it instead of generic u32 or the more specific Elf64_Word. Link: http://lkml.kernel.org/r/149035342324.6881.11667840929850361402.stgit@hbathini.in.ibm.com Signed-off-by: Hari Bathini Acked-by: Dave Young Acked-by: Tony Luck Cc: Fenghua Yu Cc: Eric Biederman Cc: Mahesh Salgaonkar Cc: Vivek Goyal Cc: Michael Ellerman Signed-off-by: Andrew Morton --- arch/ia64/kernel/crash.c | 22 ---------------------- include/linux/crash_core.h | 4 ++++ include/linux/elf.h | 2 ++ kernel/crash_core.c | 34 ++++++++++++++-------------------- kernel/kexec_core.c | 28 ---------------------------- 5 files changed, 20 insertions(+), 70 deletions(-) diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c index 2955f359e2a7..75859a07d75b 100644 --- a/arch/ia64/kernel/crash.c +++ b/arch/ia64/kernel/crash.c @@ -27,28 +27,6 @@ static int kdump_freeze_monarch; static int kdump_on_init = 1; static int kdump_on_fatal_mca = 1; -static inline Elf64_Word -*append_elf_note(Elf64_Word *buf, char *name, unsigned type, void *data, - size_t data_len) -{ - struct elf_note *note = (struct elf_note *)buf; - note->n_namesz = strlen(name) + 1; - note->n_descsz = data_len; - note->n_type = type; - buf += (sizeof(*note) + 3)/4; - memcpy(buf, name, note->n_namesz); - buf += (note->n_namesz + 3)/4; - memcpy(buf, data, data_len); - buf += (data_len + 3)/4; - return buf; -} - -static void -final_note(void *buf) -{ - memset(buf, 0, sizeof(struct elf_note)); -} - extern void ia64_dump_cpu_regs(void *); static DEFINE_PER_CPU(struct elf_prstatus, elf_prstatus); diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 18d0f946fda3..541a197ba4a2 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -55,6 +55,10 @@ extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; extern size_t vmcoreinfo_size; extern size_t vmcoreinfo_max_size; +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, + void *data, size_t data_len); +void final_note(Elf_Word *buf); + int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, diff --git a/include/linux/elf.h b/include/linux/elf.h index 20fa8d8ae313..ba069e8f4f78 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -29,6 +29,7 @@ extern Elf32_Dyn _DYNAMIC []; #define elf_note elf32_note #define elf_addr_t Elf32_Off #define Elf_Half Elf32_Half +#define Elf_Word Elf32_Word #else @@ -39,6 +40,7 @@ extern Elf64_Dyn _DYNAMIC []; #define elf_note elf64_note #define elf_addr_t Elf64_Off #define Elf_Half Elf64_Half +#define Elf_Word Elf64_Word #endif diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 4261587a34d2..fcbd568f1e95 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -291,32 +291,26 @@ int __init parse_crashkernel_low(char *cmdline, "crashkernel=", suffix_tbl[SUFFIX_LOW]); } -static u32 *append_elf_note(u32 *buf, char *name, unsigned int type, - void *data, size_t data_len) +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, + void *data, size_t data_len) { - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, ¬e, sizeof(note)); - buf += (sizeof(note) + 3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; + struct elf_note *note = (struct elf_note *)buf; + + note->n_namesz = strlen(name) + 1; + note->n_descsz = data_len; + note->n_type = type; + buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word)); + memcpy(buf, name, note->n_namesz); + buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word)); + memcpy(buf, data, data_len); + buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word)); return buf; } -static void final_note(u32 *buf) +void final_note(Elf_Word *buf) { - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, ¬e, sizeof(note)); + memset(buf, 0, sizeof(struct elf_note)); } static void update_vmcoreinfo_note(void) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 9dd722912850..ae1a3ba24df5 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -990,34 +990,6 @@ unlock: return ret; } -static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, - size_t data_len) -{ - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, ¬e, sizeof(note)); - buf += (sizeof(note) + 3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; - - return buf; -} - -static void final_note(u32 *buf) -{ - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, ¬e, sizeof(note)); -} - void crash_save_cpu(struct pt_regs *regs, int cpu) { struct elf_prstatus prstatus; -- cgit v1.2.3 From 0c9d00685af0b6363ffe74bd124997849c6dbc36 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 5 Apr 2017 09:21:32 +1000 Subject: powerpc/fadump: remove dependency with CONFIG_KEXEC Now that crashkernel parameter parsing and vmcoreinfo related code is moved under CONFIG_CRASH_CORE instead of CONFIG_KEXEC_CORE, remove dependency with CONFIG_KEXEC for CONFIG_FA_DUMP. While here, get rid of definitions of fadump_append_elf_note() & fadump_final_note() functions to reuse similar functions compiled under CONFIG_CRASH_CORE. Link: http://lkml.kernel.org/r/149035343956.6881.1536459326017709354.stgit@hbathini.in.ibm.com Signed-off-by: Hari Bathini Reviewed-by: Mahesh Salgaonkar Acked-by: Michael Ellerman Cc: Fenghua Yu Cc: Tony Luck Cc: Dave Young Cc: Eric Biederman Cc: Vivek Goyal Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 10 ++++++---- arch/powerpc/include/asm/fadump.h | 2 ++ arch/powerpc/kernel/crash.c | 2 -- arch/powerpc/kernel/fadump.c | 34 +++------------------------------- arch/powerpc/kernel/setup-common.c | 5 +++++ 5 files changed, 16 insertions(+), 37 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 97a8bc8a095c..6bc1fa232341 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -522,21 +522,23 @@ config RELOCATABLE_TEST relocation code. config CRASH_DUMP - bool "Build a kdump crash kernel" + bool "Build a dump capture kernel" depends on PPC64 || 6xx || FSL_BOOKE || (44x && !SMP) select RELOCATABLE if (PPC64 && !COMPILE_TEST) || 44x || FSL_BOOKE help - Build a kernel suitable for use as a kdump capture kernel. + Build a kernel suitable for use as a dump capture kernel. The same kernel binary can be used as production kernel and dump capture kernel. config FA_DUMP bool "Firmware-assisted dump" - depends on PPC64 && PPC_RTAS && CRASH_DUMP && KEXEC_CORE + depends on PPC64 && PPC_RTAS + select CRASH_CORE + select CRASH_DUMP help A robust mechanism to get reliable kernel crash dump with assistance from firmware. This approach does not use kexec, - instead firmware assists in booting the kdump kernel + instead firmware assists in booting the capture kernel while preserving memory contents. Firmware-assisted dump is meant to be a kdump replacement offering robustness and speed not possible without system firmware assistance. diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 0031806475f0..60b91084f33c 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -73,6 +73,8 @@ reg_entry++; \ }) +extern int crashing_cpu; + /* Kernel Dump section info */ struct fadump_section { __be32 request_flag; diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c index 47b63de81f9b..cbabb5adccd9 100644 --- a/arch/powerpc/kernel/crash.c +++ b/arch/powerpc/kernel/crash.c @@ -43,8 +43,6 @@ #define IPI_TIMEOUT 10000 #define REAL_MODE_TIMEOUT 10000 -/* This keeps a track of which one is the crashing cpu. */ -int crashing_cpu = -1; static int time_to_dump; #define CRASH_HANDLER_MAX 3 diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 8ff0dd4e77a7..31c0abec51cf 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -509,34 +509,6 @@ fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs) return reg_entry; } -static u32 *fadump_append_elf_note(u32 *buf, char *name, unsigned type, - void *data, size_t data_len) -{ - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, ¬e, sizeof(note)); - buf += (sizeof(note) + 3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; - - return buf; -} - -static void fadump_final_note(u32 *buf) -{ - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, ¬e, sizeof(note)); -} - static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) { struct elf_prstatus prstatus; @@ -547,8 +519,8 @@ static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) * prstatus.pr_pid = ???? */ elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); - buf = fadump_append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, - &prstatus, sizeof(prstatus)); + buf = append_elf_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); return buf; } @@ -689,7 +661,7 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm) note_buf = fadump_regs_to_elf_notes(note_buf, ®s); } } - fadump_final_note(note_buf); + final_note(note_buf); if (fdh) { pr_debug("Updating elfcore header (%llx) with cpu notes\n", diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 4697da895133..508dd9af20b1 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -125,6 +125,11 @@ int ppc_do_canonicalize_irqs; EXPORT_SYMBOL(ppc_do_canonicalize_irqs); #endif +#ifdef CONFIG_CRASH_CORE +/* This keeps a track of which one is the crashing cpu. */ +int crashing_cpu = -1; +#endif + /* also used by kexec */ void machine_shutdown(void) { -- cgit v1.2.3 From 74177c5d453e368b274e49331267f3a81fb28253 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 5 Apr 2017 09:21:32 +1000 Subject: powerpc/fadump: reuse crashkernel parameter for fadump memory reservation fadump supports specifying memory to reserve for fadump's crash kernel with fadump_reserve_mem kernel parameter. This parameter currently supports passing a fixed memory size, like fadump_reserve_mem= only. This patch aims to add support for other syntaxes like range-based memory size :[,:,:,...] which allows using the same parameter to boot the kernel with different system RAM sizes. As crashkernel parameter already supports the above mentioned syntaxes, this patch deprecates fadump_reserve_mem parameter and reuses crashkernel parameter instead, to specify memory for fadump's crash kernel memory reservation as well. If any offset is provided in crashkernel parameter, it will be ignored in case of fadump, as fadump reserves memory at end of RAM. Advantages using crashkernel parameter instead of fadump_reserve_mem parameter are one less kernel parameter overall, code reuse and support for multiple syntaxes to specify memory. Suggested-by: Dave Young Link: http://lkml.kernel.org/r/149035346749.6881.911095631212975718.stgit@hbathini.in.ibm.com Signed-off-by: Hari Bathini Reviewed-by: Mahesh Salgaonkar Acked-by: Michael Ellerman Cc: Fenghua Yu Cc: Tony Luck Cc: Dave Young Cc: Eric Biederman Cc: Vivek Goyal Signed-off-by: Andrew Morton --- arch/powerpc/kernel/fadump.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 31c0abec51cf..e013f8fa6f99 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -210,14 +210,20 @@ static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, */ static inline unsigned long fadump_calculate_reserve_size(void) { - unsigned long size; + int ret; + unsigned long long base, size; /* - * Check if the size is specified through fadump_reserve_mem= cmdline - * option. If yes, then use that. + * Check if the size is specified through crashkernel= cmdline + * option. If yes, then use that but ignore base as fadump + * reserves memory at end of RAM. */ - if (fw_dump.reserve_bootvar) + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), + &size, &base); + if (ret == 0 && size > 0) { + fw_dump.reserve_bootvar = (unsigned long)size; return fw_dump.reserve_bootvar; + } /* divide by 20 to get 5% of value */ size = memblock_end_of_DRAM() / 20; @@ -353,15 +359,6 @@ static int __init early_fadump_param(char *p) } early_param("fadump", early_fadump_param); -/* Look for fadump_reserve_mem= cmdline option */ -static int __init early_fadump_reserve_mem(char *p) -{ - if (p) - fw_dump.reserve_bootvar = memparse(p, &p); - return 0; -} -early_param("fadump_reserve_mem", early_fadump_reserve_mem); - static void register_fw_dump(struct fadump_mem_struct *fdm) { int rc; -- cgit v1.2.3 From be8232029aaeec1258d04f99ac9196125cbc7ed4 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 5 Apr 2017 09:21:33 +1000 Subject: powerpc/fadump: update documentation about crashkernel parameter reuse As we are reusing crashkernel parameter instead of fadump_reserve_mem parameter to specify the memory to reserve for fadump's crash kernel, update the documentation accordingly. Link: http://lkml.kernel.org/r/149035347559.6881.14224829694291758581.stgit@hbathini.in.ibm.com Signed-off-by: Hari Bathini Acked-by: Michael Ellerman Cc: Fenghua Yu Cc: Tony Luck Cc: Dave Young Cc: Eric Biederman Cc: Mahesh Salgaonkar Cc: Vivek Goyal Signed-off-by: Andrew Morton --- Documentation/powerpc/firmware-assisted-dump.txt | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 3007bc98af28..8394bc8e34d7 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -55,10 +55,14 @@ as follows: booted with restricted memory. By default, the boot memory size will be the larger of 5% of system RAM or 256MB. Alternatively, user can also specify boot memory size - through boot parameter 'fadump_reserve_mem=' which will - override the default calculated size. Use this option - if default boot memory size is not sufficient for second - kernel to boot successfully. + through boot parameter 'crashkernel=' which will override + the default calculated size. Use this option if default + boot memory size is not sufficient for second kernel to + boot successfully. For syntax of crashkernel= parameter, + refer to Documentation/kdump/kdump.txt. If any offset is + provided in crashkernel= parameter, it will be ignored + as fadump reserves memory at end of RAM for boot memory + dump preservation in case of a crash. -- After the low memory (boot memory) area has been saved, the firmware will reset PCI and other hardware state. It will @@ -158,13 +162,16 @@ How to enable firmware-assisted dump (fadump): 1. Set config option CONFIG_FA_DUMP=y and build kernel. 2. Boot into linux kernel with 'fadump=on' kernel cmdline option. -3. Optionally, user can also set 'fadump_reserve_mem=' kernel cmdline +3. Optionally, user can also set 'crashkernel=' kernel cmdline to specify size of the memory to reserve for boot memory dump preservation. -NOTE: If firmware-assisted dump fails to reserve memory then it will - fallback to existing kdump mechanism if 'crashkernel=' option - is set at kernel cmdline. +NOTE: 1. 'fadump_reserve_mem=' parameter has been deprecated. Instead + use 'crashkernel=' to specify size of the memory to reserve + for boot memory dump preservation. + 2. If firmware-assisted dump fails to reserve memory then it + will fallback to existing kdump mechanism if 'crashkernel=' + option is set at kernel cmdline. Sysfs/debugfs files: ------------ -- cgit v1.2.3 From 15f71e0aff918af8c8bc28268ce5f34f365db257 Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Wed, 5 Apr 2017 09:21:33 +1000 Subject: kdump, vmcoreinfo: report actual value of phys_base Currently, VMCOREINFO note information reports the virtual address of phys_base that is assigned to symbol phys_base. But this doesn't make sense because to refer to phys_base, it's necessary to get the value of phys_base itself we are now about to refer to. Userland tools related to kdump such as makedumpfile and crash utility so far have made some efforts to calculate phys_base on crash dump formats generated by mechanisms running outside Linux kernel, such as virtual machine hypervisor such as qemu dump, which ordinary users use via virsh dump, or ones implemented on vendor specific firmware. That is, find a kernel data whose virtual and physical addresses are available via its note information and calculate phys_base from it. However, such data structure is not the one prepared for phys_base purpose. There's no guarantee that other crash dump mechanisms include such information that can be used to calculate phys_base similarly. To get VMCOREINFO in vmcore, it's easy to use strings and grep commands like this; VMCOREINFO consists of simple string: $ strings vmcore-3.10.0-121.el7.x86_64 | grep -E ".*VMCOREINFO.*" -A 100 VMCOREINFO OSRELEASE=3.10.0-121.el7.x86_64 PAGESIZE=4096 ... This is also useful to get value of phys_base in kdump 2nd kernel contained in vmcore using the above-mentioned external crash dump mechanism; kdump 2nd kernel is an inherently relocated kernel. This commit doesn't remove VMCOREINFO_SYMBOL(phys_base) line because makedumpfile refers to it and if removing it, old versions makedumpfile doesn't work well. Signed-off-by: HATAYAMA Daisuke Cc: Eric W. Biederman Cc: Vivek Goyal Cc: Atsushi Kumagai Cc: Dave Anderson Signed-off-by: Andrew Morton --- arch/x86/kernel/machine_kexec_64.c | 1 + include/linux/crash_core.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 857cdbd02867..8b9947c414c4 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -341,6 +341,7 @@ void arch_crash_save_vmcoreinfo(void) vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); + VMCOREINFO_PHYS_BASE(phys_base); } /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 541a197ba4a2..eb71a70ea2b5 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -50,6 +50,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) +#define VMCOREINFO_PHYS_BASE(value) \ + vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value) extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; extern size_t vmcoreinfo_size; -- cgit v1.2.3 From 76e0c165b113a14f6fdce951675e28b6371abd97 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Wed, 5 Apr 2017 09:21:34 +1000 Subject: uapi: fix linux/sysctl.h userspace compilation errors Include (guarded by #ifndef __KERNEL__) to fix the following linux/sysctl.h userspace compilation errors: /usr/include/linux/sysctl.h:38:2: error: unknown type name 'size_t' size_t *oldlenp; /usr/include/linux/sysctl.h:40:2: error: unknown type name 'size_t' size_t newlen; This also fixes userspace compilation of uapi headers that include linux/sysctl.h, e.g. linux/netfilter.h. Link: http://lkml.kernel.org/r/20170222230652.GA14373@altlinux.org Signed-off-by: Dmitry V. Levin Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- include/uapi/linux/sysctl.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index d2b12152e358..c6d18aaeb3a4 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -26,6 +26,10 @@ #include #include +#ifndef __KERNEL__ +#include /* For size_t. */ +#endif + #define CTL_MAXNAME 10 /* how many path components do we allow in a call to sysctl? In other words, what is the largest acceptable value for the nlen -- cgit v1.2.3 From beb35b0574f603c376a8da0f4f08ef594dde4fa7 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 5 Apr 2017 09:21:35 +1000 Subject: fs/nsfs.c: allow ns_entries to have custom symlink content Patch series "Expose task pid_ns_for_children to userspace". pid_ns_for_children set by a task is known only to the task itself, and it's impossible to identify it from outside. It's a big problem for checkpoint/restore software like CRIU, because it can't correctly handle tasks, that do setns(CLONE_NEWPID) in proccess of their work. If they have a custom pid_ns_for_children before dump, they must have the same ns after restore. Otherwise, restored task bumped into enviroment it does not expect. This patchset solves the problem. It exposes pid_ns_for_children to ns directory in standard way with the name "pid_for_children": ~# ls /proc/5531/ns -l | grep pid lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid -> pid:[4026531836] lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid_for_children -> pid:[4026532286] This patch (of 2): Make it possible to have link content prefix yyy different from the link name xxx: $ readlink /proc/[pid]/ns/xxx yyy:[4026531838] This will be used in the patch "pidns: expose task pid_ns_for_children to userspace". Link: http://lkml.kernel.org/r/149086966919.4388.10564546347286785860.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Reviewed-by: Cyrill Gorcunov Acked-by: Andrei Vagin Cc: Andreas Gruenbacher Cc: Kees Cook Cc: Michael Kerrisk Cc: Al Viro Cc: Oleg Nesterov Cc: Paul Moore Cc: Eric Biederman Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Serge Hallyn Signed-off-by: Andrew Morton --- fs/nsfs.c | 4 +++- include/linux/proc_ns.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index 1656843e87d2..495f12b83a7b 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -195,9 +195,11 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task, { struct ns_common *ns; int res = -ENOENT; + const char *name; ns = ns_ops->get(task); if (ns) { - res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum); + name = ns_ops->real_ns_name ? : ns_ops->name; + res = snprintf(buf, size, "%s:[%u]", name, ns->inum); ns_ops->put(ns); } return res; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 12cb8bd81d2d..88dba3b53375 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -14,6 +14,7 @@ struct inode; struct proc_ns_operations { const char *name; + const char *real_ns_name; int type; struct ns_common *(*get)(struct task_struct *task); void (*put)(struct ns_common *ns); -- cgit v1.2.3 From 5a2ab9e5545bb110c2a31f9e07d40791f20aae18 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 5 Apr 2017 09:21:35 +1000 Subject: pidns: expose task pid_ns_for_children to userspace pid_ns_for_children set by a task is known only to the task itself, and it's impossible to identify it from outside. It's a big problem for checkpoint/restore software like CRIU, because it can't correctly handle tasks, that do setns(CLONE_NEWPID) in proccess of their work. This patch solves the problem, and it exposes pid_ns_for_children to ns directory in standard way with the name "pid_for_children": ~# ls /proc/5531/ns -l | grep pid lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid -> pid:[4026531836] lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid_for_children -> pid:[4026532286] Link: http://lkml.kernel.org/r/149086967937.4388.471494976517194744.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Reviewed-by: Cyrill Gorcunov Acked-by: Andrei Vagin Cc: Andreas Gruenbacher Cc: Kees Cook Cc: Michael Kerrisk Cc: Al Viro Cc: Oleg Nesterov Cc: Paul Moore Cc: Eric Biederman Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Serge Hallyn Signed-off-by: Andrew Morton --- fs/proc/namespaces.c | 1 + include/linux/proc_ns.h | 1 + kernel/pid_namespace.c | 25 +++++++++++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 766f0c637ad1..3803b24ca220 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -23,6 +23,7 @@ static const struct proc_ns_operations *ns_entries[] = { #endif #ifdef CONFIG_PID_NS &pidns_operations, + &pidns_for_children_operations, #endif #ifdef CONFIG_USER_NS &userns_operations, diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 88dba3b53375..58ab28d81fc2 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -27,6 +27,7 @@ extern const struct proc_ns_operations netns_operations; extern const struct proc_ns_operations utsns_operations; extern const struct proc_ns_operations ipcns_operations; extern const struct proc_ns_operations pidns_operations; +extern const struct proc_ns_operations pidns_for_children_operations; extern const struct proc_ns_operations userns_operations; extern const struct proc_ns_operations mntns_operations; extern const struct proc_ns_operations cgroupns_operations; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index de461aa0bf9a..4dd02ff0b0bd 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -374,6 +374,20 @@ static struct ns_common *pidns_get(struct task_struct *task) return ns ? &ns->ns : NULL; } +static struct ns_common *pidns_for_children_get(struct task_struct *task) +{ + struct pid_namespace *ns = NULL; + + task_lock(task); + if (task->nsproxy) { + ns = task->nsproxy->pid_ns_for_children; + get_pid_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + static void pidns_put(struct ns_common *ns) { put_pid_ns(to_pid_ns(ns)); @@ -443,6 +457,17 @@ const struct proc_ns_operations pidns_operations = { .get_parent = pidns_get_parent, }; +const struct proc_ns_operations pidns_for_children_operations = { + .name = "pid_for_children", + .real_ns_name = "pid", + .type = CLONE_NEWPID, + .get = pidns_for_children_get, + .put = pidns_put, + .install = pidns_install, + .owner = pidns_owner, + .get_parent = pidns_get_parent, +}; + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); -- cgit v1.2.3 From 65ab2c501ec974cbd5a7fa842b0d983933b7656e Mon Sep 17 00:00:00 2001 From: Zhang Xiao Date: Wed, 5 Apr 2017 09:21:36 +1000 Subject: taskstats: add e/u/stime for TGID command The elapsed time, user CPU time and system CPU time for the thread group status request are presently left at zero. Fill these in. Link: http://lkml.kernel.org/r/1488508424-12322-1-git-send-email-xiao.zhang@windriver.com Signed-off-by: Zhang Xiao Cc: Balbir Singh Cc: Oleg Nesterov Signed-off-by: Andrew Morton --- kernel/taskstats.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 8a5e44236f78..802a2bb7ccb9 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -210,6 +210,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) struct task_struct *tsk, *first; unsigned long flags; int rc = -ESRCH; + u64 delta, utime, stime; /* * Add additional stats from live tasks except zombie thread group @@ -238,6 +239,16 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) */ delayacct_add_tsk(stats, tsk); + /* calculate task elapsed time in nsec */ + delta = ktime_get_ns() - tsk->start_time; + /* Convert to micro seconds */ + do_div(delta, NSEC_PER_USEC); + stats->ac_etime += delta; + + task_cputime(tsk, &utime, &stime); + stats->ac_utime += div_u64(utime, NSEC_PER_USEC); + stats->ac_stime += div_u64(stime, NSEC_PER_USEC); + stats->nvcsw += tsk->nvcsw; stats->nivcsw += tsk->nivcsw; } while_each_thread(first, tsk); -- cgit v1.2.3 From 36645cb271707698d13fcdcb0a002d379e54b14d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 5 Apr 2017 09:21:36 +1000 Subject: taskstats-add-e-u-stime-for-tgid-command-fix run ktime_get_ns() a single time Cc: Balbir Singh Cc: Oleg Nesterov Cc: Zhang Xiao Signed-off-by: Andrew Morton --- kernel/taskstats.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 802a2bb7ccb9..1247db3ff002 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -211,6 +211,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) unsigned long flags; int rc = -ESRCH; u64 delta, utime, stime; + u64 start_time; /* * Add additional stats from live tasks except zombie thread group @@ -228,6 +229,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) memset(stats, 0, sizeof(*stats)); tsk = first; + start_time = ktime_get_ns(); do { if (tsk->exit_state) continue; @@ -240,7 +242,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) delayacct_add_tsk(stats, tsk); /* calculate task elapsed time in nsec */ - delta = ktime_get_ns() - tsk->start_time; + delta = start_time - tsk->start_time; /* Convert to micro seconds */ do_div(delta, NSEC_PER_USEC); stats->ac_etime += delta; -- cgit v1.2.3 From 70658bc98c977ff8abcf809e023fb3e0e2ef02df Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 5 Apr 2017 09:21:37 +1000 Subject: taskstats-add-e-u-stime-for-tgid-command-fix-fix include linux/sched/cputime.h for task_cputime() Cc: Balbir Singh Cc: Oleg Nesterov Cc: Zhang Xiao Signed-off-by: Andrew Morton --- kernel/taskstats.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 1247db3ff002..4559e914452b 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -30,6 +30,7 @@ #include #include #include +#include /* * Maximum length of a cpumask that can be specified in -- cgit v1.2.3 From e342b172a3e716b213689dfdb0ef62e7de5e59ab Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Wed, 5 Apr 2017 09:21:38 +1000 Subject: kcov: simplify interrupt check in_interrupt() semantics are confusing and wrong for most users as it also returns true when bh is disabled. Thus we open coded a proper check for interrupts in __sanitizer_cov_trace_pc() with a lengthy explanatory comment. Use the new in_task() predicate instead. Link: http://lkml.kernel.org/r/20170321091026.139655-1-dvyukov@google.com Signed-off-by: Dmitry Vyukov Cc: Kefeng Wang Cc: James Morse Cc: Alexander Popov Cc: Andrey Konovalov Cc: Hillf Danton Signed-off-by: Andrew Morton --- kernel/kcov.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index 85e5546cd791..cd771993f96f 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -60,15 +60,8 @@ void notrace __sanitizer_cov_trace_pc(void) /* * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. - * The checks for whether we are in an interrupt are open-coded, because - * 1. We can't use in_interrupt() here, since it also returns true - * when we are inside local_bh_disable() section. - * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()), - * since that leads to slower generated code (three separate tests, - * one for each of the flags). */ - if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET - | NMI_MASK))) + if (!t || !in_task()) return; mode = READ_ONCE(t->kcov_mode); if (mode == KCOV_MODE_TRACE) { -- cgit v1.2.3 From 0e01f1d018f05f36e3fdf18c48242a757ac17808 Mon Sep 17 00:00:00 2001 From: Peter Griffin Date: Wed, 5 Apr 2017 09:21:39 +1000 Subject: scripts/gdb: add lx-fdtdump command lx-fdtdump dumps the flattened device tree passed to the kernel from the bootloader to the filename specified as the command argument. If no argument is provided it defaults to fdtdump.dtb. This then allows further post processing on the machine running GDB. The fdt header is also also printed in the GDB console. For example: (gdb) lx-fdtdump fdt_magic: 0xD00DFEED fdt_totalsize: 0xC108 off_dt_struct: 0x38 off_dt_strings: 0x3804 off_mem_rsvmap: 0x28 version: 17 last_comp_version: 16 Dumped fdt to fdtdump.dtb >fdtdump fdtdump.dtb | less This command is useful as the bootloader can often re-write parts of the device tree, and this can sometimes cause the kernel to not boot. Link: http://lkml.kernel.org/r/1481280065-5336-2-git-send-email-kbingham@kernel.org Signed-off-by: Peter Griffin Signed-off-by: Kieran Bingham Cc: Jason Wessel Signed-off-by: Andrew Morton --- scripts/gdb/linux/constants.py.in | 7 ++++ scripts/gdb/linux/proc.py | 73 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in index 7986f4e0da12..7aad82406422 100644 --- a/scripts/gdb/linux/constants.py.in +++ b/scripts/gdb/linux/constants.py.in @@ -14,6 +14,7 @@ #include #include +#include /* We need to stringify expanded macros so that they can be parsed */ @@ -50,3 +51,9 @@ LX_VALUE(MNT_NOEXEC) LX_VALUE(MNT_NOATIME) LX_VALUE(MNT_NODIRATIME) LX_VALUE(MNT_RELATIME) + +/* linux/of_fdt.h> */ +LX_VALUE(OF_DT_HEADER) + +/* Kernel Configs */ +LX_CONFIG(CONFIG_OF) diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py index 38b1f09d1cd9..086d27223c0c 100644 --- a/scripts/gdb/linux/proc.py +++ b/scripts/gdb/linux/proc.py @@ -16,6 +16,7 @@ from linux import constants from linux import utils from linux import tasks from linux import lists +from struct import * class LxCmdLine(gdb.Command): @@ -195,3 +196,75 @@ values of that process namespace""" info_opts(MNT_INFO, m_flags))) LxMounts() + + +class LxFdtDump(gdb.Command): + """Output Flattened Device Tree header and dump FDT blob to the filename + specified as the command argument. Equivalent to + 'cat /proc/fdt > fdtdump.dtb' on a running target""" + + def __init__(self): + super(LxFdtDump, self).__init__("lx-fdtdump", gdb.COMMAND_DATA, + gdb.COMPLETE_FILENAME) + + def fdthdr_to_cpu(self, fdt_header): + + fdt_header_be = ">IIIIIII" + fdt_header_le = " Date: Wed, 5 Apr 2017 09:21:40 +1000 Subject: kernel/reboot.c: add devm_register_reboot_notifier() Add devm_* wrapper around register_reboot_notifier to simplify device specific reboot notifier registration/unregistration. Link: http://lkml.kernel.org/r/20170320171753.1705-1-andrew.smirnov@gmail.com Signed-off-by: Andrey Smirnov Signed-off-by: Andrew Morton --- include/linux/reboot.h | 3 +++ kernel/reboot.c | 27 +++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/include/linux/reboot.h b/include/linux/reboot.h index a7ff409f386d..0ca25413ad2d 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -38,6 +38,9 @@ extern int reboot_force; extern int register_reboot_notifier(struct notifier_block *); extern int unregister_reboot_notifier(struct notifier_block *); +struct device; +extern int devm_register_reboot_notifier(struct device *, struct notifier_block *); + extern int register_restart_handler(struct notifier_block *); extern int unregister_restart_handler(struct notifier_block *); extern void do_kernel_restart(char *cmd); diff --git a/kernel/reboot.c b/kernel/reboot.c index bd30a973fe94..e4ced883d8de 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -104,6 +104,33 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); +static void devm_unregister_reboot_notifier(struct device *dev, void *res) +{ + WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res)); +} + +int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb) +{ + struct notifier_block **rcnb; + int ret; + + rcnb = devres_alloc(devm_unregister_reboot_notifier, + sizeof(*rcnb), GFP_KERNEL); + if (!rcnb) + return -ENOMEM; + + ret = register_reboot_notifier(nb); + if (!ret) { + *rcnb = nb; + devres_add(dev, rcnb); + } else { + devres_free(rcnb); + } + + return ret; +} +EXPORT_SYMBOL(devm_register_reboot_notifier); + /* * Notifier list for kernel code which wants to be called * to restart the system. -- cgit v1.2.3 From d2cf93a6f60fa275105a284f28e8f1f27276804d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 5 Apr 2017 09:21:40 +1000 Subject: kernel-reboot-add-devm_register_reboot_notifier-fix move `struct device' forward declaration to top-of-file Cc: Andrey Smirnov Signed-off-by: Andrew Morton --- include/linux/reboot.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 0ca25413ad2d..ecbf7b56b9db 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -5,6 +5,8 @@ #include #include +struct device; + #define SYS_DOWN 0x0001 /* Notify of system down */ #define SYS_RESTART SYS_DOWN #define SYS_HALT 0x0002 /* Notify of system halt */ @@ -38,7 +40,6 @@ extern int reboot_force; extern int register_reboot_notifier(struct notifier_block *); extern int unregister_reboot_notifier(struct notifier_block *); -struct device; extern int devm_register_reboot_notifier(struct device *, struct notifier_block *); extern int register_restart_handler(struct notifier_block *); -- cgit v1.2.3 From cb20e61ae7843089f254998cfa01e81b98c9e0ff Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Wed, 5 Apr 2017 09:21:41 +1000 Subject: lib/fault-inject.c: use correct check for interrupts in_interrupt() also returns true when bh is disabled in task context. That's not what fail_task() wants to check. Use the new in_task() predicate that does the right thing. Link: http://lkml.kernel.org/r/20170321091805.140676-1-dvyukov@google.com Signed-off-by: Dmitry Vyukov Reviewed-by: Akinobu Mita Signed-off-by: Andrew Morton --- lib/fault-inject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 6a823a53e357..4ff157159a0d 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -56,7 +56,7 @@ static void fail_dump(struct fault_attr *attr) static bool fail_task(struct fault_attr *attr, struct task_struct *task) { - return !in_interrupt() && task->make_it_fail; + return in_task() && task->make_it_fail; } #define MAX_STACK_TRACE_DEPTH 32 -- cgit v1.2.3 From a85fa626d581c57c4a9efedfe4811bf0671aec62 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Wed, 5 Apr 2017 09:21:41 +1000 Subject: fault-inject: support systematic fault injection Add /proc/self/task//fail-nth file that allows failing 0-th, 1-st, 2-nd and so on calls systematically. Excerpt from the added documentation: === Write to this file of integer N makes N-th call in the current task fail (N is 0-based). Read from this file returns a single char 'Y' or 'N' that says if the fault setup with a previous write to this file was injected or not, and disables the fault if it wasn't yet injected. Note that this file enables all types of faults (slab, futex, etc). This setting takes precedence over all other generic settings like probability, interval, times, etc. But per-capability settings (e.g. fail_futex/ignore-private) take precedence over it. This feature is intended for systematic testing of faults in a single system call. See an example below. === Why adding new setting: 1. Existing settings are global rather than per-task. So parallel testing is not possible. 2. attr->interval is close but it depends on attr->count which is non reset to 0, so interval does not work as expected. 3. Trying to model this with existing settings requires manipulations of all of probability, interval, times, space, task-filter and unexposed count and per-task make-it-fail files. 4. Existing settings are per-failure-type, and the set of failure types is potentially expanding. 5. make-it-fail can't be changed by unprivileged user and aggressive stress testing better be done from an unprivileged user. Similarly, this would require opening the debugfs files to the unprivileged user, as he would need to reopen at least times file (not possible to pre-open before dropping privs). The proposed interface solves all of the above (see the example). We want to integrate this into syzkaller fuzzer. A prototype has found 10 bugs in kernel in first day of usage: https://groups.google.com/forum/#!searchin/syzkaller/%22FAULT_INJECTION%22%7Csort:relevance Link: http://lkml.kernel.org/r/20170328130128.101773-1-dvyukov@google.com Signed-off-by: Dmitry Vyukov Cc: Akinobu Mita Signed-off-by: Andrew Morton --- Documentation/fault-injection/fault-injection.txt | 78 +++++++++++++++++++++++ fs/proc/base.c | 52 +++++++++++++++ include/linux/sched.h | 1 + kernel/fork.c | 4 ++ lib/fault-inject.c | 7 ++ 5 files changed, 142 insertions(+) diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt index 415484f3d59a..192d8cbcc5f9 100644 --- a/Documentation/fault-injection/fault-injection.txt +++ b/Documentation/fault-injection/fault-injection.txt @@ -134,6 +134,22 @@ use the boot option: fail_futex= mmc_core.fail_request=,,, +o proc entries + +- /proc/self/task//fail-nth: + + Write to this file of integer N makes N-th call in the current task fail + (N is 0-based). Read from this file returns a single char 'Y' or 'N' + that says if the fault setup with a previous write to this file was + injected or not, and disables the fault if it wasn't yet injected. + Note that this file enables all types of faults (slab, futex, etc). + This setting takes precedence over all other generic debugfs settings + like probability, interval, times, etc. But per-capability settings + (e.g. fail_futex/ignore-private) take precedence over it. + + This feature is intended for systematic testing of faults in a single + system call. See an example below. + How to add new fault injection capability ----------------------------------------- @@ -278,3 +294,65 @@ allocation failure. # env FAILCMD_TYPE=fail_page_alloc \ ./tools/testing/fault-injection/failcmd.sh --times=100 \ -- make -C tools/testing/selftests/ run_tests + +Systematic faults using fail-nth +--------------------------------- + +The following code systematically faults 0-th, 1-st, 2-nd and so on +capabilities in the socketpair() system call. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int main() +{ + int i, err, res, fail_nth, fds[2]; + char buf[128]; + + system("echo N > /sys/kernel/debug/failslab/ignore-gfp-wait"); + sprintf(buf, "/proc/self/task/%ld/fail-nth", syscall(SYS_gettid)); + fail_nth = open(buf, O_RDWR); + for (i = 0;; i++) { + sprintf(buf, "%d", i); + write(fail_nth, buf, strlen(buf)); + res = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds); + err = errno; + read(fail_nth, buf, 1); + if (res == 0) { + close(fds[0]); + close(fds[1]); + } + printf("%d-th fault %c: res=%d/%d\n", i, buf[0], res, err); + if (buf[0] != 'Y') + break; + } + return 0; +} + +An example output: + +0-th fault Y: res=-1/23 +1-th fault Y: res=-1/23 +2-th fault Y: res=-1/23 +3-th fault Y: res=-1/12 +4-th fault Y: res=-1/12 +5-th fault Y: res=-1/23 +6-th fault Y: res=-1/23 +7-th fault Y: res=-1/23 +8-th fault Y: res=-1/12 +9-th fault Y: res=-1/12 +10-th fault Y: res=-1/12 +11-th fault Y: res=-1/12 +12-th fault Y: res=-1/12 +13-th fault Y: res=-1/12 +14-th fault Y: res=-1/12 +15-th fault Y: res=-1/12 +16-th fault N: res=0/12 diff --git a/fs/proc/base.c b/fs/proc/base.c index c87b6b9a8a76..9fc0b20644a6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1358,6 +1358,53 @@ static const struct file_operations proc_fault_inject_operations = { .write = proc_fault_inject_write, .llseek = generic_file_llseek, }; + +static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + int err, n; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + put_task_struct(task); + if (task != current) + return -EPERM; + err = kstrtoint_from_user(buf, count, 10, &n); + if (err) + return err; + if (n < 0 || n == INT_MAX) + return -EINVAL; + current->fail_nth = n + 1; + return len; +} + +static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + int err; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + put_task_struct(task); + if (task != current) + return -EPERM; + if (count < 1) + return -EINVAL; + err = put_user((char)(current->fail_nth ? 'N' : 'Y'), buf); + if (err) + return err; + current->fail_nth = 0; + return 1; +} + +static const struct file_operations proc_fail_nth_operations = { + .read = proc_fail_nth_read, + .write = proc_fail_nth_write, +}; #endif @@ -3302,6 +3349,11 @@ static const struct pid_entry tid_base_stuff[] = { #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), + /* + * Operations on the file check that the task is current, + * so we create it with 0666 to support testing under unprivileged user. + */ + REG("fail-nth", 0666, proc_fail_nth_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tid_io_accounting), diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c3ee2281a56..27bcbd8269f9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -941,6 +941,7 @@ struct task_struct { #ifdef CONFIG_FAULT_INJECTION int make_it_fail; + int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call diff --git a/kernel/fork.c b/kernel/fork.c index 6c463c80e93d..a0151354c959 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -555,6 +555,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) kcov_task_init(tsk); +#ifdef CONFIG_FAULT_INJECTION + tsk->fail_nth = 0; +#endif + return tsk; free_stack: diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 4ff157159a0d..09ac73c177fd 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -107,6 +107,12 @@ static inline bool fail_stacktrace(struct fault_attr *attr) bool should_fail(struct fault_attr *attr, ssize_t size) { + if (in_task() && current->fail_nth) { + if (--current->fail_nth == 0) + goto fail; + return false; + } + /* No need to check any other properties if the probability is 0 */ if (attr->probability == 0) return false; @@ -134,6 +140,7 @@ bool should_fail(struct fault_attr *attr, ssize_t size) if (!fail_stacktrace(attr)) return false; +fail: fail_dump(attr); if (atomic_read(&attr->times) != -1) -- cgit v1.2.3 From 3d332675fd34293faf3af6d0ad4f4f2c54914efa Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 5 Apr 2017 09:21:42 +1000 Subject: fault-inject-support-systematic-fault-injection-fix fix build Cc: Akinobu Mita Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 9fc0b20644a6..ff6d71bcca59 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1377,7 +1377,7 @@ static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, if (n < 0 || n == INT_MAX) return -EINVAL; current->fail_nth = n + 1; - return len; + return count; } static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, -- cgit v1.2.3 From bb95cc4e70630cf553f54eec6baabe0787d2060b Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 5 Apr 2017 09:21:43 +1000 Subject: lib/zlib_inflate/inftrees.c: fix potential buffer overflow smatch says: WARNING: please, no spaces at the start of a line #30: FILE: lib/zlib_inflate/inftrees.c:112: + for (min = 1; min < MAXBITS; min++)$ total: 0 errors, 1 warnings, 8 lines checked NOTE: For some of the reported defects, checkpatch may be able to mechanically convert to the typical style using --fix or --fix-inplace. ./patches/zlib-inflate-fix-potential-buffer-overflow.patch has style problems, please review. NOTE: If any of the errors are false positives, please report them to the maintainer, see CHECKPATCH in MAINTAINERS. Please run checkpatch prior to sending patches Signed-off-by: Andrew Morton --- lib/zlib_inflate/inftrees.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/zlib_inflate/inftrees.c b/lib/zlib_inflate/inftrees.c index 3fe6ce5b53e5..028943052926 100644 --- a/lib/zlib_inflate/inftrees.c +++ b/lib/zlib_inflate/inftrees.c @@ -109,7 +109,7 @@ int zlib_inflate_table(codetype type, unsigned short *lens, unsigned codes, *bits = 1; return 0; /* no symbols, but wait for decoding to report error */ } - for (min = 1; min <= MAXBITS; min++) + for (min = 1; min < MAXBITS; min++) if (count[min] != 0) break; if (root < min) root = min; -- cgit v1.2.3 From dc2f0d5206967436bfecd01e759099b827004d49 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Wed, 5 Apr 2017 09:21:43 +1000 Subject: initramfs: provide a way to ignore image provided by bootloader Many "embedded" architectures provide CMDLINE_FORCE to allow the kernel to override the command line provided by an inflexible bootloader. However there is currrently no way for the kernel to override the initramfs image provided by the bootloader meaning there are still ways for bootloaders to make things difficult for us. Fix this by introducing INITRAMFS_FORCE which can prevent the kernel from loading the bootloader supplied image. We use CMDLINE_FORCE (and its friend CMDLINE_EXTEND) to imply that the system has an inflexible bootloader. This allow us to avoid presenting this config option to users of systems where inflexible bootloaders aren't usually a problem. Link: http://lkml.kernel.org/r/20170217121940.30126-1-daniel.thompson@linaro.org Signed-off-by: Daniel Thompson Cc: Al Viro Signed-off-by: Andrew Morton --- init/initramfs.c | 2 +- usr/Kconfig | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/init/initramfs.c b/init/initramfs.c index 981f286c1d16..bf7707b44988 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -611,7 +611,7 @@ static int __init populate_rootfs(void) char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) panic("%s", err); /* Failed to decompress INTERNAL initramfs */ - if (initrd_start) { + if (initrd_start && !IS_ENABLED(CONFIG_INITRAMFS_FORCE)) { #ifdef CONFIG_BLK_DEV_RAM int fd; printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); diff --git a/usr/Kconfig b/usr/Kconfig index 6278f135256d..c0c48507e44e 100644 --- a/usr/Kconfig +++ b/usr/Kconfig @@ -21,6 +21,16 @@ config INITRAMFS_SOURCE If you are not sure, leave it blank. +config INITRAMFS_FORCE + bool "Ignore the initramfs passed by the bootloader" + depends on CMDLINE_EXTEND || CMDLINE_FORCE + help + This option causes the kernel to ignore the initramfs image + (or initrd image) passed to it by the bootloader. This is + analogous to CMDLINE_FORCE, which is found on some architectures, + and is useful if you cannot or don't want to change the image + your bootloader passes to the kernel. + config INITRAMFS_ROOT_UID int "User ID to map to 0 (user root)" depends on INITRAMFS_SOURCE!="" -- cgit v1.2.3 From 975c2abe355d835bfb01407cc22ced366261de45 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 5 Apr 2017 09:21:44 +1000 Subject: initramfs: use vfs_stat/lstat directly sys_newlstat is a system call implementation that is meant for user space, and that copies kernel-internal data structure to the user format, which is not needed for in-kernel users. Further, as we rearrange the system call implementation so we can extend it with 64-bit time_t, the prototype for sys_newlstat changes. This changes the initramfs code to use vfs_lstat directly, to get it out of the way of the time_t changes, and make it slightly more efficient in the process. Along the same lines we also replace sys_stat and sys_stat64 with vfs_stat. Link: http://lkml.kernel.org/r/20170314214932.4052842-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Cc: Alexander Viro Signed-off-by: Andrew Morton --- init/do_mounts.h | 22 ++++------------------ init/initramfs.c | 12 ++++++------ 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/init/do_mounts.h b/init/do_mounts.h index 067af1d9e8b6..282d65bfd674 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -19,29 +19,15 @@ static inline int create_dev(char *name, dev_t dev) return sys_mknod(name, S_IFBLK|0600, new_encode_dev(dev)); } -#if BITS_PER_LONG == 32 static inline u32 bstat(char *name) { - struct stat64 stat; - if (sys_stat64(name, &stat) != 0) + struct kstat stat; + if (vfs_stat(name, &stat) != 0) return 0; - if (!S_ISBLK(stat.st_mode)) + if (!S_ISBLK(stat.mode)) return 0; - if (stat.st_rdev != (u32)stat.st_rdev) - return 0; - return stat.st_rdev; -} -#else -static inline u32 bstat(char *name) -{ - struct stat stat; - if (sys_newstat(name, &stat) != 0) - return 0; - if (!S_ISBLK(stat.st_mode)) - return 0; - return stat.st_rdev; + return stat.rdev; } -#endif #ifdef CONFIG_BLK_DEV_RAM diff --git a/init/initramfs.c b/init/initramfs.c index bf7707b44988..a5b686696535 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -312,10 +312,10 @@ static int __init maybe_link(void) static void __init clean_path(char *path, umode_t fmode) { - struct stat st; + struct kstat st; - if (!sys_newlstat(path, &st) && (st.st_mode ^ fmode) & S_IFMT) { - if (S_ISDIR(st.st_mode)) + if (!vfs_lstat(path, &st) && (st.mode ^ fmode) & S_IFMT) { + if (S_ISDIR(st.mode)) sys_rmdir(path); else sys_unlink(path); @@ -581,13 +581,13 @@ static void __init clean_rootfs(void) num = sys_getdents64(fd, dirp, BUF_SIZE); while (num > 0) { while (num > 0) { - struct stat st; + struct kstat st; int ret; - ret = sys_newlstat(dirp->d_name, &st); + ret = vfs_lstat(dirp->d_name, &st); WARN_ON_ONCE(ret); if (!ret) { - if (S_ISDIR(st.st_mode)) + if (S_ISDIR(st.mode)) sys_rmdir(dirp->d_name); else sys_unlink(dirp->d_name); -- cgit v1.2.3 From 357d0c1c96dc69abe858381237d2573a766987b6 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 5 Apr 2017 09:21:45 +1000 Subject: ipc/shm: some shmat cleanups Clean up early flag and address some minutia. Link: http://lkml.kernel.org/r/1486673582-6979-3-git-send-email-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton --- ipc/shm.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index 481d2a9c298a..34c4344e8d4b 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1095,11 +1095,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, unsigned long shmlba) { struct shmid_kernel *shp; - unsigned long addr; + unsigned long addr = (unsigned long)shmaddr; unsigned long size; struct file *file; int err; - unsigned long flags; + unsigned long flags = MAP_SHARED; unsigned long prot; int acc_mode; struct ipc_namespace *ns; @@ -1111,7 +1111,8 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, err = -EINVAL; if (shmid < 0) goto out; - else if ((addr = (ulong)shmaddr)) { + + if (addr) { if (addr & (shmlba - 1)) { /* * Round down to the nearest multiple of shmlba. @@ -1126,13 +1127,10 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, #endif goto out; } - flags = MAP_SHARED | MAP_FIXED; - } else { - if ((shmflg & SHM_REMAP)) - goto out; - flags = MAP_SHARED; - } + flags |= MAP_FIXED; + } else if ((shmflg & SHM_REMAP)) + goto out; if (shmflg & SHM_RDONLY) { prot = PROT_READ; -- cgit v1.2.3 From 35ea10aa22077996ff58d495aca9397e61f4c1f6 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 5 Apr 2017 09:21:45 +1000 Subject: sysv,ipc: cacheline align kern_ipc_perm Assign 'struct kern_ipc_perm' its own cacheline to avoid false sharing with sysv ipc calls. While the structure itself is rather read-mostly throughout the lifespan of ipc, the spinlock causes most of the invalidations. One example is 31a7c4746e9 ("ipc/sem.c: cacheline align the ipc spinlock for semaphores"). Therefore, extend this to all ipc. The effect of cacheline alignment on sems can be seen in sembench, which deals mostly with semtimedop wait/wakes is seen to improve raw throughput (worker loops) between 8 to 12% on a 24-core x86 with over 4 threads. Link: http://lkml.kernel.org/r/1486673582-6979-4-git-send-email-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton --- include/linux/ipc.h | 7 +++---- include/linux/sem.h | 3 +-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/linux/ipc.h b/include/linux/ipc.h index 9d84942ae2e5..71fd92d81b26 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -8,8 +8,7 @@ #define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */ /* used by in-kernel data structures */ -struct kern_ipc_perm -{ +struct kern_ipc_perm { spinlock_t lock; bool deleted; int id; @@ -18,9 +17,9 @@ struct kern_ipc_perm kgid_t gid; kuid_t cuid; kgid_t cgid; - umode_t mode; + umode_t mode; unsigned long seq; void *security; -}; +} ____cacheline_aligned_in_smp; #endif /* _LINUX_IPC_H */ diff --git a/include/linux/sem.h b/include/linux/sem.h index 4fc222f8755d..9edec926e9d9 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -10,8 +10,7 @@ struct task_struct; /* One sem_array data structure for each set of semaphores in the system. */ struct sem_array { - struct kern_ipc_perm ____cacheline_aligned_in_smp - sem_perm; /* permissions .. see ipc.h */ + struct kern_ipc_perm sem_perm; /* permissions .. see ipc.h */ time_t sem_ctime; /* last change time */ struct sem *sem_base; /* ptr to first semaphore in array */ struct list_head pending_alter; /* pending operations */ -- cgit v1.2.3