From 0eef5ca171aac20bf116132b4326745ee39ed1ef Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 5 Apr 2017 09:19:23 +1000
Subject: mm: fix page_vma_mapped_walk() for ksm pages

Doug Smythies reports oops with KSM in this backtrace, I've been seeing
the same:

page_vma_mapped_walk+0xe6/0x5b0
page_referenced_one+0x91/0x1a0
rmap_walk_ksm+0x100/0x190
rmap_walk+0x4f/0x60
page_referenced+0x149/0x170
shrink_active_list+0x1c2/0x430
shrink_node_memcg+0x67a/0x7a0
shrink_node+0xe1/0x320
kswapd+0x34b/0x720

Just as 4b0ece6fa016 ("mm: migrate: fix remove_migration_pte() for ksm
pages") observed, you cannot use page->index calculations on ksm pages.
page_vma_mapped_walk() is relying on __vma_address(), where a ksm page can
lead it off the end of the page table, and into whatever nonsense is in
the next page, ending as an oops inside check_pte()'s pte_page().

KSM tells page_vma_mapped_walk() exactly where to look for the page, it
does not need any page->index calculation: and that's so also for all the
normal and file and anon pages - just not for THPs and their subpages.
Get out early in most cases: instead of a PageKsm test, move down the
earlier not-THP-page test, as suggested by Kirill.

I'm also slightly worried that this loop can stray into other vmas, so
added a vm_end test to prevent surprises; though I have not imagined
anything worse than a very contrived case, in which a page mlocked in the
next vma might be reclaimed because it is not mlocked in this vma.

Fixes: ace71a19cec5 ("mm: introduce page_vma_mapped_walk()")
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1704031104400.1118@eggly.anvils
Signed-off-by: Hugh Dickins <hughd@google.com>
Reported-by: Doug Smythies <dsmythies@telus.net>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_vma_mapped.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index c4c9def8ffea..de9c40d7304a 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -111,12 +111,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 	if (pvmw->pmd && !pvmw->pte)
 		return not_found(pvmw);
 
-	/* Only for THP, seek to next pte entry makes sense */
-	if (pvmw->pte) {
-		if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
-			return not_found(pvmw);
+	if (pvmw->pte)
 		goto next_pte;
-	}
 
 	if (unlikely(PageHuge(pvmw->page))) {
 		/* when pud is not present, pte will be NULL */
@@ -165,9 +161,14 @@ restart:
 	while (1) {
 		if (check_pte(pvmw))
 			return true;
-next_pte:	do {
+next_pte:
+		/* Seek to next pte only makes sense for THP */
+		if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
+			return not_found(pvmw);
+		do {
 			pvmw->address += PAGE_SIZE;
-			if (pvmw->address >=
+			if (pvmw->address >= pvmw->vma->vm_end ||
+			    pvmw->address >=
 					__vma_address(pvmw->page, pvmw->vma) +
 					hpage_nr_pages(pvmw->page) * PAGE_SIZE)
 				return not_found(pvmw);
-- 
cgit v1.2.3


From f3de4d62683128ee0b46e8536173b8c3e7ff0b62 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 5 Apr 2017 09:19:25 +1000
Subject: userfaultfd: report actual registered features in fdinfo

fdinfo for userfault file descriptor reports UFFD_API_FEATURES.  Up until
recently, the UFFD_API_FEATURES was defined as 0, therefore corresponding
field in fdinfo always contained zero.  Now, with introduction of several
additional features, UFFD_API_FEATURES is not longer 0 and it seems better
to report actual features requested for the userfaultfd object described
by the fdinfo.  First, the applications that were using userfault will
still see zero at the features field in fdinfo.  Next, reporting actual
features rather than available features, gives clear indication of what
userfault features are used by an application.

Link: http://lkml.kernel.org/r/1491140181-22121-1-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 1d227b0fcf49..f7555fc25877 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1756,7 +1756,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
 	 *	protocols: aa:... bb:...
 	 */
 	seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
-		   pending, total, UFFD_API, UFFD_API_FEATURES,
+		   pending, total, UFFD_API, ctx->features,
 		   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
 }
 #endif
-- 
cgit v1.2.3


From dfc6d75e9e61869b2a1de4acbb601c584c8dd1d3 Mon Sep 17 00:00:00 2001
From: Alexander Polakov <apolyakov@beget.ru>
Date: Wed, 5 Apr 2017 09:19:26 +1000
Subject: mmpage_alloc.c: fix print order in show_free_areas()

Fixes: 11fb998986a72a ("mm: move most file-based accounting to the node")
Link: http://lkml.kernel.org/r/1490377730.30219.2.camel@beget.ru
Signed-off-by: Alexander Polyakov <apolyakov@beget.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6cbde310abed..d6a665057d61 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4519,13 +4519,13 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			K(node_page_state(pgdat, NR_FILE_MAPPED)),
 			K(node_page_state(pgdat, NR_FILE_DIRTY)),
 			K(node_page_state(pgdat, NR_WRITEBACK)),
+			K(node_page_state(pgdat, NR_SHMEM)),
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 			K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
 			K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
 					* HPAGE_PMD_NR),
 			K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
 #endif
-			K(node_page_state(pgdat, NR_SHMEM)),
 			K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
 			K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
 			node_page_state(pgdat, NR_PAGES_SCANNED),
-- 
cgit v1.2.3


From 939897e2d736e25ee8917096de1586226fed9a7b Mon Sep 17 00:00:00 2001
From: Jessica Yu <jeyu@redhat.com>
Date: Wed, 5 Apr 2017 09:19:27 +1000
Subject: vmlinux.lds: add missing VMLINUX_SYMBOL macros

When __{start,end}_ro_after_init is referenced from C code, we run into the
following build errors on blackfin:

  kernel/extable.c:169: undefined reference to `__start_ro_after_init'
  kernel/extable.c:169: undefined reference to `__end_ro_after_init'

The build error is due to the fact that blackfin is one of the few arches
that prepends an underscore '_' to all symbols defined in C.  Fix this by
wrapping __{start,end}_ro_after_init in vmlinux.lds.h with
VMLINUX_SYMBOL(), which adds the necessary prefix for arches that have
HAVE_UNDERSCORE_SYMBOL_PREFIX.

Link: http://lkml.kernel.org/r/1491259387-15869-1-git-send-email-jeyu@redhat.com
Signed-off-by: Jessica Yu <jeyu@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Eddie Kovsky <ewk@edkovsky.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-generic/vmlinux.lds.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 7cdfe167074f..143db9c523e2 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -261,9 +261,9 @@
  */
 #ifndef RO_AFTER_INIT_DATA
 #define RO_AFTER_INIT_DATA						\
-	__start_ro_after_init = .;					\
+	VMLINUX_SYMBOL(__start_ro_after_init) = .;			\
 	*(.data..ro_after_init)						\
-	__end_ro_after_init = .;
+	VMLINUX_SYMBOL(__end_ro_after_init) = .;
 #endif
 
 /*
-- 
cgit v1.2.3


From ae03d3524968e4c0538d1652f3da906c07cfafe5 Mon Sep 17 00:00:00 2001
From: "bsegall@google.com" <bsegall@google.com>
Date: Wed, 5 Apr 2017 09:19:29 +1000
Subject: ptrace: fix PTRACE_LISTEN race corrupting task->state

In PT_SEIZED + LISTEN mode STOP/CONT signals cause a wakeup against
__TASK_TRACED. If this races with the ptrace_unfreeze_traced at the end
of a PTRACE_LISTEN, this can wake the task /after/ the check against
__TASK_TRACED, but before the reset of state to TASK_TRACED. This causes
it to instead clobber TASK_WAKING, allowing a subsequent wakeup against
TRACED while the task is still on the rq wake_list, corrupting it.

Link: http://lkml.kernel.org/r/xm26y3vfhmkp.fsf_-_@bsegall-linux.mtv.corp.google.com
Signed-off-by: Ben Segall <bsegall@google.com>
---
 kernel/ptrace.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 0af928712174..ede305b040ce 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -184,11 +184,17 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
 
 	WARN_ON(!task->ptrace || task->parent != current);
 
+       /*
+        * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up
+        * remotely. Recheck state under the lock to close this race.
+        */
 	spin_lock_irq(&task->sighand->siglock);
-	if (__fatal_signal_pending(task))
-		wake_up_state(task, __TASK_TRACED);
-	else
-		task->state = TASK_TRACED;
+       if (task->state == __TASK_TRACED) {
+               if (__fatal_signal_pending(task))
+                       wake_up_state(task, __TASK_TRACED);
+               else
+                       task->state = TASK_TRACED;
+       }
 	spin_unlock_irq(&task->sighand->siglock);
 }
 
-- 
cgit v1.2.3


From 15743a3c805e67a54aaf72faacb7db5c6613a06b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 5 Apr 2017 09:19:30 +1000
Subject: ptrace-fix-ptrace_listen-race-corrupting-task-state-checkpatch-fixes

ERROR: code indent should use tabs where possible
#39: FILE: kernel/ptrace.c:188:
+        * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up$

ERROR: code indent should use tabs where possible
#40: FILE: kernel/ptrace.c:189:
+        * remotely. Recheck state under the lock to close this race.$

ERROR: code indent should use tabs where possible
#41: FILE: kernel/ptrace.c:190:
+        */$

WARNING: please, no spaces at the start of a line
#47: FILE: kernel/ptrace.c:192:
+       if (task->state == __TASK_TRACED) {$

WARNING: suspect code indent for conditional statements (7, 15)
#47: FILE: kernel/ptrace.c:192:
+       if (task->state == __TASK_TRACED) {
+               if (__fatal_signal_pending(task))

ERROR: code indent should use tabs where possible
#48: FILE: kernel/ptrace.c:193:
+               if (__fatal_signal_pending(task))$

WARNING: please, no spaces at the start of a line
#48: FILE: kernel/ptrace.c:193:
+               if (__fatal_signal_pending(task))$

WARNING: suspect code indent for conditional statements (15, 23)
#48: FILE: kernel/ptrace.c:193:
+               if (__fatal_signal_pending(task))
+                       wake_up_state(task, __TASK_TRACED);

ERROR: code indent should use tabs where possible
#49: FILE: kernel/ptrace.c:194:
+                       wake_up_state(task, __TASK_TRACED);$

WARNING: please, no spaces at the start of a line
#49: FILE: kernel/ptrace.c:194:
+                       wake_up_state(task, __TASK_TRACED);$

ERROR: code indent should use tabs where possible
#50: FILE: kernel/ptrace.c:195:
+               else$

WARNING: please, no spaces at the start of a line
#50: FILE: kernel/ptrace.c:195:
+               else$

ERROR: code indent should use tabs where possible
#51: FILE: kernel/ptrace.c:196:
+                       task->state = TASK_TRACED;$

WARNING: please, no spaces at the start of a line
#51: FILE: kernel/ptrace.c:196:
+                       task->state = TASK_TRACED;$

WARNING: please, no spaces at the start of a line
#52: FILE: kernel/ptrace.c:197:
+       }$

total: 7 errors, 8 warnings, 21 lines checked

NOTE: For some of the reported defects, checkpatch may be able to
      mechanically convert to the typical style using --fix or --fix-inplace.

NOTE: Whitespace errors detected.
      You may wish to use scripts/cleanpatch or scripts/cleanfile

./patches/ptrace-fix-ptrace_listen-race-corrupting-task-state.patch has style problems, please review.

NOTE: If any of the errors are false positives, please report
      them to the maintainer, see CHECKPATCH in MAINTAINERS.

Please run checkpatch prior to sending patches

Cc: Ben Segall <bsegall@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/ptrace.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ede305b040ce..266ddcc1d8bb 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -184,17 +184,17 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
 
 	WARN_ON(!task->ptrace || task->parent != current);
 
-       /*
-        * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up
-        * remotely. Recheck state under the lock to close this race.
-        */
+	/*
+	 * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely.
+	 * Recheck state under the lock to close this race.
+	 */
 	spin_lock_irq(&task->sighand->siglock);
-       if (task->state == __TASK_TRACED) {
-               if (__fatal_signal_pending(task))
-                       wake_up_state(task, __TASK_TRACED);
-               else
-                       task->state = TASK_TRACED;
-       }
+	if (task->state == __TASK_TRACED) {
+		if (__fatal_signal_pending(task))
+			wake_up_state(task, __TASK_TRACED);
+		else
+			task->state = TASK_TRACED;
+	}
 	spin_unlock_irq(&task->sighand->siglock);
 }
 
-- 
cgit v1.2.3


From 6e19569d23154ab79a56db662752cb597481599d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 5 Apr 2017 09:19:32 +1000
Subject: arm: arch/arm/include/asm/page.h needs personality.h

VM_DATA_DEFAULT_FLAGS uses READ_IMPLIES_EXEC, so page.h should include
personality.h to provide this.

This fixes no known bugs and can be safely ignored ;)

Cc: Russell King <linux@arm.linux.org.uk>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/include/asm/page.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index 4355f0ec44d6..f98baaec0a15 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -17,6 +17,8 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/personality.h>	/* For READ_IMPLIES_EXEC */
+
 #ifndef CONFIG_MMU
 
 #include <asm/page-nommu.h>
-- 
cgit v1.2.3


From 834fac54d12562b594d676623c0e11627ef558cc Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 5 Apr 2017 09:19:35 +1000
Subject: dax: add tracepoints to dax_iomap_pte_fault()

Patch series "second round of tracepoints for DAX".

This second round of DAX tracepoint patches adds tracing to the PTE fault
path (dax_iomap_pte_fault(), dax_pfn_mkwrite(), dax_load_hole(),
dax_insert_mapping()) and to the writeback path
(dax_writeback_mapping_range(), dax_writeback_one()).

The purpose of this tracing is to give us a high level view of what DAX is
doing, whether faults are being serviced by PMDs or PTEs, and by real
storage or by zero pages covering holes.

I do have some patches nearly ready which also add tracing to
grab_mapping_entry() and dax_insert_mapping_entry().  These are more
targeted at logging how we are interacting with the radix tree, how we use
empty entries for locking, whether we "downgrade" huge zero pages to 4k
PTE sized allocations, etc.  In the end it seemed to me that this might be
too detailed to have as constantly present tracepoints, but if anyone sees
value in having tracepoints like this in the DAX code permanently (Jan?),
please let me know and I'll add those last two patches.

All these tracepoints were done to be consistent with the style of the XFS
tracepoints and with the existing DAX PMD tracepoints.

This patch (of 6):

Add tracepoints to dax_iomap_pte_fault(), following the same logging
conventions as the rest of DAX.

Here is an example fault that initially tries to be serviced by the PMD
fault handler but which falls back to PTEs because the VMA isn't large
enough to hold a PMD:

small-1086  [005] ....    71.140014: xfs_filemap_huge_fault: dev 259:0 ino
0x1003

small-1086  [005] ....    71.140027: dax_pmd_fault: dev 259:0 ino 0x1003
shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10420000 vm_start
0x10200000 vm_end 0x10500000 pgoff 0x220 max_pgoff 0x1400

small-1086  [005] ....    71.140028: dax_pmd_fault_done: dev 259:0 ino
0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10420000 vm_start
0x10200000 vm_end 0x10500000 pgoff 0x220 max_pgoff 0x1400 FALLBACK

small-1086  [005] ....    71.140035: dax_pte_fault: dev 259:0 ino 0x1003
shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10420000 pgoff 0x220

small-1086  [005] ....    71.140396: dax_pte_fault_done: dev 259:0 ino
0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10420000 pgoff
0x220 MAJOR|NOPAGE

Link: http://lkml.kernel.org/r/20170221195116.13278-2-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c                      | 15 +++++++++++----
 include/trace/events/fs_dax.h | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index de622d4282a6..13acc2655428 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1133,13 +1133,16 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 	int vmf_ret = 0;
 	void *entry;
 
+	trace_dax_pte_fault(inode, vmf, vmf_ret);
 	/*
 	 * Check whether offset isn't beyond end of file now. Caller is supposed
 	 * to hold locks serializing us with truncate / punch hole so this is
 	 * a reliable test.
 	 */
-	if (pos >= i_size_read(inode))
-		return VM_FAULT_SIGBUS;
+	if (pos >= i_size_read(inode)) {
+		vmf_ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
 
 	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
 		flags |= IOMAP_WRITE;
@@ -1150,8 +1153,10 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 	 * that we never have to deal with more than a single extent here.
 	 */
 	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
-	if (error)
-		return dax_fault_return(error);
+	if (error) {
+		vmf_ret = dax_fault_return(error);
+		goto out;
+	}
 	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
 		vmf_ret = dax_fault_return(-EIO);	/* fs corruption? */
 		goto finish_iomap;
@@ -1235,6 +1240,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 		 */
 		ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
 	}
+out:
+	trace_dax_pte_fault_done(inode, vmf, vmf_ret);
 	return vmf_ret;
 }
 
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index c566ddc87f73..cbcd7d64a18d 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -150,6 +150,47 @@ DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
 DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
 DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
 
+DECLARE_EVENT_CLASS(dax_pte_fault_class,
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result),
+	TP_ARGS(inode, vmf, result),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(unsigned long, vm_flags)
+		__field(unsigned long, address)
+		__field(pgoff_t, pgoff)
+		__field(dev_t, dev)
+		__field(unsigned int, flags)
+		__field(int, result)
+	),
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->vm_flags = vmf->vma->vm_flags;
+		__entry->address = vmf->address;
+		__entry->flags = vmf->flags;
+		__entry->pgoff = vmf->pgoff;
+		__entry->result = result;
+	),
+	TP_printk("dev %d:%d ino %#lx %s %s address %#lx pgoff %#lx %s",
+		MAJOR(__entry->dev),
+		MINOR(__entry->dev),
+		__entry->ino,
+		__entry->vm_flags & VM_SHARED ? "shared" : "private",
+		__print_flags(__entry->flags, "|", FAULT_FLAG_TRACE),
+		__entry->address,
+		__entry->pgoff,
+		__print_flags(__entry->result, "|", VM_FAULT_RESULT_TRACE)
+	)
+)
+
+#define DEFINE_PTE_FAULT_EVENT(name) \
+DEFINE_EVENT(dax_pte_fault_class, name, \
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), \
+	TP_ARGS(inode, vmf, result))
+
+DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
+DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done);
+
 #endif /* _TRACE_FS_DAX_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From d2cdb9662231f4edffdab1f4740cf5ef4f0f7a1a Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 5 Apr 2017 09:19:37 +1000
Subject: dax: add tracepoints to dax_pfn_mkwrite()

Add tracepoints to dax_pfn_mkwrite(), following the same logging
conventions as the rest of DAX.

Here is an example PTE fault followed by a pfn_mkwrite:

small_aligned-1094  [002] ....   374.084998: dax_pte_fault: dev 259:0 ino
0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10400000 pgoff
0x200

small_aligned-1094  [002] ....   374.085145: dax_pte_fault_done: dev 259:0
ino 0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10400000 pgoff
0x200 MAJOR|NOPAGE

small_aligned-1094  [002] ....   374.085165: dax_pfn_mkwrite: dev 259:0 ino
0x1003 shared WRITE|MKWRITE|ALLOW_RETRY|KILLABLE|USER address 0x10400000
pgoff 0x200 NOPAGE

Link: http://lkml.kernel.org/r/20170221195116.13278-3-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c                      | 3 +++
 include/trace/events/fs_dax.h | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/fs/dax.c b/fs/dax.c
index 13acc2655428..ed9af976d8a6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -932,6 +932,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
 {
 	struct file *file = vmf->vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
 	void *entry, **slot;
 	pgoff_t index = vmf->pgoff;
 
@@ -941,6 +942,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
 		if (entry)
 			put_unlocked_mapping_entry(mapping, index, entry);
 		spin_unlock_irq(&mapping->tree_lock);
+		trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
 		return VM_FAULT_NOPAGE;
 	}
 	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
@@ -953,6 +955,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
 	 */
 	finish_mkwrite_fault(vmf);
 	put_locked_mapping_entry(mapping, index, entry);
+	trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
 	return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index cbcd7d64a18d..b5a520961f8d 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -190,6 +190,8 @@ DEFINE_EVENT(dax_pte_fault_class, name, \
 
 DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
 DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done);
+DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry);
+DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite);
 
 #endif /* _TRACE_FS_DAX_H */
 
-- 
cgit v1.2.3


From 64a157cb721ab13e64e85460fb56b3351347a515 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 5 Apr 2017 09:19:38 +1000
Subject: dax: add tracepoints to dax_load_hole()

Add tracepoints to dax_load_hole(), following the same logging conventions
as the rest of DAX.

Here is the logging generated by a PTE read from a hole:

read-1075  [002] ....    62.362108: dax_pte_fault: dev 259:0 ino 0x1003
shared ALLOW_RETRY|KILLABLE|USER address 0x10480000 pgoff 0x280

read-1075  [002] ....    62.362140: dax_load_hole: dev 259:0 ino 0x1003
shared ALLOW_RETRY|KILLABLE|USER address 0x10480000 pgoff 0x280 NOPAGE

read-1075  [002] ....    62.362141: dax_pte_fault_done: dev 259:0 ino
0x1003 shared ALLOW_RETRY|KILLABLE|USER address 0x10480000 pgoff 0x280
NOPAGE

Link: http://lkml.kernel.org/r/20170221195116.13278-4-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c                      | 16 +++++++++++-----
 include/trace/events/fs_dax.h |  1 +
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index ed9af976d8a6..c46a5a666a3e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -546,21 +546,25 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 static int dax_load_hole(struct address_space *mapping, void **entry,
 			 struct vm_fault *vmf)
 {
+	struct inode *inode = mapping->host;
 	struct page *page;
 	int ret;
 
 	/* Hole page already exists? Return it...  */
 	if (!radix_tree_exceptional_entry(*entry)) {
 		page = *entry;
-		goto out;
+		goto finish_fault;
 	}
 
 	/* This will replace locked radix tree entry with a hole page */
 	page = find_or_create_page(mapping, vmf->pgoff,
 				   vmf->gfp_mask | __GFP_ZERO);
-	if (!page)
-		return VM_FAULT_OOM;
- out:
+	if (!page) {
+		ret = VM_FAULT_OOM;
+		goto out;
+	}
+
+finish_fault:
 	vmf->page = page;
 	ret = finish_fault(vmf);
 	vmf->page = NULL;
@@ -568,8 +572,10 @@ static int dax_load_hole(struct address_space *mapping, void **entry,
 	if (!ret) {
 		/* Grab reference for PTE that is now referencing the page */
 		get_page(page);
-		return VM_FAULT_NOPAGE;
+		ret = VM_FAULT_NOPAGE;
 	}
+out:
+	trace_dax_load_hole(inode, vmf, ret);
 	return ret;
 }
 
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index b5a520961f8d..2f15dfea7fb1 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -192,6 +192,7 @@ DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
 DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done);
 DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry);
 DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite);
+DEFINE_PTE_FAULT_EVENT(dax_load_hole);
 
 #endif /* _TRACE_FS_DAX_H */
 
-- 
cgit v1.2.3


From 107f537c68f9cd9d7ab585963c4a5df71196e960 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 5 Apr 2017 09:19:40 +1000
Subject: dax: add tracepoints to dax_writeback_mapping_range()

Add tracepoints to dax_writeback_mapping_range(), following the same
logging conventions as the rest of DAX.

Here is an example writeback call:

msync-1085  [006] ....   200.902565: dax_writeback_range: dev 259:0 ino
0x1003 pgoff 0x200-0x2ff

msync-1085  [006] ....   200.902579: dax_writeback_range_done: dev 259:0
ino 0x1003 pgoff 0x200-0x2ff

Link: http://lkml.kernel.org/r/20170221195116.13278-5-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c                      |  8 ++++++--
 include/trace/events/fs_dax.h | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index c46a5a666a3e..9b1c7ed4dddc 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -879,6 +879,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 	start_index = wbc->range_start >> PAGE_SHIFT;
 	end_index = wbc->range_end >> PAGE_SHIFT;
 
+	trace_dax_writeback_range(inode, start_index, end_index);
+
 	tag_pages_for_writeback(mapping, start_index, end_index);
 
 	pagevec_init(&pvec, 0);
@@ -899,10 +901,12 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 			ret = dax_writeback_one(bdev, mapping, indices[i],
 					pvec.pages[i]);
 			if (ret < 0)
-				return ret;
+				goto out;
 		}
 	}
-	return 0;
+out:
+	trace_dax_writeback_range_done(inode, start_index, end_index);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 2f15dfea7fb1..9afe8c8f0bef 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -194,6 +194,38 @@ DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry);
 DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite);
 DEFINE_PTE_FAULT_EVENT(dax_load_hole);
 
+DECLARE_EVENT_CLASS(dax_writeback_range_class,
+	TP_PROTO(struct inode *inode, pgoff_t start_index, pgoff_t end_index),
+	TP_ARGS(inode, start_index, end_index),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(pgoff_t, start_index)
+		__field(pgoff_t, end_index)
+		__field(dev_t, dev)
+	),
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->start_index = start_index;
+		__entry->end_index = end_index;
+	),
+	TP_printk("dev %d:%d ino %#lx pgoff %#lx-%#lx",
+		MAJOR(__entry->dev),
+		MINOR(__entry->dev),
+		__entry->ino,
+		__entry->start_index,
+		__entry->end_index
+	)
+)
+
+#define DEFINE_WRITEBACK_RANGE_EVENT(name) \
+DEFINE_EVENT(dax_writeback_range_class, name, \
+	TP_PROTO(struct inode *inode, pgoff_t start_index, pgoff_t end_index),\
+	TP_ARGS(inode, start_index, end_index))
+
+DEFINE_WRITEBACK_RANGE_EVENT(dax_writeback_range);
+DEFINE_WRITEBACK_RANGE_EVENT(dax_writeback_range_done);
+
 #endif /* _TRACE_FS_DAX_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 05afbdd973135e9d18bfc70110e583ddc90d0a45 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 5 Apr 2017 09:19:41 +1000
Subject: dax: fix regression in dax_writeback_mapping_range()

commit 354ae7432ee8 ("dax: add tracepoints to
dax_writeback_mapping_range()") in the -next tree, which appears in
next-20170310, inadvertently changed dax_writeback_mapping_range() so that
it could end up returning a positive value: the number of bytes flushed,
as returned by dax_writeback_one().  This was incorrect.  This function
either needs to return a negative error value, or zero on success.

This change was causing xfstest failures, as reported by Xiong:

https://lkml.org/lkml/2017/3/13/1220

With this fix applied to next-20170310, all the test failures reported by
Xiong (generic/075 generic/112 generic/127 generic/231 generic/263) are
resolved.

Link: http://lkml.kernel.org/r/20170314215358.31451-1-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reported-by: Xiong Zhou <xzhou@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/dax.c b/fs/dax.c
index 9b1c7ed4dddc..23a55b485be2 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -906,7 +906,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 	}
 out:
 	trace_dax_writeback_range_done(inode, start_index, end_index);
-	return ret;
+	return (ret < 0 ? ret : 0);
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
-- 
cgit v1.2.3


From ff91661855f2a95a77f3cea5e5e56daabd3759ea Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 5 Apr 2017 09:19:42 +1000
Subject: dax: add tracepoint to dax_writeback_one()

Add a tracepoint to dax_writeback_one(), following the same logging
conventions as the rest of DAX.

Here is an example range writeback which ends up flushing one PMD and one
PTE:

test-1265  [003] ....   496.615250: dax_writeback_range: dev 259:0 ino 0x1003
pgoff 0x0-0x7ffffffffffff

test-1265  [003] ....   496.616263: dax_writeback_one: dev 259:0 ino 0x1003
pgoff 0x0 pglen 0x200

test-1265  [003] ....   496.616270: dax_writeback_one: dev 259:0 ino 0x1003
pgoff 0x305 pglen 0x1

test-1265  [003] ....   496.616272: dax_writeback_range_done: dev 259:0 ino
0x1003 pgoff 0x0-0x7ffffffffffff

Link: http://lkml.kernel.org/r/20170221195116.13278-6-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c                      |  1 +
 include/trace/events/fs_dax.h | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/fs/dax.c b/fs/dax.c
index 23a55b485be2..5a38d40457e8 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -844,6 +844,7 @@ static int dax_writeback_one(struct block_device *bdev,
 	spin_lock_irq(&mapping->tree_lock);
 	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
 	spin_unlock_irq(&mapping->tree_lock);
+	trace_dax_writeback_one(mapping->host, index, dax.size >> PAGE_SHIFT);
  unmap:
 	dax_unmap_atomic(bdev, &dax);
 	put_locked_mapping_entry(mapping, index, entry);
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 9afe8c8f0bef..292a4719edd0 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -226,6 +226,30 @@ DEFINE_EVENT(dax_writeback_range_class, name, \
 DEFINE_WRITEBACK_RANGE_EVENT(dax_writeback_range);
 DEFINE_WRITEBACK_RANGE_EVENT(dax_writeback_range_done);
 
+TRACE_EVENT(dax_writeback_one,
+	TP_PROTO(struct inode *inode, pgoff_t pgoff, pgoff_t pglen),
+	TP_ARGS(inode, pgoff, pglen),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(pgoff_t, pgoff)
+		__field(pgoff_t, pglen)
+		__field(dev_t, dev)
+	),
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->pgoff = pgoff;
+		__entry->pglen = pglen;
+	),
+	TP_printk("dev %d:%d ino %#lx pgoff %#lx pglen %#lx",
+		MAJOR(__entry->dev),
+		MINOR(__entry->dev),
+		__entry->ino,
+		__entry->pgoff,
+		__entry->pglen
+	)
+)
+
 #endif /* _TRACE_FS_DAX_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From c345b97bdbb460e10e259f0be94cea951d84b268 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 5 Apr 2017 09:19:44 +1000
Subject: dax: add tracepoint to dax_insert_mapping()

Add a tracepoint to dax_insert_mapping(), following the same logging
conventions as the rest of DAX.  This tracepoint, along with the one in
dax_load_hole(), lets us know how a DAX PTE fault was serviced.

Here is an example DAX fault that inserts a PTE mapping:

small-1126  [007] ....   145.451604: dax_pte_fault: dev 259:0 ino 0x1003
shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10420000 pgoff 0x220

small-1126  [007] ....   145.452317: dax_insert_mapping: dev 259:0 ino
0x1003 shared write address 0x10420000 radix_entry 0x100006

small-1126  [007] ....   145.452399: dax_pte_fault_done: dev 259:0 ino
0x1003 shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10420000 pgoff
0x220 MAJOR|NOPAGE

Link: http://lkml.kernel.org/r/20170221195116.13278-7-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c                      |  1 +
 include/trace/events/fs_dax.h | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/fs/dax.c b/fs/dax.c
index 5a38d40457e8..cd29ad7b6ae9 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -932,6 +932,7 @@ static int dax_insert_mapping(struct address_space *mapping,
 		return PTR_ERR(ret);
 	*entryp = ret;
 
+	trace_dax_insert_mapping(mapping->host, vmf, ret);
 	return vm_insert_mixed(vma, vaddr, dax.pfn);
 }
 
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 292a4719edd0..08bb3ed18dcc 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -194,6 +194,36 @@ DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry);
 DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite);
 DEFINE_PTE_FAULT_EVENT(dax_load_hole);
 
+TRACE_EVENT(dax_insert_mapping,
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry),
+	TP_ARGS(inode, vmf, radix_entry),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(unsigned long, vm_flags)
+		__field(unsigned long, address)
+		__field(void *, radix_entry)
+		__field(dev_t, dev)
+		__field(int, write)
+	),
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->vm_flags = vmf->vma->vm_flags;
+		__entry->address = vmf->address;
+		__entry->write = vmf->flags & FAULT_FLAG_WRITE;
+		__entry->radix_entry = radix_entry;
+	),
+	TP_printk("dev %d:%d ino %#lx %s %s address %#lx radix_entry %#lx",
+		MAJOR(__entry->dev),
+		MINOR(__entry->dev),
+		__entry->ino,
+		__entry->vm_flags & VM_SHARED ? "shared" : "private",
+		__entry->write ? "write" : "read",
+		__entry->address,
+		(unsigned long)__entry->radix_entry
+	)
+)
+
 DECLARE_EVENT_CLASS(dax_writeback_range_class,
 	TP_PROTO(struct inode *inode, pgoff_t start_index, pgoff_t end_index),
 	TP_ARGS(inode, start_index, end_index),
-- 
cgit v1.2.3


From bcd025be9f1051a729c1a457399e7bd81064e3bb Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Wed, 5 Apr 2017 09:19:48 +1000
Subject: kbuild: consolidate header generation from ASM offset information

Largely redundant code is used in different places to generate C headers
from offset information extracted from assembly language output.
Consolidate the code in a Makefile include and use this instead.

Link: http://lkml.kernel.org/r/20170403193739.84905-1-mka@chromium.org
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Michal Marek <mmarek@suse.com>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Kieran Bingham <kieran.bingham@linaro.org>
Cc: Grant Grundler <grundler@chromium.org>
Cc: Michael Davidson <md@google.com>
Cc: Greg Hackmann <ghackmann@google.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Kbuild                       | 21 ++-------------------
 arch/ia64/kernel/Makefile    | 19 +++----------------
 scripts/Makefile.asm-offsets | 22 ++++++++++++++++++++++
 scripts/mod/Makefile         | 21 ++-------------------
 4 files changed, 29 insertions(+), 54 deletions(-)
 create mode 100644 scripts/Makefile.asm-offsets

diff --git a/Kbuild b/Kbuild
index 3d0ae152af7c..e3789c9611fd 100644
--- a/Kbuild
+++ b/Kbuild
@@ -7,29 +7,12 @@
 # 4) Check for missing system calls
 # 5) Generate constants.py (may need bounds.h)
 
-# Default sed regexp - multiline due to syntax constraints
-define sed-y
-	"/^->/{s:->#\(.*\):/* \1 */:; \
-	s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \
-	s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \
-	s:->::; p;}"
-endef
+include scripts/Makefile.asm-offsets
 
 # Use filechk to avoid rebuilds when a header changes, but the resulting file
 # does not
 define filechk_offsets
-	(set -e; \
-	 echo "#ifndef $2"; \
-	 echo "#define $2"; \
-	 echo "/*"; \
-	 echo " * DO NOT MODIFY."; \
-	 echo " *"; \
-	 echo " * This file was generated by Kbuild"; \
-	 echo " */"; \
-	 echo ""; \
-	 sed -ne $(sed-y); \
-	 echo ""; \
-	 echo "#endif" )
+	$(call gen_header_from_asm_offsets,$2)
 endef
 
 #####
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index 3686d6abafde..186ba553ff26 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -50,25 +50,12 @@ CFLAGS_traps.o  += -mfixed-range=f2-f5,f16-f31
 # The gate DSO image is built using a special linker script.
 include $(src)/Makefile.gate
 
+include $(srctree)/scripts/Makefile.asm-offsets
+
 # Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config
-define sed-y
-	"/^->/{s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; s:->::; p;}"
-endef
 quiet_cmd_nr_irqs = GEN     $@
 define cmd_nr_irqs
-	(set -e; \
-	 echo "#ifndef __ASM_NR_IRQS_H__"; \
-	 echo "#define __ASM_NR_IRQS_H__"; \
-	 echo "/*"; \
-	 echo " * DO NOT MODIFY."; \
-	 echo " *"; \
-	 echo " * This file was generated by Kbuild"; \
-	 echo " *"; \
-	 echo " */"; \
-	 echo ""; \
-	 sed -ne $(sed-y) $<; \
-	 echo ""; \
-	 echo "#endif" ) > $@
+	$(call gen_header_from_asm_offsets,__ASM_NR_IRQS_H__) < $< > $@
 endef
 
 # We use internal kbuild rules to avoid the "is up to date" message from make
diff --git a/scripts/Makefile.asm-offsets b/scripts/Makefile.asm-offsets
new file mode 100644
index 000000000000..4ba80ba29b82
--- /dev/null
+++ b/scripts/Makefile.asm-offsets
@@ -0,0 +1,22 @@
+# Default sed regexp - multiline due to syntax constraints
+define sed-asm-offsets-to-c
+	"/^->/{s:->#\(.*\):/* \1 */:; \
+	s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \
+	s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \
+	s:->::; p;}"
+endef
+
+define gen_header_from_asm_offsets
+	(set -e; \
+	 echo "#ifndef $1"; \
+	 echo "#define $1"; \
+	 echo "/*"; \
+	 echo " * DO NOT MODIFY."; \
+	 echo " *"; \
+	 echo " * This file was generated by Kbuild"; \
+	 echo " */"; \
+	 echo ""; \
+	 sed -ne $(sed-asm-offsets-to-c); \
+	 echo ""; \
+	 echo "#endif" )
+endef
diff --git a/scripts/mod/Makefile b/scripts/mod/Makefile
index 19d9bcadc0cc..5858bebfaf32 100644
--- a/scripts/mod/Makefile
+++ b/scripts/mod/Makefile
@@ -7,28 +7,11 @@ modpost-objs	:= modpost.o file2alias.o sumversion.o
 
 devicetable-offsets-file := devicetable-offsets.h
 
-define sed-y
-	"/^->/{s:->#\(.*\):/* \1 */:; \
-	s:^->\([^ ]*\) [\$$#]*\([-0-9]*\) \(.*\):#define \1 \2 /* \3 */:; \
-	s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \
-	s:->::; p;}"
-endef
+include $(srctree)/scripts/Makefile.asm-offsets
 
 quiet_cmd_offsets = GEN     $@
 define cmd_offsets
-	(set -e; \
-	 echo "#ifndef __DEVICETABLE_OFFSETS_H__"; \
-	 echo "#define __DEVICETABLE_OFFSETS_H__"; \
-	 echo "/*"; \
-	 echo " * DO NOT MODIFY."; \
-	 echo " *"; \
-	 echo " * This file was generated by Kbuild"; \
-	 echo " *"; \
-	 echo " */"; \
-	 echo ""; \
-	 sed -ne $(sed-y) $<; \
-	 echo ""; \
-	 echo "#endif" ) > $@
+	$(call gen_header_from_asm_offsets,__DEVICETABLE_OFFSETS_H__) < $< > $@
 endef
 
 $(obj)/$(devicetable-offsets-file): $(obj)/devicetable-offsets.s
-- 
cgit v1.2.3


From 197ec848ee088be8030c0c96a961114b7f676127 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@gmail.com>
Date: Wed, 5 Apr 2017 09:19:52 +1000
Subject: fs/ocfs2/cluster: use setup_timer

Use setup_timer() instead of init_timer() to simplify the code.

Link: http://lkml.kernel.org/r/5e75bf07beb91e092d5aa36c36769949a480456a.1489060564.git.geliangtang@gmail.com
Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/cluster/tcp.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d0ab7e56d0b4..13014f83042c 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -450,9 +450,8 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
 	INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
 	INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req);
 
-	init_timer(&sc->sc_idle_timeout);
-	sc->sc_idle_timeout.function = o2net_idle_timer;
-	sc->sc_idle_timeout.data = (unsigned long)sc;
+	setup_timer(&sc->sc_idle_timeout, o2net_idle_timer,
+		    (unsigned long)sc);
 
 	sclog(sc, "alloced\n");
 
-- 
cgit v1.2.3


From 033881af1a0b7d79f7082a28558d9c1d0c20d765 Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Wed, 5 Apr 2017 09:19:54 +1000
Subject: ocfs2: o2hb: revert hb threshold to keep compatible

Configfs is the interface for ocfs2-tools to set configure to kernel
and $configfs_dir/cluster/$clustername/heartbeat/dead_threshold is the
one used to configure heartbeat dead threshold.  Kernel has a default
value of it but user can set O2CB_HEARTBEAT_THRESHOLD in
/etc/sysconfig/o2cb to override it.

Commit 45b997737a80 ("ocfs2/cluster: use per-attribute show and store
methods") changed heartbeat dead threshold name while ocfs2-tools did
not, so ocfs2-tools won't set this configurable and the default value
is always used.  So revert it.

Fixes: 45b997737a80 ("ocfs2/cluster: use per-attribute show and store methods")
Link: http://lkml.kernel.org/r/1490665245-15374-1-git-send-email-junxiao.bi@oracle.com
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Acked-by: Joseph Qi <jiangqi903@gmail.com>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f6e871760f8d..0da0332725aa 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -2242,13 +2242,13 @@ unlock:
 	spin_unlock(&o2hb_live_lock);
 }
 
-static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item,
+static ssize_t o2hb_heartbeat_group_dead_threshold_show(struct config_item *item,
 		char *page)
 {
 	return sprintf(page, "%u\n", o2hb_dead_threshold);
 }
 
-static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item,
+static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *item,
 		const char *page, size_t count)
 {
 	unsigned long tmp;
@@ -2297,11 +2297,11 @@ static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item,
 
 }
 
-CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold);
+CONFIGFS_ATTR(o2hb_heartbeat_group_, dead_threshold);
 CONFIGFS_ATTR(o2hb_heartbeat_group_, mode);
 
 static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
-	&o2hb_heartbeat_group_attr_threshold,
+	&o2hb_heartbeat_group_attr_dead_threshold,
 	&o2hb_heartbeat_group_attr_mode,
 	NULL,
 };
-- 
cgit v1.2.3


From 2702777758dd0840f2246acedc8ecaa271710e36 Mon Sep 17 00:00:00 2001
From: Guozhonghua <guozhonghua@h3c.com>
Date: Wed, 5 Apr 2017 09:19:55 +1000
Subject: ocfs2: old mle put and release after the function
 dlm_add_migration_mle called

If the old mle is found after the dlm_add_migration_mle called, it should
be put once.  If the return value is not - EEXIST and its type is BLOCK,
it should be put again to release it to avoid memory leak, for it had been
unhashed from the map.

Link: http://lkml.kernel.org/r/71604351584F6A4EBAE558C676F37CA4A3D4B7FE@H3CMLB12-EX.srv.huawei-3com.com
Signed-off-by: Guozhonghua <guozhonghua@h3c.com>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <jiangqi903@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmmaster.c | 62 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 18 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3e04279446e8..4438671c4ac3 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2612,20 +2612,45 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 	spin_lock(&dlm->master_lock);
 	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
 				    namelen, target, dlm->node_num);
+	if (ret == -EEXIST) {
+		if(oldmle)
+			__dlm_put_mle(oldmle);
+
+		spin_unlock(&dlm->master_lock);
+		spin_unlock(&dlm->spinlock);
+		mlog(0, "another process is already migrating it\n");
+		goto fail;
+	}
+
+	/* If an old one mle found, it should be put. if its type is BLOCK,
+	 * it should be put again. Because it had been unhasded from the map
+	 * in the function dlm_add_migration_mle.
+	 * otherwise the memory will be leaked. It will not found it again from
+	 * the hash map.
+	 */
+	if (oldmle) {
+		/* master is known, detach if not already detached */
+		__dlm_mle_detach_hb_events(dlm, oldmle);
+		__dlm_put_mle(oldmle);
+
+		/* if the type of the mle is BLOCK, should put it once for release.
+		 * otherwise memory leak may be caused because oldmle had been unhashed
+		 * from the hash map, it will not be found anymore.
+		 */
+		if (oldmle->type == DLM_MLE_BLOCK)
+			__dlm_put_mle(oldmle);
+       }
+
 	/* get an extra reference on the mle.
 	 * otherwise the assert_master from the new
 	 * master will destroy this.
 	 */
 	dlm_get_mle_inuse(mle);
+	mle_added = 1;
+
 	spin_unlock(&dlm->master_lock);
 	spin_unlock(&dlm->spinlock);
 
-	if (ret == -EEXIST) {
-		mlog(0, "another process is already migrating it\n");
-		goto fail;
-	}
-	mle_added = 1;
-
 	/*
 	 * set the MIGRATING flag and flush asts
 	 * if we fail after this we need to re-dirty the lockres
@@ -2642,12 +2667,6 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 	}
 
 fail:
-	if (ret != -EEXIST && oldmle) {
-		/* master is known, detach if not already detached */
-		dlm_mle_detach_hb_events(dlm, oldmle);
-		dlm_put_mle(oldmle);
-	}
-
 	if (ret < 0) {
 		if (mle_added) {
 			dlm_mle_detach_hb_events(dlm, mle);
@@ -3182,16 +3201,23 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
 	if (ret < 0)
 		kmem_cache_free(dlm_mle_cache, mle);
 
+	/* If an old one mle found, it should be put. if its type is BLOCK,
+	 * it should be put again. Because it had been unhasded from the map
+	 * in the function dlm_add_migration_mle.
+	 * otherwise the memory will be leaked. It will not found it again from
+	 * the hash map.
+	 */
+	if (oldmle) {
+		__dlm_mle_detach_hb_events(dlm, oldmle);
+		__dlm_put_mle(oldmle);
+		if (ret >= 0 && oldmle->type == DLM_MLE_BLOCK)
+			__dlm_put_mle(oldmle);
+	}
+
 	spin_unlock(&dlm->master_lock);
 unlock:
 	spin_unlock(&dlm->spinlock);
 
-	if (oldmle) {
-		/* master is known, detach if not already detached */
-		dlm_mle_detach_hb_events(dlm, oldmle);
-		dlm_put_mle(oldmle);
-	}
-
 	if (res)
 		dlm_lockres_put(res);
 leave:
-- 
cgit v1.2.3


From 40f5224a0525caa6c4e4962eea92e24bbe9e5349 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 5 Apr 2017 09:19:57 +1000
Subject: 
 ocfs2-old-mle-put-and-release-after-the-function-dlm_add_migration_mle-called-fix

fix coding style, comments

Cc: Guozhonghua <guozhonghua@h3c.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <jiangqi903@gmail.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mfasheh@versity.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmmaster.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 4438671c4ac3..f0072145eead 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2613,7 +2613,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
 				    namelen, target, dlm->node_num);
 	if (ret == -EEXIST) {
-		if(oldmle)
+		if (oldmle)
 			__dlm_put_mle(oldmle);
 
 		spin_unlock(&dlm->master_lock);
@@ -2622,10 +2622,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 		goto fail;
 	}
 
-	/* If an old one mle found, it should be put. if its type is BLOCK,
-	 * it should be put again. Because it had been unhasded from the map
+	/*
+	 * If an old mle is found, it should be put.  If its type is BLOCK,
+	 * it should be put again.  Because it has been unhasded from the map
 	 * in the function dlm_add_migration_mle.
-	 * otherwise the memory will be leaked. It will not found it again from
+	 * Otherwise the memory will be leaked.  It will not be found again from
 	 * the hash map.
 	 */
 	if (oldmle) {
@@ -2633,9 +2634,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 		__dlm_mle_detach_hb_events(dlm, oldmle);
 		__dlm_put_mle(oldmle);
 
-		/* if the type of the mle is BLOCK, should put it once for release.
-		 * otherwise memory leak may be caused because oldmle had been unhashed
-		 * from the hash map, it will not be found anymore.
+		/*
+		 * If the type of the mle is BLOCK, it should be put once for
+		 * release.  Otherwise a memory leak may be caused because
+		 * oldmle has been unhashed from the hash map and it will not
+		 * be found any more.
 		 */
 		if (oldmle->type == DLM_MLE_BLOCK)
 			__dlm_put_mle(oldmle);
@@ -3201,10 +3204,11 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
 	if (ret < 0)
 		kmem_cache_free(dlm_mle_cache, mle);
 
-	/* If an old one mle found, it should be put. if its type is BLOCK,
-	 * it should be put again. Because it had been unhasded from the map
-	 * in the function dlm_add_migration_mle.
-	 * otherwise the memory will be leaked. It will not found it again from
+	/*
+	 * If an old mle is found, it should be put.  If its type is BLOCK,
+	 * it should be put again because it has been unhashed from the map
+	 * in the dlm_add_migration_mle().
+	 * Otherwise the memory will be leaked.  It will not be found again from
 	 * the hash map.
 	 */
 	if (oldmle) {
-- 
cgit v1.2.3


From 4819d61ef0476abdfcfd770a4d846cc4d0624e52 Mon Sep 17 00:00:00 2001
From: Guozhonghua <guozhonghua@h3c.com>
Date: Wed, 5 Apr 2017 09:19:58 +1000
Subject: ocfs2/dlm: optimize freeing of dead node locks

Three loops can be optimized into one and its sub loops, so less code can
do the same work.

Link: http://lkml.kernel.org/r/71604351584F6A4EBAE558C676F37CA4C4AF898E@H3CMLB12-EX.srv.huawei-3com.com
Signed-off-by: Guozhonghua <guozhonghua@h3c.com>
Reviewed-by: Eric Ren <zren@suse.com>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <jiangqi903@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmrecovery.c | 39 ++++++++++++++-------------------------
 1 file changed, 14 insertions(+), 25 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 74407c6dd592..4c4b18e612c5 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2268,6 +2268,8 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
 {
 	struct dlm_lock *lock, *next;
 	unsigned int freed = 0;
+       struct list_head *queue = NULL;
+       int i;
 
 	/* this node is the lockres master:
 	 * 1) remove any stale locks for the dead node
@@ -2280,31 +2282,18 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
 	 * to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */
 
 	/* TODO: check pending_asts, pending_basts here */
-	list_for_each_entry_safe(lock, next, &res->granted, list) {
-		if (lock->ml.node == dead_node) {
-			list_del_init(&lock->list);
-			dlm_lock_put(lock);
-			/* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
-			dlm_lock_put(lock);
-			freed++;
-		}
-	}
-	list_for_each_entry_safe(lock, next, &res->converting, list) {
-		if (lock->ml.node == dead_node) {
-			list_del_init(&lock->list);
-			dlm_lock_put(lock);
-			/* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
-			dlm_lock_put(lock);
-			freed++;
-		}
-	}
-	list_for_each_entry_safe(lock, next, &res->blocked, list) {
-		if (lock->ml.node == dead_node) {
-			list_del_init(&lock->list);
-			dlm_lock_put(lock);
-			/* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
-			dlm_lock_put(lock);
-			freed++;
+       for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) {
+               queue = dlm_list_idx_to_ptr(res, i);
+               list_for_each_entry_safe(lock, next, queue, list) {
+                       if (lock->ml.node == dead_node) {
+                               list_del_init(&lock->list);
+                               dlm_lock_put(lock);
+                               /* Can't schedule DLM_UNLOCK_FREE_LOCK
+                                * do manually
+                                */
+                               dlm_lock_put(lock);
+                               freed++;
+                       }
 		}
 	}
 
-- 
cgit v1.2.3


From 4816877e9bc5e27f13dfa19829f9e36a9da6298a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 5 Apr 2017 09:20:00 +1000
Subject: 
 ocfs2-dlm-optimization-of-code-while-free-dead-node-locks-checkpatch-fixes

WARNING: please, no spaces at the start of a line
#26: FILE: fs/ocfs2/dlm/dlmrecovery.c:2271:
+       struct list_head *queue = NULL;$

WARNING: please, no spaces at the start of a line
#27: FILE: fs/ocfs2/dlm/dlmrecovery.c:2272:
+       int i;$

WARNING: please, no spaces at the start of a line
#60: FILE: fs/ocfs2/dlm/dlmrecovery.c:2285:
+       for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) {$

WARNING: suspect code indent for conditional statements (7, 15)
#60: FILE: fs/ocfs2/dlm/dlmrecovery.c:2285:
+       for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) {
+               queue = dlm_list_idx_to_ptr(res, i);

ERROR: code indent should use tabs where possible
#61: FILE: fs/ocfs2/dlm/dlmrecovery.c:2286:
+               queue = dlm_list_idx_to_ptr(res, i);$

WARNING: please, no spaces at the start of a line
#61: FILE: fs/ocfs2/dlm/dlmrecovery.c:2286:
+               queue = dlm_list_idx_to_ptr(res, i);$

ERROR: code indent should use tabs where possible
#62: FILE: fs/ocfs2/dlm/dlmrecovery.c:2287:
+               list_for_each_entry_safe(lock, next, queue, list) {$

WARNING: please, no spaces at the start of a line
#62: FILE: fs/ocfs2/dlm/dlmrecovery.c:2287:
+               list_for_each_entry_safe(lock, next, queue, list) {$

WARNING: suspect code indent for conditional statements (15, 23)
#62: FILE: fs/ocfs2/dlm/dlmrecovery.c:2287:
+               list_for_each_entry_safe(lock, next, queue, list) {
+                       if (lock->ml.node == dead_node) {

ERROR: code indent should use tabs where possible
#63: FILE: fs/ocfs2/dlm/dlmrecovery.c:2288:
+                       if (lock->ml.node == dead_node) {$

WARNING: please, no spaces at the start of a line
#63: FILE: fs/ocfs2/dlm/dlmrecovery.c:2288:
+                       if (lock->ml.node == dead_node) {$

WARNING: suspect code indent for conditional statements (23, 31)
#63: FILE: fs/ocfs2/dlm/dlmrecovery.c:2288:
+                       if (lock->ml.node == dead_node) {
+                               list_del_init(&lock->list);

ERROR: code indent should use tabs where possible
#64: FILE: fs/ocfs2/dlm/dlmrecovery.c:2289:
+                               list_del_init(&lock->list);$

WARNING: please, no spaces at the start of a line
#64: FILE: fs/ocfs2/dlm/dlmrecovery.c:2289:
+                               list_del_init(&lock->list);$

ERROR: code indent should use tabs where possible
#65: FILE: fs/ocfs2/dlm/dlmrecovery.c:2290:
+                               dlm_lock_put(lock);$

WARNING: please, no spaces at the start of a line
#65: FILE: fs/ocfs2/dlm/dlmrecovery.c:2290:
+                               dlm_lock_put(lock);$

ERROR: code indent should use tabs where possible
#66: FILE: fs/ocfs2/dlm/dlmrecovery.c:2291:
+                               /* Can't schedule DLM_UNLOCK_FREE_LOCK$

ERROR: code indent should use tabs where possible
#67: FILE: fs/ocfs2/dlm/dlmrecovery.c:2292:
+                                * do manually$

ERROR: code indent should use tabs where possible
#68: FILE: fs/ocfs2/dlm/dlmrecovery.c:2293:
+                                */$

ERROR: code indent should use tabs where possible
#69: FILE: fs/ocfs2/dlm/dlmrecovery.c:2294:
+                               dlm_lock_put(lock);$

WARNING: please, no spaces at the start of a line
#69: FILE: fs/ocfs2/dlm/dlmrecovery.c:2294:
+                               dlm_lock_put(lock);$

ERROR: code indent should use tabs where possible
#70: FILE: fs/ocfs2/dlm/dlmrecovery.c:2295:
+                               freed++;$

WARNING: please, no spaces at the start of a line
#70: FILE: fs/ocfs2/dlm/dlmrecovery.c:2295:
+                               freed++;$

ERROR: code indent should use tabs where possible
#71: FILE: fs/ocfs2/dlm/dlmrecovery.c:2296:
+                       }$

WARNING: please, no spaces at the start of a line
#71: FILE: fs/ocfs2/dlm/dlmrecovery.c:2296:
+                       }$

total: 11 errors, 14 warnings, 51 lines checked

NOTE: For some of the reported defects, checkpatch may be able to
      mechanically convert to the typical style using --fix or --fix-inplace.

NOTE: Whitespace errors detected.
      You may wish to use scripts/cleanpatch or scripts/cleanfile

./patches/ocfs2-dlm-optimization-of-code-while-free-dead-node-locks.patch has style problems, please review.

NOTE: If any of the errors are false positives, please report
      them to the maintainer, see CHECKPATCH in MAINTAINERS.

Please run checkpatch prior to sending patches

Cc: Guozhonghua <guozhonghua@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmrecovery.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 4c4b18e612c5..908b05942282 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2268,8 +2268,8 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
 {
 	struct dlm_lock *lock, *next;
 	unsigned int freed = 0;
-       struct list_head *queue = NULL;
-       int i;
+	struct list_head *queue = NULL;
+	int i;
 
 	/* this node is the lockres master:
 	 * 1) remove any stale locks for the dead node
@@ -2282,18 +2282,19 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
 	 * to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */
 
 	/* TODO: check pending_asts, pending_basts here */
-       for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) {
-               queue = dlm_list_idx_to_ptr(res, i);
-               list_for_each_entry_safe(lock, next, queue, list) {
-                       if (lock->ml.node == dead_node) {
-                               list_del_init(&lock->list);
-                               dlm_lock_put(lock);
-                               /* Can't schedule DLM_UNLOCK_FREE_LOCK
-                                * do manually
-                                */
-                               dlm_lock_put(lock);
-                               freed++;
-                       }
+	for (i = DLM_GRANTED_LIST; i <= DLM_BLOCKED_LIST; i++) {
+		queue = dlm_list_idx_to_ptr(res, i);
+		list_for_each_entry_safe(lock, next, queue, list) {
+			if (lock->ml.node == dead_node) {
+				list_del_init(&lock->list);
+				dlm_lock_put(lock);
+				/*
+				 * Can't schedule DLM_UNLOCK_FREE_LOCK: do
+				 * manually
+				 */
+				dlm_lock_put(lock);
+				freed++;
+			}
 		}
 	}
 
-- 
cgit v1.2.3


From 44ecd79dcf536a3335dea8f4e20a0460aabb3651 Mon Sep 17 00:00:00 2001
From: Josh Hunt <johunt@akamai.com>
Date: Wed, 5 Apr 2017 09:20:02 +1000
Subject: block: restore /proc/partitions to not display non-partitionable
 removable devices

We found with newer kernels we started seeing the cdrom device showing
up in /proc/partitions, but it was not there before.

Looking into this I found that commit d27769ec ("block: add
GENHD_FL_NO_PART_SCAN") introduces this change in behavior.  It's not
clear to me from the commit's changelog if this change was intentional or
not.  This comment still remains: /* Don't show non-partitionable
removeable devices or empty devices */ so I've decided to send a patch to
restore the behavior of not printing unpartitionable removable devices.

Signed-off-by: Josh Hunt <johunt@akamai.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 block/genhd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/genhd.c b/block/genhd.c
index a9c516a8b37d..08c1cc707702 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -889,7 +889,7 @@ static int show_partition(struct seq_file *seqf, void *v)
 	char buf[BDEVNAME_SIZE];
 
 	/* Don't show non-partitionable removeable devices or empty devices */
-	if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+	if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
 				   (sgp->flags & GENHD_FL_REMOVABLE)))
 		return 0;
 	if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
-- 
cgit v1.2.3


From 21447a80830ea99460fbde74cc026ea80fe13fb0 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 5 Apr 2017 09:20:04 +1000
Subject: mm: fix 100% CPU kswapd busyloop on unreclaimable nodes

Patch series "mm: kswapd spinning on unreclaimable nodes - fixes and cleanups".

Jia reported a scenario in which the kswapd of a node indefinitely spins
at 100% CPU usage.  We have seen similar cases at Facebook.

The kernel's current method of judging its ability to reclaim a node (or
whether to back off and sleep) is based on the amount of scanned pages in
proportion to the amount of reclaimable pages.  In Jia's and our
scenarios, there are no reclaimable pages in the node, however, and the
condition for backing off is never met.  Kswapd busyloops in an attempt to
restore the watermarks while having nothing to work with.

This series reworks the definition of an unreclaimable node based not on
scanning but on whether kswapd is able to actually reclaim pages in
MAX_RECLAIM_RETRIES (16) consecutive runs.  This is the same criteria the
page allocator uses for giving up on direct reclaim and invoking the OOM
killer.  If it cannot free any pages, kswapd will go to sleep and leave
further attempts to direct reclaim invocations, which will either make
progress and re-enable kswapd, or invoke the OOM killer.

Patch #1 fixes the immediate problem Jia reported, the remainder are
smaller fixlets, cleanups, and overall phasing out of the old method.

Patch #6 is the odd one out.  It's a nice cleanup to get_scan_count(), and
directly related to #5, but in itself not relevant to the series.

If the whole series is too ambitious for 4.11, I would consider the first
three patches fixes, the rest cleanups.

This patch (of 9):

Jia He reports a problem with kswapd spinning at 100% CPU when requesting
more hugepages than memory available in the system:

$ echo 4000 >/proc/sys/vm/nr_hugepages

top - 13:42:59 up  3:37,  1 user,  load average: 1.09, 1.03, 1.01
Tasks:   1 total,   1 running,   0 sleeping,   0 stopped,   0 zombie
%Cpu(s):  0.0 us, 12.5 sy,  0.0 ni, 85.5 id,  2.0 wa,  0.0 hi,  0.0 si,  0.0 st
KiB Mem:  31371520 total, 30915136 used,   456384 free,      320 buffers
KiB Swap:  6284224 total,   115712 used,  6168512 free.    48192 cached Mem

  PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND
   76 root      20   0       0      0      0 R 100.0 0.000 217:17.29 kswapd3

At that time, there are no reclaimable pages left in the node, but as
kswapd fails to restore the high watermarks it refuses to go to sleep.

Kswapd needs to back away from nodes that fail to balance.  Up until
1d82de618ddd ("mm, vmscan: make kswapd reclaim in terms of nodes") kswapd
had such a mechanism.  It considered zones whose theoretically reclaimable
pages it had reclaimed six times over as unreclaimable and backed away
from them.  This guard was erroneously removed as the patch changed the
definition of a balanced node.

However, simply restoring this code wouldn't help in the case reported
here: there *are* no reclaimable pages that could be scanned until the
threshold is met.  Kswapd would stay awake anyway.

Introduce a new and much simpler way of backing off.  If kswapd runs
through MAX_RECLAIM_RETRIES (16) cycles without reclaiming a single page,
make it back off from the node.  This is the same number of shots direct
reclaim takes before declaring OOM.  Kswapd will go to sleep on that node
until a direct reclaimer manages to reclaim some pages, thus proving the
node reclaimable again.

v2: move MAX_RECLAIM_RETRIES to mm/internal.h (Michal)

Link: http://lkml.kernel.org/r/20170228214007.5621-2-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Jia He <hejianet@gmail.com>
Tested-by: Jia He <hejianet@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  2 ++
 mm/internal.h          |  6 ++++++
 mm/page_alloc.c        |  9 ++-------
 mm/vmscan.c            | 27 ++++++++++++++++++++-------
 mm/vmstat.c            |  2 +-
 5 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8e02b3750fe0..d2c50ab6ae40 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -630,6 +630,8 @@ typedef struct pglist_data {
 	int kswapd_order;
 	enum zone_type kswapd_classzone_idx;
 
+	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
+
 #ifdef CONFIG_COMPACTION
 	int kcompactd_max_order;
 	enum zone_type kcompactd_classzone_idx;
diff --git a/mm/internal.h b/mm/internal.h
index ccfc2a2969f4..aae93e3fd984 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -80,6 +80,12 @@ static inline void set_page_refcounted(struct page *page)
 
 extern unsigned long highest_memmap_pfn;
 
+/*
+ * Maximum number of reclaim retries without progress before the OOM
+ * killer is consider the only way forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
 /*
  * in mm/vmscan.c:
  */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d6a665057d61..8a40aedddfe5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3517,12 +3517,6 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	return false;
 }
 
-/*
- * Maximum number of reclaim retries without any progress before OOM killer
- * is consider as the only way to move forward.
- */
-#define MAX_RECLAIM_RETRIES 16
-
 /*
  * Checks whether it makes sense to retry the reclaim to make a forward progress
  * for the given allocation request.
@@ -4529,7 +4523,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
 			K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
 			node_page_state(pgdat, NR_PAGES_SCANNED),
-			!pgdat_reclaimable(pgdat) ? "yes" : "no");
+			pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
+				"yes" : "no");
 	}
 
 	for_each_populated_zone(zone) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bc8031ef994d..71292ef80ed4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2620,6 +2620,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 	} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
 					 sc->nr_scanned - nr_scanned, sc));
 
+	/*
+	 * Kswapd gives up on balancing particular nodes after too
+	 * many failures to reclaim anything from them and goes to
+	 * sleep. On reclaim progress, reset the failure counter. A
+	 * successful direct reclaim run will revive a dormant kswapd.
+	 */
+	if (reclaimable)
+		pgdat->kswapd_failures = 0;
+
 	return reclaimable;
 }
 
@@ -2694,10 +2703,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 						 GFP_KERNEL | __GFP_HARDWALL))
 				continue;
 
-			if (sc->priority != DEF_PRIORITY &&
-			    !pgdat_reclaimable(zone->zone_pgdat))
-				continue;	/* Let kswapd poll it */
-
 			/*
 			 * If we already have plenty of memory free for
 			 * compaction in this zone, don't free any more.
@@ -3128,6 +3133,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 	if (waitqueue_active(&pgdat->pfmemalloc_wait))
 		wake_up_all(&pgdat->pfmemalloc_wait);
 
+	/* Hopeless node, leave it to direct reclaim */
+	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+		return true;
+
 	for (i = 0; i <= classzone_idx; i++) {
 		struct zone *zone = pgdat->node_zones + i;
 
@@ -3310,6 +3319,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 			sc.priority--;
 	} while (sc.priority >= 1);
 
+	if (!sc.nr_reclaimed)
+		pgdat->kswapd_failures++;
+
 out:
 	/*
 	 * Return the order kswapd stopped reclaiming at as
@@ -3509,6 +3521,10 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 	if (!waitqueue_active(&pgdat->kswapd_wait))
 		return;
 
+	/* Hopeless node, leave it to direct reclaim */
+	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+		return;
+
 	/* Only wake kswapd if all zones are unbalanced */
 	for (z = 0; z <= classzone_idx; z++) {
 		zone = pgdat->node_zones + z;
@@ -3779,9 +3795,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
 	    sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
 		return NODE_RECLAIM_FULL;
 
-	if (!pgdat_reclaimable(pgdat))
-		return NODE_RECLAIM_FULL;
-
 	/*
 	 * Do not scan if the allocation should not be delayed.
 	 */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 89f95396ec46..16d0411fc2cc 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1425,7 +1425,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n  node_unreclaimable:  %u"
 		   "\n  start_pfn:           %lu"
 		   "\n  node_inactive_ratio: %u",
-		   !pgdat_reclaimable(zone->zone_pgdat),
+		   pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
 		   zone->zone_start_pfn,
 		   zone->zone_pgdat->inactive_ratio);
 	seq_putc(m, '\n');
-- 
cgit v1.2.3


From b956edd67315a4381af1fba5af7f5f33f5bbd1fb Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 5 Apr 2017 09:20:05 +1000
Subject: mm: fix 100% CPU kswapd busyloop on unreclaimable nodes fix

Check kswapd failure against the cumulative nr_reclaimed count, not
against the count from the lowest priority iteration.

Link: http://lkml.kernel.org/r/20170306162410.GB2090@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Minchan Kim <minchan@kernel.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Jia He <hejianet@gmail.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 71292ef80ed4..ea90340ef933 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3223,9 +3223,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 	count_vm_event(PAGEOUTRUN);
 
 	do {
+		unsigned long nr_reclaimed = sc.nr_reclaimed;
 		bool raise_priority = true;
 
-		sc.nr_reclaimed = 0;
 		sc.reclaim_idx = classzone_idx;
 
 		/*
@@ -3315,7 +3315,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		 * Raise priority if scanning rate is too low or there was no
 		 * progress in reclaiming pages
 		 */
-		if (raise_priority || !sc.nr_reclaimed)
+		nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+		if (raise_priority || !nr_reclaimed)
 			sc.priority--;
 	} while (sc.priority >= 1);
 
-- 
cgit v1.2.3


From fa6f62b7f13e9671f3a1299db003001bb0cb2e06 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Wed, 5 Apr 2017 09:20:06 +1000
Subject: mm: fix condition for throttle_direct_reclaim

Since "mm: fix 100% CPU kswapd busyloop on unreclaimable nodes" kswapd has
been modified to give up after MAX_RECLAIM_RETRIES number of unsucessful
iterations.  Before going to sleep, kswapd thread will unconditionally
wakeup all threads sleeping on pfmemalloc_wait.  However the awoken
threads will recheck the watermarks and wake the kswapd thread and sleep
again on pfmemalloc_wait.  There is a chance that the system might end up
in livelock between unsuccessful kswapd and direct reclaimers because all
direct reclaimer might end up in throttle_direct_reclaim and there is
nobody to make a forward progress.  So, add kswapd_failures check on the
throttle_direct_reclaim condition.

Link: http://lkml.kernel.org/r/20170314183228.20152-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index ea90340ef933..667644e53b5c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2822,7 +2822,7 @@ retry:
 	return 0;
 }
 
-static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+static bool allow_direct_reclaim(pg_data_t *pgdat)
 {
 	struct zone *zone;
 	unsigned long pfmemalloc_reserve = 0;
@@ -2830,6 +2830,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 	int i;
 	bool wmark_ok;
 
+	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+		return true;
+
 	for (i = 0; i <= ZONE_NORMAL; i++) {
 		zone = &pgdat->node_zones[i];
 		if (!managed_zone(zone) ||
@@ -2910,7 +2913,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 
 		/* Throttle based on the first usable node */
 		pgdat = zone->zone_pgdat;
-		if (pfmemalloc_watermark_ok(pgdat))
+		if (allow_direct_reclaim(pgdat))
 			goto out;
 		break;
 	}
@@ -2932,14 +2935,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 	 */
 	if (!(gfp_mask & __GFP_FS)) {
 		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
-			pfmemalloc_watermark_ok(pgdat), HZ);
+			allow_direct_reclaim(pgdat), HZ);
 
 		goto check_pending;
 	}
 
 	/* Throttle until kswapd wakes the process */
 	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-		pfmemalloc_watermark_ok(pgdat));
+		allow_direct_reclaim(pgdat));
 
 check_pending:
 	if (fatal_signal_pending(current))
@@ -3119,7 +3122,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 
 	/*
 	 * The throttled processes are normally woken up in balance_pgdat() as
-	 * soon as pfmemalloc_watermark_ok() is true. But there is a potential
+	 * soon as allow_direct_reclaim() is true. But there is a potential
 	 * race between when kswapd checks the watermarks and a process gets
 	 * throttled. There is also a potential race if processes get
 	 * throttled, kswapd wakes, a large process exits thereby balancing the
@@ -3304,7 +3307,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		 * able to safely make forward progress. Wake them
 		 */
 		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
-				pfmemalloc_watermark_ok(pgdat))
+				allow_direct_reclaim(pgdat))
 			wake_up_all(&pgdat->pfmemalloc_wait);
 
 		/* Check if kswapd should be suspending */
-- 
cgit v1.2.3


From c779bcc2064754fbb791181c5b3c5c723e2287ce Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 5 Apr 2017 09:20:07 +1000
Subject: mm: fix check for reclaimable pages in PF_MEMALLOC reclaim throttling

PF_MEMALLOC direct reclaimers get throttled on a node when the sum of all
free pages in each zone fall below half the min watermark.  During the
summation, we want to exclude zones that don't have reclaimables.
Checking the same pgdat over and over again doesn't make sense.

Fixes: 599d0c954f91 ("mm, vmscan: move LRU lists to node")
Link: http://lkml.kernel.org/r/20170228214007.5621-3-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Jia He <hejianet@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 667644e53b5c..52832bedb2ed 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2835,8 +2835,10 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
 
 	for (i = 0; i <= ZONE_NORMAL; i++) {
 		zone = &pgdat->node_zones[i];
-		if (!managed_zone(zone) ||
-		    pgdat_reclaimable_pages(pgdat) == 0)
+		if (!managed_zone(zone))
+			continue;
+
+		if (!zone_reclaimable_pages(zone))
 			continue;
 
 		pfmemalloc_reserve += min_wmark_pages(zone);
-- 
cgit v1.2.3


From 281e5c092e1e7336f0d44ef9baa25f9fb2593ee6 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 5 Apr 2017 09:20:08 +1000
Subject: mm: remove seemingly spurious reclaimability check from laptop_mode
 gating

1d82de618ddd ("mm, vmscan: make kswapd reclaim in terms of nodes") allowed
laptop_mode=1 to start writing not just when the priority drops to
DEF_PRIORITY - 2 but also when the node is unreclaimable.  That appears to
be a spurious change in this patch as I doubt the series was tested with
laptop_mode, and neither is that particular change mentioned in the
changelog.  Remove it, it's still recent.

Link: http://lkml.kernel.org/r/20170228214007.5621-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Jia He <hejianet@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 52832bedb2ed..014d0d181be0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3285,7 +3285,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		 * If we're getting trouble reclaiming, start doing writepage
 		 * even in laptop mode.
 		 */
-		if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat))
+		if (sc.priority < DEF_PRIORITY - 2)
 			sc.may_writepage = 1;
 
 		/* Call soft limit reclaim before calling shrink_node. */
-- 
cgit v1.2.3


From f9a37e599cf573f5da678439ecadbe2de3455a37 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 5 Apr 2017 09:20:08 +1000
Subject: mm: remove unnecessary reclaimability check from NUMA balancing
 target

NUMA balancing already checks the watermarks of the target node to decide
whether it's a suitable balancing target.  Whether the node is reclaimable
or not is irrelevant when we don't intend to reclaim.

Link: http://lkml.kernel.org/r/20170228214007.5621-5-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Jia He <hejianet@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/migrate.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index ed97c2c14fa8..011df454812b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1722,9 +1722,6 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
 {
 	int z;
 
-	if (!pgdat_reclaimable(pgdat))
-		return false;
-
 	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
 		struct zone *zone = pgdat->node_zones + z;
 
-- 
cgit v1.2.3


From d7c5d077f9c4ae68137b15f8c4dfcb8946e93928 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 5 Apr 2017 09:20:09 +1000
Subject: mm: don't avoid high-priority reclaim on unreclaimable nodes

246e87a93934 ("memcg: fix get_scan_count() for small targets") sought to
avoid high reclaim priorities for kswapd by forcing it to scan a minimum
amount of pages when lru_pages >> priority yielded nothing.

b95a2f2d486d ("mm: vmscan: convert global reclaim to per-memcg LRU
lists"), due to switching global reclaim to a round-robin scheme over all
cgroups, had to restrict this forceful behavior to unreclaimable zones in
order to prevent massive overreclaim with many cgroups.

The latter patch effectively neutered the behavior completely for all but
extreme memory pressure.  But in those situations we might as well drop
the reclaimers to lower priority levels.  Remove the check.

Link: http://lkml.kernel.org/r/20170228214007.5621-6-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Jia He <hejianet@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 014d0d181be0..2fd50ca88016 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2130,22 +2130,13 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	int pass;
 
 	/*
-	 * If the zone or memcg is small, nr[l] can be 0.  This
-	 * results in no scanning on this priority and a potential
-	 * priority drop.  Global direct reclaim can go to the next
-	 * zone and tends to have no problems. Global kswapd is for
-	 * zone balancing and it needs to scan a minimum amount. When
+	 * If the zone or memcg is small, nr[l] can be 0. When
 	 * reclaiming for a memcg, a priority drop can cause high
-	 * latencies, so it's better to scan a minimum amount there as
-	 * well.
+	 * latencies, so it's better to scan a minimum amount. When a
+	 * cgroup has already been deleted, scrape out the remaining
+	 * cache forcefully to get rid of the lingering state.
 	 */
-	if (current_is_kswapd()) {
-		if (!pgdat_reclaimable(pgdat))
-			force_scan = true;
-		if (!mem_cgroup_online(memcg))
-			force_scan = true;
-	}
-	if (!global_reclaim(sc))
+	if (!global_reclaim(sc) || !mem_cgroup_online(memcg))
 		force_scan = true;
 
 	/* If we have no swap space, do not bother scanning anon pages. */
-- 
cgit v1.2.3


From eba74071a6806d36f26a6c4d1771448f97fc9604 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 5 Apr 2017 09:20:10 +1000
Subject: mm: don't avoid high-priority reclaim on memcg limit reclaim

246e87a93934 ("memcg: fix get_scan_count() for small targets") sought to
avoid high reclaim priorities for memcg by forcing it to scan a minimum
amount of pages when lru_pages >> priority yielded nothing.  This was done
at a time when reclaim decisions like dirty throttling were tied to the
priority level.

Nowadays, the only meaningful thing still tied to priority dropping below
DEF_PRIORITY - 2 is gating whether laptop_mode=1 is generally allowed to
write.  But that is from an era where direct reclaim was still allowed to
call ->writepage, and kswapd nowadays avoids writes until it's scanned
every clean page in the system.  Potential changes to how quick
sc->may_writepage could trigger are of little concern.

Remove the force_scan stuff, as well as the ugly multi-pass target
calculation that it necessitated.

Link: http://lkml.kernel.org/r/20170228214007.5621-7-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Jia He <hejianet@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 94 ++++++++++++++++++++++++-------------------------------------
 1 file changed, 37 insertions(+), 57 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2fd50ca88016..9117ae8d49ee 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2123,21 +2123,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	unsigned long anon_prio, file_prio;
 	enum scan_balance scan_balance;
 	unsigned long anon, file;
-	bool force_scan = false;
 	unsigned long ap, fp;
 	enum lru_list lru;
-	bool some_scanned;
-	int pass;
-
-	/*
-	 * If the zone or memcg is small, nr[l] can be 0. When
-	 * reclaiming for a memcg, a priority drop can cause high
-	 * latencies, so it's better to scan a minimum amount. When a
-	 * cgroup has already been deleted, scrape out the remaining
-	 * cache forcefully to get rid of the lingering state.
-	 */
-	if (!global_reclaim(sc) || !mem_cgroup_online(memcg))
-		force_scan = true;
 
 	/* If we have no swap space, do not bother scanning anon pages. */
 	if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
@@ -2268,55 +2255,48 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	fraction[1] = fp;
 	denominator = ap + fp + 1;
 out:
-	some_scanned = false;
-	/* Only use force_scan on second pass. */
-	for (pass = 0; !some_scanned && pass < 2; pass++) {
-		*lru_pages = 0;
-		for_each_evictable_lru(lru) {
-			int file = is_file_lru(lru);
-			unsigned long size;
-			unsigned long scan;
-
-			size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
-			scan = size >> sc->priority;
-
-			if (!scan && pass && force_scan)
-				scan = min(size, SWAP_CLUSTER_MAX);
-
-			switch (scan_balance) {
-			case SCAN_EQUAL:
-				/* Scan lists relative to size */
-				break;
-			case SCAN_FRACT:
-				/*
-				 * Scan types proportional to swappiness and
-				 * their relative recent reclaim efficiency.
-				 */
-				scan = div64_u64(scan * fraction[file],
-							denominator);
-				break;
-			case SCAN_FILE:
-			case SCAN_ANON:
-				/* Scan one type exclusively */
-				if ((scan_balance == SCAN_FILE) != file) {
-					size = 0;
-					scan = 0;
-				}
-				break;
-			default:
-				/* Look ma, no brain */
-				BUG();
-			}
+	*lru_pages = 0;
+	for_each_evictable_lru(lru) {
+		int file = is_file_lru(lru);
+		unsigned long size;
+		unsigned long scan;
 
-			*lru_pages += size;
-			nr[lru] = scan;
+		size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+		scan = size >> sc->priority;
+		/*
+		 * If the cgroup's already been deleted, make sure to
+		 * scrape out the remaining cache.
+		 */
+		if (!scan && !mem_cgroup_online(memcg))
+			scan = min(size, SWAP_CLUSTER_MAX);
 
+		switch (scan_balance) {
+		case SCAN_EQUAL:
+			/* Scan lists relative to size */
+			break;
+		case SCAN_FRACT:
 			/*
-			 * Skip the second pass and don't force_scan,
-			 * if we found something to scan.
+			 * Scan types proportional to swappiness and
+			 * their relative recent reclaim efficiency.
 			 */
-			some_scanned |= !!scan;
+			scan = div64_u64(scan * fraction[file],
+					 denominator);
+			break;
+		case SCAN_FILE:
+		case SCAN_ANON:
+			/* Scan one type exclusively */
+			if ((scan_balance == SCAN_FILE) != file) {
+				size = 0;
+				scan = 0;
+			}
+			break;
+		default:
+			/* Look ma, no brain */
+			BUG();
 		}
+
+		*lru_pages += size;
+		nr[lru] = scan;
 	}
 }
 
-- 
cgit v1.2.3


From f66b0818d4b2b9ccd141582acb63002578a90427 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 5 Apr 2017 09:20:10 +1000
Subject: mm: delete NR_PAGES_SCANNED and pgdat_reclaimable()

NR_PAGES_SCANNED counts number of pages scanned since the last page free
event in the allocator.  This was used primarily to measure the
reclaimability of zones and nodes, and determine when reclaim should give
up on them.  In that role, it has been replaced in the preceding patches
by a different mechanism.

Being implemented as an efficient vmstat counter, it was automatically
exported to userspace as well.  It's however unlikely that anyone outside
the kernel is using this counter in any meaningful way.

Remove the counter and the unused pgdat_reclaimable().

Link: http://lkml.kernel.org/r/20170228214007.5621-8-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Jia He <hejianet@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  1 -
 mm/internal.h          |  1 -
 mm/page_alloc.c        | 15 +++------------
 mm/vmscan.c            |  9 ---------
 mm/vmstat.c            | 22 +++-------------------
 5 files changed, 6 insertions(+), 42 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d2c50ab6ae40..04e0969966f6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -149,7 +149,6 @@ enum node_stat_item {
 	NR_UNEVICTABLE,		/*  "     "     "   "       "         */
 	NR_ISOLATED_ANON,	/* Temporary isolated pages from anon lru */
 	NR_ISOLATED_FILE,	/* Temporary isolated pages from file lru */
-	NR_PAGES_SCANNED,	/* pages scanned since last reclaim */
 	WORKINGSET_REFAULT,
 	WORKINGSET_ACTIVATE,
 	WORKINGSET_NODERECLAIM,
diff --git a/mm/internal.h b/mm/internal.h
index aae93e3fd984..c583ce1b32b9 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -91,7 +91,6 @@ extern unsigned long highest_memmap_pfn;
  */
 extern int isolate_lru_page(struct page *page);
 extern void putback_lru_page(struct page *page);
-extern bool pgdat_reclaimable(struct pglist_data *pgdat);
 
 /*
  * in mm/rmap.c:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8a40aedddfe5..5f2fd0bd5ead 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1090,15 +1090,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 {
 	int migratetype = 0;
 	int batch_free = 0;
-	unsigned long nr_scanned, flags;
+	unsigned long flags;
 	bool isolated_pageblocks;
 
 	spin_lock_irqsave(&zone->lock, flags);
 	isolated_pageblocks = has_isolate_pageblock(zone);
-	nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
-	if (nr_scanned)
-		__mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
-
 	while (count) {
 		struct page *page;
 		struct list_head *list;
@@ -1150,13 +1146,10 @@ static void free_one_page(struct zone *zone,
 				unsigned int order,
 				int migratetype)
 {
-	unsigned long nr_scanned, flags;
+	unsigned long flags;
+
 	spin_lock_irqsave(&zone->lock, flags);
 	__count_vm_events(PGFREE, 1 << order);
-	nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
-	if (nr_scanned)
-		__mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
-
 	if (unlikely(has_isolate_pageblock(zone) ||
 		is_migrate_isolate(migratetype))) {
 		migratetype = get_pfnblock_migratetype(page, pfn);
@@ -4499,7 +4492,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 #endif
 			" writeback_tmp:%lukB"
 			" unstable:%lukB"
-			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
 			"\n",
 			pgdat->node_id,
@@ -4522,7 +4514,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 #endif
 			K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
 			K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
-			node_page_state(pgdat, NR_PAGES_SCANNED),
 			pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
 				"yes" : "no");
 	}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9117ae8d49ee..02f2eb51b33e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -230,12 +230,6 @@ unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
 	return nr;
 }
 
-bool pgdat_reclaimable(struct pglist_data *pgdat)
-{
-	return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) <
-		pgdat_reclaimable_pages(pgdat) * 6;
-}
-
 /**
  * lruvec_lru_size -  Returns the number of pages on the given LRU list.
  * @lruvec: lru vector
@@ -1750,7 +1744,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	reclaim_stat->recent_scanned[file] += nr_taken;
 
 	if (global_reclaim(sc)) {
-		__mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
 		if (current_is_kswapd())
 			__count_vm_events(PGSCAN_KSWAPD, nr_scanned);
 		else
@@ -1953,8 +1946,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
 	reclaim_stat->recent_scanned[file] += nr_taken;
 
-	if (global_reclaim(sc))
-		__mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
 	__count_vm_events(PGREFILL, nr_scanned);
 
 	spin_unlock_irq(&pgdat->lru_lock);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 16d0411fc2cc..9911ec58fce2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -954,7 +954,6 @@ const char * const vmstat_text[] = {
 	"nr_unevictable",
 	"nr_isolated_anon",
 	"nr_isolated_file",
-	"nr_pages_scanned",
 	"workingset_refault",
 	"workingset_activate",
 	"workingset_nodereclaim",
@@ -1378,7 +1377,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n        min      %lu"
 		   "\n        low      %lu"
 		   "\n        high     %lu"
-		   "\n   node_scanned  %lu"
 		   "\n        spanned  %lu"
 		   "\n        present  %lu"
 		   "\n        managed  %lu",
@@ -1386,7 +1384,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   min_wmark_pages(zone),
 		   low_wmark_pages(zone),
 		   high_wmark_pages(zone),
-		   node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED),
 		   zone->spanned_pages,
 		   zone->present_pages,
 		   zone->managed_pages);
@@ -1587,22 +1584,9 @@ int vmstat_refresh(struct ctl_table *table, int write,
 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
 		val = atomic_long_read(&vm_zone_stat[i]);
 		if (val < 0) {
-			switch (i) {
-			case NR_PAGES_SCANNED:
-				/*
-				 * This is often seen to go negative in
-				 * recent kernels, but not to go permanently
-				 * negative.  Whilst it would be nicer not to
-				 * have exceptions, rooting them out would be
-				 * another task, of rather low priority.
-				 */
-				break;
-			default:
-				pr_warn("%s: %s %ld\n",
-					__func__, vmstat_text[i], val);
-				err = -EINVAL;
-				break;
-			}
+			pr_warn("%s: %s %ld\n",
+				__func__, vmstat_text[i], val);
+			err = -EINVAL;
 		}
 	}
 	if (err)
-- 
cgit v1.2.3


From c72c741ba1c05e83497dcb417c48c0ac0ffdaf9b Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 5 Apr 2017 09:20:11 +1000
Subject: Revert "mm, vmscan: account for skipped pages as a partial scan"

This reverts commit d7f05528eedb047efe2288cff777676b028747b6.

Now that reclaimability of a node is no longer based on the ratio between
pages scanned and theoretically reclaimable pages, we can remove
accounting tricks for pages skipped due to zone constraints.

Link: http://lkml.kernel.org/r/20170228214007.5621-9-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Jia He <hejianet@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 02f2eb51b33e..77832f0dbe0d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1472,12 +1472,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	unsigned long nr_taken = 0;
 	unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
 	unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
-	unsigned long skipped = 0, total_skipped = 0;
+	unsigned long skipped = 0;
 	unsigned long scan, nr_pages;
 	LIST_HEAD(pages_skipped);
 
 	for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
-					!list_empty(src);) {
+					!list_empty(src); scan++) {
 		struct page *page;
 
 		page = lru_to_page(src);
@@ -1491,12 +1491,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 			continue;
 		}
 
-		/*
-		 * Account for scanned and skipped separetly to avoid the pgdat
-		 * being prematurely marked unreclaimable by pgdat_reclaimable.
-		 */
-		scan++;
-
 		switch (__isolate_lru_page(page, mode)) {
 		case 0:
 			nr_pages = hpage_nr_pages(page);
@@ -1525,6 +1519,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	if (!list_empty(&pages_skipped)) {
 		int zid;
 
+		list_splice(&pages_skipped, src);
 		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 			if (!nr_skipped[zid])
 				continue;
@@ -1532,17 +1527,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 			__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
 			skipped += nr_skipped[zid];
 		}
-
-		/*
-		 * Account skipped pages as a partial scan as the pgdat may be
-		 * close to unreclaimable. If the LRU list is empty, account
-		 * skipped pages as a full scan.
-		 */
-		total_skipped = list_empty(src) ? skipped : skipped >> 2;
-
-		list_splice(&pages_skipped, src);
 	}
-	*nr_scanned = scan + total_skipped;
+	*nr_scanned = scan;
 	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
 				    scan, skipped, nr_taken, mode, lru);
 	update_lru_sizes(lruvec, lru, nr_zone_taken);
-- 
cgit v1.2.3


From 08b755e44f1f17bc3aba76c8b54e020d4291787e Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 5 Apr 2017 09:20:11 +1000
Subject: mm: remove unnecessary back-off function when retrying page reclaim

The backoff mechanism is not needed.  If we have MAX_RECLAIM_RETRIES loops
without progress, we'll OOM anyway; backing off might cut one or two
iterations off that in the rare OOM case.  If we have intermittent success
reclaiming a few pages, the backoff function gets reset also, and so is of
little help in these scenarios.

We might want a backoff function for when there IS progress, but not
enough to be satisfactory.  But this isn't that.  Remove it.

Link: http://lkml.kernel.org/r/20170228214007.5621-10-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Jia He <hejianet@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5f2fd0bd5ead..1f222dd91a5b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3513,11 +3513,10 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 /*
  * Checks whether it makes sense to retry the reclaim to make a forward progress
  * for the given allocation request.
- * The reclaim feedback represented by did_some_progress (any progress during
- * the last reclaim round) and no_progress_loops (number of reclaim rounds without
- * any progress in a row) is considered as well as the reclaimable pages on the
- * applicable zone list (with a backoff mechanism which is a function of
- * no_progress_loops).
+ *
+ * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
+ * without success, or when we couldn't even meet the watermark if we
+ * reclaimed all remaining pages on the LRU lists.
  *
  * Returns true if a retry is viable or false to enter the oom path.
  */
@@ -3562,13 +3561,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 		bool wmark;
 
 		available = reclaimable = zone_reclaimable_pages(zone);
-		available -= DIV_ROUND_UP((*no_progress_loops) * available,
-					  MAX_RECLAIM_RETRIES);
 		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
 
 		/*
-		 * Would the allocation succeed if we reclaimed the whole
-		 * available?
+		 * Would the allocation succeed if we reclaimed all
+		 * reclaimable pages?
 		 */
 		wmark = __zone_watermark_ok(zone, order, min_wmark,
 				ac_classzone_idx(ac), alloc_flags, available);
-- 
cgit v1.2.3


From 932489293c74c6ee03c0ee82fdf81a949c957e33 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@gmail.com>
Date: Wed, 5 Apr 2017 09:20:12 +1000
Subject: mm/page-writeback.c: use setup_deferrable_timer

Use setup_deferrable_timer() instead of init_timer_deferrable() to
simplify the code.

Link: http://lkml.kernel.org/r/e8e3d4280a34facbc007346f31df833cec28801e.1488070291.git.geliangtang@gmail.com
Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page-writeback.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d8ac2a7fb9e7..33df0583edb9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -650,9 +650,8 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
 
 	spin_lock_init(&dom->lock);
 
-	init_timer_deferrable(&dom->period_timer);
-	dom->period_timer.function = writeout_period;
-	dom->period_timer.data = (unsigned long)dom;
+	setup_deferrable_timer(&dom->period_timer, writeout_period,
+			       (unsigned long)dom);
 
 	dom->dirty_limit_tstamp = jiffies;
 
-- 
cgit v1.2.3


From e7b090350311749b3e5ad07954f8649be9ebedff Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 5 Apr 2017 09:20:13 +1000
Subject: mm: delete unnecessary TTU_* flags

Patch series "mm: fix some MADV_FREE issues", v5.

We are trying to use MADV_FREE in jemalloc.  Several issues are found.
Without solving the issues, jemalloc can't use the MADV_FREE feature.

- Doesn't support system without swap enabled.  Because if swap is off,
  we can't or can't efficiently age anonymous pages.  And since MADV_FREE
  pages are mixed with other anonymous pages, we can't reclaim MADV_FREE
  pages.  In current implementation, MADV_FREE will fallback to
  MADV_DONTNEED without swap enabled.  But in our environment, a lot of
  machines don't enable swap.  This will prevent our setup using
  MADV_FREE.

- Increases memory pressure.  page reclaim bias file pages reclaim
  against anonymous pages.  This doesn't make sense for MADV_FREE pages,
  because those pages could be freed easily and refilled with very slight
  penality.  Even page reclaim doesn't bias file pages, there is still an
  issue, because MADV_FREE pages and other anonymous pages are mixed
  together.  To reclaim a MADV_FREE page, we probably must scan a lot of
  other anonymous pages, which is inefficient.  In our test, we usually
  see oom with MADV_FREE enabled and nothing without it.

- Accounting.  There are two accounting problems.  We don't have a
  global accounting.  If the system is abnormal, we don't know if it's a
  problem from MADV_FREE side.  The other problem is RSS accounting.
  MADV_FREE pages are accounted as normal anon pages and reclaimed lazily,
  so application's RSS becomes bigger.  This confuses our workloads.  We
  have monitoring daemon running and if it finds applications' RSS becomes
  abnormal, the daemon will kill the applications even kernel can reclaim
  the memory easily.

To address the first the two issues, we can either put MADV_FREE pages
into a separate LRU list (Minchan's previous patches and V1 patches), or
put them into LRU_INACTIVE_FILE list (suggested by Johannes).  The
patchset use the second idea.  The reason is LRU_INACTIVE_FILE list is
tiny nowadays and should be full of used once file pages.  So we can still
efficiently reclaim MADV_FREE pages there without interference with other
anon and active file pages.  Putting the pages into inactive file list
also has an advantage which allows page reclaim to prioritize MADV_FREE
pages and used once file pages.  MADV_FREE pages are put into the lru list
and clear SwapBacked flag, so PageAnon(page) && !PageSwapBacked(page) will
indicate a MADV_FREE pages.  These pages will directly freed without
pageout if they are clean, otherwise normal swap will reclaim them.

For the third issue, the previous post adds global accounting and a
separate RSS count for MADV_FREE pages.  The problem is we never get
accurate accounting for MADV_FREE pages.  The pages are mapped to
userspace, can be dirtied without notice from kernel side.  To get
accurate accounting, we could write protect the page, but then there is
extra page fault overhead, which people don't want to pay.  Jemalloc guys
have concerns about the inaccurate accounting, so this post drops the
accounting patches temporarily.  The info exported to /proc/pid/smaps for
MADV_FREE pages are kept, which is the only place we can get accurate
accounting right now.

This patch (of 6):

Johannes pointed out TTU_LZFREE is unnecessary.  It's true because we
always have the flag set if we want to do an unmap.  For cases we don't do
an unmap, the TTU_LZFREE part of code should never run.

Also the TTU_UNMAP is unnecessary.  If no other flags set (for example,
TTU_MIGRATION), an unmap is implied.

The patch includes Johannes's cleanup and dead TTU_ACTION macro removal
code

Link: http://lkml.kernel.org/r/4be3ea1bc56b26fd98a54d0a6f70bec63f6d8980.1487965799.git.shli@fb.com
Signed-off-by: Shaohua Li <shli@fb.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 22 +++++++++-------------
 mm/memory-failure.c  |  2 +-
 mm/rmap.c            |  2 +-
 mm/vmscan.c          | 11 ++++-------
 4 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 8c89e902df3e..7a3941492856 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -83,19 +83,17 @@ struct anon_vma_chain {
 };
 
 enum ttu_flags {
-	TTU_UNMAP = 1,			/* unmap mode */
-	TTU_MIGRATION = 2,		/* migration mode */
-	TTU_MUNLOCK = 4,		/* munlock mode */
-	TTU_LZFREE = 8,			/* lazy free mode */
-	TTU_SPLIT_HUGE_PMD = 16,	/* split huge PMD if any */
-
-	TTU_IGNORE_MLOCK = (1 << 8),	/* ignore mlock */
-	TTU_IGNORE_ACCESS = (1 << 9),	/* don't age */
-	TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
-	TTU_BATCH_FLUSH = (1 << 11),	/* Batch TLB flushes where possible
+	TTU_MIGRATION		= 0x1,	/* migration mode */
+	TTU_MUNLOCK		= 0x2,	/* munlock mode */
+
+	TTU_SPLIT_HUGE_PMD	= 0x4,	/* split huge PMD if any */
+	TTU_IGNORE_MLOCK	= 0x8,	/* ignore mlock */
+	TTU_IGNORE_ACCESS	= 0x10,	/* don't age */
+	TTU_IGNORE_HWPOISON	= 0x20,	/* corrupted page is recoverable */
+	TTU_BATCH_FLUSH		= 0x40,	/* Batch TLB flushes where possible
 					 * and caller guarantees they will
 					 * do a final flush if necessary */
-	TTU_RMAP_LOCKED = (1 << 12)	/* do not grab rmap lock:
+	TTU_RMAP_LOCKED		= 0x80	/* do not grab rmap lock:
 					 * caller holds it */
 };
 
@@ -193,8 +191,6 @@ static inline void page_dup_rmap(struct page *page, bool compound)
 int page_referenced(struct page *, int is_locked,
 			struct mem_cgroup *memcg, unsigned long *vm_flags);
 
-#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
-
 int try_to_unmap(struct page *, enum ttu_flags flags);
 
 /* Avoid racy checks */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 27f7210e7fab..f85adfe57484 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -907,7 +907,7 @@ EXPORT_SYMBOL_GPL(get_hwpoison_page);
 static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 				  int trapno, int flags, struct page **hpagep)
 {
-	enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+	enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
 	struct address_space *mapping;
 	LIST_HEAD(tokill);
 	int ret;
diff --git a/mm/rmap.c b/mm/rmap.c
index f6838015810f..d7b6d780764b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1426,7 +1426,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 */
 			VM_BUG_ON_PAGE(!PageSwapCache(page), page);
 
-			if (!PageDirty(page) && (flags & TTU_LZFREE)) {
+			if (!PageDirty(page)) {
 				/* It's a freeable page by MADV_FREE */
 				dec_mm_counter(mm, MM_ANONPAGES);
 				rp->lazyfreed++;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 77832f0dbe0d..e5c00f2b98ab 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -966,7 +966,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		int may_enter_fs;
 		enum page_references references = PAGEREF_RECLAIM_CLEAN;
 		bool dirty, writeback;
-		bool lazyfree = false;
 		int ret = SWAP_SUCCESS;
 
 		cond_resched();
@@ -1120,7 +1119,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				goto keep_locked;
 			if (!add_to_swap(page, page_list))
 				goto activate_locked;
-			lazyfree = true;
 			may_enter_fs = 1;
 
 			/* Adding to swap updated mapping */
@@ -1138,9 +1136,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page) && mapping) {
-			switch (ret = try_to_unmap(page, lazyfree ?
-				(ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
-				(ttu_flags | TTU_BATCH_FLUSH))) {
+			switch (ret = try_to_unmap(page,
+				ttu_flags | TTU_BATCH_FLUSH)) {
 			case SWAP_FAIL:
 				nr_unmap_fail++;
 				goto activate_locked;
@@ -1348,7 +1345,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 	}
 
 	ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
-			TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true);
+			TTU_IGNORE_ACCESS, NULL, true);
 	list_splice(&clean_pages, page_list);
 	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
 	return ret;
@@ -1740,7 +1737,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	if (nr_taken == 0)
 		return 0;
 
-	nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
+	nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
 				&stat, false);
 
 	spin_lock_irq(&pgdat->lru_lock);
-- 
cgit v1.2.3


From 216032ff36d2c5ebff90bd2419abc7779beb2540 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 5 Apr 2017 09:20:14 +1000
Subject: mm: don't assume anonymous pages have SwapBacked flag

There are a few places the code assumes anonymous pages should have
SwapBacked flag set. MADV_FREE pages are anonymous pages but we are
going to add them to LRU_INACTIVE_FILE list and clear SwapBacked flag
for them. The assumption doesn't hold any more, so fix them.

Link: http://lkml.kernel.org/r/3945232c0df3dd6c4ef001976f35a95f18dcb407.1487965799.git.shli@fb.com
Signed-off-by: Shaohua Li <shli@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/huge_memory.c | 1 -
 mm/khugepaged.c  | 8 +++-----
 mm/migrate.c     | 3 ++-
 mm/rmap.c        | 3 ++-
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1ebc93e179f3..f7dd8b886d41 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2368,7 +2368,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 
 	VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
-	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
 	VM_BUG_ON_PAGE(!PageCompound(page), page);
 
 	if (PageAnon(head)) {
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ba40b7f673f4..88e4b1737c90 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -483,8 +483,7 @@ void __khugepaged_exit(struct mm_struct *mm)
 
 static void release_pte_page(struct page *page)
 {
-	/* 0 stands for page_is_file_cache(page) == false */
-	dec_node_page_state(page, NR_ISOLATED_ANON + 0);
+	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
 	unlock_page(page);
 	putback_lru_page(page);
 }
@@ -532,7 +531,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
 		VM_BUG_ON_PAGE(PageCompound(page), page);
 		VM_BUG_ON_PAGE(!PageAnon(page), page);
-		VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
 
 		/*
 		 * We can do it before isolate_lru_page because the
@@ -579,8 +577,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			result = SCAN_DEL_PAGE_LRU;
 			goto out;
 		}
-		/* 0 stands for page_is_file_cache(page) == false */
-		inc_node_page_state(page, NR_ISOLATED_ANON + 0);
+		inc_node_page_state(page,
+				NR_ISOLATED_ANON + page_is_file_cache(page));
 		VM_BUG_ON_PAGE(!PageLocked(page), page);
 		VM_BUG_ON_PAGE(PageLRU(page), page);
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 011df454812b..937378e8b883 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1944,7 +1944,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 
 	/* Prepare a page as a migration target */
 	__SetPageLocked(new_page);
-	__SetPageSwapBacked(new_page);
+	if (PageSwapBacked(page))
+		__SetPageSwapBacked(new_page);
 
 	/* anon mapping, we can simply copy page->mapping to the new page: */
 	new_page->mapping = page->mapping;
diff --git a/mm/rmap.c b/mm/rmap.c
index d7b6d780764b..b4084d09dbe8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1424,7 +1424,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 * Store the swap location in the pte.
 			 * See handle_pte_fault() ...
 			 */
-			VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+			VM_BUG_ON_PAGE(!PageSwapCache(page) && PageSwapBacked(page),
+				page);
 
 			if (!PageDirty(page)) {
 				/* It's a freeable page by MADV_FREE */
-- 
cgit v1.2.3


From b8f0976d39d334991ac75e0f6067884f29055821 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 5 Apr 2017 09:20:15 +1000
Subject: mm: move MADV_FREE pages into LRU_INACTIVE_FILE list

madv()'s MADV_FREE indicate pages are 'lazyfree'.  They are still
anonymous pages, but they can be freed without pageout.  To distinguish
these from normal anonymous pages, we clear their SwapBacked flag.

MADV_FREE pages could be freed without pageout, so they pretty much like
used once file pages. For such pages, we'd like to reclaim them once
there is memory pressure. Also it might be unfair reclaiming MADV_FREE
pages always before used once file pages and we definitively want to
reclaim the pages before other anonymous and file pages.

To speed up MADV_FREE pages reclaim, we put the pages into
LRU_INACTIVE_FILE list. The rationale is LRU_INACTIVE_FILE list is tiny
nowadays and should be full of used once file pages. Reclaiming
MADV_FREE pages will not have much interfere of anonymous and active
file pages. And the inactive file pages and MADV_FREE pages will be
reclaimed according to their age, so we don't reclaim too many MADV_FREE
pages too. Putting the MADV_FREE pages into LRU_INACTIVE_FILE_LIST also
means we can reclaim the pages without swap support. This idea is
suggested by Johannes.

This patch doesn't move MADV_FREE pages to LRU_INACTIVE_FILE list yet to
avoid bisect failure, next patch will do it.

The patch is based on Minchan's original patch.

Link: http://lkml.kernel.org/r/2f87063c1e9354677b7618c647abde77b07561e5.1487965799.git.shli@fb.com
Signed-off-by: Shaohua Li <shli@fb.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h          |  2 +-
 include/linux/vm_event_item.h |  2 +-
 mm/huge_memory.c              |  3 ---
 mm/madvise.c                  |  2 --
 mm/swap.c                     | 50 ++++++++++++++++++++++++-------------------
 mm/vmstat.c                   |  1 +
 6 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 45e91dd6716d..486494e6b2fc 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -279,7 +279,7 @@ extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void deactivate_file_page(struct page *page);
-extern void deactivate_page(struct page *page);
+extern void mark_page_lazyfree(struct page *page);
 extern void swap_setup(void);
 
 extern void add_page_to_unevictable_list(struct page *page);
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index a80b7b59cf33..d84ae90ccd5c 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -25,7 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		FOR_ALL_ZONES(PGALLOC),
 		FOR_ALL_ZONES(ALLOCSTALL),
 		FOR_ALL_ZONES(PGSCAN_SKIP),
-		PGFREE, PGACTIVATE, PGDEACTIVATE,
+		PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE,
 		PGFAULT, PGMAJFAULT,
 		PGLAZYFREED,
 		PGREFILL,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f7dd8b886d41..ccd301360ad1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1564,9 +1564,6 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		ClearPageDirty(page);
 	unlock_page(page);
 
-	if (PageActive(page))
-		deactivate_page(page);
-
 	if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
 		orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
 			tlb->fullmm);
diff --git a/mm/madvise.c b/mm/madvise.c
index 7a2abf0127ae..cf3021b05b32 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -411,8 +411,6 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 			ptent = pte_mkold(ptent);
 			ptent = pte_mkclean(ptent);
 			set_pte_at(mm, addr, pte, ptent);
-			if (PageActive(page))
-				deactivate_page(page);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 		}
 	}
diff --git a/mm/swap.c b/mm/swap.c
index c4910f14f957..c4fb4b9f7524 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -46,7 +46,7 @@ int page_cluster;
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
 #endif
@@ -561,20 +561,26 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
 }
 
 
-static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
 			    void *arg)
 {
-	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
-		int file = page_is_file_cache(page);
-		int lru = page_lru_base_type(page);
+	if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
+	    !PageUnevictable(page)) {
+		bool active = PageActive(page);
 
-		del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+		del_page_from_lru_list(page, lruvec, LRU_INACTIVE_ANON + active);
 		ClearPageActive(page);
 		ClearPageReferenced(page);
-		add_page_to_lru_list(page, lruvec, lru);
+		/*
+		 * lazyfree pages are clean anonymous pages. They have
+		 * SwapBacked flag cleared to distinguish normal anonymous
+		 * pages
+		 */
+		ClearPageSwapBacked(page);
+		add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
 
-		__count_vm_event(PGDEACTIVATE);
-		update_page_reclaim_stat(lruvec, file, 0);
+		__count_vm_events(PGLAZYFREE, hpage_nr_pages(page));
+		update_page_reclaim_stat(lruvec, 1, 0);
 	}
 }
 
@@ -604,9 +610,9 @@ void lru_add_drain_cpu(int cpu)
 	if (pagevec_count(pvec))
 		pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
 
-	pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+	pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
 	if (pagevec_count(pvec))
-		pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+		pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
 
 	activate_page_drain(cpu);
 }
@@ -638,22 +644,22 @@ void deactivate_file_page(struct page *page)
 }
 
 /**
- * deactivate_page - deactivate a page
+ * mark_page_lazyfree - make an anon page lazyfree
  * @page: page to deactivate
  *
- * deactivate_page() moves @page to the inactive list if @page was on the active
- * list and was not an unevictable page.  This is done to accelerate the reclaim
- * of @page.
+ * mark_page_lazyfree() moves @page to the inactive file list.
+ * This is done to accelerate the reclaim of @page.
  */
-void deactivate_page(struct page *page)
-{
-	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
-		struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+void mark_page_lazyfree(struct page *page)
+ {
+	if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
+	    !PageUnevictable(page)) {
+		struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
 
 		get_page(page);
 		if (!pagevec_add(pvec, page) || PageCompound(page))
-			pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
-		put_cpu_var(lru_deactivate_pvecs);
+			pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
+		put_cpu_var(lru_lazyfree_pvecs);
 	}
 }
 
@@ -704,7 +710,7 @@ void lru_add_drain_all(void)
 		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
 		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
 		    pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
-		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
 		    need_activate_page_drain(cpu)) {
 			INIT_WORK(work, lru_add_drain_per_cpu);
 			queue_work_on(cpu, lru_add_drain_wq, work);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9911ec58fce2..35de61b46408 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -991,6 +991,7 @@ const char * const vmstat_text[] = {
 	"pgfree",
 	"pgactivate",
 	"pgdeactivate",
+	"pglazyfree",
 
 	"pgfault",
 	"pgmajfault",
-- 
cgit v1.2.3


From 312f6b0a70120436adbb122fa494e546a574ed90 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 5 Apr 2017 09:20:16 +1000
Subject: mm-move-madv_free-pages-into-lru_inactive_file-list-checkpatch-fixes

WARNING: line over 80 characters
#127: FILE: mm/swap.c:571:
+		del_page_from_lru_list(page, lruvec, LRU_INACTIVE_ANON + active);

WARNING: please, no spaces at the start of a line
#177: FILE: mm/swap.c:654:
+ {$

total: 0 errors, 2 warnings, 133 lines checked

NOTE: For some of the reported defects, checkpatch may be able to
      mechanically convert to the typical style using --fix or --fix-inplace.

./patches/mm-move-madv_free-pages-into-lru_inactive_file-list.patch has style problems, please review.

NOTE: If any of the errors are false positives, please report
      them to the maintainer, see CHECKPATCH in MAINTAINERS.

Please run checkpatch prior to sending patches

Cc: Shaohua Li <shli@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/swap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index c4fb4b9f7524..ac98eb443a03 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -568,7 +568,8 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
 	    !PageUnevictable(page)) {
 		bool active = PageActive(page);
 
-		del_page_from_lru_list(page, lruvec, LRU_INACTIVE_ANON + active);
+		del_page_from_lru_list(page, lruvec,
+				       LRU_INACTIVE_ANON + active);
 		ClearPageActive(page);
 		ClearPageReferenced(page);
 		/*
@@ -651,7 +652,7 @@ void deactivate_file_page(struct page *page)
  * This is done to accelerate the reclaim of @page.
  */
 void mark_page_lazyfree(struct page *page)
- {
+{
 	if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
 	    !PageUnevictable(page)) {
 		struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
-- 
cgit v1.2.3


From c3084daebe1c7dd6be2f4f437732a98993aea947 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 5 Apr 2017 09:20:17 +1000
Subject: mm: reclaim MADV_FREE pages

When memory pressure is high, we free MADV_FREE pages.  If the pages are
not dirty in pte, the pages could be freed immediately.  Otherwise we
can't reclaim them.  We put the pages back to anonumous LRU list (by
setting SwapBacked flag) and the pages will be reclaimed in normal swapout
way.

We use normal page reclaim policy.  Since MADV_FREE pages are put into
inactive file list, such pages and inactive file pages are reclaimed
according to their age.  This is expected, because we don't want to
reclaim too many MADV_FREE pages before used once pages.

Based on Minchan's original patch

Link: http://lkml.kernel.org/r/14b8eb1d3f6bf6cc492833f183ac8c304e560484.1487965799.git.shli@fb.com
Signed-off-by: Shaohua Li <shli@fb.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h |  2 +-
 mm/huge_memory.c     |  2 ++
 mm/madvise.c         |  1 +
 mm/rmap.c            | 40 +++++++++++++++++-----------------------
 mm/vmscan.c          | 34 ++++++++++++++++++++++------------
 5 files changed, 43 insertions(+), 36 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 7a3941492856..fee10d744ebd 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -298,6 +298,6 @@ static inline int page_mkclean(struct page *page)
 #define SWAP_AGAIN	1
 #define SWAP_FAIL	2
 #define SWAP_MLOCK	3
-#define SWAP_LZFREE	4
+#define SWAP_DIRTY	4
 
 #endif	/* _LINUX_RMAP_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ccd301360ad1..c474e6f0461a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1573,6 +1573,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		set_pmd_at(mm, addr, pmd, orig_pmd);
 		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
 	}
+
+	mark_page_lazyfree(page);
 	ret = true;
 out:
 	spin_unlock(ptl);
diff --git a/mm/madvise.c b/mm/madvise.c
index cf3021b05b32..d3a6712c3e14 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -413,6 +413,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 			set_pte_at(mm, addr, pte, ptent);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 		}
+		mark_page_lazyfree(page);
 	}
 out:
 	if (nr_swap) {
diff --git a/mm/rmap.c b/mm/rmap.c
index b4084d09dbe8..b7c47fd59108 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1288,11 +1288,6 @@ void page_remove_rmap(struct page *page, bool compound)
 	 */
 }
 
-struct rmap_private {
-	enum ttu_flags flags;
-	int lazyfreed;
-};
-
 /*
  * @arg: enum ttu_flags will be passed to this argument
  */
@@ -1308,8 +1303,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	pte_t pteval;
 	struct page *subpage;
 	int ret = SWAP_AGAIN;
-	struct rmap_private *rp = arg;
-	enum ttu_flags flags = rp->flags;
+	enum ttu_flags flags = (enum ttu_flags)arg;
 
 	/* munlock has nothing to gain from examining un-locked vmas */
 	if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
@@ -1427,11 +1421,21 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			VM_BUG_ON_PAGE(!PageSwapCache(page) && PageSwapBacked(page),
 				page);
 
-			if (!PageDirty(page)) {
+			/*
+			 * swapin page could be clean, it has data stored in
+			 * swap. We can't silently discard it without setting
+			 * swap entry in the page table.
+			 */
+			if (!PageDirty(page) && !PageSwapCache(page)) {
 				/* It's a freeable page by MADV_FREE */
 				dec_mm_counter(mm, MM_ANONPAGES);
-				rp->lazyfreed++;
 				goto discard;
+			} else if (!PageSwapBacked(page)) {
+				/* dirty MADV_FREE page */
+				set_pte_at(mm, address, pvmw.pte, pteval);
+				ret = SWAP_DIRTY;
+				page_vma_mapped_walk_done(&pvmw);
+				break;
 			}
 
 			if (swap_duplicate(entry) < 0) {
@@ -1499,18 +1503,15 @@ static int page_mapcount_is_zero(struct page *page)
  * SWAP_AGAIN	- we missed a mapping, try again later
  * SWAP_FAIL	- the page is unswappable
  * SWAP_MLOCK	- page is mlocked.
+ * SWAP_DIRTY	- page is dirty MADV_FREE page
  */
 int try_to_unmap(struct page *page, enum ttu_flags flags)
 {
 	int ret;
-	struct rmap_private rp = {
-		.flags = flags,
-		.lazyfreed = 0,
-	};
 
 	struct rmap_walk_control rwc = {
 		.rmap_one = try_to_unmap_one,
-		.arg = &rp,
+		.arg = (void *)flags,
 		.done = page_mapcount_is_zero,
 		.anon_lock = page_lock_anon_vma_read,
 	};
@@ -1531,11 +1532,8 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
 	else
 		ret = rmap_walk(page, &rwc);
 
-	if (ret != SWAP_MLOCK && !page_mapcount(page)) {
+	if (ret != SWAP_MLOCK && !page_mapcount(page))
 		ret = SWAP_SUCCESS;
-		if (rp.lazyfreed && !PageDirty(page))
-			ret = SWAP_LZFREE;
-	}
 	return ret;
 }
 
@@ -1562,14 +1560,10 @@ static int page_not_mapped(struct page *page)
 int try_to_munlock(struct page *page)
 {
 	int ret;
-	struct rmap_private rp = {
-		.flags = TTU_MUNLOCK,
-		.lazyfreed = 0,
-	};
 
 	struct rmap_walk_control rwc = {
 		.rmap_one = try_to_unmap_one,
-		.arg = &rp,
+		.arg = (void *)TTU_MUNLOCK,
 		.done = page_not_mapped,
 		.anon_lock = page_lock_anon_vma_read,
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5c00f2b98ab..ec4555369e17 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -906,7 +906,8 @@ static void page_check_dirty_writeback(struct page *page,
 	 * Anonymous pages are not handled by flushers and must be written
 	 * from reclaim context. Do not stall reclaim based on them
 	 */
-	if (!page_is_file_cache(page)) {
+	if (!page_is_file_cache(page) ||
+	    (PageAnon(page) && !PageSwapBacked(page))) {
 		*dirty = false;
 		*writeback = false;
 		return;
@@ -987,7 +988,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			goto keep_locked;
 
 		/* Double the slab pressure for mapped and swapcache pages */
-		if (page_mapped(page) || PageSwapCache(page))
+		if ((page_mapped(page) || PageSwapCache(page)) &&
+		    !(PageAnon(page) && !PageSwapBacked(page)))
 			sc->nr_scanned++;
 
 		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
@@ -1113,8 +1115,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		/*
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
+		 * Lazyfree page could be freed directly
 		 */
-		if (PageAnon(page) && !PageSwapCache(page)) {
+		if (PageAnon(page) && PageSwapBacked(page) &&
+		    !PageSwapCache(page)) {
 			if (!(sc->gfp_mask & __GFP_IO))
 				goto keep_locked;
 			if (!add_to_swap(page, page_list))
@@ -1135,9 +1139,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * The page is mapped into the page tables of one or more
 		 * processes. Try to unmap it here.
 		 */
-		if (page_mapped(page) && mapping) {
+		if (page_mapped(page)) {
 			switch (ret = try_to_unmap(page,
 				ttu_flags | TTU_BATCH_FLUSH)) {
+			case SWAP_DIRTY:
+				SetPageSwapBacked(page);
+				/* fall through */
 			case SWAP_FAIL:
 				nr_unmap_fail++;
 				goto activate_locked;
@@ -1145,8 +1152,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				goto keep_locked;
 			case SWAP_MLOCK:
 				goto cull_mlocked;
-			case SWAP_LZFREE:
-				goto lazyfree;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
 			}
@@ -1258,10 +1263,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			}
 		}
 
-lazyfree:
-		if (!mapping || !__remove_mapping(mapping, page, true))
-			goto keep_locked;
+		if (PageAnon(page) && !PageSwapBacked(page)) {
+			/* follow __remove_mapping for reference */
+			if (!page_ref_freeze(page, 1))
+				goto keep_locked;
+			if (PageDirty(page)) {
+				page_ref_unfreeze(page, 1);
+				goto keep_locked;
+			}
 
+			count_vm_event(PGLAZYFREED);
+		} else if (!mapping || !__remove_mapping(mapping, page, true))
+			goto keep_locked;
 		/*
 		 * At this point, we have no other references and there is
 		 * no way to pick any more up (removed from LRU, removed
@@ -1271,9 +1284,6 @@ lazyfree:
 		 */
 		__ClearPageLocked(page);
 free_it:
-		if (ret == SWAP_LZFREE)
-			count_vm_event(PGLAZYFREED);
-
 		nr_reclaimed++;
 
 		/*
-- 
cgit v1.2.3


From 751632341c07df3cb18fc388e4d87f261a768713 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:20:18 +1000
Subject: mm: clean up lazyfree page handling

We can make it simple to understand without need to be aware of
clean-swapin page.
This patch just clean up lazyfree page handling in try_to_unmap_one.

Link: http://lkml.kernel.org/r/20170303025237.GB3503@bbox
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Shaohua Li <shli@fb.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/rmap.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index b7c47fd59108..519b7eb723d1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1421,17 +1421,17 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			VM_BUG_ON_PAGE(!PageSwapCache(page) && PageSwapBacked(page),
 				page);
 
-			/*
-			 * swapin page could be clean, it has data stored in
-			 * swap. We can't silently discard it without setting
-			 * swap entry in the page table.
-			 */
-			if (!PageDirty(page) && !PageSwapCache(page)) {
-				/* It's a freeable page by MADV_FREE */
-				dec_mm_counter(mm, MM_ANONPAGES);
-				goto discard;
-			} else if (!PageSwapBacked(page)) {
-				/* dirty MADV_FREE page */
+			/* MADV_FREE page check */
+			if (!PageSwapBacked(page)) {
+				if (!PageDirty(page)) {
+					dec_mm_counter(mm, MM_ANONPAGES);
+					goto discard;
+				}
+
+				/*
+				 * If the page was redirtied, it cannot be
+				 * discarded. Remap the page to page table.
+				 */
 				set_pte_at(mm, address, pvmw.pte, pteval);
 				ret = SWAP_DIRTY;
 				page_vma_mapped_walk_done(&pvmw);
-- 
cgit v1.2.3


From 6f42a3110d537567eb0ea3ff1668c59fe7013598 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:20:19 +1000
Subject: mm: fix lazyfree BUG_ON check in try_to_unmap_one()

If a page is swapbacked, it means it should be in swapcache in
try_to_unmap_one's path.

If a page is !swapbacked, it mean it shouldn't be in swapcache in
try_to_unmap_one's path.

Check both two cases all at once and if it fails, warn and return
SWAP_FAIL.  Such bug never mean we should shut down the kernel.

Link: http://lkml.kernel.org/r/20170307055551.GC29458@bbox
Signed-off-by: Minchan Kim <minchan@kernel.org>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shaohua Li <shli@fb.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/rmap.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 519b7eb723d1..1b6105be87f3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1418,8 +1418,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 * Store the swap location in the pte.
 			 * See handle_pte_fault() ...
 			 */
-			VM_BUG_ON_PAGE(!PageSwapCache(page) && PageSwapBacked(page),
-				page);
+			if (VM_WARN_ON_ONCE(PageSwapBacked(page) !=
+						PageSwapCache(page))) {
+				ret = SWAP_FAIL;
+				page_vma_mapped_walk_done(&pvmw);
+				break;
+
+			}
 
 			/* MADV_FREE page check */
 			if (!PageSwapBacked(page)) {
-- 
cgit v1.2.3


From 50b06ec48564f28f673f48d852fa188fcf2c92b7 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:20:20 +1000
Subject: mm: do not use VM_WARN_ON_ONCE as if condition

Sergey reported VM_WARN_ON_ONCE returns void with !CONFIG_DEBUG_VM
so we cannot use it as if's condition unlike WARN_ON.

This patch fixes it.

Link: http://lkml.kernel.org/r/20170309060226.GB854@bbox
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reported-by: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/rmap.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 1b6105be87f3..a19bd8b8ab0d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1418,12 +1418,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 * Store the swap location in the pte.
 			 * See handle_pte_fault() ...
 			 */
-			if (VM_WARN_ON_ONCE(PageSwapBacked(page) !=
-						PageSwapCache(page))) {
+			if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
+				WARN_ON_ONCE(1);
 				ret = SWAP_FAIL;
 				page_vma_mapped_walk_done(&pvmw);
 				break;
-
 			}
 
 			/* MADV_FREE page check */
-- 
cgit v1.2.3


From e92e4622d3c09f7f7fb5778c90e7cb92b4b2412c Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 5 Apr 2017 09:20:21 +1000
Subject: mm: enable MADV_FREE for swapless system

Now MADV_FREE pages can be easily reclaimed even for swapless system.  We
can safely enable MADV_FREE for all systems.

Link: http://lkml.kernel.org/r/155648585589300bfae1d45078e7aebb3d988b87.1487965799.git.shli@fb.com
Signed-off-by: Shaohua Li <shli@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/madvise.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index d3a6712c3e14..a09d2d3dfae9 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -650,13 +650,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	case MADV_WILLNEED:
 		return madvise_willneed(vma, prev, start, end);
 	case MADV_FREE:
-		/*
-		 * XXX: In this implementation, MADV_FREE works like
-		 * MADV_DONTNEED on swapless system or full swap.
-		 */
-		if (get_nr_swap_pages() > 0)
-			return madvise_free(vma, prev, start, end);
-		/* passthrough */
+		return madvise_free(vma, prev, start, end);
 	case MADV_DONTNEED:
 		return madvise_dontneed(vma, prev, start, end);
 	default:
-- 
cgit v1.2.3


From 69c0412e8ff4ee456f36e2bb2a82358c81fe9dce Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 5 Apr 2017 09:20:22 +1000
Subject: proc: show MADV_FREE pages info in smaps

Show MADV_FREE pages info of each vma in smaps.  The interface is for
diganose or monitoring purpose, userspace could use it to understand what
happens in the application.  Since userspace could dirty MADV_FREE pages
without notice from kernel, this interface is the only place we can get
accurate accounting info about MADV_FREE pages.

Link: http://lkml.kernel.org/r/89efde633559de1ec07444f2ef0f4963a97a2ce8.1487965799.git.shli@fb.com
Signed-off-by: Shaohua Li <shli@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 4 ++++
 fs/proc/task_mmu.c                 | 8 +++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index c94b4675d021..45853e116eef 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -412,6 +412,7 @@ Private_Clean:         0 kB
 Private_Dirty:         0 kB
 Referenced:          892 kB
 Anonymous:             0 kB
+LazyFree:              0 kB
 AnonHugePages:         0 kB
 ShmemPmdMapped:        0 kB
 Shared_Hugetlb:        0 kB
@@ -441,6 +442,9 @@ accessed.
 "Anonymous" shows the amount of memory that does not belong to any file.  Even
 a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
 and a page is modified, the file page is replaced by a private anonymous copy.
+"LazyFree" shows the amount of memory which is marked by madvise(MADV_FREE).
+The memory isn't freed immediately with madvise(). It's freed in memory
+pressure if the memory is clean.
 "AnonHugePages" shows the ammount of memory backed by transparent hugepage.
 "ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by
 huge pages.
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f08bd31c1081..b0e3800b4cf5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -441,6 +441,7 @@ struct mem_size_stats {
 	unsigned long private_dirty;
 	unsigned long referenced;
 	unsigned long anonymous;
+	unsigned long lazyfree;
 	unsigned long anonymous_thp;
 	unsigned long shmem_thp;
 	unsigned long swap;
@@ -457,8 +458,11 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
 	int i, nr = compound ? 1 << compound_order(page) : 1;
 	unsigned long size = nr * PAGE_SIZE;
 
-	if (PageAnon(page))
+	if (PageAnon(page)) {
 		mss->anonymous += size;
+		if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
+			mss->lazyfree += size;
+	}
 
 	mss->resident += size;
 	/* Accumulate the size in pages that have been accessed. */
@@ -771,6 +775,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 		   "Private_Dirty:  %8lu kB\n"
 		   "Referenced:     %8lu kB\n"
 		   "Anonymous:      %8lu kB\n"
+		   "LazyFree:       %8lu kB\n"
 		   "AnonHugePages:  %8lu kB\n"
 		   "ShmemPmdMapped: %8lu kB\n"
 		   "Shared_Hugetlb: %8lu kB\n"
@@ -789,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 		   mss.private_dirty >> 10,
 		   mss.referenced >> 10,
 		   mss.anonymous >> 10,
+		   mss.lazyfree >> 10,
 		   mss.anonymous_thp >> 10,
 		   mss.shmem_thp >> 10,
 		   mss.shared_hugetlb >> 10,
-- 
cgit v1.2.3


From afd080b551855b2440925117f9019becadbe8b06 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@kernel.org>
Date: Wed, 5 Apr 2017 09:20:22 +1000
Subject: proc-show-madv_free-pages-info-in-smaps-fix

update Documentation/filesystems/proc.txt

Cc: Shaohua Li <shli@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 45853e116eef..0b58b317dc76 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -444,7 +444,9 @@ a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
 and a page is modified, the file page is replaced by a private anonymous copy.
 "LazyFree" shows the amount of memory which is marked by madvise(MADV_FREE).
 The memory isn't freed immediately with madvise(). It's freed in memory
-pressure if the memory is clean.
+pressure if the memory is clean. Please note that the printed value might
+be lower than the real value due to optimizations used in the current
+implementation. If this is not desirable please file a bug report.
 "AnonHugePages" shows the ammount of memory backed by transparent hugepage.
 "ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by
 huge pages.
-- 
cgit v1.2.3


From 47eecd1eaf803dec48f8d96a83b99bc0dc72662d Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 5 Apr 2017 09:20:24 +1000
Subject: mm: memcontrol: provide shmem statistics

Cgroups currently don't report how much shmem they use, which can be
useful data to have, in particular since shmem is included in the
cache/file item while being reclaimed like anonymous memory.

Add a counter to track shmem pages during charging and uncharging.

Link: http://lkml.kernel.org/r/20170221164343.32252-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Chris Down <cdown@fb.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/cgroup-v2.txt |  5 +++++
 include/linux/memcontrol.h  |  1 +
 mm/memcontrol.c             | 28 ++++++++++++++++++++--------
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 49d7c997fa1e..e50b95c25868 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -871,6 +871,11 @@ PAGE_SIZE multiple when read back.
 
 		Amount of memory used in network transmission buffers
 
+	  shmem
+
+		Amount of cached filesystem data that is swap-backed,
+		such as tmpfs, shm segments, shared anonymous mmap()s
+
 	  file_mapped
 
 		Amount of cached filesystem data mapped with mmap()
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bb7250c45cb8..c5ebb32fef49 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -46,6 +46,7 @@ enum mem_cgroup_stat_index {
 	MEM_CGROUP_STAT_CACHE,		/* # of pages charged as cache */
 	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */
 	MEM_CGROUP_STAT_RSS_HUGE,	/* # of pages charged as anon huge */
+	MEM_CGROUP_STAT_SHMEM,		/* # of pages charged as shmem */
 	MEM_CGROUP_STAT_FILE_MAPPED,	/* # of pages charged as file rss */
 	MEM_CGROUP_STAT_DIRTY,          /* # of dirty pages in page cache */
 	MEM_CGROUP_STAT_WRITEBACK,	/* # of pages under writeback */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2bd7541d7c11..490d5b4676c1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -104,6 +104,7 @@ static const char * const mem_cgroup_stat_names[] = {
 	"cache",
 	"rss",
 	"rss_huge",
+	"shmem",
 	"mapped_file",
 	"dirty",
 	"writeback",
@@ -608,9 +609,13 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 	if (PageAnon(page))
 		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
 				nr_pages);
-	else
+	else {
 		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
 				nr_pages);
+		if (PageSwapBacked(page))
+			__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SHMEM],
+				       nr_pages);
+	}
 
 	if (compound) {
 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
@@ -5208,6 +5213,8 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	seq_printf(m, "sock %llu\n",
 		   (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
 
+	seq_printf(m, "shmem %llu\n",
+		   (u64)stat[MEM_CGROUP_STAT_SHMEM] * PAGE_SIZE);
 	seq_printf(m, "file_mapped %llu\n",
 		   (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
 	seq_printf(m, "file_dirty %llu\n",
@@ -5476,8 +5483,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
 
 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 			   unsigned long nr_anon, unsigned long nr_file,
-			   unsigned long nr_huge, unsigned long nr_kmem,
-			   struct page *dummy_page)
+			   unsigned long nr_kmem, unsigned long nr_huge,
+			   unsigned long nr_shmem, struct page *dummy_page)
 {
 	unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
 	unsigned long flags;
@@ -5495,6 +5502,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
+	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_SHMEM], nr_shmem);
 	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
 	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 	memcg_check_events(memcg, dummy_page);
@@ -5507,6 +5515,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 static void uncharge_list(struct list_head *page_list)
 {
 	struct mem_cgroup *memcg = NULL;
+	unsigned long nr_shmem = 0;
 	unsigned long nr_anon = 0;
 	unsigned long nr_file = 0;
 	unsigned long nr_huge = 0;
@@ -5539,9 +5548,9 @@ static void uncharge_list(struct list_head *page_list)
 		if (memcg != page->mem_cgroup) {
 			if (memcg) {
 				uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
-					       nr_huge, nr_kmem, page);
-				pgpgout = nr_anon = nr_file =
-					nr_huge = nr_kmem = 0;
+					       nr_kmem, nr_huge, nr_shmem, page);
+				pgpgout = nr_anon = nr_file = nr_kmem = 0;
+				nr_huge = nr_shmem = 0;
 			}
 			memcg = page->mem_cgroup;
 		}
@@ -5555,8 +5564,11 @@ static void uncharge_list(struct list_head *page_list)
 			}
 			if (PageAnon(page))
 				nr_anon += nr_pages;
-			else
+			else {
 				nr_file += nr_pages;
+				if (PageSwapBacked(page))
+					nr_shmem += nr_pages;
+			}
 			pgpgout++;
 		} else {
 			nr_kmem += 1 << compound_order(page);
@@ -5568,7 +5580,7 @@ static void uncharge_list(struct list_head *page_list)
 
 	if (memcg)
 		uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
-			       nr_huge, nr_kmem, page);
+			       nr_kmem, nr_huge, nr_shmem, page);
 }
 
 /**
-- 
cgit v1.2.3


From f58a7e847cde1b28baa9fc2ca611a993bc28ae5e Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 5 Apr 2017 09:20:25 +1000
Subject: thp: reduce indentation level in change_huge_pmd()

Patch series "thp: fix few MADV_DONTNEED races"

For MADV_DONTNEED to work properly with huge pages, it's critical to not
clear pmd intermittently unless you hold down_write(mmap_sem).  Otherwise
MADV_DONTNEED can miss the THP which can lead to userspace breakage.

See example of such race in commit message of patch 2/4.

All these races are found by code inspection.  I haven't seen them
triggered.  I don't think it's worth to apply them to stable@.

This patch (of 4):

Restructure code in preparation for a fix.

Link: http://lkml.kernel.org/r/20170302151034.27829-2-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/huge_memory.c | 52 ++++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c474e6f0461a..55badf9baf0d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1723,37 +1723,37 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 {
 	struct mm_struct *mm = vma->vm_mm;
 	spinlock_t *ptl;
-	int ret = 0;
+	pmd_t entry;
+	bool preserve_write;
+	int ret;
 
 	ptl = __pmd_trans_huge_lock(pmd, vma);
-	if (ptl) {
-		pmd_t entry;
-		bool preserve_write = prot_numa && pmd_write(*pmd);
-		ret = 1;
+	if (!ptl)
+		return 0;
 
-		/*
-		 * Avoid trapping faults against the zero page. The read-only
-		 * data is likely to be read-cached on the local CPU and
-		 * local/remote hits to the zero page are not interesting.
-		 */
-		if (prot_numa && is_huge_zero_pmd(*pmd)) {
-			spin_unlock(ptl);
-			return ret;
-		}
+	preserve_write = prot_numa && pmd_write(*pmd);
+	ret = 1;
 
-		if (!prot_numa || !pmd_protnone(*pmd)) {
-			entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
-			entry = pmd_modify(entry, newprot);
-			if (preserve_write)
-				entry = pmd_mk_savedwrite(entry);
-			ret = HPAGE_PMD_NR;
-			set_pmd_at(mm, addr, pmd, entry);
-			BUG_ON(vma_is_anonymous(vma) && !preserve_write &&
-					pmd_write(entry));
-		}
-		spin_unlock(ptl);
-	}
+	/*
+	 * Avoid trapping faults against the zero page. The read-only
+	 * data is likely to be read-cached on the local CPU and
+	 * local/remote hits to the zero page are not interesting.
+	 */
+	if (prot_numa && is_huge_zero_pmd(*pmd))
+		goto unlock;
 
+	if (prot_numa && pmd_protnone(*pmd))
+		goto unlock;
+
+	entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
+	entry = pmd_modify(entry, newprot);
+	if (preserve_write)
+		entry = pmd_mk_savedwrite(entry);
+	ret = HPAGE_PMD_NR;
+	set_pmd_at(mm, addr, pmd, entry);
+	BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
+unlock:
+	spin_unlock(ptl);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 4eee541efec1e9965c4b302d7ad70b7cacfcf6a8 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 5 Apr 2017 09:20:26 +1000
Subject: thp: fix MADV_DONTNEED vs. numa balancing race

In case prot_numa, we are under down_read(mmap_sem). It's critical
to not clear pmd intermittently to avoid race with MADV_DONTNEED
which is also under down_read(mmap_sem):

	CPU0:				CPU1:
				change_huge_pmd(prot_numa=1)
				 pmdp_huge_get_and_clear_notify()
madvise_dontneed()
 zap_pmd_range()
  pmd_trans_huge(*pmd) == 0 (without ptl)
  // skip the pmd
				 set_pmd_at();
				 // pmd is re-established

The race makes MADV_DONTNEED miss the huge pmd and don't clear it
which may break userspace.

Found by code analysis, never saw triggered.

Link: http://lkml.kernel.org/r/20170302151034.27829-3-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/huge_memory.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 55badf9baf0d..23768acddf2f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1745,7 +1745,39 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 	if (prot_numa && pmd_protnone(*pmd))
 		goto unlock;
 
-	entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
+	/*
+	 * In case prot_numa, we are under down_read(mmap_sem). It's critical
+	 * to not clear pmd intermittently to avoid race with MADV_DONTNEED
+	 * which is also under down_read(mmap_sem):
+	 *
+	 *	CPU0:				CPU1:
+	 *				change_huge_pmd(prot_numa=1)
+	 *				 pmdp_huge_get_and_clear_notify()
+	 * madvise_dontneed()
+	 *  zap_pmd_range()
+	 *   pmd_trans_huge(*pmd) == 0 (without ptl)
+	 *   // skip the pmd
+	 *				 set_pmd_at();
+	 *				 // pmd is re-established
+	 *
+	 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
+	 * which may break userspace.
+	 *
+	 * pmdp_invalidate() is required to make sure we don't miss
+	 * dirty/young flags set by hardware.
+	 */
+	entry = *pmd;
+	pmdp_invalidate(vma, addr, pmd);
+
+	/*
+	 * Recover dirty/young flags.  It relies on pmdp_invalidate to not
+	 * corrupt them.
+	 */
+	if (pmd_dirty(*pmd))
+		entry = pmd_mkdirty(entry);
+	if (pmd_young(*pmd))
+		entry = pmd_mkyoung(entry);
+
 	entry = pmd_modify(entry, newprot);
 	if (preserve_write)
 		entry = pmd_mk_savedwrite(entry);
-- 
cgit v1.2.3


From 2318e4706aeffde78a521196312880f331081b01 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 5 Apr 2017 09:20:26 +1000
Subject: mm: drop unused pmdp_huge_get_and_clear_notify()

Dave noticed that after fixing MADV_DONTNEED vs.  numa balancing race the
last pmdp_huge_get_and_clear_notify() user is gone.

Let's drop the helper.

Link: http://lkml.kernel.org/r/20170306112047.24809-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 51891fb0d3ce..c91b3bcd158f 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -394,18 +394,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 	___pud;								\
 })
 
-#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd)		\
-({									\
-	unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;		\
-	pmd_t ___pmd;							\
-									\
-	___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd);		\
-	mmu_notifier_invalidate_range(__mm, ___haddr,			\
-				      ___haddr + HPAGE_PMD_SIZE);	\
-									\
-	___pmd;								\
-})
-
 /*
  * set_pte_at_notify() sets the pte _after_ running the notifier.
  * This is safe to start by updating the secondary MMUs, because the primary MMU
@@ -489,7 +477,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 #define	ptep_clear_flush_notify ptep_clear_flush
 #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
 #define pudp_huge_clear_flush_notify pudp_huge_clear_flush
-#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
 #define set_pte_at_notify set_pte_at
 
 #endif /* CONFIG_MMU_NOTIFIER */
-- 
cgit v1.2.3


From a99ee3fe57fd1c5f23c20e291d1e2d56c5d766d7 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 5 Apr 2017 09:20:27 +1000
Subject: thp: fix MADV_DONTNEED vs. MADV_FREE race

Both MADV_DONTNEED and MADV_FREE handled with down_read(mmap_sem).  It's
critical to not clear pmd intermittently while handling MADV_FREE to avoid
race with MADV_DONTNEED:

	CPU0:				CPU1:
				madvise_free_huge_pmd()
				 pmdp_huge_get_and_clear_full()
madvise_dontneed()
 zap_pmd_range()
  pmd_trans_huge(*pmd) == 0 (without ptl)
  // skip the pmd
				 set_pmd_at();
				 // pmd is re-established

It results in MADV_DONTNEED skipping the pmd, leaving it not cleared.  It
violates MADV_DONTNEED interface and can result is userspace misbehaviour.

Basically it's the same race as with numa balancing in change_huge_pmd(),
but a bit simpler to mitigate: we don't need to preserve dirty/young flags
here due to MADV_FREE functionality.

Link: http://lkml.kernel.org/r/20170302151034.27829-4-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/huge_memory.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 23768acddf2f..8a239d965304 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1565,8 +1565,6 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	unlock_page(page);
 
 	if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
-		orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
-			tlb->fullmm);
 		orig_pmd = pmd_mkold(orig_pmd);
 		orig_pmd = pmd_mkclean(orig_pmd);
 
-- 
cgit v1.2.3


From 8fa0fc1aaa3004fbee769176f5de5b442313c618 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 5 Apr 2017 09:20:28 +1000
Subject: thp-fix-madv_dontneed-vs-madv_free-race-fix

Urgh... Power is special again.

Link: http://lkml.kernel.org/r/20170303102636.bhd2zhtpds4mt62a@black.fi.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/huge_memory.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8a239d965304..c36094c4fa4b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1565,6 +1565,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	unlock_page(page);
 
 	if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
+		pmdp_invalidate(vma, addr, pmd);
 		orig_pmd = pmd_mkold(orig_pmd);
 		orig_pmd = pmd_mkclean(orig_pmd);
 
-- 
cgit v1.2.3


From 15b240ffe2b8d2f51dbf65c841ecb398b629c9bb Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 5 Apr 2017 09:20:28 +1000
Subject: thp: fix MADV_DONTNEED vs clear soft dirty race

Yet another instance of the same race.

Fix is identical to change_huge_pmd().

See "thp: fix MADV_DONTNEED vs. numa balancing race" for more details.

Link: http://lkml.kernel.org/r/20170302151034.27829-5-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b0e3800b4cf5..f0c8b33d99b1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -906,7 +906,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t *pmdp)
 {
-	pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
+	pmd_t pmd = *pmdp;
+
+	/* See comment in change_huge_pmd() */
+	pmdp_invalidate(vma, addr, pmdp);
+	if (pmd_dirty(*pmdp))
+		pmd = pmd_mkdirty(pmd);
+	if (pmd_young(*pmdp))
+		pmd = pmd_mkyoung(pmd);
 
 	pmd = pmd_wrprotect(pmd);
 	pmd = pmd_clear_soft_dirty(pmd);
-- 
cgit v1.2.3


From 475c08708b9287294943379efd6e56ab67037bff Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 5 Apr 2017 09:20:29 +1000
Subject: mm, swap: Fix a race in free_swap_and_cache()

Before using cluster lock in free_swap_and_cache(), the
swap_info_struct->lock will be held during freeing the swap entry and
acquiring page lock, so the page swap count will not change when testing
page information later.  But after using cluster lock, the cluster lock
(or swap_info_struct->lock) will be held only during freeing the swap
entry.  So before acquiring the page lock, the page swap count may be
changed in another thread.  If the page swap count is not 0, we should not
delete the page from the swap cache.  This is fixed via checking page swap
count again after acquiring the page lock.

I found the race when I review the code, so I didn't trigger the race
via a test program.  If the race occurs for an anonymous page shared by
multiple processes via fork, multiple pages will be allocated and
swapped in from the swap device for the previously shared one page.
That is, the user-visible runtime effect is more memory will be used
and the access latency for the page will be higher, that is, the
performance regression.

Link: http://lkml.kernel.org/r/20170301143905.12846-1-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/swapfile.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 178130880b90..6b6bb1bb6209 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1111,6 +1111,18 @@ int page_swapcount(struct page *page)
 	return count;
 }
 
+static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
+{
+	int count = 0;
+	pgoff_t offset = swp_offset(entry);
+	struct swap_cluster_info *ci;
+
+	ci = lock_cluster_or_swap_info(si, offset);
+	count = swap_count(si->swap_map[offset]);
+	unlock_cluster_or_swap_info(si, ci);
+	return count;
+}
+
 /*
  * How many references to @entry are currently swapped out?
  * This does not give an exact answer when swap count is continued,
@@ -1119,17 +1131,11 @@ int page_swapcount(struct page *page)
 int __swp_swapcount(swp_entry_t entry)
 {
 	int count = 0;
-	pgoff_t offset;
 	struct swap_info_struct *si;
-	struct swap_cluster_info *ci;
 
 	si = __swap_info_get(entry);
-	if (si) {
-		offset = swp_offset(entry);
-		ci = lock_cluster_or_swap_info(si, offset);
-		count = swap_count(si->swap_map[offset]);
-		unlock_cluster_or_swap_info(si, ci);
-	}
+	if (si)
+		count = swap_swapcount(si, entry);
 	return count;
 }
 
@@ -1291,7 +1297,8 @@ int free_swap_and_cache(swp_entry_t entry)
 		 * Also recheck PageSwapCache now page is locked (above).
 		 */
 		if (PageSwapCache(page) && !PageWriteback(page) &&
-		    (!page_mapped(page) || mem_cgroup_swap_full(page))) {
+		    (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
+		    !swap_swapcount(p, entry)) {
 			delete_from_swap_cache(page);
 			SetPageDirty(page);
 		}
-- 
cgit v1.2.3


From 8fff6f299969c2e298b12ef62a77d2f24c788c92 Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Wed, 5 Apr 2017 09:20:29 +1000
Subject: mm: use is_migrate_highatomic() to simplify the code

Introduce two helpers, is_migrate_highatomic() and is_migrate_highatomic_page().
Simplify the code, no functional changes.

Link: http://lkml.kernel.org/r/58B94F15.6060606@huawei.com
Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  5 +++++
 mm/page_alloc.c        | 14 ++++++--------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 04e0969966f6..4b25dd64a0be 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -66,6 +66,11 @@ enum {
 /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
 extern char * const migratetype_names[MIGRATE_TYPES];
 
+#define is_migrate_highatomic(migratetype)				\
+	(migratetype == MIGRATE_HIGHATOMIC)
+#define is_migrate_highatomic_page(_page)				\
+	(get_pageblock_migratetype(_page) == MIGRATE_HIGHATOMIC)
+
 #ifdef CONFIG_CMA
 #  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
 #  define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1f222dd91a5b..3b4b7d5017a4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2035,8 +2035,8 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
 
 	/* Yoink! */
 	mt = get_pageblock_migratetype(page);
-	if (mt != MIGRATE_HIGHATOMIC &&
-			!is_migrate_isolate(mt) && !is_migrate_cma(mt)) {
+	if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
+	    && !is_migrate_cma(mt)) {
 		zone->nr_reserved_highatomic += pageblock_nr_pages;
 		set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
 		move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
@@ -2093,8 +2093,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 			 * from highatomic to ac->migratetype. So we should
 			 * adjust the count once.
 			 */
-			if (get_pageblock_migratetype(page) ==
-							MIGRATE_HIGHATOMIC) {
+			if (is_migrate_highatomic_page(page)) {
 				/*
 				 * It should never happen but changes to
 				 * locking could inadvertently allow a per-cpu
@@ -2151,8 +2150,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
 
 		page = list_first_entry(&area->free_list[fallback_mt],
 						struct page, lru);
-		if (can_steal &&
-			get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC)
+		if (can_steal && !is_migrate_highatomic_page(page))
 			steal_suitable_fallback(zone, page, start_migratetype);
 
 		/* Remove the page from the freelists */
@@ -2489,7 +2487,7 @@ void free_hot_cold_page(struct page *page, bool cold)
 	/*
 	 * We only track unmovable, reclaimable and movable on pcp lists.
 	 * Free ISOLATE pages back to the allocator because they are being
-	 * offlined but treat RESERVE as movable pages so we can get those
+	 * offlined but treat HIGHATOMIC as movable pages so we can get those
 	 * areas back if necessary. Otherwise, we may have to free
 	 * excessively into the page allocator
 	 */
@@ -2600,7 +2598,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
 		for (; page < endpage; page += pageblock_nr_pages) {
 			int mt = get_pageblock_migratetype(page);
 			if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
-				&& mt != MIGRATE_HIGHATOMIC)
+			    && !is_migrate_highatomic(mt))
 				set_pageblock_migratetype(page,
 							  MIGRATE_MOVABLE);
 		}
-- 
cgit v1.2.3


From 55da3de08375fb99fc115dfb9d604364ea6044aa Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 5 Apr 2017 09:20:30 +1000
Subject: mm-use-is_migrate_highatomic-to-simplify-the-code-fix

use static inlines rather than macros, per mhocko

Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  7 +------
 mm/internal.h          | 10 ++++++++++
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4b25dd64a0be..446cf68c1c09 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -35,7 +35,7 @@
  */
 #define PAGE_ALLOC_COSTLY_ORDER 3
 
-enum {
+enum migratetype {
 	MIGRATE_UNMOVABLE,
 	MIGRATE_MOVABLE,
 	MIGRATE_RECLAIMABLE,
@@ -66,11 +66,6 @@ enum {
 /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
 extern char * const migratetype_names[MIGRATE_TYPES];
 
-#define is_migrate_highatomic(migratetype)				\
-	(migratetype == MIGRATE_HIGHATOMIC)
-#define is_migrate_highatomic_page(_page)				\
-	(get_pageblock_migratetype(_page) == MIGRATE_HIGHATOMIC)
-
 #ifdef CONFIG_CMA
 #  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
 #  define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
diff --git a/mm/internal.h b/mm/internal.h
index c583ce1b32b9..823a7a89099b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -503,4 +503,14 @@ extern const struct trace_print_flags pageflag_names[];
 extern const struct trace_print_flags vmaflag_names[];
 extern const struct trace_print_flags gfpflag_names[];
 
+static inline bool is_migrate_highatomic(enum migratetype migratetype)
+{
+	return migratetype == MIGRATE_HIGHATOMIC;
+}
+
+static inline bool is_migrate_highatomic_page(struct page *page)
+{
+	return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
+}
+
 #endif	/* __MM_INTERNAL_H */
-- 
cgit v1.2.3


From dcf15639c61d23445556546fb6074cd16d4b84c6 Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Wed, 5 Apr 2017 09:20:31 +1000
Subject: mm: use is_migrate_isolate_page() to simplify the code

Use is_migrate_isolate_page() to simplify the code, no functional changes.

Link: http://lkml.kernel.org/r/58B94FB1.8020802@huawei.com
Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_isolation.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index f4e17a57926a..7927bbb54a4e 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -88,7 +88,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 
 	zone = page_zone(page);
 	spin_lock_irqsave(&zone->lock, flags);
-	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+	if (!is_migrate_isolate_page(page))
 		goto out;
 
 	/*
@@ -205,7 +205,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 	     pfn < end_pfn;
 	     pfn += pageblock_nr_pages) {
 		page = __first_valid_page(pfn, pageblock_nr_pages);
-		if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+		if (!page || !is_migrate_isolate_page(page))
 			continue;
 		unset_migratetype_isolate(page, migratetype);
 	}
@@ -262,7 +262,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
 	 */
 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
 		page = __first_valid_page(pfn, pageblock_nr_pages);
-		if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+		if (page && !is_migrate_isolate_page(page))
 			break;
 	}
 	page = __first_valid_page(start_pfn, end_pfn - start_pfn);
-- 
cgit v1.2.3


From 8686ce143327cd173c852eb1fd7f24736e128e8b Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 5 Apr 2017 09:20:31 +1000
Subject: mm, vmstat: print non-populated zones in zoneinfo

Initscripts can use the information (protection levels) from
/proc/zoneinfo to configure vm.lowmem_reserve_ratio at boot.

vm.lowmem_reserve_ratio is an array of ratios for each configured zone on
the system.  If a zone is not populated on an arch, /proc/zoneinfo
suppresses its output.

This results in there not being a 1:1 mapping between the set of zones
emitted by /proc/zoneinfo and the zones configured by
vm.lowmem_reserve_ratio.

This patch shows statistics for non-populated zones in /proc/zoneinfo.
The zones exist and hold a spot in the vm.lowmem_reserve_ratio array.
Without this patch, it is not possible to determine which index in the
array controls which zone if one or more zones on the system are not
populated.

Remaining users of walk_zones_in_node() are unchanged.  Files such as
/proc/pagetypeinfo require certain zone data to be initialized properly
for display, which is not done for unpopulated zones.

Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1703031451310.98023@chino.kir.corp.google.com
Signed-off-by: David Rientjes <rientjes@google.com>
Reviewed-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmstat.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 35de61b46408..f8844236926f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1124,8 +1124,12 @@ static void frag_stop(struct seq_file *m, void *arg)
 {
 }
 
-/* Walk all the zones in a node and print using a callback */
+/*
+ * Walk zones in a node and print using a callback.
+ * If @assert_populated is true, only use callback for zones that are populated.
+ */
 static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
+		bool assert_populated,
 		void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
 {
 	struct zone *zone;
@@ -1133,7 +1137,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
 	unsigned long flags;
 
 	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-		if (!populated_zone(zone))
+		if (assert_populated && !populated_zone(zone))
 			continue;
 
 		spin_lock_irqsave(&zone->lock, flags);
@@ -1161,7 +1165,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
 static int frag_show(struct seq_file *m, void *arg)
 {
 	pg_data_t *pgdat = (pg_data_t *)arg;
-	walk_zones_in_node(m, pgdat, frag_show_print);
+	walk_zones_in_node(m, pgdat, true, frag_show_print);
 	return 0;
 }
 
@@ -1202,7 +1206,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
 		seq_printf(m, "%6d ", order);
 	seq_putc(m, '\n');
 
-	walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
+	walk_zones_in_node(m, pgdat, true, pagetypeinfo_showfree_print);
 
 	return 0;
 }
@@ -1254,7 +1258,7 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
 	for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
 		seq_printf(m, "%12s ", migratetype_names[mtype]);
 	seq_putc(m, '\n');
-	walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
+	walk_zones_in_node(m, pgdat, true, pagetypeinfo_showblockcount_print);
 
 	return 0;
 }
@@ -1280,7 +1284,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
 		seq_printf(m, "%12s ", migratetype_names[mtype]);
 	seq_putc(m, '\n');
 
-	walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print);
+	walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print);
 #endif /* CONFIG_PAGE_OWNER */
 }
 
@@ -1430,12 +1434,15 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 }
 
 /*
- * Output information about zones in @pgdat.
+ * Output information about zones in @pgdat.  All zones are printed regardless
+ * of whether they are populated or not: lowmem_reserve_ratio operates on the
+ * set of all zones and userspace would not be aware of such zones if they are
+ * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
  */
 static int zoneinfo_show(struct seq_file *m, void *arg)
 {
 	pg_data_t *pgdat = (pg_data_t *)arg;
-	walk_zones_in_node(m, pgdat, zoneinfo_show_print);
+	walk_zones_in_node(m, pgdat, false, zoneinfo_show_print);
 	return 0;
 }
 
@@ -1839,7 +1846,7 @@ static int unusable_show(struct seq_file *m, void *arg)
 	if (!node_state(pgdat->node_id, N_MEMORY))
 		return 0;
 
-	walk_zones_in_node(m, pgdat, unusable_show_print);
+	walk_zones_in_node(m, pgdat, true, unusable_show_print);
 
 	return 0;
 }
@@ -1891,7 +1898,7 @@ static int extfrag_show(struct seq_file *m, void *arg)
 {
 	pg_data_t *pgdat = (pg_data_t *)arg;
 
-	walk_zones_in_node(m, pgdat, extfrag_show_print);
+	walk_zones_in_node(m, pgdat, true, extfrag_show_print);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 64f33106acd67891159b0ced339f2fc345ae0871 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 5 Apr 2017 09:20:32 +1000
Subject: mm, vmstat: suppress pcp stats for unpopulated zones in zoneinfo

After "mm, vmstat: print non-populated zones in zoneinfo", /proc/zoneinfo
will show unpopulated zones.

The per-cpu pageset statistics are not relevant for unpopulated zones and
can be potentially lengthy, so supress them when they are not interesting.

Also moves lowmem reserve protection information above pcp stats since it
is relevant for all zones per vm.lowmem_reserve_ratio.

Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1703061400500.46428@chino.kir.corp.google.com
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmstat.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index f8844236926f..fe937e32a7a6 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1393,18 +1393,24 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   zone->present_pages,
 		   zone->managed_pages);
 
-	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-		seq_printf(m, "\n      %-12s %lu", vmstat_text[i],
-				zone_page_state(zone, i));
-
 	seq_printf(m,
 		   "\n        protection: (%ld",
 		   zone->lowmem_reserve[0]);
 	for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
 		seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
-	seq_printf(m,
-		   ")"
-		   "\n  pagesets");
+	seq_putc(m, ')');
+
+	/* If unpopulated, no other information is useful */
+	if (!populated_zone(zone)) {
+		seq_putc(m, '\n');
+		return;
+	}
+
+	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+		seq_printf(m, "\n      %-12s %lu", vmstat_text[i],
+				zone_page_state(zone, i));
+
+	seq_printf(m, "\n  pagesets");
 	for_each_online_cpu(i) {
 		struct per_cpu_pageset *pageset;
 
-- 
cgit v1.2.3


From ea04bcf38515398d3fbd93f0a2b74e4b1b24d593 Mon Sep 17 00:00:00 2001
From: Sangwoo Park <sangwoo2.park@lge.com>
Date: Wed, 5 Apr 2017 09:20:32 +1000
Subject: zram: reduce load operation in page_same_filled

In page_same_filled function, all elements in the page is compared with
next index value.  The current comparison routine compares the (i)th and
(i+1)th values of the page.

In this case, two load operaions occur for each comparison.  But if we
store first value of the page stores at 'val' variable and using it to
compare with others, the load opearation is reduced.  It reduce load
operation per page by up to 64times.

Link: http://lkml.kernel.org/r/1488428104-7257-1-git-send-email-sangwoo2.park@lge.com
Signed-off-by: Sangwoo Park <sangwoo2.park@lge.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index dceb5edd1e54..01944419b1f3 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -177,15 +177,17 @@ static bool page_same_filled(void *ptr, unsigned long *element)
 {
 	unsigned int pos;
 	unsigned long *page;
+	unsigned long val;
 
 	page = (unsigned long *)ptr;
+	val = page[0];
 
-	for (pos = 0; pos < PAGE_SIZE / sizeof(*page) - 1; pos++) {
-		if (page[pos] != page[pos + 1])
+	for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) {
+		if (val != page[pos])
 			return false;
 	}
 
-	*element = page[pos];
+	*element = val;
 
 	return true;
 }
-- 
cgit v1.2.3


From 6ddca4ad37510572bb45a2b3011818bcff31fc8c Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 5 Apr 2017 09:20:33 +1000
Subject: lockdep: teach lockdep about memalloc_noio_save

Patch series "scope GFP_NOFS api", v5.

This patch (of 7):

Commit 21caf2fc1931 ("mm: teach mm by current context info to not do I/O
during memory allocation") added the memalloc_noio_(save|restore)
functions to enable people to modify the MM behavior by disabling I/O
during memory allocation.  This was further extended in Fixes:
934f3072c17c ("mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set").
memalloc_noio_* functions prevent allocation paths recursing back into the
filesystem without explicitly changing the flags for every allocation
site.  However, lockdep hasn't been keeping up with the changes and it
entirely misses handling the memalloc_noio adjustments.  Instead, it is
left to the callers of __lockdep_trace_alloc to call the function after
they have shaven the respective GFP flags which can lead to false
positives:

[  644.173373] =================================
[  644.174012] [ INFO: inconsistent lock state ]
[  644.174012] 4.10.0-nbor #134 Not tainted
[  644.174012] ---------------------------------
[  644.174012] inconsistent {IN-RECLAIM_FS-W} -> {RECLAIM_FS-ON-W} usage.
[  644.174012] fsstress/3365 [HC0[0]:SC0[0]:HE1:SE1] takes:
[  644.174012]  (&xfs_nondir_ilock_class){++++?.}, at: [<ffffffff8136f231>] xfs_ilock+0x141/0x230
[  644.174012] {IN-RECLAIM_FS-W} state was registered at:
[  644.174012]   __lock_acquire+0x62a/0x17c0
[  644.174012]   lock_acquire+0xc5/0x220
[  644.174012]   down_write_nested+0x4f/0x90
[  644.174012]   xfs_ilock+0x141/0x230
[  644.174012]   xfs_reclaim_inode+0x12a/0x320
[  644.174012]   xfs_reclaim_inodes_ag+0x2c8/0x4e0
[  644.174012]   xfs_reclaim_inodes_nr+0x33/0x40
[  644.174012]   xfs_fs_free_cached_objects+0x19/0x20
[  644.174012]   super_cache_scan+0x191/0x1a0
[  644.174012]   shrink_slab+0x26f/0x5f0
[  644.174012]   shrink_node+0xf9/0x2f0
[  644.174012]   kswapd+0x356/0x920
[  644.174012]   kthread+0x10c/0x140
[  644.174012]   ret_from_fork+0x31/0x40
[  644.174012] irq event stamp: 173777
[  644.174012] hardirqs last  enabled at (173777): [<ffffffff8105b440>] __local_bh_enable_ip+0x70/0xc0
[  644.174012] hardirqs last disabled at (173775): [<ffffffff8105b407>] __local_bh_enable_ip+0x37/0xc0
[  644.174012] softirqs last  enabled at (173776): [<ffffffff81357e2a>] _xfs_buf_find+0x67a/0xb70
[  644.174012] softirqs last disabled at (173774): [<ffffffff81357d8b>] _xfs_buf_find+0x5db/0xb70
[  644.174012]
[  644.174012] other info that might help us debug this:
[  644.174012]  Possible unsafe locking scenario:
[  644.174012]
[  644.174012]        CPU0
[  644.174012]        ----
[  644.174012]   lock(&xfs_nondir_ilock_class);
[  644.174012]   <Interrupt>
[  644.174012]     lock(&xfs_nondir_ilock_class);
[  644.174012]
[  644.174012]  *** DEADLOCK ***
[  644.174012]
[  644.174012] 4 locks held by fsstress/3365:
[  644.174012]  #0:  (sb_writers#10){++++++}, at: [<ffffffff81208d04>] mnt_want_write+0x24/0x50
[  644.174012]  #1:  (&sb->s_type->i_mutex_key#12){++++++}, at: [<ffffffff8120ea2f>] vfs_setxattr+0x6f/0xb0
[  644.174012]  #2:  (sb_internal#2){++++++}, at: [<ffffffff8138185c>] xfs_trans_alloc+0xfc/0x140
[  644.174012]  #3:  (&xfs_nondir_ilock_class){++++?.}, at: [<ffffffff8136f231>] xfs_ilock+0x141/0x230
[  644.174012]
[  644.174012] stack backtrace:
[  644.174012] CPU: 0 PID: 3365 Comm: fsstress Not tainted 4.10.0-nbor #134
[  644.174012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[  644.174012] Call Trace:
[  644.174012]  dump_stack+0x85/0xc9
[  644.174012]  print_usage_bug.part.37+0x284/0x293
[  644.174012]  ? print_shortest_lock_dependencies+0x1b0/0x1b0
[  644.174012]  mark_lock+0x27e/0x660
[  644.174012]  mark_held_locks+0x66/0x90
[  644.174012]  lockdep_trace_alloc+0x6f/0xd0
[  644.174012]  kmem_cache_alloc_node_trace+0x3a/0x2c0
[  644.174012]  ? vm_map_ram+0x2a1/0x510
[  644.174012]  vm_map_ram+0x2a1/0x510
[  644.174012]  ? vm_map_ram+0x46/0x510
[  644.174012]  _xfs_buf_map_pages+0x77/0x140
[  644.174012]  xfs_buf_get_map+0x185/0x2a0
[  644.174012]  xfs_attr_rmtval_set+0x233/0x430
[  644.174012]  xfs_attr_leaf_addname+0x2d2/0x500
[  644.174012]  xfs_attr_set+0x214/0x420
[  644.174012]  xfs_xattr_set+0x59/0xb0
[  644.174012]  __vfs_setxattr+0x76/0xa0
[  644.174012]  __vfs_setxattr_noperm+0x5e/0xf0
[  644.174012]  vfs_setxattr+0xae/0xb0
[  644.174012]  ? __might_fault+0x43/0xa0
[  644.174012]  setxattr+0x15e/0x1a0
[  644.174012]  ? __lock_is_held+0x53/0x90
[  644.174012]  ? rcu_read_lock_sched_held+0x93/0xa0
[  644.174012]  ? rcu_sync_lockdep_assert+0x2f/0x60
[  644.174012]  ? __sb_start_write+0x130/0x1d0
[  644.174012]  ? mnt_want_write+0x24/0x50
[  644.174012]  path_setxattr+0x8f/0xc0
[  644.174012]  SyS_lsetxattr+0x11/0x20
[  644.174012]  entry_SYSCALL_64_fastpath+0x23/0xc6

Let's fix this by making lockdep explicitly do the shaving of respective
GFP flags.

Fixes: 934f3072c17c ("mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set")
Link: http://lkml.kernel.org/r/20170306131408.9828-2-mhocko@kernel.org
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Chris Mason <clm@fb.com>
Cc: David Sterba <dsterba@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Brian Foster <bfoster@redhat.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/locking/lockdep.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index a95e5d1f4a9c..73a2025b8bde 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -30,6 +30,7 @@
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/task.h>
+#include <linux/sched/mm.h>
 #include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
@@ -2863,6 +2864,8 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
 	if (unlikely(!debug_locks))
 		return;
 
+	gfp_mask = memalloc_noio_flags(gfp_mask);
+
 	/* no reclaim without waiting on it */
 	if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
 		return;
@@ -3861,7 +3864,7 @@ EXPORT_SYMBOL_GPL(lock_unpin_lock);
 
 void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
 {
-	current->lockdep_reclaim_gfp = gfp_mask;
+	current->lockdep_reclaim_gfp = memalloc_noio_flags(gfp_mask);
 }
 
 void lockdep_clear_current_reclaim_state(void)
-- 
cgit v1.2.3


From 6c0556eb86b90e799ca6dc15b9b599a0dfd594df Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 5 Apr 2017 09:20:34 +1000
Subject: lockdep: allow to disable reclaim lockup detection

The current implementation of the reclaim lockup detection can lead to
false positives and those even happen and usually lead to tweak the
code to silence the lockdep by using GFP_NOFS even though the context
can use __GFP_FS just fine. See
http://lkml.kernel.org/r/20160512080321.GA18496@dastard as an example.

=================================
[ INFO: inconsistent lock state ]
4.5.0-rc2+ #4 Tainted: G           O
---------------------------------
inconsistent {RECLAIM_FS-ON-R} -> {IN-RECLAIM_FS-W} usage.
kswapd0/543 [HC0[0]:SC0[0]:HE1:SE1] takes:

(&xfs_nondir_ilock_class){++++-+}, at: [<ffffffffa00781f7>] xfs_ilock+0x177/0x200 [xfs]

{RECLAIM_FS-ON-R} state was registered at:
  [<ffffffff8110f369>] mark_held_locks+0x79/0xa0
  [<ffffffff81113a43>] lockdep_trace_alloc+0xb3/0x100
  [<ffffffff81224623>] kmem_cache_alloc+0x33/0x230
  [<ffffffffa008acc1>] kmem_zone_alloc+0x81/0x120 [xfs]
  [<ffffffffa005456e>] xfs_refcountbt_init_cursor+0x3e/0xa0 [xfs]
  [<ffffffffa0053455>] __xfs_refcount_find_shared+0x75/0x580 [xfs]
  [<ffffffffa00539e4>] xfs_refcount_find_shared+0x84/0xb0 [xfs]
  [<ffffffffa005dcb8>] xfs_getbmap+0x608/0x8c0 [xfs]
  [<ffffffffa007634b>] xfs_vn_fiemap+0xab/0xc0 [xfs]
  [<ffffffff81244208>] do_vfs_ioctl+0x498/0x670
  [<ffffffff81244459>] SyS_ioctl+0x79/0x90
  [<ffffffff81847cd7>] entry_SYSCALL_64_fastpath+0x12/0x6f

       CPU0
       ----
  lock(&xfs_nondir_ilock_class);
  <Interrupt>
    lock(&xfs_nondir_ilock_class);

 *** DEADLOCK ***

3 locks held by kswapd0/543:

stack backtrace:
CPU: 0 PID: 543 Comm: kswapd0 Tainted: G           O    4.5.0-rc2+ #4

Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006

 ffffffff82a34f10 ffff88003aa078d0 ffffffff813a14f9 ffff88003d8551c0
 ffff88003aa07920 ffffffff8110ec65 0000000000000000 0000000000000001
 ffff880000000001 000000000000000b 0000000000000008 ffff88003d855aa0
Call Trace:
 [<ffffffff813a14f9>] dump_stack+0x4b/0x72
 [<ffffffff8110ec65>] print_usage_bug+0x215/0x240
 [<ffffffff8110ee85>] mark_lock+0x1f5/0x660
 [<ffffffff8110e100>] ? print_shortest_lock_dependencies+0x1a0/0x1a0
 [<ffffffff811102e0>] __lock_acquire+0xa80/0x1e50
 [<ffffffff8122474e>] ? kmem_cache_alloc+0x15e/0x230
 [<ffffffffa008acc1>] ? kmem_zone_alloc+0x81/0x120 [xfs]
 [<ffffffff811122e8>] lock_acquire+0xd8/0x1e0
 [<ffffffffa00781f7>] ? xfs_ilock+0x177/0x200 [xfs]
 [<ffffffffa0083a70>] ? xfs_reflink_cancel_cow_range+0x150/0x300 [xfs]
 [<ffffffff8110aace>] down_write_nested+0x5e/0xc0
 [<ffffffffa00781f7>] ? xfs_ilock+0x177/0x200 [xfs]
 [<ffffffffa00781f7>] xfs_ilock+0x177/0x200 [xfs]
 [<ffffffffa0083a70>] xfs_reflink_cancel_cow_range+0x150/0x300 [xfs]
 [<ffffffffa0085bdc>] xfs_fs_evict_inode+0xdc/0x1e0 [xfs]
 [<ffffffff8124d7d5>] evict+0xc5/0x190
 [<ffffffff8124d8d9>] dispose_list+0x39/0x60
 [<ffffffff8124eb2b>] prune_icache_sb+0x4b/0x60
 [<ffffffff8123317f>] super_cache_scan+0x14f/0x1a0
 [<ffffffff811e0d19>] shrink_slab.part.63.constprop.79+0x1e9/0x4e0
 [<ffffffff811e50ee>] shrink_zone+0x15e/0x170
 [<ffffffff811e5ef1>] kswapd+0x4f1/0xa80
 [<ffffffff811e5a00>] ? zone_reclaim+0x230/0x230
 [<ffffffff810e6882>] kthread+0xf2/0x110
 [<ffffffff810e6790>] ? kthread_create_on_node+0x220/0x220
 [<ffffffff8184803f>] ret_from_fork+0x3f/0x70
 [<ffffffff810e6790>] ? kthread_create_on_node+0x220/0x220

To quote Dave:
"
Ignoring whether reflink should be doing anything or not, that's a
"xfs_refcountbt_init_cursor() gets called both outside and inside
transactions" lockdep false positive case. The problem here is
lockdep has seen this allocation from within a transaction, hence a
GFP_NOFS allocation, and now it's seeing it in a GFP_KERNEL context.
Also note that we have an active reference to this inode.

So, because the reclaim annotations overload the interrupt level
detections and it's seen the inode ilock been taken in reclaim
("interrupt") context, this triggers a reclaim context warning where
it thinks it is unsafe to do this allocation in GFP_KERNEL context
holding the inode ilock...
"

This sounds like a fundamental problem of the reclaim lock detection.
It is really impossible to annotate such a special usecase IMHO unless
the reclaim lockup detection is reworked completely. Until then it
is much better to provide a way to add "I know what I am doing flag"
and mark problematic places. This would prevent from abusing GFP_NOFS
flag which has a runtime effect even on configurations which have
lockdep disabled.

Introduce __GFP_NOLOCKDEP flag which tells the lockdep gfp tracking to
skip the current allocation request.

While we are at it also make sure that the radix tree doesn't
accidentaly override tags stored in the upper part of the gfp_mask.

Link: http://lkml.kernel.org/r/20170306131408.9828-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Chris Mason <clm@fb.com>
Cc: David Sterba <dsterba@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Brian Foster <bfoster@redhat.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp.h      | 10 +++++++++-
 kernel/locking/lockdep.c |  4 ++++
 lib/radix-tree.c         |  2 ++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index db373b9d3223..978232a3b4ae 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -40,6 +40,11 @@ struct vm_area_struct;
 #define ___GFP_DIRECT_RECLAIM	0x400000u
 #define ___GFP_WRITE		0x800000u
 #define ___GFP_KSWAPD_RECLAIM	0x1000000u
+#ifdef CONFIG_LOCKDEP
+#define ___GFP_NOLOCKDEP	0x4000000u
+#else
+#define ___GFP_NOLOCKDEP	0
+#endif
 /* If the above are modified, __GFP_BITS_SHIFT may need updating */
 
 /*
@@ -179,8 +184,11 @@ struct vm_area_struct;
 #define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)
 #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
 
+/* Disable lockdep for GFP context tracking */
+#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
+
 /* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT 25
+#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP))
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /*
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 73a2025b8bde..a06159c984e4 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2884,6 +2884,10 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
 	if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
 		return;
 
+	/* Disable lockdep if explicitly requested */
+	if (gfp_mask & __GFP_NOLOCKDEP)
+		return;
+
 	mark_held_locks(curr, RECLAIM_FS);
 }
 
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 691a9ad48497..898e87998417 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -2284,6 +2284,8 @@ static int radix_tree_cpu_dead(unsigned int cpu)
 void __init radix_tree_init(void)
 {
 	int ret;
+
+	BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32);
 	radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
 			sizeof(struct radix_tree_node), 0,
 			SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
-- 
cgit v1.2.3


From dcccc9d42e58df92f7114acfc721331e65ef1fc3 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 5 Apr 2017 09:20:34 +1000
Subject: xfs: abstract PF_FSTRANS to PF_MEMALLOC_NOFS

xfs has defined PF_FSTRANS to declare a scope GFP_NOFS semantic quite some
time ago.  We would like to make this concept more generic and use it for
other filesystems as well.  Let's start by giving the flag a more generic
name PF_MEMALLOC_NOFS which is in line with an exiting PF_MEMALLOC_NOIO
already used for the same purpose for GFP_NOIO contexts.  Replace all
PF_FSTRANS usage from the xfs code in the first step before we introduce a
full API for it as xfs uses the flag directly anyway.

This patch doesn't introduce any functional change.

Link: http://lkml.kernel.org/r/20170306131408.9828-4-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Chris Mason <clm@fb.com>
Cc: David Sterba <dsterba@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Nikolay Borisov <nborisov@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/xfs/kmem.c             |  4 ++--
 fs/xfs/kmem.h             |  2 +-
 fs/xfs/libxfs/xfs_btree.c |  2 +-
 fs/xfs/xfs_aops.c         |  6 +++---
 fs/xfs/xfs_trans.c        | 12 ++++++------
 include/linux/sched.h     |  2 ++
 6 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 70a5b55e0870..d0ac1a065539 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -63,13 +63,13 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
 	 * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
 	 * the filesystem here and potentially deadlocking.
 	 */
-	if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+	if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
 		noio_flag = memalloc_noio_save();
 
 	lflags = kmem_flags_convert(flags);
 	ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
 
-	if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+	if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
 		memalloc_noio_restore(noio_flag);
 
 	return ptr;
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index f0fc84fcaac2..a6c8da40c70d 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
 		lflags = GFP_ATOMIC | __GFP_NOWARN;
 	} else {
 		lflags = GFP_KERNEL | __GFP_NOWARN;
-		if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+		if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
 			lflags &= ~__GFP_FS;
 	}
 
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c3decedc9455..3059a3ec7ecb 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2886,7 +2886,7 @@ xfs_btree_split_worker(
 	struct xfs_btree_split_args	*args = container_of(work,
 						struct xfs_btree_split_args, work);
 	unsigned long		pflags;
-	unsigned long		new_pflags = PF_FSTRANS;
+	unsigned long		new_pflags = PF_MEMALLOC_NOFS;
 
 	/*
 	 * we are in a transaction context here, but may also be doing work
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 61494295d92f..05eca126c688 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -189,7 +189,7 @@ xfs_setfilesize_trans_alloc(
 	 * We hand off the transaction to the completion thread now, so
 	 * clear the flag here.
 	 */
-	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 	return 0;
 }
 
@@ -252,7 +252,7 @@ xfs_setfilesize_ioend(
 	 * thus we need to mark ourselves as being in a transaction manually.
 	 * Similarly for freeze protection.
 	 */
-	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 	__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
 
 	/* we abort the update if there was an IO error */
@@ -1016,7 +1016,7 @@ xfs_do_writepage(
 	 * Given that we do not allow direct reclaim to call us, we should
 	 * never be called while in a filesystem transaction.
 	 */
-	if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
+	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
 		goto redirty;
 
 	/*
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 70f42ea86dfb..f5969c8274fc 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -134,7 +134,7 @@ xfs_trans_reserve(
 	bool		rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
 
 	/* Mark this thread as being in a transaction */
-	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 
 	/*
 	 * Attempt to reserve the needed disk blocks by decrementing
@@ -144,7 +144,7 @@ xfs_trans_reserve(
 	if (blocks > 0) {
 		error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
 		if (error != 0) {
-			current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+			current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 			return -ENOSPC;
 		}
 		tp->t_blk_res += blocks;
@@ -221,7 +221,7 @@ undo_blocks:
 		tp->t_blk_res = 0;
 	}
 
-	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 
 	return error;
 }
@@ -914,7 +914,7 @@ __xfs_trans_commit(
 
 	xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
 
-	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 	xfs_trans_free(tp);
 
 	/*
@@ -944,7 +944,7 @@ out_unreserve:
 		if (commit_lsn == -1 && !error)
 			error = -EIO;
 	}
-	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 	xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
 	xfs_trans_free(tp);
 
@@ -998,7 +998,7 @@ xfs_trans_cancel(
 		xfs_log_done(mp, tp->t_ticket, NULL, false);
 
 	/* mark this thread as no longer being in a transaction */
-	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 
 	xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
 	xfs_trans_free(tp);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee84fd43..4528f7c9789f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1224,6 +1224,8 @@ extern struct pid *cad_pid;
 #define PF_FREEZER_SKIP		0x40000000	/* Freezer should not count it as freezable */
 #define PF_SUSPEND_TASK		0x80000000      /* This thread called freeze_processes() and should not be frozen */
 
+#define PF_MEMALLOC_NOFS PF_FSTRANS	/* Transition to a more generic GFP_NOFS scope semantic */
+
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
  * tasks can access tsk->flags in readonly mode for example
-- 
cgit v1.2.3


From b3225bf62ceec294665c03296ffe7f122e3b8188 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 5 Apr 2017 09:20:35 +1000
Subject: mm: introduce memalloc_nofs_{save,restore} API

GFP_NOFS context is used for the following 5 reasons currently
	- to prevent from deadlocks when the lock held by the allocation
	  context would be needed during the memory reclaim
	- to prevent from stack overflows during the reclaim because
	  the allocation is performed from a deep context already
	- to prevent lockups when the allocation context depends on
	  other reclaimers to make a forward progress indirectly
	- just in case because this would be safe from the fs POV
	- silence lockdep false positives

Unfortunately overuse of this allocation context brings some problems to
the MM.  Memory reclaim is much weaker (especially during heavy FS
metadata workloads), OOM killer cannot be invoked because the MM layer
doesn't have enough information about how much memory is freeable by the
FS layer.

In many cases it is far from clear why the weaker context is even used and
so it might be used unnecessarily.  We would like to get rid of those as
much as possible.  One way to do that is to use the flag in scopes rather
than isolated cases.  Such a scope is declared when really necessary,
tracked per task and all the allocation requests from within the context
will simply inherit the GFP_NOFS semantic.

Not only this is easier to understand and maintain because there are much
less problematic contexts than specific allocation requests, this also
helps code paths where FS layer interacts with other layers (e.g.  crypto,
security modules, MM etc...) and there is no easy way to convey the
allocation context between the layers.

Introduce memalloc_nofs_{save,restore} API to control the scope of
GFP_NOFS allocation context.  This is basically copying
memalloc_noio_{save,restore} API we have for other restricted allocation
context GFP_NOIO.  The PF_MEMALLOC_NOFS flag already exists and it is just
an alias for PF_FSTRANS which has been xfs specific until recently.  There
are no more PF_FSTRANS users anymore so let's just drop it.

PF_MEMALLOC_NOFS is now checked in the MM layer and drops __GFP_FS
implicitly same as PF_MEMALLOC_NOIO drops __GFP_IO.  memalloc_noio_flags
is renamed to current_gfp_context because it now cares about both
PF_MEMALLOC_NOFS and PF_MEMALLOC_NOIO contexts.  Xfs code paths preserve
their semantic.  kmem_flags_convert() doesn't need to evaluate the flag
anymore.

This patch shouldn't introduce any functional changes.

Let's hope that filesystems will drop direct GFP_NOFS (resp.  ~__GFP_FS)
usage as much as possible and only use a properly documented
memalloc_nofs_{save,restore} checkpoints where they are appropriate.

Link: http://lkml.kernel.org/r/20170306131408.9828-5-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Chris Mason <clm@fb.com>
Cc: David Sterba <dsterba@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Brian Foster <bfoster@redhat.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Nikolay Borisov <nborisov@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/xfs/kmem.h            |  2 +-
 include/linux/gfp.h      |  8 ++++++++
 include/linux/sched.h    |  8 +++-----
 include/linux/sched/mm.h | 26 +++++++++++++++++++++++---
 kernel/locking/lockdep.c |  6 +++---
 mm/page_alloc.c          | 10 ++++++----
 mm/vmscan.c              |  6 +++---
 7 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index a6c8da40c70d..d6ea520162b2 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
 		lflags = GFP_ATOMIC | __GFP_NOWARN;
 	} else {
 		lflags = GFP_KERNEL | __GFP_NOWARN;
-		if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
+		if (flags & KM_NOFS)
 			lflags &= ~__GFP_FS;
 	}
 
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 978232a3b4ae..2bfcfd33e476 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -210,8 +210,16 @@ struct vm_area_struct;
  *
  * GFP_NOIO will use direct reclaim to discard clean pages or slab pages
  *   that do not require the starting of any physical IO.
+ *   Please try to avoid using this flag directly and instead use
+ *   memalloc_noio_{save,restore} to mark the whole scope which cannot
+ *   perform any IO with a short explanation why. All allocation requests
+ *   will inherit GFP_NOIO implicitly.
  *
  * GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
+ *   Please try to avoid using this flag directly and instead use
+ *   memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't
+ *   recurse into the FS layer with a short explanation why. All allocation
+ *   requests will inherit GFP_NOFS implicitly.
  *
  * GFP_USER is for userspace allocations that also need to be directly
  *   accessibly by the kernel or hardware. It is typically used by hardware
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4528f7c9789f..9c3ee2281a56 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1211,9 +1211,9 @@ extern struct pid *cad_pid;
 #define PF_USED_ASYNC		0x00004000	/* Used async_schedule*(), used by module init */
 #define PF_NOFREEZE		0x00008000	/* This thread should not be frozen */
 #define PF_FROZEN		0x00010000	/* Frozen for system suspend */
-#define PF_FSTRANS		0x00020000	/* Inside a filesystem transaction */
-#define PF_KSWAPD		0x00040000	/* I am kswapd */
-#define PF_MEMALLOC_NOIO	0x00080000	/* Allocating memory without IO involved */
+#define PF_KSWAPD		0x00020000	/* I am kswapd */
+#define PF_MEMALLOC_NOFS	0x00040000	/* All allocation requests will inherit GFP_NOFS */
+#define PF_MEMALLOC_NOIO	0x00080000	/* All allocation requests will inherit GFP_NOIO */
 #define PF_LESS_THROTTLE	0x00100000	/* Throttle me less: I clean memory */
 #define PF_KTHREAD		0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
@@ -1224,8 +1224,6 @@ extern struct pid *cad_pid;
 #define PF_FREEZER_SKIP		0x40000000	/* Freezer should not count it as freezable */
 #define PF_SUSPEND_TASK		0x80000000      /* This thread called freeze_processes() and should not be frozen */
 
-#define PF_MEMALLOC_NOFS PF_FSTRANS	/* Transition to a more generic GFP_NOFS scope semantic */
-
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
  * tasks can access tsk->flags in readonly mode for example
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 830953ebb391..9daabe138c99 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -149,13 +149,21 @@ static inline bool in_vfork(struct task_struct *tsk)
 	return ret;
 }
 
-/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
- * __GFP_FS is also cleared as it implies __GFP_IO.
+/*
+ * Applies per-task gfp context to the given allocation flags.
+ * PF_MEMALLOC_NOIO implies GFP_NOIO
+ * PF_MEMALLOC_NOFS implies GFP_NOFS
  */
-static inline gfp_t memalloc_noio_flags(gfp_t flags)
+static inline gfp_t current_gfp_context(gfp_t flags)
 {
+	/*
+	 * NOIO implies both NOIO and NOFS and it is a weaker context
+	 * so always make sure it makes precendence
+	 */
 	if (unlikely(current->flags & PF_MEMALLOC_NOIO))
 		flags &= ~(__GFP_IO | __GFP_FS);
+	else if (unlikely(current->flags & PF_MEMALLOC_NOFS))
+		flags &= ~__GFP_FS;
 	return flags;
 }
 
@@ -171,4 +179,16 @@ static inline void memalloc_noio_restore(unsigned int flags)
 	current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
 }
 
+static inline unsigned int memalloc_nofs_save(void)
+{
+	unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
+	current->flags |= PF_MEMALLOC_NOFS;
+	return flags;
+}
+
+static inline void memalloc_nofs_restore(unsigned int flags)
+{
+	current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
+}
+
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index a06159c984e4..ab94a8606d3c 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2864,7 +2864,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
 	if (unlikely(!debug_locks))
 		return;
 
-	gfp_mask = memalloc_noio_flags(gfp_mask);
+	gfp_mask = current_gfp_context(gfp_mask);
 
 	/* no reclaim without waiting on it */
 	if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
@@ -2875,7 +2875,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
 		return;
 
 	/* We're only interested __GFP_FS allocations for now */
-	if (!(gfp_mask & __GFP_FS))
+	if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS))
 		return;
 
 	/*
@@ -3868,7 +3868,7 @@ EXPORT_SYMBOL_GPL(lock_unpin_lock);
 
 void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
 {
-	current->lockdep_reclaim_gfp = memalloc_noio_flags(gfp_mask);
+	current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask);
 }
 
 void lockdep_clear_current_reclaim_state(void)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3b4b7d5017a4..b4a9f9382e0c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3949,10 +3949,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 		goto out;
 
 	/*
-	 * Runtime PM, block IO and its error handling path can deadlock
-	 * because I/O on the device might not complete.
+	 * Apply scoped allocation constrains. This is mainly about
+	 * GFP_NOFS resp. GFP_NOIO which has to be inherited for all
+	 * allocation requests from a particular context which has
+	 * been marked by memalloc_no{fs,io}_{save,restore}
 	 */
-	alloc_mask = memalloc_noio_flags(gfp_mask);
+	alloc_mask = current_gfp_context(gfp_mask);
 	ac.spread_dirty_pages = false;
 
 	/*
@@ -7405,7 +7407,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 		.zone = page_zone(pfn_to_page(start)),
 		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
-		.gfp_mask = memalloc_noio_flags(gfp_mask),
+		.gfp_mask = current_gfp_context(gfp_mask),
 	};
 	INIT_LIST_HEAD(&cc.migratepages);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ec4555369e17..3ad66580b8b4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2915,7 +2915,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 	unsigned long nr_reclaimed;
 	struct scan_control sc = {
 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
-		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+		.gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
 		.reclaim_idx = gfp_zone(gfp_mask),
 		.order = order,
 		.nodemask = nodemask,
@@ -2995,7 +2995,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 	int nid;
 	struct scan_control sc = {
 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
-		.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+		.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
 				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
 		.reclaim_idx = MAX_NR_ZONES - 1,
 		.target_mem_cgroup = memcg,
@@ -3702,7 +3702,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	int classzone_idx = gfp_zone(gfp_mask);
 	struct scan_control sc = {
 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
-		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+		.gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
 		.order = order,
 		.priority = NODE_RECLAIM_PRIORITY,
 		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
-- 
cgit v1.2.3


From c7c9c685b40e38d82ed89cb099d3118646520457 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 5 Apr 2017 09:20:35 +1000
Subject: mm-introduce-memalloc_nofs_saverestore-api-fix

fix comment typo, reflow comment

Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b4a9f9382e0c..0b1de367e79a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3949,10 +3949,10 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 		goto out;
 
 	/*
-	 * Apply scoped allocation constrains. This is mainly about
-	 * GFP_NOFS resp. GFP_NOIO which has to be inherited for all
-	 * allocation requests from a particular context which has
-	 * been marked by memalloc_no{fs,io}_{save,restore}
+	 * Apply scoped allocation constraints. This is mainly about GFP_NOFS
+	 * resp. GFP_NOIO which has to be inherited for all allocation requests
+	 * from a particular context which has been marked by
+	 * memalloc_no{fs,io}_{save,restore}.
 	 */
 	alloc_mask = current_gfp_context(gfp_mask);
 	ac.spread_dirty_pages = false;
-- 
cgit v1.2.3


From 6f840f9cc900929a7dd86906a31cde397a16aad8 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 5 Apr 2017 09:20:36 +1000
Subject: xfs: use memalloc_nofs_{save,restore} instead of memalloc_noio*

kmem_zalloc_large and _xfs_buf_map_pages use memalloc_noio_{save,restore}
API to prevent from reclaim recursion into the fs because vmalloc can
invoke unconditional GFP_KERNEL allocations and these functions might be
called from the NOFS contexts.  The memalloc_noio_save will enforce
GFP_NOIO context which is even weaker than GFP_NOFS and that seems to be
unnecessary.  Let's use memalloc_nofs_{save,restore} instead as it should
provide exactly what we need here - implicit GFP_NOFS context.

Link: http://lkml.kernel.org/r/20170306131408.9828-6-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Chris Mason <clm@fb.com>
Cc: David Sterba <dsterba@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Nikolay Borisov <nborisov@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/xfs/kmem.c    | 12 ++++++------
 fs/xfs/xfs_buf.c |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index d0ac1a065539..780fc8986dab 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -48,7 +48,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
 void *
 kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
 {
-	unsigned noio_flag = 0;
+	unsigned nofs_flag = 0;
 	void	*ptr;
 	gfp_t	lflags;
 
@@ -60,17 +60,17 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
 	 * __vmalloc() will allocate data pages and auxillary structures (e.g.
 	 * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
 	 * here. Hence we need to tell memory reclaim that we are in such a
-	 * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
+	 * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
 	 * the filesystem here and potentially deadlocking.
 	 */
-	if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
-		noio_flag = memalloc_noio_save();
+	if (flags & KM_NOFS)
+		nofs_flag = memalloc_nofs_save();
 
 	lflags = kmem_flags_convert(flags);
 	ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
 
-	if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
-		memalloc_noio_restore(noio_flag);
+	if (flags & KM_NOFS)
+		memalloc_nofs_restore(nofs_flag);
 
 	return ptr;
 }
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b6208728ba39..ca09061369cb 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -443,17 +443,17 @@ _xfs_buf_map_pages(
 		bp->b_addr = NULL;
 	} else {
 		int retried = 0;
-		unsigned noio_flag;
+		unsigned nofs_flag;
 
 		/*
 		 * vm_map_ram() will allocate auxillary structures (e.g.
 		 * pagetables) with GFP_KERNEL, yet we are likely to be under
 		 * GFP_NOFS context here. Hence we need to tell memory reclaim
-		 * that we are in such a context via PF_MEMALLOC_NOIO to prevent
+		 * that we are in such a context via PF_MEMALLOC_NOFS to prevent
 		 * memory reclaim re-entering the filesystem here and
 		 * potentially deadlocking.
 		 */
-		noio_flag = memalloc_noio_save();
+		nofs_flag = memalloc_nofs_save();
 		do {
 			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
 						-1, PAGE_KERNEL);
@@ -461,7 +461,7 @@ _xfs_buf_map_pages(
 				break;
 			vm_unmap_aliases();
 		} while (retried++ <= 1);
-		memalloc_noio_restore(noio_flag);
+		memalloc_nofs_restore(nofs_flag);
 
 		if (!bp->b_addr)
 			return -ENOMEM;
-- 
cgit v1.2.3


From fbb8c9f146ea722f52203b2a87c24a9514bd92dd Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 5 Apr 2017 09:20:36 +1000
Subject: jbd2: mark the transaction context with the scope GFP_NOFS context

now that we have memalloc_nofs_{save,restore} api we can mark the whole
transaction context as implicitly GFP_NOFS.  All allocations will
automatically inherit GFP_NOFS this way.  This means that we do not have
to mark any of those requests with GFP_NOFS and moreover all the
ext4_kv[mz]alloc(GFP_NOFS) are also safe now because even the hardcoded
GFP_KERNEL allocations deep inside the vmalloc will be NOFS now.

Link: http://lkml.kernel.org/r/20170306131408.9828-7-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Chris Mason <clm@fb.com>
Cc: David Sterba <dsterba@suse.cz>
Cc: Brian Foster <bfoster@redhat.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Nikolay Borisov <nborisov@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/jbd2/transaction.c | 12 ++++++++++++
 include/linux/jbd2.h  |  2 ++
 2 files changed, 14 insertions(+)

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 5e659ee08d6a..d8f09f34285f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -29,6 +29,7 @@
 #include <linux/backing-dev.h>
 #include <linux/bug.h>
 #include <linux/module.h>
+#include <linux/sched/mm.h>
 
 #include <trace/events/jbd2.h>
 
@@ -388,6 +389,11 @@ repeat:
 
 	rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_);
 	jbd2_journal_free_transaction(new_transaction);
+	/*
+	 * Make sure that no allocations done while the transaction is
+	 * open is going to recurse back to the fs layer.
+	 */
+	handle->saved_alloc_context = memalloc_nofs_save();
 	return 0;
 }
 
@@ -466,6 +472,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
 	trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
 				handle->h_transaction->t_tid, type,
 				line_no, nblocks);
+
 	return handle;
 }
 EXPORT_SYMBOL(jbd2__journal_start);
@@ -1760,6 +1767,11 @@ int jbd2_journal_stop(handle_t *handle)
 	if (handle->h_rsv_handle)
 		jbd2_journal_free_reserved(handle->h_rsv_handle);
 free_and_exit:
+	/*
+	 * scope of th GFP_NOFS context is over here and so we can
+	 * restore the original alloc context.
+	 */
+	memalloc_nofs_restore(handle->saved_alloc_context);
 	jbd2_free_handle(handle);
 	return err;
 }
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index dfaa1f4dcb0c..606b6bce3a5b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -491,6 +491,8 @@ struct jbd2_journal_handle
 
 	unsigned long		h_start_jiffies;
 	unsigned int		h_requested_credits;
+
+	unsigned int		saved_alloc_context;
 };
 
 
-- 
cgit v1.2.3


From 12016b84e88322bbecdc0a0b4d23272f5c91081d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 5 Apr 2017 09:20:37 +1000
Subject: jbd2-mark-the-transaction-context-with-the-scope-gfp_nofs-context-fix

tweak comments

Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/jbd2/transaction.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d8f09f34285f..9ee4832b6f8b 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -390,8 +390,8 @@ repeat:
 	rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_);
 	jbd2_journal_free_transaction(new_transaction);
 	/*
-	 * Make sure that no allocations done while the transaction is
-	 * open is going to recurse back to the fs layer.
+	 * Ensure that no allocations done while the transaction is open are
+	 * going to recurse back to the fs layer.
 	 */
 	handle->saved_alloc_context = memalloc_nofs_save();
 	return 0;
@@ -1768,8 +1768,8 @@ int jbd2_journal_stop(handle_t *handle)
 		jbd2_journal_free_reserved(handle->h_rsv_handle);
 free_and_exit:
 	/*
-	 * scope of th GFP_NOFS context is over here and so we can
-	 * restore the original alloc context.
+	 * Scope of the GFP_NOFS context is over here and so we can restore the
+	 * original alloc context.
 	 */
 	memalloc_nofs_restore(handle->saved_alloc_context);
 	jbd2_free_handle(handle);
-- 
cgit v1.2.3


From 670cd509d441dddccc89ecb33163bb8f3d374f3f Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 5 Apr 2017 09:20:37 +1000
Subject: jbd2: make the whole kjournald2 kthread NOFS safe

kjournald2 is central to the transaction commit processing.  As such any
potential allocation from this kernel thread has to be GFP_NOFS.  Make
sure to mark the whole kernel thread GFP_NOFS by the memalloc_nofs_save.

Link: http://lkml.kernel.org/r/20170306131408.9828-8-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Suggested-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Chris Mason <clm@fb.com>
Cc: David Sterba <dsterba@suse.cz>
Cc: Brian Foster <bfoster@redhat.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Nikolay Borisov <nborisov@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/jbd2/journal.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5adc2fb62b0f..5e86e8ca1324 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -43,6 +43,7 @@
 #include <linux/backing-dev.h>
 #include <linux/bitops.h>
 #include <linux/ratelimit.h>
+#include <linux/sched/mm.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -205,6 +206,13 @@ static int kjournald2(void *arg)
 	journal->j_task = current;
 	wake_up(&journal->j_wait_done_commit);
 
+	/*
+	 * Make sure that no allocations from this kernel thread will ever recurse
+	 * to the fs layer because we are responsible for the transaction commit
+	 * and any fs involvement might get stuck waiting for the trasn. commit.
+	 */
+	memalloc_nofs_save();
+
 	/*
 	 * And now, wait forever for commit wakeup events.
 	 */
-- 
cgit v1.2.3


From b213931158477c10f5e584ba3f0fdac5a628a7b4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 5 Apr 2017 09:20:38 +1000
Subject: jbd2-make-the-whole-kjournald2-kthread-nofs-safe-checkpatch-fixes

WARNING: line over 80 characters
#42: FILE: fs/jbd2/journal.c:210:
+	 * Make sure that no allocations from this kernel thread will ever recurse

total: 0 errors, 1 warnings, 20 lines checked

NOTE: For some of the reported defects, checkpatch may be able to
      mechanically convert to the typical style using --fix or --fix-inplace.

./patches/jbd2-make-the-whole-kjournald2-kthread-nofs-safe.patch has style problems, please review.

NOTE: If any of the errors are false positives, please report
      them to the maintainer, see CHECKPATCH in MAINTAINERS.

Please run checkpatch prior to sending patches

Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/jbd2/journal.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5e86e8ca1324..c43fe83ee708 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -207,9 +207,10 @@ static int kjournald2(void *arg)
 	wake_up(&journal->j_wait_done_commit);
 
 	/*
-	 * Make sure that no allocations from this kernel thread will ever recurse
-	 * to the fs layer because we are responsible for the transaction commit
-	 * and any fs involvement might get stuck waiting for the trasn. commit.
+	 * Make sure that no allocations from this kernel thread will ever
+	 * recurse to the fs layer because we are responsible for the
+	 * transaction commit and any fs involvement might get stuck waiting for
+	 * the trasn. commit.
 	 */
 	memalloc_nofs_save();
 
-- 
cgit v1.2.3


From bee959dd9d80673220934e52213dfd98d3c28a02 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Wed, 5 Apr 2017 09:20:39 +1000
Subject: mm: tighten up the fault path a little

The round_up() macro generates a couple of unnecessary instructions
in this usage:

    48cd:       49 8b 47 50             mov    0x50(%r15),%rax
    48d1:       48 83 e8 01             sub    $0x1,%rax
    48d5:       48 0d ff 0f 00 00       or     $0xfff,%rax
    48db:       48 83 c0 01             add    $0x1,%rax
    48df:       48 c1 f8 0c             sar    $0xc,%rax
    48e3:       48 39 c3                cmp    %rax,%rbx
    48e6:       72 2e                   jb     4916 <filemap_fault+0x96>

If we change round_up() to ((x) + __round_mask(x, y)) & ~__round_mask(x, y)
then GCC can see through it and remove the mask (because that would be
dead code given the subsequent shift):

    48cd:       49 8b 47 50             mov    0x50(%r15),%rax
    48d1:       48 05 ff 0f 00 00       add    $0xfff,%rax
    48d7:       48 c1 e8 0c             shr    $0xc,%rax
    48db:       48 39 c3                cmp    %rax,%rbx
    48de:       72 2e                   jb     490e <filemap_fault+0x8e>

But that's problematic because we'd evaluate 'y' twice.  Converting
round_up into an inline function prevents it from being used in other
definitions.  The easiest thing to do is just change these three usages of
round_up to use DIV_ROUND_UP.  Also add an unlikely() because GCC's
heuristic is wrong in this case.

Link: http://lkml.kernel.org/r/20170207192812.5281-1-willy@infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/filemap.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 1694623a6289..68b166a9eda0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2202,12 +2202,12 @@ int filemap_fault(struct vm_fault *vmf)
 	struct file_ra_state *ra = &file->f_ra;
 	struct inode *inode = mapping->host;
 	pgoff_t offset = vmf->pgoff;
+	pgoff_t max_off;
 	struct page *page;
-	loff_t size;
 	int ret = 0;
 
-	size = round_up(i_size_read(inode), PAGE_SIZE);
-	if (offset >= size >> PAGE_SHIFT)
+	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+	if (unlikely(offset >= max_off))
 		return VM_FAULT_SIGBUS;
 
 	/*
@@ -2256,8 +2256,8 @@ retry_find:
 	 * Found the page and have a reference on it.
 	 * We must recheck i_size under page lock.
 	 */
-	size = round_up(i_size_read(inode), PAGE_SIZE);
-	if (unlikely(offset >= size >> PAGE_SHIFT)) {
+	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+	if (unlikely(offset >= max_off)) {
 		unlock_page(page);
 		put_page(page);
 		return VM_FAULT_SIGBUS;
@@ -2323,7 +2323,7 @@ void filemap_map_pages(struct vm_fault *vmf,
 	struct file *file = vmf->vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	pgoff_t last_pgoff = start_pgoff;
-	loff_t size;
+	unsigned long max_idx;
 	struct page *head, *page;
 
 	rcu_read_lock();
@@ -2369,8 +2369,8 @@ repeat:
 		if (page->mapping != mapping || !PageUptodate(page))
 			goto unlock;
 
-		size = round_up(i_size_read(mapping->host), PAGE_SIZE);
-		if (page->index >= size >> PAGE_SHIFT)
+		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+		if (page->index >= max_idx)
 			goto unlock;
 
 		if (file->f_ra.mmap_miss > 0)
-- 
cgit v1.2.3


From eaf02dbe42bb68dc2b7114a879e495b569562e86 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 5 Apr 2017 09:20:39 +1000
Subject: mm: remove rodata_test_data export, add pr_fmt

Since commit 3ad38ceb2769 ("x86/mm: Remove CONFIG_DEBUG_NX_TEST"), nothing
is using the exported rodata_test_data variable, so drop the export.
Additionally updates the pr_fmt to avoid redundant strings and adjusts
some whitespace.

Link: http://lkml.kernel.org/r/20170307005313.GA85809@beast
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Jinbum Park <jinb.park7@gmail.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rodata_test.h |  1 -
 mm/rodata_test.c            | 17 +++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/rodata_test.h b/include/linux/rodata_test.h
index ea05f6c51413..84766bcdd01f 100644
--- a/include/linux/rodata_test.h
+++ b/include/linux/rodata_test.h
@@ -14,7 +14,6 @@
 #define _RODATA_TEST_H
 
 #ifdef CONFIG_DEBUG_RODATA_TEST
-extern const int rodata_test_data;
 void rodata_test(void);
 #else
 static inline void rodata_test(void) {}
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
index 0fd21670b513..6bb4deb12e78 100644
--- a/mm/rodata_test.c
+++ b/mm/rodata_test.c
@@ -9,11 +9,12 @@
  * as published by the Free Software Foundation; version 2
  * of the License.
  */
+#define pr_fmt(fmt) "rodata_test: " fmt
+
 #include <linux/uaccess.h>
 #include <asm/sections.h>
 
 const int rodata_test_data = 0xC3;
-EXPORT_SYMBOL_GPL(rodata_test_data);
 
 void rodata_test(void)
 {
@@ -23,20 +24,20 @@ void rodata_test(void)
 	/* test 1: read the value */
 	/* If this test fails, some previous testrun has clobbered the state */
 	if (!rodata_test_data) {
-		pr_err("rodata_test: test 1 fails (start data)\n");
+		pr_err("test 1 fails (start data)\n");
 		return;
 	}
 
 	/* test 2: write to the variable; this should fault */
 	if (!probe_kernel_write((void *)&rodata_test_data,
-						(void *)&zero, sizeof(zero))) {
-		pr_err("rodata_test: test data was not read only\n");
+				(void *)&zero, sizeof(zero))) {
+		pr_err("test data was not read only\n");
 		return;
 	}
 
 	/* test 3: check the value hasn't changed */
 	if (rodata_test_data == zero) {
-		pr_err("rodata_test: test data was changed\n");
+		pr_err("test data was changed\n");
 		return;
 	}
 
@@ -44,13 +45,13 @@ void rodata_test(void)
 	start = (unsigned long)__start_rodata;
 	end = (unsigned long)__end_rodata;
 	if (start & (PAGE_SIZE - 1)) {
-		pr_err("rodata_test: start of .rodata is not page size aligned\n");
+		pr_err("start of .rodata is not page size aligned\n");
 		return;
 	}
 	if (end & (PAGE_SIZE - 1)) {
-		pr_err("rodata_test: end of .rodata is not page size aligned\n");
+		pr_err("end of .rodata is not page size aligned\n");
 		return;
 	}
 
-	pr_info("rodata_test: all tests were successful\n");
+	pr_info("all tests were successful\n");
 }
-- 
cgit v1.2.3


From 4867f817534bee769144e9576eef2ca6cc0eeac9 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 5 Apr 2017 09:20:40 +1000
Subject: mm: move pcp and lru-pcp draining into single wq

We currently have 2 specific WQ_RECLAIM workqueues in the mm code.
vmstat_wq for updating pcp stats and lru_add_drain_wq dedicated to drain
per cpu lru caches.  This seems more than necessary because both can run
on a single WQ.  Both do not block on locks requiring a memory allocation
nor perform any allocations themselves.  We will save one rescuer thread
this way.

On the other hand drain_all_pages() queues work on the system wq which
doesn't have rescuer and so this depend on memory allocation (when all
workers are stuck allocating and new ones cannot be created).  This is not
critical as there should be somebody invoking the OOM killer (e.g.  the
forking worker) and get the situation unstuck and eventually performs the
draining.  Quite annoying though.  This worker should be using WQ_RECLAIM
as well.  We can reuse the same one as for lru draining and vmstat.

Link: http://lkml.kernel.org/r/20170307131751.24936-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Suggested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@suse.de>
Tested-by: Yang Li <pku.leo@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/internal.h   |  7 +++++++
 mm/page_alloc.c |  9 ++++++++-
 mm/swap.c       | 27 ++++++++-------------------
 mm/vmstat.c     | 14 +++++++++-----
 4 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 823a7a89099b..04d08ef91224 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -486,6 +486,13 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 enum ttu_flags;
 struct tlbflush_unmap_batch;
 
+
+/*
+ * only for MM internal work items which do not depend on
+ * any allocations or locks which might depend on allocations
+ */
+extern struct workqueue_struct *mm_percpu_wq;
+
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 void try_to_unmap_flush(void);
 void try_to_unmap_flush_dirty(void);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0b1de367e79a..c0977c4d0501 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2364,6 +2364,13 @@ void drain_all_pages(struct zone *zone)
 	 */
 	static cpumask_t cpus_with_pcps;
 
+	/*
+	 * Make sure nobody triggers this path before mm_percpu_wq is fully
+	 * initialized.
+	 */
+	if (WARN_ON_ONCE(!mm_percpu_wq))
+		return;
+
 	/* Workqueues cannot recurse */
 	if (current->flags & PF_WQ_WORKER)
 		return;
@@ -2413,7 +2420,7 @@ void drain_all_pages(struct zone *zone)
 	for_each_cpu(cpu, &cpus_with_pcps) {
 		struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
 		INIT_WORK(work, drain_local_pages_wq);
-		schedule_work_on(cpu, work);
+		queue_work_on(cpu, mm_percpu_wq, work);
 	}
 	for_each_cpu(cpu, &cpus_with_pcps)
 		flush_work(per_cpu_ptr(&pcpu_drain, cpu));
diff --git a/mm/swap.c b/mm/swap.c
index ac98eb443a03..361bdb1575ab 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -677,30 +677,19 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
 
 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
 
-/*
- * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
- * workqueue, aiding in getting memory freed.
- */
-static struct workqueue_struct *lru_add_drain_wq;
-
-static int __init lru_init(void)
-{
-	lru_add_drain_wq = alloc_workqueue("lru-add-drain", WQ_MEM_RECLAIM, 0);
-
-	if (WARN(!lru_add_drain_wq,
-		"Failed to create workqueue lru_add_drain_wq"))
-		return -ENOMEM;
-
-	return 0;
-}
-early_initcall(lru_init);
-
 void lru_add_drain_all(void)
 {
 	static DEFINE_MUTEX(lock);
 	static struct cpumask has_work;
 	int cpu;
 
+	/*
+	 * Make sure nobody triggers this path before mm_percpu_wq is fully
+	 * initialized.
+	 */
+	if (WARN_ON(!mm_percpu_wq))
+		return;
+
 	mutex_lock(&lock);
 	get_online_cpus();
 	cpumask_clear(&has_work);
@@ -714,7 +703,7 @@ void lru_add_drain_all(void)
 		    pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
 		    need_activate_page_drain(cpu)) {
 			INIT_WORK(work, lru_add_drain_per_cpu);
-			queue_work_on(cpu, lru_add_drain_wq, work);
+			queue_work_on(cpu, mm_percpu_wq, work);
 			cpumask_set_cpu(cpu, &has_work);
 		}
 	}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index fe937e32a7a6..6561402443cb 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1563,7 +1563,6 @@ static const struct file_operations proc_vmstat_file_operations = {
 #endif /* CONFIG_PROC_FS */
 
 #ifdef CONFIG_SMP
-static struct workqueue_struct *vmstat_wq;
 static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
 int sysctl_stat_interval __read_mostly = HZ;
 
@@ -1621,7 +1620,7 @@ static void vmstat_update(struct work_struct *w)
 		 * to occur in the future. Keep on running the
 		 * update worker thread.
 		 */
-		queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+		queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
 				this_cpu_ptr(&vmstat_work),
 				round_jiffies_relative(sysctl_stat_interval));
 	}
@@ -1700,7 +1699,7 @@ static void vmstat_shepherd(struct work_struct *w)
 		struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
 
 		if (!delayed_work_pending(dw) && need_update(cpu))
-			queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
+			queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
 	}
 	put_online_cpus();
 
@@ -1716,7 +1715,6 @@ static void __init start_shepherd_timer(void)
 		INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
 			vmstat_update);
 
-	vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
 	schedule_delayed_work(&shepherd,
 		round_jiffies_relative(sysctl_stat_interval));
 }
@@ -1762,10 +1760,16 @@ static int vmstat_cpu_dead(unsigned int cpu)
 
 #endif
 
+struct workqueue_struct *mm_percpu_wq;
+
 void __init init_mm_internals(void)
 {
+	int ret __maybe_unused;
+
+	mm_percpu_wq = alloc_workqueue("vmstat",
+				       WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+
 #ifdef CONFIG_SMP
-	int ret;
 
 	ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
 					NULL, vmstat_cpu_dead);
-- 
cgit v1.2.3


From 0ec46c948f7fc081671291037eb92c6e6a5fc46e Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@kernel.org>
Date: Wed, 5 Apr 2017 09:20:40 +1000
Subject: mm-move-pcp-and-lru-pcp-drainging-into-single-wq-fix

> Should the workqueue also have been renamed to mm_percpu_wq?

Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmstat.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6561402443cb..757be8303aa0 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1766,11 +1766,10 @@ void __init init_mm_internals(void)
 {
 	int ret __maybe_unused;
 
-	mm_percpu_wq = alloc_workqueue("vmstat",
+	mm_percpu_wq = alloc_workqueue("mm_percpu_wq",
 				       WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
 
 #ifdef CONFIG_SMP
-
 	ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
 					NULL, vmstat_cpu_dead);
 	if (ret < 0)
-- 
cgit v1.2.3


From 34883c08506e0b33b283fe252c59edbc17efd32a Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 5 Apr 2017 09:20:41 +1000
Subject: mm, compaction: reorder fields in struct compact_control

Patch series "try to reduce fragmenting fallbacks", v3.

Last year, Johannes Weiner has reported a regression in page mobility
grouping [1] and while the exact cause was not found, I've come up with
some ways to improve it by reducing the number of allocations falling back
to different migratetype and causing permanent fragmentation.

The series was tested with mmtests stress-highalloc modified to do
GFP_KERNEL order-4 allocations, on 4.9 with "mm, vmscan: fix zone balance
check in prepare_kswapd_sleep" (without that, kcompactd indeed wasn't
woken up) on UMA machine with 4GB memory.  There were 5 repeats of each
run, as the extfrag stats are quite volatile (note the stats below are
sums, not averages, as it was less perl hacking for me).

Success rate are the same, already high due to the low allocation order
used, so I'm not including them.

Compaction stats:
(the patches are stacked, and I haven't measured the non-functional-changes
patches separately)

                                   patch 1     patch 2     patch 3     patch 4     patch 7     patch 8
Compaction stalls                    22449       24680       24846       19765       22059       17480
Compaction success                   12971       14836       14608       10475       11632        8757
Compaction failures                   9477        9843       10238        9290       10426        8722
Page migrate success               3109022     3370438     3312164     1695105     1608435     2111379
Page migrate failure                911588     1149065     1028264     1112675     1077251     1026367
Compaction pages isolated          7242983     8015530     7782467     4629063     4402787     5377665
Compaction migrate scanned       980838938   987367943   957690188   917647238   947155598  1018922197
Compaction free scanned          557926893   598946443   602236894   594024490   541169699   763651731
Compaction cost                      10243       10578       10304        8286        8398        9440

Compaction stats are mostly within noise until patch 4, which decreases
the number of compactions, and migrations.  Part of that could be due to
more pageblocks marked as unmovable, and async compaction skipping those.
This changes a bit with patch 7, but not so much.  Patch 8 increases free
scanner stats and migrations, which comes from the changed termination
criteria.  Interestingly number of compactions decreases - probably the
fully compacted pageblock satisfies multiple subsequent allocations, so it
amortizes.

Next comes the extfrag tracepoint, where "fragmenting" means that an
allocation had to fallback to a pageblock of another migratetype which
wasn't fully free (which is almost all of the fallbacks).  I have locally
added another tracepoint for "Page steal" into steal_suitable_fallback()
which triggers in situations where we are allowed to do
move_freepages_block().  If we decide to also do
set_pageblock_migratetype(), it's "Pages steal with pageblock" with break
down for which allocation migratetype we are stealing and from which
fallback migratetype.  The last part "due to counting" comes from patch 4
and counts the events where the counting of movable pages allowed us to
change pageblock's migratetype, while the number of free pages alone
wouldn't be enough to cross the threshold.

                                                     patch 1     patch 2     patch 3     patch 4     patch 7     patch 8
Page alloc extfrag event                            10155066     8522968    10164959    15622080    13727068    13140319
Extfrag fragmenting                                 10149231     8517025    10159040    15616925    13721391    13134792
Extfrag fragmenting for unmovable                     159504      168500      184177       97835       70625       56948
Extfrag fragmenting unmovable placed with movable     153613      163549      172693       91740       64099       50917
Extfrag fragmenting unmovable placed with reclaim.      5891        4951       11484        6095        6526        6031
Extfrag fragmenting for reclaimable                     4738        4829        6345        4822        5640        5378
Extfrag fragmenting reclaimable placed with movable     1836        1902        1851        1579        1739        1760
Extfrag fragmenting reclaimable placed with unmov.      2902        2927        4494        3243        3901        3618
Extfrag fragmenting for movable                      9984989     8343696     9968518    15514268    13645126    13072466
Pages steal                                           179954      192291      210880      123254       94545       81486
Pages steal with pageblock                             22153       18943       20154       33562       29969       33444
Pages steal with pageblock for unmovable               14350       12858       13256       20660       19003       20852
Pages steal with pageblock for unmovable from mov.     12812       11402       11683       19072       17467       19298
Pages steal with pageblock for unmovable from recl.     1538        1456        1573        1588        1536        1554
Pages steal with pageblock for movable                  7114        5489        5965       11787       10012       11493
Pages steal with pageblock for movable from unmov.      6885        5291        5541       11179        9525       10885
Pages steal with pageblock for movable from recl.        229         198         424         608         487         608
Pages steal with pageblock for reclaimable               689         596         933        1115         954        1099
Pages steal with pageblock for reclaimable from unmov.   273         219         537         658         547         667
Pages steal with pageblock for reclaimable from mov.     416         377         396         457         407         432
Pages steal with pageblock due to counting                                                 11834       10075        7530
... for unmovable                                                                           8993        7381        4616
... for movable                                                                             2792        2653        2851
... for reclaimable                                                                           49          41          63

What we can see is that "Extfrag fragmenting for unmovable" and "...
placed with movable" drops with almost each patch, which is good as we are
polluting less movable pageblocks with unmovable pages.

The most significant change is patch 4 with movable page counting.  On the
other hand it increases "Extfrag fragmenting for movable" by 50%.  "Pages
steal" drops though, so these movable allocation fallbacks find only small
free pages and are not allowed to steal whole pageblocks back.  "Pages
steal with pageblock" raises, because the patch increases the chances of
pageblock migratetype changes to happen.  This affects all migratetypes.

The summary is that patch 4 is not a clear win wrt these stats, but I
believe that the tradeoff it makes is a good one.  There's less pollution
of movable pageblocks by unmovable allocations.  There's less stealing
between pageblock, and those that remain have higher chance of changing
migratetype also the pageblock itself, so it should more faithfully
reflect the migratetype of the pages within the pageblock.  The increase
of movable allocations falling back to unmovable pageblock might look
dramatic, but those allocations can be migrated by compaction when needed,
and other patches in the series (7-9) improve that aspect.

Patches 7 and 8 continue the trend of reduced unmovable fallbacks and also
reduce the impact on movable fallbacks from patch 4.

[1] https://www.spinics.net/lists/linux-mm/msg114237.html

This patch (of 8):

While currently there are (mostly by accident) no holes in struct
compact_control (on x86_64), but we are going to add more bool flags, so
place them all together to the end of the structure.  While at it, just
order all fields from largest to smallest.

Link: http://lkml.kernel.org/r/20170307131545.28577-2-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/internal.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 04d08ef91224..004471b72977 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -183,6 +183,7 @@ extern int user_min_free_kbytes;
 struct compact_control {
 	struct list_head freepages;	/* List of free pages to migrate to */
 	struct list_head migratepages;	/* List of pages being migrated */
+	struct zone *zone;
 	unsigned long nr_freepages;	/* Number of isolated free pages */
 	unsigned long nr_migratepages;	/* Number of pages to migrate */
 	unsigned long total_migrate_scanned;
@@ -190,16 +191,15 @@ struct compact_control {
 	unsigned long free_pfn;		/* isolate_freepages search base */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
 	unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
+	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
+	int order;			/* order a direct compactor needs */
+	const unsigned int alloc_flags;	/* alloc flags of a direct compactor */
+	const int classzone_idx;	/* zone index of a direct compactor */
 	enum migrate_mode mode;		/* Async or sync migration mode */
 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
 	bool ignore_block_suitable;	/* Scan blocks considered unsuitable */
 	bool direct_compaction;		/* False from kcompactd or /proc/... */
 	bool whole_zone;		/* Whole zone should/has been scanned */
-	int order;			/* order a direct compactor needs */
-	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
-	const unsigned int alloc_flags;	/* alloc flags of a direct compactor */
-	const int classzone_idx;	/* zone index of a direct compactor */
-	struct zone *zone;
 	bool contended;			/* Signal lock or sched contention */
 };
 
-- 
cgit v1.2.3


From 66403bc7d65352979de165613fa1dc655dca861e Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 5 Apr 2017 09:20:41 +1000
Subject: mm, compaction: remove redundant watermark check in
 compact_finished()

When detecting whether compaction has succeeded in forming a high-order
page, __compact_finished() employs a watermark check, followed by an own
search for a suitable page in the freelists.  This is not ideal for two
reasons:

- The watermark check also searches high-order freelists, but has a less
  strict criteria wrt fallback.  It's therefore redundant and waste of
  cycles.  This was different in the past when high-order watermark check
  attempted to apply reserves to high-order pages.

- The watermark check might actually fail due to lack of order-0 pages.
  Compaction can't help with that, so there's no point in continuing
  because of that.  It's possible that high-order page still exists and it
  terminates.

This patch therefore removes the watermark check.  This should save some
cycles and terminate compaction sooner in some cases.

Link: http://lkml.kernel.org/r/20170307131545.28577-3-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/compaction.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 81e1eaa2a2cf..9222ff362f33 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1280,7 +1280,6 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
 			    const int migratetype)
 {
 	unsigned int order;
-	unsigned long watermark;
 
 	if (cc->contended || fatal_signal_pending(current))
 		return COMPACT_CONTENDED;
@@ -1308,13 +1307,6 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
 	if (is_via_compact_memory(cc->order))
 		return COMPACT_CONTINUE;
 
-	/* Compaction run is not finished if the watermark is not met */
-	watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK];
-
-	if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
-							cc->alloc_flags))
-		return COMPACT_CONTINUE;
-
 	/* Direct compactor: Is a suitable page free? */
 	for (order = cc->order; order < MAX_ORDER; order++) {
 		struct free_area *area = &zone->free_area[order];
-- 
cgit v1.2.3


From 25b17331ed1a2dcab6b12688bc6d63bcf00c5728 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 5 Apr 2017 09:20:42 +1000
Subject: mm, page_alloc: split smallest stolen page in fallback

The __rmqueue_fallback() function is called when there's no free page of
requested migratetype, and we need to steal from a different one.  There
are various heuristics to make this event infrequent and reduce permanent
fragmentation.  The main one is to try stealing from a pageblock that has
the most free pages, and possibly steal them all at once and convert the
whole pageblock.  Precise searching for such pageblock would be expensive,
so instead the heuristics walks the free lists from MAX_ORDER down to
requested order and assumes that the block with highest-order free page is
likely to also have the most free pages in total.

Chances are that together with the highest-order page, we steal also pages
of lower orders from the same block.  But then we still split the highest
order page.  This is wasteful and can contribute to fragmentation instead
of avoiding it.

This patch thus changes __rmqueue_fallback() to just steal the page(s) and
put them on the freelist of the requested migratetype, and only report
whether it was successful.  Then we pick (and eventually split) the
smallest page with __rmqueue_smallest().  This all happens under zone
lock, so nobody can steal it from us in the process.  This should reduce
fragmentation due to fallbacks.  At worst we are only stealing a single
highest-order page and waste some cycles by moving it between lists and
then removing it, but fallback is not exactly hot path so that should not
be a concern.  As a side benefit the patch removes some duplicate code by
reusing __rmqueue_smallest().

Link: http://lkml.kernel.org/r/20170307131545.28577-4-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 59 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c0977c4d0501..76236b572fc6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1953,23 +1953,41 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
  * use it's pages as requested migratetype in the future.
  */
 static void steal_suitable_fallback(struct zone *zone, struct page *page,
-							  int start_type)
+					int start_type, bool whole_block)
 {
 	unsigned int current_order = page_order(page);
+	struct free_area *area;
 	int pages;
 
+	/*
+	 * This can happen due to races and we want to prevent broken
+	 * highatomic accounting.
+	 */
+	if (is_migrate_highatomic_page(page))
+		goto single_page;
+
 	/* Take ownership for orders >= pageblock_order */
 	if (current_order >= pageblock_order) {
 		change_pageblock_range(page, current_order, start_type);
-		return;
+		goto single_page;
 	}
 
+	/* We are not allowed to try stealing from the whole block */
+	if (!whole_block)
+		goto single_page;
+
 	pages = move_freepages_block(zone, page, start_type);
 
 	/* Claim the whole block if over half of it is free */
 	if (pages >= (1 << (pageblock_order-1)) ||
 			page_group_by_mobility_disabled)
 		set_pageblock_migratetype(page, start_type);
+
+	return;
+
+single_page:
+	area = &zone->free_area[current_order];
+	list_move(&page->lru, &area->free_list[start_type]);
 }
 
 /*
@@ -2128,8 +2146,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 	return false;
 }
 
-/* Remove an element from the buddy allocator from the fallback list */
-static inline struct page *
+/*
+ * Try finding a free buddy page on the fallback list and put it on the free
+ * list of requested migratetype, possibly along with other pages from the same
+ * block, depending on fragmentation avoidance heuristics. Returns true if
+ * fallback was found so that __rmqueue_smallest() can grab it.
+ */
+static inline bool
 __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
 {
 	struct free_area *area;
@@ -2150,32 +2173,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
 
 		page = list_first_entry(&area->free_list[fallback_mt],
 						struct page, lru);
-		if (can_steal && !is_migrate_highatomic_page(page))
-			steal_suitable_fallback(zone, page, start_migratetype);
 
-		/* Remove the page from the freelists */
-		area->nr_free--;
-		list_del(&page->lru);
-		rmv_page_order(page);
-
-		expand(zone, page, order, current_order, area,
-					start_migratetype);
-		/*
-		 * The pcppage_migratetype may differ from pageblock's
-		 * migratetype depending on the decisions in
-		 * find_suitable_fallback(). This is OK as long as it does not
-		 * differ for MIGRATE_CMA pageblocks. Those can be used as
-		 * fallback only via special __rmqueue_cma_fallback() function
-		 */
-		set_pcppage_migratetype(page, start_migratetype);
+		steal_suitable_fallback(zone, page, start_migratetype,
+								can_steal);
 
 		trace_mm_page_alloc_extfrag(page, order, current_order,
 			start_migratetype, fallback_mt);
 
-		return page;
+		return true;
 	}
 
-	return NULL;
+	return false;
 }
 
 /*
@@ -2187,13 +2195,14 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
 {
 	struct page *page;
 
+retry:
 	page = __rmqueue_smallest(zone, order, migratetype);
 	if (unlikely(!page)) {
 		if (migratetype == MIGRATE_MOVABLE)
 			page = __rmqueue_cma_fallback(zone, order);
 
-		if (!page)
-			page = __rmqueue_fallback(zone, order, migratetype);
+		if (!page && __rmqueue_fallback(zone, order, migratetype))
+			goto retry;
 	}
 
 	trace_mm_page_alloc_zone_locked(page, order, migratetype);
-- 
cgit v1.2.3


From 6c0d9a04e3a0f6dcfee1ecefae8530e0adffa445 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 5 Apr 2017 09:20:43 +1000
Subject: mm-page_alloc-split-smallest-stolen-page-in-fallback-fix

The lkp-robot reported a test case stuck on boot due to the patch [1]
which was due to endless loop in the modified __rmqueue().  It blindly
expected that move_freepages_block() will succeed, but that can fail due
to last pageblock pfn not belonging to the same zone as the fallback
candidate page.

This fix checks the result of move_freepages_block() and steals the single
candidate page if it fails, which was also effectively done before [1].

[1] mmotm: mm-page_alloc-split-smallest-stolen-page-in-fallback.patch

Link: http://lkml.kernel.org/r/59d71b35-d556-4fc9-ee2e-1574259282fd@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 76236b572fc6..299ecbae04d6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1977,6 +1977,9 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
 		goto single_page;
 
 	pages = move_freepages_block(zone, page, start_type);
+	/* moving whole block can fail due to zone boundary conditions */
+	if (!pages)
+		goto single_page;
 
 	/* Claim the whole block if over half of it is free */
 	if (pages >= (1 << (pageblock_order-1)) ||
-- 
cgit v1.2.3


From 5ec0505c1417ae25edc0ad7106fa5122ceff2d53 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 5 Apr 2017 09:20:43 +1000
Subject: mm, page_alloc: count movable pages when stealing from pageblock

When stealing pages from pageblock of a different migratetype, we count
how many free pages were stolen, and change the pageblock's migratetype if
more than half of the pageblock was free.  This might be too conservative,
as there might be other pages that are not free, but were allocated with
the same migratetype as our allocation requested.

While we cannot determine the migratetype of allocated pages precisely (at
least without the page_owner functionality enabled), we can count pages
that compaction would try to isolate for migration - those are either on
LRU or __PageMovable().  The rest can be assumed to be MIGRATE_RECLAIMABLE
or MIGRATE_UNMOVABLE, which we cannot easily distinguish.  This counting
can be done as part of free page stealing with little additional overhead.

The page stealing code is changed so that it considers free pages plus
pages of the "good" migratetype for the decision whether to change
pageblock's migratetype.

The result should be more accurate migratetype of pageblocks wrt the
actual pages in the pageblocks, when stealing from semi-occupied
pageblocks.  This should help the efficiency of page grouping by mobility.

In testing based on 4.9 kernel with stress-highalloc from mmtests
configured for order-4 GFP_KERNEL allocations, this patch has reduced the
number of unmovable allocations falling back to movable pageblocks by 47%.
The number of movable allocations falling back to other pageblocks are
increased by 55%, but these events don't cause permanent fragmentation, so
the tradeoff should be positive.  Later patches also offset the movable
fallback increase to some extent.

Link: http://lkml.kernel.org/r/20170307131545.28577-5-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-isolation.h |  5 +--
 mm/page_alloc.c                | 72 ++++++++++++++++++++++++++++++++++--------
 mm/page_isolation.c            |  5 +--
 3 files changed, 62 insertions(+), 20 deletions(-)

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 047d64706f2a..d4cd2014fa6f 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -33,10 +33,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 			 bool skip_hwpoisoned_pages);
 void set_pageblock_migratetype(struct page *page, int migratetype);
 int move_freepages_block(struct zone *zone, struct page *page,
-				int migratetype);
-int move_freepages(struct zone *zone,
-			  struct page *start_page, struct page *end_page,
-			  int migratetype);
+				int migratetype, int *num_movable);
 
 /*
  * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 299ecbae04d6..a8f518bd5651 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1837,9 +1837,9 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
-int move_freepages(struct zone *zone,
+static int move_freepages(struct zone *zone,
 			  struct page *start_page, struct page *end_page,
-			  int migratetype)
+			  int migratetype, int *num_movable)
 {
 	struct page *page;
 	unsigned int order;
@@ -1856,6 +1856,9 @@ int move_freepages(struct zone *zone,
 	VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
 #endif
 
+	if (num_movable)
+		*num_movable = 0;
+
 	for (page = start_page; page <= end_page;) {
 		if (!pfn_valid_within(page_to_pfn(page))) {
 			page++;
@@ -1866,6 +1869,15 @@ int move_freepages(struct zone *zone,
 		VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
 
 		if (!PageBuddy(page)) {
+			/*
+			 * We assume that pages that could be isolated for
+			 * migration are movable. But we don't actually try
+			 * isolating, as that would be expensive.
+			 */
+			if (num_movable &&
+					(PageLRU(page) || __PageMovable(page)))
+				(*num_movable)++;
+
 			page++;
 			continue;
 		}
@@ -1881,7 +1893,7 @@ int move_freepages(struct zone *zone,
 }
 
 int move_freepages_block(struct zone *zone, struct page *page,
-				int migratetype)
+				int migratetype, int *num_movable)
 {
 	unsigned long start_pfn, end_pfn;
 	struct page *start_page, *end_page;
@@ -1898,7 +1910,8 @@ int move_freepages_block(struct zone *zone, struct page *page,
 	if (!zone_spans_pfn(zone, end_pfn))
 		return 0;
 
-	return move_freepages(zone, start_page, end_page, migratetype);
+	return move_freepages(zone, start_page, end_page, migratetype,
+								num_movable);
 }
 
 static void change_pageblock_range(struct page *pageblock_page,
@@ -1948,22 +1961,26 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
 /*
  * This function implements actual steal behaviour. If order is large enough,
  * we can steal whole pageblock. If not, we first move freepages in this
- * pageblock and check whether half of pages are moved or not. If half of
- * pages are moved, we can change migratetype of pageblock and permanently
- * use it's pages as requested migratetype in the future.
+ * pageblock to our migratetype and determine how many already-allocated pages
+ * are there in the pageblock with a compatible migratetype. If at least half
+ * of pages are free or compatible, we can change migratetype of the pageblock
+ * itself, so pages freed in the future will be put on the correct free list.
  */
 static void steal_suitable_fallback(struct zone *zone, struct page *page,
 					int start_type, bool whole_block)
 {
 	unsigned int current_order = page_order(page);
 	struct free_area *area;
-	int pages;
+	int free_pages, movable_pages, alike_pages;
+	int old_block_type;
+
+	old_block_type = get_pageblock_migratetype(page);
 
 	/*
 	 * This can happen due to races and we want to prevent broken
 	 * highatomic accounting.
 	 */
-	if (is_migrate_highatomic_page(page))
+	if (is_migrate_highatomic(old_block_type))
 		goto single_page;
 
 	/* Take ownership for orders >= pageblock_order */
@@ -1976,13 +1993,39 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
 	if (!whole_block)
 		goto single_page;
 
-	pages = move_freepages_block(zone, page, start_type);
+	free_pages = move_freepages_block(zone, page, start_type,
+						&movable_pages);
+	/*
+	 * Determine how many pages are compatible with our allocation.
+	 * For movable allocation, it's the number of movable pages which
+	 * we just obtained. For other types it's a bit more tricky.
+	 */
+	if (start_type == MIGRATE_MOVABLE) {
+		alike_pages = movable_pages;
+	} else {
+		/*
+		 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
+		 * to MOVABLE pageblock, consider all non-movable pages as
+		 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
+		 * vice versa, be conservative since we can't distinguish the
+		 * exact migratetype of non-movable pages.
+		 */
+		if (old_block_type == MIGRATE_MOVABLE)
+			alike_pages = pageblock_nr_pages
+						- (free_pages + movable_pages);
+		else
+			alike_pages = 0;
+	}
+
 	/* moving whole block can fail due to zone boundary conditions */
 	if (!pages)
 		goto single_page;
 
-	/* Claim the whole block if over half of it is free */
-	if (pages >= (1 << (pageblock_order-1)) ||
+	/*
+	 * If a sufficient number of pages in the block are either free or of
+	 * comparable migratability as our allocation, claim the whole block.
+	 */
+	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
 			page_group_by_mobility_disabled)
 		set_pageblock_migratetype(page, start_type);
 
@@ -2060,7 +2103,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
 	    && !is_migrate_cma(mt)) {
 		zone->nr_reserved_highatomic += pageblock_nr_pages;
 		set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
-		move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
+		move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
 	}
 
 out_unlock:
@@ -2137,7 +2180,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 			 * may increase.
 			 */
 			set_pageblock_migratetype(page, ac->migratetype);
-			ret = move_freepages_block(zone, page, ac->migratetype);
+			ret = move_freepages_block(zone, page, ac->migratetype,
+									NULL);
 			if (ret) {
 				spin_unlock_irqrestore(&zone->lock, flags);
 				return ret;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 7927bbb54a4e..5092e4ef00c8 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -66,7 +66,8 @@ out:
 
 		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
 		zone->nr_isolate_pageblock++;
-		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
+		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
+									NULL);
 
 		__mod_zone_freepage_state(zone, -nr_pages, migratetype);
 	}
@@ -120,7 +121,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 	 * pageblock scanning for freepage moving.
 	 */
 	if (!isolated_page) {
-		nr_pages = move_freepages_block(zone, page, migratetype);
+		nr_pages = move_freepages_block(zone, page, migratetype, NULL);
 		__mod_zone_freepage_state(zone, nr_pages, migratetype);
 	}
 	set_pageblock_migratetype(page, migratetype);
-- 
cgit v1.2.3


From eadd0fe934746c6343cb605ee885141219ab7a65 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 5 Apr 2017 09:20:44 +1000
Subject: mm-page_alloc-count-movable-pages-when-stealing-from-pageblock-fix

merge fix

Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a8f518bd5651..a834a578c256 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2018,7 +2018,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
 	}
 
 	/* moving whole block can fail due to zone boundary conditions */
-	if (!pages)
+	if (!free_pages)
 		goto single_page;
 
 	/*
-- 
cgit v1.2.3


From c70542057ec37fc03c057cb73a906491a2595329 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 5 Apr 2017 09:20:44 +1000
Subject: mm, compaction: change migrate_async_suitable() to
 suitable_migration_source()

Preparation for making the decisions more complex and depending on
compact_control flags.  No functional change.

Link: http://lkml.kernel.org/r/20170307131545.28577-6-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  5 +++++
 mm/compaction.c        | 19 +++++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 446cf68c1c09..618499159a7c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -74,6 +74,11 @@ extern char * const migratetype_names[MIGRATE_TYPES];
 #  define is_migrate_cma_page(_page) false
 #endif
 
+static inline bool is_migrate_movable(int mt)
+{
+	return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE;
+}
+
 #define for_each_migratetype_order(order, type) \
 	for (order = 0; order < MAX_ORDER; order++) \
 		for (type = 0; type < MIGRATE_TYPES; type++)
diff --git a/mm/compaction.c b/mm/compaction.c
index 9222ff362f33..a7c2f0da7228 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,11 +89,6 @@ static void map_pages(struct list_head *list)
 	list_splice(&tmp_list, list);
 }
 
-static inline bool migrate_async_suitable(int migratetype)
-{
-	return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
-}
-
 #ifdef CONFIG_COMPACTION
 
 int PageMovable(struct page *page)
@@ -988,6 +983,15 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
 #ifdef CONFIG_COMPACTION
 
+static bool suitable_migration_source(struct compact_control *cc,
+							struct page *page)
+{
+	if (cc->mode != MIGRATE_ASYNC)
+		return true;
+
+	return is_migrate_movable(get_pageblock_migratetype(page));
+}
+
 /* Returns true if the page is within a block suitable for migration to */
 static bool suitable_migration_target(struct compact_control *cc,
 							struct page *page)
@@ -1007,7 +1011,7 @@ static bool suitable_migration_target(struct compact_control *cc,
 	}
 
 	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
-	if (migrate_async_suitable(get_pageblock_migratetype(page)))
+	if (is_migrate_movable(get_pageblock_migratetype(page)))
 		return true;
 
 	/* Otherwise skip the block */
@@ -1242,8 +1246,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 		 * Async compaction is optimistic to see if the minimum amount
 		 * of work satisfies the allocation.
 		 */
-		if (cc->mode == MIGRATE_ASYNC &&
-		    !migrate_async_suitable(get_pageblock_migratetype(page)))
+		if (!suitable_migration_source(cc, page))
 			continue;
 
 		/* Perform the isolation */
-- 
cgit v1.2.3


From 5a4d3740b1161d84a5c4a15b208b75ad188ae59e Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 5 Apr 2017 09:20:45 +1000
Subject: mm, compaction: add migratetype to compact_control

Preparation patch.  We are going to need migratetype at lower layers than
compact_zone() and compact_finished().

Link: http://lkml.kernel.org/r/20170307131545.28577-7-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/compaction.c | 15 +++++++--------
 mm/internal.h   |  1 +
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index a7c2f0da7228..c48da73e30a5 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1279,10 +1279,11 @@ static inline bool is_via_compact_memory(int order)
 	return order == -1;
 }
 
-static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc,
-			    const int migratetype)
+static enum compact_result __compact_finished(struct zone *zone,
+						struct compact_control *cc)
 {
 	unsigned int order;
+	const int migratetype = cc->migratetype;
 
 	if (cc->contended || fatal_signal_pending(current))
 		return COMPACT_CONTENDED;
@@ -1338,12 +1339,11 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
 }
 
 static enum compact_result compact_finished(struct zone *zone,
-			struct compact_control *cc,
-			const int migratetype)
+			struct compact_control *cc)
 {
 	int ret;
 
-	ret = __compact_finished(zone, cc, migratetype);
+	ret = __compact_finished(zone, cc);
 	trace_mm_compaction_finished(zone, cc->order, ret);
 	if (ret == COMPACT_NO_SUITABLE_PAGE)
 		ret = COMPACT_CONTINUE;
@@ -1476,9 +1476,9 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
 	enum compact_result ret;
 	unsigned long start_pfn = zone->zone_start_pfn;
 	unsigned long end_pfn = zone_end_pfn(zone);
-	const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
 	const bool sync = cc->mode != MIGRATE_ASYNC;
 
+	cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
 	ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
 							cc->classzone_idx);
 	/* Compaction is likely to fail */
@@ -1528,8 +1528,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
 
 	migrate_prep_local();
 
-	while ((ret = compact_finished(zone, cc, migratetype)) ==
-						COMPACT_CONTINUE) {
+	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
 		int err;
 
 		switch (isolate_migratepages(zone, cc)) {
diff --git a/mm/internal.h b/mm/internal.h
index 004471b72977..e7e709fd3043 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -193,6 +193,7 @@ struct compact_control {
 	unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
 	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
 	int order;			/* order a direct compactor needs */
+	int migratetype;		/* migratetype of direct compactor */
 	const unsigned int alloc_flags;	/* alloc flags of a direct compactor */
 	const int classzone_idx;	/* zone index of a direct compactor */
 	enum migrate_mode mode;		/* Async or sync migration mode */
-- 
cgit v1.2.3


From bc84ff5428991f9c1b4d941927fa436503719a9e Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 5 Apr 2017 09:20:45 +1000
Subject: mm, compaction: restrict async compaction to pageblocks of same
 migratetype

The migrate scanner in async compaction is currently limited to
MIGRATE_MOVABLE pageblocks.  This is a heuristic intended to reduce
latency, based on the assumption that non-MOVABLE pageblocks are unlikely
to contain movable pages.

However, with the exception of THP's, most high-order allocations are not
movable.  Should the async compaction succeed, this increases the chance
that the non-MOVABLE allocations will fallback to a MOVABLE pageblock,
making the long-term fragmentation worse.

This patch attempts to help the situation by changing async direct
compaction so that the migrate scanner only scans the pageblocks of the
requested migratetype.  If it's a non-MOVABLE type and there are such
pageblocks that do contain movable pages, chances are that the allocation
can succeed within one of such pageblocks, removing the need for a
fallback.  If that fails, the subsequent sync attempt will ignore this
restriction.

In testing based on 4.9 kernel with stress-highalloc from mmtests
configured for order-4 GFP_KERNEL allocations, this patch has reduced the
number of unmovable allocations falling back to movable pageblocks by 30%.
The number of movable allocations falling back is reduced by 12%.

Link: http://lkml.kernel.org/r/20170307131545.28577-8-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/compaction.c | 11 +++++++++--
 mm/page_alloc.c | 20 +++++++++++++-------
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index c48da73e30a5..2c288e75840d 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -986,10 +986,17 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
 static bool suitable_migration_source(struct compact_control *cc,
 							struct page *page)
 {
-	if (cc->mode != MIGRATE_ASYNC)
+	int block_mt;
+
+	if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
 		return true;
 
-	return is_migrate_movable(get_pageblock_migratetype(page));
+	block_mt = get_pageblock_migratetype(page);
+
+	if (cc->migratetype == MIGRATE_MOVABLE)
+		return is_migrate_movable(block_mt);
+	else
+		return block_mt == cc->migratetype;
 }
 
 /* Returns true if the page is within a block suitable for migration to */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a834a578c256..7d0332b5f3d8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3677,6 +3677,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 						struct alloc_context *ac)
 {
 	bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
+	const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
 	struct page *page = NULL;
 	unsigned int alloc_flags;
 	unsigned long did_some_progress;
@@ -3744,12 +3745,17 @@ retry_cpuset:
 
 	/*
 	 * For costly allocations, try direct compaction first, as it's likely
-	 * that we have enough base pages and don't need to reclaim. Don't try
-	 * that for allocations that are allowed to ignore watermarks, as the
-	 * ALLOC_NO_WATERMARKS attempt didn't yet happen.
+	 * that we have enough base pages and don't need to reclaim. For non-
+	 * movable high-order allocations, do that as well, as compaction will
+	 * try prevent permanent fragmentation by migrating from blocks of the
+	 * same migratetype.
+	 * Don't try this for allocations that are allowed to ignore
+	 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
 	 */
-	if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
-		!gfp_pfmemalloc_allowed(gfp_mask)) {
+	if (can_direct_reclaim &&
+			(costly_order ||
+			   (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
+			&& !gfp_pfmemalloc_allowed(gfp_mask)) {
 		page = __alloc_pages_direct_compact(gfp_mask, order,
 						alloc_flags, ac,
 						INIT_COMPACT_PRIORITY,
@@ -3761,7 +3767,7 @@ retry_cpuset:
 		 * Checks for costly allocations with __GFP_NORETRY, which
 		 * includes THP page fault allocations
 		 */
-		if (gfp_mask & __GFP_NORETRY) {
+		if (costly_order && (gfp_mask & __GFP_NORETRY)) {
 			/*
 			 * If compaction is deferred for high-order allocations,
 			 * it is because sync compaction recently failed. If
@@ -3842,7 +3848,7 @@ retry:
 	 * Do not retry costly high order allocations unless they are
 	 * __GFP_REPEAT
 	 */
-	if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
+	if (costly_order && !(gfp_mask & __GFP_REPEAT))
 		goto nopage;
 
 	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
-- 
cgit v1.2.3


From 32795e89bce9e578747e284a9883b382542703f3 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 5 Apr 2017 09:20:46 +1000
Subject: mm, compaction: finish whole pageblock to reduce fragmentation

The main goal of direct compaction is to form a high-order page for
allocation, but it should also help against long-term fragmentation when
possible.  Most lower-than-pageblock-order compactions are for non-movable
allocations, which means that if we compact in a movable pageblock and
terminate as soon as we create the high-order page, it's unlikely that the
fallback heuristics will claim the whole block.  Instead there might be a
single unmovable page in a pageblock full of movable pages, and the next
unmovable allocation might pick another pageblock and increase long-term
fragmentation.

To help against such scenarios, this patch changes the termination
criteria for compaction so that the current pageblock is finished even
though the high-order page already exists.  Note that it might be possible
that the high-order page formed elsewhere in the zone due to parallel
activity, but this patch doesn't try to detect that.

This is only done with sync compaction, because async compaction is
limited to pageblock of the same migratetype, where it cannot result in a
migratetype fallback.  (Async compaction also eagerly skips order-aligned
blocks where isolation fails, which is against the goal of migrating away
as much of the pageblock as possible.)

As a result of this patch, long-term memory fragmentation should be
reduced.

In testing based on 4.9 kernel with stress-highalloc from mmtests
configured for order-4 GFP_KERNEL allocations, this patch has reduced the
number of unmovable allocations falling back to movable pageblocks by 20%.
The number

Link: http://lkml.kernel.org/r/20170307131545.28577-9-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/compaction.c | 36 ++++++++++++++++++++++++++++++++++--
 mm/internal.h   |  1 +
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 2c288e75840d..bc7903130501 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1318,6 +1318,17 @@ static enum compact_result __compact_finished(struct zone *zone,
 	if (is_via_compact_memory(cc->order))
 		return COMPACT_CONTINUE;
 
+	if (cc->finishing_block) {
+		/*
+		 * We have finished the pageblock, but better check again that
+		 * we really succeeded.
+		 */
+		if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
+			cc->finishing_block = false;
+		else
+			return COMPACT_CONTINUE;
+	}
+
 	/* Direct compactor: Is a suitable page free? */
 	for (order = cc->order; order < MAX_ORDER; order++) {
 		struct free_area *area = &zone->free_area[order];
@@ -1338,8 +1349,29 @@ static enum compact_result __compact_finished(struct zone *zone,
 		 * other migratetype buddy lists.
 		 */
 		if (find_suitable_fallback(area, order, migratetype,
-						true, &can_steal) != -1)
-			return COMPACT_SUCCESS;
+						true, &can_steal) != -1) {
+
+			/* movable pages are OK in any pageblock */
+			if (migratetype == MIGRATE_MOVABLE)
+				return COMPACT_SUCCESS;
+
+			/*
+			 * We are stealing for a non-movable allocation. Make
+			 * sure we finish compacting the current pageblock
+			 * first so it is as free as possible and we won't
+			 * have to steal another one soon. This only applies
+			 * to sync compaction, as async compaction operates
+			 * on pageblocks of the same migratetype.
+			 */
+			if (cc->mode == MIGRATE_ASYNC ||
+					IS_ALIGNED(cc->migrate_pfn,
+							pageblock_nr_pages)) {
+				return COMPACT_SUCCESS;
+			}
+
+			cc->finishing_block = true;
+			return COMPACT_CONTINUE;
+		}
 	}
 
 	return COMPACT_NO_SUITABLE_PAGE;
diff --git a/mm/internal.h b/mm/internal.h
index e7e709fd3043..0e4f558412fb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -202,6 +202,7 @@ struct compact_control {
 	bool direct_compaction;		/* False from kcompactd or /proc/... */
 	bool whole_zone;		/* Whole zone should/has been scanned */
 	bool contended;			/* Signal lock or sched contention */
+	bool finishing_block;		/* Finishing current pageblock */
 };
 
 unsigned long
-- 
cgit v1.2.3


From 7fc5212771801fb2f185ca61d0e3d6083e8b19c1 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:20:46 +1000
Subject: mm: do not use double negation for testing page flags

With the discussion[1], I found it seems there are every PageFlags
functions return bool at this moment so we don't need double negation
any more.  Although it's not a problem to keep it, it makes future
users confused to use double negation for them, too.

Remove such possibility.

[1] https://marc.info/?l=linux-kernel&m=148881578820434

Frankly sepaking, I like every PageFlags to return bool instead of int.
It will make it clear.  AFAIR, Chen Gang had tried it but don't know
why it was not merged at that time.

http://lkml.kernel.org/r/1469336184-1904-1-git-send-email-chengang@emindsoft.com.cn

Link: http://lkml.kernel.org/r/1488868597-32222-1-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Chen Gang <gang.chen.5i5j@gmail.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/khugepaged.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 88e4b1737c90..7cb9c88bb4a3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -548,7 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		 * The page must only be referenced by the scanned process
 		 * and page swap cache.
 		 */
-		if (page_count(page) != 1 + !!PageSwapCache(page)) {
+		if (page_count(page) != 1 + PageSwapCache(page)) {
 			unlock_page(page);
 			result = SCAN_PAGE_COUNT;
 			goto out;
@@ -1181,7 +1181,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 		 * The page must only be referenced by the scanned process
 		 * and page swap cache.
 		 */
-		if (page_count(page) != 1 + !!PageSwapCache(page)) {
+		if (page_count(page) != 1 + PageSwapCache(page)) {
 			result = SCAN_PAGE_COUNT;
 			goto out_unmap;
 		}
-- 
cgit v1.2.3


From ca737d720d0b5abeb7ccb4e42e4ea12a8b7574ed Mon Sep 17 00:00:00 2001
From: Shantanu Goel <sgoel01@yahoo.com>
Date: Wed, 5 Apr 2017 09:20:47 +1000
Subject: mm, vmscan: fix zone balance check in prepare_kswapd_sleep

Patch series "Reduce amount of time kswapd sleeps prematurely", v2.

The series is unusual in that the first patch fixes one problem and
introduces other issues that are noted in the changelog.  Patch 2 makes a
minor modification that is worth considering on its own but leaves the
kernel in a state where it behaves badly.  It's not until patch 3 that
there is an improvement against baseline.

This was mostly motivated by examining Chris Mason's "simoop" benchmark
which puts the VM under similar pressure to HADOOP.  It has been reported
that the benchmark has regressed severely during the last number of
releases.  While I cannot reproduce all the same problems Chris
experienced due to hardware limitations, there was a number of problems on
a 2-socket machine with a single disk.

simoop latencies
                                         4.11.0-rc1            4.11.0-rc1
                                            vanilla          keepawake-v2
Amean    p50-Read             21670074.18 (  0.00%) 22668332.52 ( -4.61%)
Amean    p95-Read             25456267.64 (  0.00%) 26738688.00 ( -5.04%)
Amean    p99-Read             29369064.73 (  0.00%) 30991404.52 ( -5.52%)
Amean    p50-Write                1390.30 (  0.00%)      924.91 ( 33.47%)
Amean    p95-Write              412901.57 (  0.00%)     1362.62 ( 99.67%)
Amean    p99-Write             6668722.09 (  0.00%)    16854.04 ( 99.75%)
Amean    p50-Allocation          78714.31 (  0.00%)    74729.74 (  5.06%)
Amean    p95-Allocation         175533.51 (  0.00%)   101609.74 ( 42.11%)
Amean    p99-Allocation         247003.02 (  0.00%)   125765.57 ( 49.08%)

These are latencies.  Read/write are threads reading fixed-size random
blocks from a simulated database.  The allocation latency is mmaping and
faulting regions of memory.  The p50, 95 and p99 reports the worst
latencies for 50% of the samples, 95% and 99% respectively.

For example, the report indicates that while the test was running 99% of
writes completed 99.75% faster.  It's worth noting that on a UMA machine
that no difference in performance with simoop was observed so milage will
vary.

It's noted that there is a slight impact to read latencies but it's mostly
due to IO scheduler decisions and offset by the large reduction in other
latencies.

This patch (of 3):

The check in prepare_kswapd_sleep needs to match the one in balance_pgdat
since the latter will return as soon as any one of the zones in the
classzone is above the watermark.  This is specially important for higher
order allocations since balance_pgdat will typically reset the order to
zero relying on compaction to create the higher order pages.  Without this
patch, prepare_kswapd_sleep fails to wake up kcompactd since the zone
balance check fails.

It was first reported against 4.9.7 that kswapd is failing to wake up
kcompactd due to a mismatch in the zone balance check between
balance_pgdat() and prepare_kswapd_sleep().  balance_pgdat() returns as
soon as a single zone satisfies the allocation but prepare_kswapd_sleep()
requires all zones to do +the same.  This causes prepare_kswapd_sleep() to
never succeed except in the order == 0 case and consequently,
wakeup_kcompactd() is never called.  For the machine that originally
motivated this patch, the state of compaction from /proc/vmstat looked
this way after a day and a half +of uptime:

compact_migrate_scanned 240496
compact_free_scanned 76238632
compact_isolated 123472
compact_stall 1791
compact_fail 29
compact_success 1762
compact_daemon_wake 0

After applying the patch and about 10 hours of uptime the state looks
like this:

compact_migrate_scanned 59927299
compact_free_scanned 2021075136
compact_isolated 640926
compact_stall 4
compact_fail 2
compact_success 2
compact_daemon_wake 5160

Further notes from Mel that motivated him to pick this patch up and
resend it;

It was observed for the simoop workload (pressures the VM similar to
HADOOP) that kswapd was failing to keep ahead of direct reclaim.  The
investigation noted that there was a need to rationalise kswapd decisions
to reclaim with kswapd decisions to sleep.  With this patch on a 2-socket
box, there was a 49% reduction in direct reclaim scanning.

However, the impact otherwise is extremely negative.  Kswapd reclaim
efficiency dropped from 98% to 76%.  simoop has three latency-related
metrics for read, write and allocation (an anonymous mmap and fault).

                                         4.11.0-rc1            4.11.0-rc1
                                            vanilla           fixcheck-v2
Amean    p50-Read             21670074.18 (  0.00%) 20464344.18 (  5.56%)
Amean    p95-Read             25456267.64 (  0.00%) 25721423.64 ( -1.04%)
Amean    p99-Read             29369064.73 (  0.00%) 30174230.76 ( -2.74%)
Amean    p50-Write                1390.30 (  0.00%)     1395.28 ( -0.36%)
Amean    p95-Write              412901.57 (  0.00%)    37737.74 ( 90.86%)
Amean    p99-Write             6668722.09 (  0.00%)   666489.04 ( 90.01%)
Amean    p50-Allocation          78714.31 (  0.00%)    86286.22 ( -9.62%)
Amean    p95-Allocation         175533.51 (  0.00%)   351812.27 (-100.42%)
Amean    p99-Allocation         247003.02 (  0.00%)  6291171.56 (-2447.00%)

Of greater concern is that the patch causes swapping and page writes from
kswapd context rose from 0 pages to 4189753 pages during the hour the
workload ran for.  By and large, the patch has very bad behaviour but
easily missed as the impact on a UMA machine is negligible.

This patch is included with the data in case a bisection leads to this area.
This patch is also a pre-requisite for the rest of the series.

Link: http://lkml.kernel.org/r/20170309075657.25121-2-mgorman@techsingularity.net
Signed-off-by: Shantanu Goel <sgoel01@yahoo.com>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3ad66580b8b4..1860bfab02c5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3103,11 +3103,11 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 		if (!managed_zone(zone))
 			continue;
 
-		if (!zone_balanced(zone, order, classzone_idx))
-			return false;
+		if (zone_balanced(zone, order, classzone_idx))
+			return true;
 	}
 
-	return true;
+	return false;
 }
 
 /*
@@ -3304,7 +3304,13 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 
 	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 
-	/* Try to sleep for a short interval */
+	/*
+	 * Try to sleep for a short interval. Note that kcompactd will only be
+	 * woken if it is possible to sleep for a short interval. This is
+	 * deliberate on the assumption that if reclaim cannot keep an
+	 * eligible zone balanced that it's also unlikely that compaction will
+	 * succeed.
+	 */
 	if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
 		/*
 		 * Compaction records what page blocks it recently failed to
-- 
cgit v1.2.3


From 68c78b1fdcb2ee72fd80741e920a42136431d18c Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 5 Apr 2017 09:20:48 +1000
Subject: mm, vmscan: only clear pgdat congested/dirty/writeback state when
 balanced

A pgdat tracks if recent reclaim encountered too many dirty, writeback or
congested pages.  The flags control whether kswapd writes pages back from
reclaim context, tags pages for immediate reclaim when IO completes,
whether processes block on wait_iff_congested and whether kswapd blocks
when too many pages marked for immediate reclaim are encountered.

The state is cleared in a check function with side-effects.  With the
patch "mm, vmscan: fix zone balance check in prepare_kswapd_sleep", the
timing of when the bits get cleared changed.  Due to the way the check
works, it'll clear the bits if ZONE_DMA is balanced for a GFP_DMA
allocation because it does not account for lowmem reserves properly.

For the simoop workload, kswapd is not stalling when it should due to the
premature clearing, writing pages from reclaim context like crazy and
generally being unhelpful.

This patch resets the pgdat bits related to page reclaim only when kswapd
is going to sleep.  The comparison with simoop is then

                                         4.11.0-rc1            4.11.0-rc1            4.11.0-rc1
                                            vanilla           fixcheck-v2              clear-v2
Amean    p50-Read             21670074.18 (  0.00%) 20464344.18 (  5.56%) 19786774.76 (  8.69%)
Amean    p95-Read             25456267.64 (  0.00%) 25721423.64 ( -1.04%) 24101956.27 (  5.32%)
Amean    p99-Read             29369064.73 (  0.00%) 30174230.76 ( -2.74%) 27691872.71 (  5.71%)
Amean    p50-Write                1390.30 (  0.00%)     1395.28 ( -0.36%)     1011.91 ( 27.22%)
Amean    p95-Write              412901.57 (  0.00%)    37737.74 ( 90.86%)    34874.98 ( 91.55%)
Amean    p99-Write             6668722.09 (  0.00%)   666489.04 ( 90.01%)   575449.60 ( 91.37%)
Amean    p50-Allocation          78714.31 (  0.00%)    86286.22 ( -9.62%)    84246.26 ( -7.03%)
Amean    p95-Allocation         175533.51 (  0.00%)   351812.27 (-100.42%)   400058.43 (-127.91%)
Amean    p99-Allocation         247003.02 (  0.00%)  6291171.56 (-2447.00%) 10905600.00 (-4315.17%)

Read latency is improved, write latency is mostly improved but allocation
latency is regressed.  kswapd is still reclaiming inefficiently, pages are
being written back from writeback context and a host of other issues.
However, given the change, it needed to be spelled out why the side-effect
was moved.

Link: http://lkml.kernel.org/r/20170309075657.25121-3-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shantanu Goel <sgoel01@yahoo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1860bfab02c5..8c553fa0d800 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3056,17 +3056,17 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
 	if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx))
 		return false;
 
-	/*
-	 * If any eligible zone is balanced then the node is not considered
-	 * to be congested or dirty
-	 */
-	clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
-	clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
-	clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags);
-
 	return true;
 }
 
+/* Clear pgdat state for congested, dirty or under writeback. */
+static void clear_pgdat_congested(pg_data_t *pgdat)
+{
+	clear_bit(PGDAT_CONGESTED, &pgdat->flags);
+	clear_bit(PGDAT_DIRTY, &pgdat->flags);
+	clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
+}
+
 /*
  * Prepare kswapd for sleeping. This verifies that there are no processes
  * waiting in throttle_direct_reclaim() and that watermarks have been met.
@@ -3103,8 +3103,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 		if (!managed_zone(zone))
 			continue;
 
-		if (zone_balanced(zone, order, classzone_idx))
+		if (zone_balanced(zone, order, classzone_idx)) {
+			clear_pgdat_congested(pgdat);
 			return true;
+		}
 	}
 
 	return false;
-- 
cgit v1.2.3


From 68487be26caaf5aba92272e37b538346d3a2cc18 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 5 Apr 2017 09:20:48 +1000
Subject: mm, vmscan: prevent kswapd sleeping prematurely due to mismatched
 classzone_idx

kswapd is woken to reclaim a node based on a failed allocation request
from any eligible zone.  Once reclaiming in balance_pgdat(), it will
continue reclaiming until there is an eligible zone available for the zone
it was woken for.  kswapd tracks what zone it was recently woken for in
pgdat->kswapd_classzone_idx.  If it has not been woken recently, this zone
will be 0.

However, the decision on whether to sleep is made on kswapd_classzone_idx
which is 0 without a recent wakeup request and that classzone does not
account for lowmem reserves.  This allows kswapd to sleep when a low small
zone such as ZONE_DMA is balanced for a GFP_DMA request even if a stream
of allocations cannot use that zone.  While kswapd may be woken again
shortly in the near future there are two consequences -- the pgdat bits
that control congestion are cleared prematurely and direct reclaim is more
likely as kswapd slept prematurely.

This patch flips kswapd_classzone_idx to default to MAX_NR_ZONES (an
invalid index) when there has been no recent wakeups.  If there are no
wakeups, it'll decide whether to sleep based on the highest possible zone
available (MAX_NR_ZONES - 1).  It then becomes critical that the "pgdat
balanced" decisions during reclaim and when deciding to sleep are the
same.  If there is a mismatch, kswapd can stay awake continually trying to
balance tiny zones.

simoop was used to evaluate it again.  Two of the preparation patches
regressed the workload so they are included as the second set of results.
Otherwise this patch looks artifically excellent

                                         4.11.0-rc1            4.11.0-rc1            4.11.0-rc1
                                            vanilla              clear-v2          keepawake-v2
Amean    p50-Read             21670074.18 (  0.00%) 19786774.76 (  8.69%) 22668332.52 ( -4.61%)
Amean    p95-Read             25456267.64 (  0.00%) 24101956.27 (  5.32%) 26738688.00 ( -5.04%)
Amean    p99-Read             29369064.73 (  0.00%) 27691872.71 (  5.71%) 30991404.52 ( -5.52%)
Amean    p50-Write                1390.30 (  0.00%)     1011.91 ( 27.22%)      924.91 ( 33.47%)
Amean    p95-Write              412901.57 (  0.00%)    34874.98 ( 91.55%)     1362.62 ( 99.67%)
Amean    p99-Write             6668722.09 (  0.00%)   575449.60 ( 91.37%)    16854.04 ( 99.75%)
Amean    p50-Allocation          78714.31 (  0.00%)    84246.26 ( -7.03%)    74729.74 (  5.06%)
Amean    p95-Allocation         175533.51 (  0.00%)   400058.43 (-127.91%)   101609.74 ( 42.11%)
Amean    p99-Allocation         247003.02 (  0.00%) 10905600.00 (-4315.17%)   125765.57 ( 49.08%)

With this patch on top, write and allocation latencies are massively
improved.  The read latencies are slightly impaired but it's worth noting
that this is mostly due to the IO scheduler and not directly related to
reclaim.  The vmstats are a bit of a mix but the relevant ones are as
follows;

                            4.10.0-rc7  4.10.0-rc7  4.10.0-rc7
                          mmots-20170209 clear-v1r25keepawake-v1r25
Swap Ins                             0           0           0
Swap Outs                            0         608           0
Direct pages scanned           6910672     3132699     6357298
Kswapd pages scanned          57036946    82488665    56986286
Kswapd pages reclaimed        55993488    63474329    55939113
Direct pages reclaimed         6905990     2964843     6352115
Kswapd efficiency                  98%         76%         98%
Kswapd velocity              12494.375   17597.507   12488.065
Direct efficiency                  99%         94%         99%
Direct velocity               1513.835     668.306    1393.148
Page writes by reclaim           0.000 4410243.000       0.000
Page writes file                     0     4409635           0
Page writes anon                     0         608           0
Page reclaim immediate         1036792    14175203     1042571

                            4.11.0-rc1  4.11.0-rc1  4.11.0-rc1
                               vanilla  clear-v2  keepawake-v2
Swap Ins                             0          12           0
Swap Outs                            0         838           0
Direct pages scanned           6579706     3237270     6256811
Kswapd pages scanned          61853702    79961486    54837791
Kswapd pages reclaimed        60768764    60755788    53849586
Direct pages reclaimed         6579055     2987453     6256151
Kswapd efficiency                  98%         75%         98%
Page writes by reclaim           0.000 4389496.000       0.000
Page writes file                     0     4388658           0
Page writes anon                     0         838           0
Page reclaim immediate         1073573    14473009      982507

Swap-outs are equivalent to baseline.

Direct reclaim is reduced but not eliminated.  It's worth noting that
there are two periods of direct reclaim for this workload.  The first is
when it switches from preparing the files for the actual test itself.
It's a lot of file IO followed by a lot of allocs that reclaims heavily
for a brief window.  While direct reclaim is lower with clear-v2, it is
due to kswapd scanning aggressively and trying to reclaim the world which
is not the right thing to do.  With the patches applied, there is still
direct reclaim but the phase change from "creating work files" to starting
multiple threads that allocate a lot of anonymous memory faster than
kswapd can reclaim.

Scanning/reclaim efficiency is restored by this patch.

Page writes from reclaim context are back at 0 which is ideal.

Pages immediately reclaimed after IO completes is slightly improved but it
is expected this will vary slightly.

On UMA, there is almost no change so this is not expected to be a universal
win.

Link: http://lkml.kernel.org/r/20170309075657.25121-4-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shantanu Goel <sgoel01@yahoo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory_hotplug.c |   2 +-
 mm/vmscan.c         | 120 +++++++++++++++++++++++++++++-----------------------
 2 files changed, 67 insertions(+), 55 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6fa7208bcd56..587eccfd4588 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1211,7 +1211,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 		/* Reset the nr_zones, order and classzone_idx before reuse */
 		pgdat->nr_zones = 0;
 		pgdat->kswapd_order = 0;
-		pgdat->kswapd_classzone_idx = 0;
+		pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
 	}
 
 	/* we can use NODE_DATA(nid) from here */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8c553fa0d800..8ce39867140b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3049,14 +3049,36 @@ static void age_active_anon(struct pglist_data *pgdat,
 	} while (memcg);
 }
 
-static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
+/*
+ * Returns true if there is an eligible zone balanced for the request order
+ * and classzone_idx
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
 {
-	unsigned long mark = high_wmark_pages(zone);
+	int i;
+	unsigned long mark = -1;
+	struct zone *zone;
 
-	if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx))
-		return false;
+	for (i = 0; i <= classzone_idx; i++) {
+		zone = pgdat->node_zones + i;
 
-	return true;
+		if (!managed_zone(zone))
+			continue;
+
+		mark = high_wmark_pages(zone);
+		if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
+			return true;
+	}
+
+	/*
+	 * If a node has no populated zone within classzone_idx, it does not
+	 * need balancing by definition. This can happen if a zone-restricted
+	 * allocation tries to wake a remote kswapd.
+	 */
+	if (mark == -1)
+		return true;
+
+	return false;
 }
 
 /* Clear pgdat state for congested, dirty or under writeback. */
@@ -3075,8 +3097,6 @@ static void clear_pgdat_congested(pg_data_t *pgdat)
  */
 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 {
-	int i;
-
 	/*
 	 * The throttled processes are normally woken up in balance_pgdat() as
 	 * soon as allow_direct_reclaim() is true. But there is a potential
@@ -3097,16 +3117,9 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
 		return true;
 
-	for (i = 0; i <= classzone_idx; i++) {
-		struct zone *zone = pgdat->node_zones + i;
-
-		if (!managed_zone(zone))
-			continue;
-
-		if (zone_balanced(zone, order, classzone_idx)) {
-			clear_pgdat_congested(pgdat);
-			return true;
-		}
+	if (pgdat_balanced(pgdat, order, classzone_idx)) {
+		clear_pgdat_congested(pgdat);
+		return true;
 	}
 
 	return false;
@@ -3212,23 +3225,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		}
 
 		/*
-		 * Only reclaim if there are no eligible zones. Check from
-		 * high to low zone as allocations prefer higher zones.
-		 * Scanning from low to high zone would allow congestion to be
-		 * cleared during a very small window when a small low
-		 * zone was balanced even under extreme pressure when the
-		 * overall node may be congested. Note that sc.reclaim_idx
-		 * is not used as buffer_heads_over_limit may have adjusted
-		 * it.
+		 * Only reclaim if there are no eligible zones. Note that
+		 * sc.reclaim_idx is not used as buffer_heads_over_limit may
+		 * have adjusted it.
 		 */
-		for (i = classzone_idx; i >= 0; i--) {
-			zone = pgdat->node_zones + i;
-			if (!managed_zone(zone))
-				continue;
-
-			if (zone_balanced(zone, sc.order, classzone_idx))
-				goto out;
-		}
+		if (pgdat_balanced(pgdat, sc.order, classzone_idx))
+			goto out;
 
 		/*
 		 * Do some background aging of the anon list, to give
@@ -3295,6 +3297,22 @@ out:
 	return sc.order;
 }
 
+/*
+ * pgdat->kswapd_classzone_idx is the highest zone index that a recent
+ * allocation request woke kswapd for. When kswapd has not woken recently,
+ * the value is MAX_NR_ZONES which is not a valid index. This compares a
+ * given classzone and returns it or the highest classzone index kswapd
+ * was recently woke for.
+ */
+static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
+					   enum zone_type classzone_idx)
+{
+	if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
+		return classzone_idx;
+
+	return max(pgdat->kswapd_classzone_idx, classzone_idx);
+}
+
 static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
 				unsigned int classzone_idx)
 {
@@ -3336,7 +3354,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 		 * the previous request that slept prematurely.
 		 */
 		if (remaining) {
-			pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+			pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
 			pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
 		}
 
@@ -3390,7 +3408,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
  */
 static int kswapd(void *p)
 {
-	unsigned int alloc_order, reclaim_order, classzone_idx;
+	unsigned int alloc_order, reclaim_order;
+	unsigned int classzone_idx = MAX_NR_ZONES - 1;
 	pg_data_t *pgdat = (pg_data_t*)p;
 	struct task_struct *tsk = current;
 
@@ -3420,20 +3439,23 @@ static int kswapd(void *p)
 	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
 	set_freezable();
 
-	pgdat->kswapd_order = alloc_order = reclaim_order = 0;
-	pgdat->kswapd_classzone_idx = classzone_idx = 0;
+	pgdat->kswapd_order = 0;
+	pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
 	for ( ; ; ) {
 		bool ret;
 
+		alloc_order = reclaim_order = pgdat->kswapd_order;
+		classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
+
 kswapd_try_sleep:
 		kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
 					classzone_idx);
 
 		/* Read the new order and classzone_idx */
 		alloc_order = reclaim_order = pgdat->kswapd_order;
-		classzone_idx = pgdat->kswapd_classzone_idx;
+		classzone_idx = kswapd_classzone_idx(pgdat, 0);
 		pgdat->kswapd_order = 0;
-		pgdat->kswapd_classzone_idx = 0;
+		pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
 
 		ret = try_to_freeze();
 		if (kthread_should_stop())
@@ -3459,9 +3481,6 @@ kswapd_try_sleep:
 		reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
 		if (reclaim_order < alloc_order)
 			goto kswapd_try_sleep;
-
-		alloc_order = reclaim_order = pgdat->kswapd_order;
-		classzone_idx = pgdat->kswapd_classzone_idx;
 	}
 
 	tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
@@ -3477,7 +3496,6 @@ kswapd_try_sleep:
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 {
 	pg_data_t *pgdat;
-	int z;
 
 	if (!managed_zone(zone))
 		return;
@@ -3485,7 +3503,8 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 	if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
 		return;
 	pgdat = zone->zone_pgdat;
-	pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+	pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
+							   classzone_idx);
 	pgdat->kswapd_order = max(pgdat->kswapd_order, order);
 	if (!waitqueue_active(&pgdat->kswapd_wait))
 		return;
@@ -3494,17 +3513,10 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
 		return;
 
-	/* Only wake kswapd if all zones are unbalanced */
-	for (z = 0; z <= classzone_idx; z++) {
-		zone = pgdat->node_zones + z;
-		if (!managed_zone(zone))
-			continue;
-
-		if (zone_balanced(zone, order, classzone_idx))
-			return;
-	}
+	if (pgdat_balanced(pgdat, order, classzone_idx))
+		return;
 
-	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
+	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
 	wake_up_interruptible(&pgdat->kswapd_wait);
 }
 
-- 
cgit v1.2.3


From bfd894cbfd10d0b00e1464c8b459da9de2315238 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 5 Apr 2017 09:20:49 +1000
Subject: mm: page_alloc: __GFP_NOWARN shouldn't suppress stall warnings

__GFP_NOWARN, which is usually added to avoid warnings from callsites that
expect to fail and have fallbacks, currently also suppresses allocation
stall warnings.  These trigger when an allocation is stuck inside the
allocator for 10 seconds or longer.

But there is no class of allocations that can get legitimately stuck in
the allocator for this long.  This always indicates a problem.

Always emit stall warnings. Restrict __GFP_NOWARN to alloc failures.

Link: http://lkml.kernel.org/r/20170125181150.GA16398@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7d0332b5f3d8..975b081b5c6c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3818,7 +3818,7 @@ retry:
 
 	/* Make sure we know about allocations which stall for too long */
 	if (time_after(jiffies, alloc_start + stall_timeout)) {
-		warn_alloc(gfp_mask, ac->nodemask,
+		warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,
 			"page allocation stalls for %ums, order:%u",
 			jiffies_to_msecs(jiffies-alloc_start), order);
 		stall_timeout += 10 * HZ;
-- 
cgit v1.2.3


From 4a97827a90ad87b8acc7a0e23c8bb30215d4a217 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Wed, 5 Apr 2017 09:20:49 +1000
Subject: mm/sparse: refine usemap_size() a little

Current implementation calculates usemap_size in two steps:
    * calculate number of bytes to cover these bits
    * calculate number of "unsigned long" to cover these bytes

It would be more clear by:
    * calculate number of "unsigned long" to cover these bits
    * multiple it with sizeof(unsigned long)

This patch refine usemap_size() a little to make it more easy to
understand.

Link: http://lkml.kernel.org/r/20170310043713.96871-1-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/sparse.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/mm/sparse.c b/mm/sparse.c
index db6bf3c97ea2..6903c8fc3085 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -248,10 +248,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms,
 
 unsigned long usemap_size(void)
 {
-	unsigned long size_bytes;
-	size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8;
-	size_bytes = roundup(size_bytes, sizeof(unsigned long));
-	return size_bytes;
+	return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-- 
cgit v1.2.3


From 24ac7307a6c911995f8311b2eb967e85a2442233 Mon Sep 17 00:00:00 2001
From: Yisheng Xie <xieyisheng1@huawei.com>
Date: Wed, 5 Apr 2017 09:20:50 +1000
Subject: mm/compaction: ignore block suitable after check large free page

By reviewing code, I find that if the migrate target is a large free page
and we ignore suitable, it may splite large target free page into smaller
block which is not good for defrag.  So move the ignore block suitable
after check large free page.

As Vlastimil pointed out in RFC version that this patch is just based on
logical analyses which might be better for future-proofing the function
and it is most likely won't have any visible effect right now, for direct
compaction shouldn't have to be called if there's a >=pageblock_order page
already available.

Link: http://lkml.kernel.org/r/1489490743-5364-1-git-send-email-xieyisheng1@huawei.com
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/compaction.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index bc7903130501..613c59e928cb 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1003,9 +1003,6 @@ static bool suitable_migration_source(struct compact_control *cc,
 static bool suitable_migration_target(struct compact_control *cc,
 							struct page *page)
 {
-	if (cc->ignore_block_suitable)
-		return true;
-
 	/* If the page is a large free page, then disallow migration */
 	if (PageBuddy(page)) {
 		/*
@@ -1017,6 +1014,9 @@ static bool suitable_migration_target(struct compact_control *cc,
 			return false;
 	}
 
+	if (cc->ignore_block_suitable)
+		return true;
+
 	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
 	if (is_migrate_movable(get_pageblock_migratetype(page)))
 		return true;
-- 
cgit v1.2.3


From c1b1c16b17cf9b2ad64215709330d53c8c4f5ef1 Mon Sep 17 00:00:00 2001
From: Yisheng Xie <xieyisheng1@huawei.com>
Date: Wed, 5 Apr 2017 09:20:50 +1000
Subject: mm/vmscan: more restrictive condition for retry in
 do_try_to_free_pages

By reviewing code, I find that when enter do_try_to_free_pages, the
may_thrash is always clear, and it will retry shrink zones to tap cgroup's
reserves memory by setting may_thrash when the former shrink_zones reclaim
nothing.

However, when memcg is disabled or on legacy hierarchy, or there do not
have any memcg protected by low limit, it should not do this useless retry
at all, for we do not have any cgroup's reserves memory to tap, and we
have already done hard work but made no progress, which as Michal pointed
out in former version, we are trying hard to control the retry logical of
page alloctor, and the current additional round of reclaim is just lame.

Therefore, to avoid this unneeded retrying and make code more readable, we
remove the may_thrash field in scan_control, instead, introduce
memcg_low_reclaim and memcg_low_skipped, and only retry when
memcg_low_skipped, by setting memcg_low_reclaim.

Link: http://lkml.kernel.org/r/1490191893-5923-1-git-send-email-ysxie@foxmail.com
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Michal Hocko <mhocko@kernel.org>
Suggested-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8ce39867140b..068f21a1fa31 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -100,6 +100,9 @@ struct scan_control {
 	/* Can cgroups be reclaimed below their normal consumption range? */
 	unsigned int may_thrash:1;
 
+	/* Did we have any memcg protected by the low limit */
+	unsigned int memcg_low_protection:1;
+
 	unsigned int hibernation_mode:1;
 
 	/* One of the zones is ready for compaction */
@@ -2512,6 +2515,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 			unsigned long scanned;
 
 			if (mem_cgroup_low(root, memcg)) {
+				sc->memcg_low_protection = 1;
+
 				if (!sc->may_thrash)
 					continue;
 				mem_cgroup_events(memcg, MEMCG_LOW, 1);
@@ -2768,7 +2773,7 @@ retry:
 		return 1;
 
 	/* Untapped cgroup reserves?  Don't OOM, retry. */
-	if (!sc->may_thrash) {
+	if (sc->memcg_low_protection && !sc->may_thrash) {
 		sc->priority = initial_priority;
 		sc->may_thrash = 1;
 		goto retry;
-- 
cgit v1.2.3


From 3597871d972d4d11d74cdf9d00c6b7fecd652cc3 Mon Sep 17 00:00:00 2001
From: Yisheng Xie <xieyisheng1@huawei.com>
Date: Wed, 5 Apr 2017 09:20:51 +1000
Subject: 
 mm-vmscan-more-restrictive-condition-for-retry-in-do_try_to_free_pages-v5

 - remove may_thrash field in scan_control, and introduce mem_cgroup_reclaim
   and memcg_low_skipped to make code more readable. - Johannes

Link: http://lkml.kernel.org/r/1490191893-5923-1-git-send-email-ysxie@foxmail.com
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Michal Hocko <mhocko@kernel.org>
Suggested-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 068f21a1fa31..e54c882d6789 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -97,11 +97,13 @@ struct scan_control {
 	/* Can pages be swapped as part of reclaim? */
 	unsigned int may_swap:1;
 
-	/* Can cgroups be reclaimed below their normal consumption range? */
-	unsigned int may_thrash:1;
-
-	/* Did we have any memcg protected by the low limit */
-	unsigned int memcg_low_protection:1;
+	/*
+	 * Cgroups are not reclaimed below their configured memory.low,
+	 * unless we threaten to OOM. If any cgroups are skipped due to
+	 * memory.low and nothing was reclaimed, go back for memory.low.
+	 */
+	unsigned int memcg_low_reclaim:1;
+	unsigned int memcg_low_skipped:1;
 
 	unsigned int hibernation_mode:1;
 
@@ -2515,10 +2517,10 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 			unsigned long scanned;
 
 			if (mem_cgroup_low(root, memcg)) {
-				sc->memcg_low_protection = 1;
-
-				if (!sc->may_thrash)
+				if (!sc->memcg_low_reclaim) {
+					sc->memcg_low_skipped = 1;
 					continue;
+				}
 				mem_cgroup_events(memcg, MEMCG_LOW, 1);
 			}
 
@@ -2773,9 +2775,10 @@ retry:
 		return 1;
 
 	/* Untapped cgroup reserves?  Don't OOM, retry. */
-	if (sc->memcg_low_protection && !sc->may_thrash) {
+	if (sc->memcg_low_skipped) {
 		sc->priority = initial_priority;
-		sc->may_thrash = 1;
+		sc->memcg_low_reclaim = 1;
+		sc->memcg_low_skipped = 0;
 		goto retry;
 	}
 
-- 
cgit v1.2.3


From c5f2c8719aa0935ecaeb842cd5317489e2d87129 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:20:52 +1000
Subject: mm: remove unncessary ret in page_referenced

Nobody uses ret variable. Remove it.

Link: http://lkml.kernel.org/r/1489555493-14659-2-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/rmap.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index a19bd8b8ab0d..4baf504e4213 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -812,7 +812,6 @@ int page_referenced(struct page *page,
 		    struct mem_cgroup *memcg,
 		    unsigned long *vm_flags)
 {
-	int ret;
 	int we_locked = 0;
 	struct page_referenced_arg pra = {
 		.mapcount = total_mapcount(page),
@@ -846,7 +845,7 @@ int page_referenced(struct page *page,
 		rwc.invalid_vma = invalid_page_referenced_vma;
 	}
 
-	ret = rmap_walk(page, &rwc);
+	rmap_walk(page, &rwc);
 	*vm_flags = pra.vm_flags;
 
 	if (we_locked)
-- 
cgit v1.2.3


From 9bf5b6c237b198bd3d8b9fd37466566ae0d72941 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:20:52 +1000
Subject: mm: remove SWAP_DIRTY in ttu

If we found lazyfree page is dirty, try_to_unmap_one can just
SetPageSwapBakced in there like PG_mlocked page and just return with
SWAP_FAIL which is very natural because the page is not swappable right
now so that vmscan can activate it.  There is no point to introduce new
return value SWAP_DIRTY in try_to_unmap at the moment.

Link: http://lkml.kernel.org/r/1489555493-14659-3-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 1 -
 mm/rmap.c            | 4 ++--
 mm/vmscan.c          | 3 ---
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index fee10d744ebd..b556eefa62bc 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -298,6 +298,5 @@ static inline int page_mkclean(struct page *page)
 #define SWAP_AGAIN	1
 #define SWAP_FAIL	2
 #define SWAP_MLOCK	3
-#define SWAP_DIRTY	4
 
 #endif	/* _LINUX_RMAP_H */
diff --git a/mm/rmap.c b/mm/rmap.c
index 4baf504e4213..f6aa18d8a420 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1436,7 +1436,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 				 * discarded. Remap the page to page table.
 				 */
 				set_pte_at(mm, address, pvmw.pte, pteval);
-				ret = SWAP_DIRTY;
+				SetPageSwapBacked(page);
+				ret = SWAP_FAIL;
 				page_vma_mapped_walk_done(&pvmw);
 				break;
 			}
@@ -1506,7 +1507,6 @@ static int page_mapcount_is_zero(struct page *page)
  * SWAP_AGAIN	- we missed a mapping, try again later
  * SWAP_FAIL	- the page is unswappable
  * SWAP_MLOCK	- page is mlocked.
- * SWAP_DIRTY	- page is dirty MADV_FREE page
  */
 int try_to_unmap(struct page *page, enum ttu_flags flags)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e54c882d6789..f1fd388454bd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1147,9 +1147,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (page_mapped(page)) {
 			switch (ret = try_to_unmap(page,
 				ttu_flags | TTU_BATCH_FLUSH)) {
-			case SWAP_DIRTY:
-				SetPageSwapBacked(page);
-				/* fall through */
 			case SWAP_FAIL:
 				nr_unmap_fail++;
 				goto activate_locked;
-- 
cgit v1.2.3


From 45d2347672d947da64f95f22f947a2a843917aaf Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:20:53 +1000
Subject: mm: remove SWAP_MLOCK check for SWAP_SUCCESS in ttu

If the page is mapped and rescue in try_to_unmap_one, page_mapcount(page)
== 0 cannot be true so page_mapcount check in try_to_unmap is enough to
return SWAP_SUCCESS.  IOW, SWAP_MLOCK check is redundant so remove it.

Link: http://lkml.kernel.org/r/1489555493-14659-4-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/rmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index f6aa18d8a420..dfe40557ea29 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1535,7 +1535,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
 	else
 		ret = rmap_walk(page, &rwc);
 
-	if (ret != SWAP_MLOCK && !page_mapcount(page))
+	if (!page_mapcount(page))
 		ret = SWAP_SUCCESS;
 	return ret;
 }
-- 
cgit v1.2.3


From 291d76b59c81abf62a3f94ee019231f38c93e6c2 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:20:54 +1000
Subject: mm: make try_to_munlock() return void

try_to_munlock returns SWAP_MLOCK if the one of VMAs mapped the page has
VM_LOCKED flag.  In that time, VM set PG_mlocked to the page if the page
is not pte-mapped THP which cannot be mlocked, either.

With that, __munlock_isolated_page can use PageMlocked to check whether
try_to_munlock is successful or not without relying on try_to_munlock's
retval.  It helps to make try_to_unmap/try_to_unmap_one simple with
upcoming patches.

Link: http://lkml.kernel.org/r/1489555493-14659-5-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h |  2 +-
 mm/mlock.c           |  6 ++----
 mm/rmap.c            | 17 +++++------------
 3 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b556eefa62bc..1b0cd4cf68e3 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -235,7 +235,7 @@ int page_mkclean(struct page *);
  * called in munlock()/munmap() path to check for other vmas holding
  * the page mlocked.
  */
-int try_to_munlock(struct page *);
+void try_to_munlock(struct page *);
 
 void remove_migration_ptes(struct page *old, struct page *new, bool locked);
 
diff --git a/mm/mlock.c b/mm/mlock.c
index 0dd9ca18e19e..c483c5c20b4b 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -123,17 +123,15 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
  */
 static void __munlock_isolated_page(struct page *page)
 {
-	int ret = SWAP_AGAIN;
-
 	/*
 	 * Optimization: if the page was mapped just once, that's our mapping
 	 * and we don't need to check all the other vmas.
 	 */
 	if (page_mapcount(page) > 1)
-		ret = try_to_munlock(page);
+		try_to_munlock(page);
 
 	/* Did try_to_unlock() succeed or punt? */
-	if (ret != SWAP_MLOCK)
+	if (!PageMlocked(page))
 		count_vm_event(UNEVICTABLE_PGMUNLOCKED);
 
 	putback_lru_page(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index dfe40557ea29..d59112367d47 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1552,18 +1552,10 @@ static int page_not_mapped(struct page *page)
  * Called from munlock code.  Checks all of the VMAs mapping the page
  * to make sure nobody else has this page mlocked. The page will be
  * returned with PG_mlocked cleared if no other vmas have it mlocked.
- *
- * Return values are:
- *
- * SWAP_AGAIN	- no vma is holding page mlocked, or,
- * SWAP_AGAIN	- page mapped in mlocked vma -- couldn't acquire mmap sem
- * SWAP_FAIL	- page cannot be located at present
- * SWAP_MLOCK	- page is now mlocked.
  */
-int try_to_munlock(struct page *page)
-{
-	int ret;
 
+void try_to_munlock(struct page *page)
+{
 	struct rmap_walk_control rwc = {
 		.rmap_one = try_to_unmap_one,
 		.arg = (void *)TTU_MUNLOCK,
@@ -1573,9 +1565,10 @@ int try_to_munlock(struct page *page)
 	};
 
 	VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
+	VM_BUG_ON_PAGE(PageMlocked(page), page);
+	VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
 
-	ret = rmap_walk(page, &rwc);
-	return ret;
+	rmap_walk(page, &rwc);
 }
 
 void __put_anon_vma(struct anon_vma *anon_vma)
-- 
cgit v1.2.3


From 3669edd331aecfb4967f468a0204692e1ab10584 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:20:55 +1000
Subject: mm: remove SWAP_MLOCK in ttu

ttu doesn't need to return SWAP_MLOCK.  Instead, just return SWAP_FAIL
because it means the page is not-swappable so it should move to another
LRU list(active or unevictable).  putback friends will move it to right
list depending on the page's LRU flag.

Link: http://lkml.kernel.org/r/1489555493-14659-6-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h |  1 -
 mm/rmap.c            |  3 +--
 mm/vmscan.c          | 20 +++++++-------------
 3 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 1b0cd4cf68e3..3630d4dcee13 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -297,6 +297,5 @@ static inline int page_mkclean(struct page *page)
 #define SWAP_SUCCESS	0
 #define SWAP_AGAIN	1
 #define SWAP_FAIL	2
-#define SWAP_MLOCK	3
 
 #endif	/* _LINUX_RMAP_H */
diff --git a/mm/rmap.c b/mm/rmap.c
index d59112367d47..4935d317e654 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1329,7 +1329,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 					 */
 					mlock_vma_page(page);
 				}
-				ret = SWAP_MLOCK;
+				ret = SWAP_FAIL;
 				page_vma_mapped_walk_done(&pvmw);
 				break;
 			}
@@ -1506,7 +1506,6 @@ static int page_mapcount_is_zero(struct page *page)
  * SWAP_SUCCESS	- we succeeded in removing all mappings
  * SWAP_AGAIN	- we missed a mapping, try again later
  * SWAP_FAIL	- the page is unswappable
- * SWAP_MLOCK	- page is mlocked.
  */
 int try_to_unmap(struct page *page, enum ttu_flags flags)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f1fd388454bd..cfd2651966a8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -987,7 +987,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		sc->nr_scanned++;
 
 		if (unlikely(!page_evictable(page)))
-			goto cull_mlocked;
+			goto activate_locked;
 
 		if (!sc->may_unmap && page_mapped(page))
 			goto keep_locked;
@@ -1152,8 +1152,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				goto activate_locked;
 			case SWAP_AGAIN:
 				goto keep_locked;
-			case SWAP_MLOCK:
-				goto cull_mlocked;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
 			}
@@ -1295,20 +1293,16 @@ free_it:
 		list_add(&page->lru, &free_pages);
 		continue;
 
-cull_mlocked:
-		if (PageSwapCache(page))
-			try_to_free_swap(page);
-		unlock_page(page);
-		list_add(&page->lru, &ret_pages);
-		continue;
-
 activate_locked:
 		/* Not a candidate for swapping, so reclaim swap space. */
-		if (PageSwapCache(page) && mem_cgroup_swap_full(page))
+		if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
+						PageMlocked(page)))
 			try_to_free_swap(page);
 		VM_BUG_ON_PAGE(PageActive(page), page);
-		SetPageActive(page);
-		pgactivate++;
+		if (!PageMlocked(page)) {
+			SetPageActive(page);
+			pgactivate++;
+		}
 keep_locked:
 		unlock_page(page);
 keep:
-- 
cgit v1.2.3


From 2955ab2b800fcf08ae908aa62d8ae32fc933b600 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:20:56 +1000
Subject: mm: remove SWAP_AGAIN in ttu

In 2002, [1] introduced SWAP_AGAIN.  At that time, try_to_unmap_one used
spin_trylock(&mm->page_table_lock) so it's really easy to contend and fail
to hold a lock so SWAP_AGAIN to keep LRU status makes sense.

However, now we changed it to mutex-based lock and be able to block
without skip pte so there is few of small window to return SWAP_AGAIN so
remove SWAP_AGAIN and just return SWAP_FAIL.

[1] c48c43e, minimal rmap

Link: http://lkml.kernel.org/r/1489555493-14659-7-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/rmap.c   | 11 +++--------
 mm/vmscan.c |  2 --
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 4935d317e654..84ea7cc17c20 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1504,13 +1504,10 @@ static int page_mapcount_is_zero(struct page *page)
  * Return values are:
  *
  * SWAP_SUCCESS	- we succeeded in removing all mappings
- * SWAP_AGAIN	- we missed a mapping, try again later
  * SWAP_FAIL	- the page is unswappable
  */
 int try_to_unmap(struct page *page, enum ttu_flags flags)
 {
-	int ret;
-
 	struct rmap_walk_control rwc = {
 		.rmap_one = try_to_unmap_one,
 		.arg = (void *)flags,
@@ -1530,13 +1527,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
 		rwc.invalid_vma = invalid_migration_vma;
 
 	if (flags & TTU_RMAP_LOCKED)
-		ret = rmap_walk_locked(page, &rwc);
+		rmap_walk_locked(page, &rwc);
 	else
-		ret = rmap_walk(page, &rwc);
+		rmap_walk(page, &rwc);
 
-	if (!page_mapcount(page))
-		ret = SWAP_SUCCESS;
-	return ret;
+	return !page_mapcount(page) ? SWAP_SUCCESS : SWAP_FAIL;
 }
 
 static int page_not_mapped(struct page *page)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cfd2651966a8..f80a54da5f7f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1150,8 +1150,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			case SWAP_FAIL:
 				nr_unmap_fail++;
 				goto activate_locked;
-			case SWAP_AGAIN:
-				goto keep_locked;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
 			}
-- 
cgit v1.2.3


From 304ba3e7009a075c0a5cd53bab4f93ece95f3c2d Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:20:57 +1000
Subject: mm: make ttu's return boolean

try_to_unmap() returns SWAP_SUCCESS or SWAP_FAIL so it's suitable for
boolean return.  This patch changes it.

Link: http://lkml.kernel.org/r/1489555493-14659-8-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h |  4 ++--
 mm/huge_memory.c     |  6 +++---
 mm/memory-failure.c  | 26 ++++++++++++--------------
 mm/rmap.c            |  8 +++-----
 mm/vmscan.c          |  7 +------
 5 files changed, 21 insertions(+), 30 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 3630d4dcee13..6028c38d3cac 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -191,7 +191,7 @@ static inline void page_dup_rmap(struct page *page, bool compound)
 int page_referenced(struct page *, int is_locked,
 			struct mem_cgroup *memcg, unsigned long *vm_flags);
 
-int try_to_unmap(struct page *, enum ttu_flags flags);
+bool try_to_unmap(struct page *, enum ttu_flags flags);
 
 /* Avoid racy checks */
 #define PVMW_SYNC		(1 << 0)
@@ -281,7 +281,7 @@ static inline int page_referenced(struct page *page, int is_locked,
 	return 0;
 }
 
-#define try_to_unmap(page, refs) SWAP_FAIL
+#define try_to_unmap(page, refs) false
 
 static inline int page_mkclean(struct page *page)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c36094c4fa4b..d14dd961f626 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2144,15 +2144,15 @@ static void freeze_page(struct page *page)
 {
 	enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
 		TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
-	int ret;
+	bool unmap_success;
 
 	VM_BUG_ON_PAGE(!PageHead(page), page);
 
 	if (PageAnon(page))
 		ttu_flags |= TTU_MIGRATION;
 
-	ret = try_to_unmap(page, ttu_flags);
-	VM_BUG_ON_PAGE(ret, page);
+	unmap_success = try_to_unmap(page, ttu_flags);
+	VM_BUG_ON_PAGE(!unmap_success, page);
 }
 
 static void unfreeze_page(struct page *page)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f85adfe57484..3d3cf6add4c1 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -322,7 +322,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
  * wrong earlier.
  */
 static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
-			  int fail, struct page *page, unsigned long pfn,
+			  bool fail, struct page *page, unsigned long pfn,
 			  int flags)
 {
 	struct to_kill *tk, *next;
@@ -904,13 +904,13 @@ EXPORT_SYMBOL_GPL(get_hwpoison_page);
  * Do all that is necessary to remove user space mappings. Unmap
  * the pages and send SIGBUS to the processes if the data was dirty.
  */
-static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
+static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 				  int trapno, int flags, struct page **hpagep)
 {
 	enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
 	struct address_space *mapping;
 	LIST_HEAD(tokill);
-	int ret;
+	bool unmap_success;
 	int kill = 1, forcekill;
 	struct page *hpage = *hpagep;
 
@@ -919,20 +919,20 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	 * other types of pages.
 	 */
 	if (PageReserved(p) || PageSlab(p))
-		return SWAP_SUCCESS;
+		return true;
 	if (!(PageLRU(hpage) || PageHuge(p)))
-		return SWAP_SUCCESS;
+		return true;
 
 	/*
 	 * This check implies we don't kill processes if their pages
 	 * are in the swap cache early. Those are always late kills.
 	 */
 	if (!page_mapped(hpage))
-		return SWAP_SUCCESS;
+		return true;
 
 	if (PageKsm(p)) {
 		pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
-		return SWAP_FAIL;
+		return false;
 	}
 
 	if (PageSwapCache(p)) {
@@ -971,8 +971,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	if (kill)
 		collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
 
-	ret = try_to_unmap(hpage, ttu);
-	if (ret != SWAP_SUCCESS)
+	unmap_success = try_to_unmap(hpage, ttu);
+	if (!unmap_success)
 		pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
 		       pfn, page_mapcount(hpage));
 
@@ -987,10 +987,9 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	 * any accesses to the poisoned memory.
 	 */
 	forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
-	kill_procs(&tokill, forcekill, trapno,
-		      ret != SWAP_SUCCESS, p, pfn, flags);
+	kill_procs(&tokill, forcekill, trapno, !unmap_success, p, pfn, flags);
 
-	return ret;
+	return unmap_success;
 }
 
 static void set_page_hwpoison_huge_page(struct page *hpage)
@@ -1230,8 +1229,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 	 * When the raw error page is thp tail page, hpage points to the raw
 	 * page after thp split.
 	 */
-	if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
-	    != SWAP_SUCCESS) {
+	if (!hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)) {
 		action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
 		res = -EBUSY;
 		goto out;
diff --git a/mm/rmap.c b/mm/rmap.c
index 84ea7cc17c20..7eaca35c0f35 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1501,12 +1501,10 @@ static int page_mapcount_is_zero(struct page *page)
  *
  * Tries to remove all the page table entries which are mapping this
  * page, used in the pageout path.  Caller must hold the page lock.
- * Return values are:
  *
- * SWAP_SUCCESS	- we succeeded in removing all mappings
- * SWAP_FAIL	- the page is unswappable
+ * If unmap is successful, return true. Otherwise, false.
  */
-int try_to_unmap(struct page *page, enum ttu_flags flags)
+bool try_to_unmap(struct page *page, enum ttu_flags flags)
 {
 	struct rmap_walk_control rwc = {
 		.rmap_one = try_to_unmap_one,
@@ -1531,7 +1529,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
 	else
 		rmap_walk(page, &rwc);
 
-	return !page_mapcount(page) ? SWAP_SUCCESS : SWAP_FAIL;
+	return !page_mapcount(page) ? true : false;
 }
 
 static int page_not_mapped(struct page *page)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f80a54da5f7f..7a30150b4dee 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -972,7 +972,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		int may_enter_fs;
 		enum page_references references = PAGEREF_RECLAIM_CLEAN;
 		bool dirty, writeback;
-		int ret = SWAP_SUCCESS;
 
 		cond_resched();
 
@@ -1145,13 +1144,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page)) {
-			switch (ret = try_to_unmap(page,
-				ttu_flags | TTU_BATCH_FLUSH)) {
-			case SWAP_FAIL:
+			if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) {
 				nr_unmap_fail++;
 				goto activate_locked;
-			case SWAP_SUCCESS:
-				; /* try to free the page below */
 			}
 		}
 
-- 
cgit v1.2.3


From 63556f68b73a2a7b55e65552c6c7a80c36f7ffba Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:20:57 +1000
Subject: mm: make rmap_walk() return void

There is no user of the return value from rmap_walk() and friends so this
patch makes them void-returning functions.

Link: http://lkml.kernel.org/r/1489555493-14659-9-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ksm.h  |  5 ++---
 include/linux/rmap.h |  4 ++--
 mm/ksm.c             | 16 ++++++----------
 mm/rmap.c            | 32 +++++++++++++-------------------
 4 files changed, 23 insertions(+), 34 deletions(-)

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index e1cfda4bee58..78b44a024eaa 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -61,7 +61,7 @@ static inline void set_page_stable_node(struct page *page,
 struct page *ksm_might_need_to_copy(struct page *page,
 			struct vm_area_struct *vma, unsigned long address);
 
-int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
+void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
 void ksm_migrate_page(struct page *newpage, struct page *oldpage);
 
 #else  /* !CONFIG_KSM */
@@ -94,10 +94,9 @@ static inline int page_referenced_ksm(struct page *page,
 	return 0;
 }
 
-static inline int rmap_walk_ksm(struct page *page,
+static inline void rmap_walk_ksm(struct page *page,
 			struct rmap_walk_control *rwc)
 {
-	return 0;
 }
 
 static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 6028c38d3cac..1d7d457ca0dc 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -264,8 +264,8 @@ struct rmap_walk_control {
 	bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
 };
 
-int rmap_walk(struct page *page, struct rmap_walk_control *rwc);
-int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc);
+void rmap_walk(struct page *page, struct rmap_walk_control *rwc);
+void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc);
 
 #else	/* !CONFIG_MMU */
 
diff --git a/mm/ksm.c b/mm/ksm.c
index 19b4f2dea7a5..6edffb9a795b 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1933,11 +1933,10 @@ struct page *ksm_might_need_to_copy(struct page *page,
 	return new_page;
 }
 
-int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
 {
 	struct stable_node *stable_node;
 	struct rmap_item *rmap_item;
-	int ret = SWAP_AGAIN;
 	int search_new_forks = 0;
 
 	VM_BUG_ON_PAGE(!PageKsm(page), page);
@@ -1950,7 +1949,7 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
 
 	stable_node = page_stable_node(page);
 	if (!stable_node)
-		return ret;
+		return;
 again:
 	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
 		struct anon_vma *anon_vma = rmap_item->anon_vma;
@@ -1978,23 +1977,20 @@ again:
 			if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
 				continue;
 
-			ret = rwc->rmap_one(page, vma,
-					rmap_item->address, rwc->arg);
-			if (ret != SWAP_AGAIN) {
+			if (SWAP_AGAIN != rwc->rmap_one(page, vma,
+					rmap_item->address, rwc->arg)) {
 				anon_vma_unlock_read(anon_vma);
-				goto out;
+				return;
 			}
 			if (rwc->done && rwc->done(page)) {
 				anon_vma_unlock_read(anon_vma);
-				goto out;
+				return;
 			}
 		}
 		anon_vma_unlock_read(anon_vma);
 	}
 	if (!search_new_forks++)
 		goto again;
-out:
-	return ret;
 }
 
 #ifdef CONFIG_MIGRATION
diff --git a/mm/rmap.c b/mm/rmap.c
index 7eaca35c0f35..7f22e8af80b5 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1608,13 +1608,12 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * LOCKED.
  */
-static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
+static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
 		bool locked)
 {
 	struct anon_vma *anon_vma;
 	pgoff_t pgoff_start, pgoff_end;
 	struct anon_vma_chain *avc;
-	int ret = SWAP_AGAIN;
 
 	if (locked) {
 		anon_vma = page_anon_vma(page);
@@ -1624,7 +1623,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
 		anon_vma = rmap_walk_anon_lock(page, rwc);
 	}
 	if (!anon_vma)
-		return ret;
+		return;
 
 	pgoff_start = page_to_pgoff(page);
 	pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
@@ -1638,8 +1637,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
 		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
 			continue;
 
-		ret = rwc->rmap_one(page, vma, address, rwc->arg);
-		if (ret != SWAP_AGAIN)
+		if (SWAP_AGAIN != rwc->rmap_one(page, vma, address, rwc->arg))
 			break;
 		if (rwc->done && rwc->done(page))
 			break;
@@ -1647,7 +1645,6 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
 
 	if (!locked)
 		anon_vma_unlock_read(anon_vma);
-	return ret;
 }
 
 /*
@@ -1663,13 +1660,12 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * LOCKED.
  */
-static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
+static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
 		bool locked)
 {
 	struct address_space *mapping = page_mapping(page);
 	pgoff_t pgoff_start, pgoff_end;
 	struct vm_area_struct *vma;
-	int ret = SWAP_AGAIN;
 
 	/*
 	 * The page lock not only makes sure that page->mapping cannot
@@ -1680,7 +1676,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 
 	if (!mapping)
-		return ret;
+		return;
 
 	pgoff_start = page_to_pgoff(page);
 	pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
@@ -1695,8 +1691,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
 		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
 			continue;
 
-		ret = rwc->rmap_one(page, vma, address, rwc->arg);
-		if (ret != SWAP_AGAIN)
+		if (SWAP_AGAIN != rwc->rmap_one(page, vma, address, rwc->arg))
 			goto done;
 		if (rwc->done && rwc->done(page))
 			goto done;
@@ -1705,28 +1700,27 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
 done:
 	if (!locked)
 		i_mmap_unlock_read(mapping);
-	return ret;
 }
 
-int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
 {
 	if (unlikely(PageKsm(page)))
-		return rmap_walk_ksm(page, rwc);
+		rmap_walk_ksm(page, rwc);
 	else if (PageAnon(page))
-		return rmap_walk_anon(page, rwc, false);
+		rmap_walk_anon(page, rwc, false);
 	else
-		return rmap_walk_file(page, rwc, false);
+		rmap_walk_file(page, rwc, false);
 }
 
 /* Like rmap_walk, but caller holds relevant rmap lock */
-int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
 {
 	/* no ksm support for now */
 	VM_BUG_ON_PAGE(PageKsm(page), page);
 	if (PageAnon(page))
-		return rmap_walk_anon(page, rwc, true);
+		rmap_walk_anon(page, rwc, true);
 	else
-		return rmap_walk_file(page, rwc, true);
+		rmap_walk_file(page, rwc, true);
 }
 
 #ifdef CONFIG_HUGETLB_PAGE
-- 
cgit v1.2.3


From f6a77ae632c6fab01166f48afaa4d70a8a0620f5 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:20:58 +1000
Subject: mm: make rmap_one boolean function

rmap_one's return value controls whether rmap_work should contine to scan
other ptes or not so it's target for changing to boolean.  Return true if
the scan should be continued.  Otherwise, return false to stop the
scanning.

This patch makes rmap_one's return value to boolean.

Link: http://lkml.kernel.org/r/1489555493-14659-10-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h |  6 +++++-
 mm/ksm.c             |  2 +-
 mm/migrate.c         |  4 ++--
 mm/page_idle.c       |  4 ++--
 mm/rmap.c            | 30 +++++++++++++++---------------
 5 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 1d7d457ca0dc..13ed232cbb29 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -257,7 +257,11 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
  */
 struct rmap_walk_control {
 	void *arg;
-	int (*rmap_one)(struct page *page, struct vm_area_struct *vma,
+	/*
+	 * Return false if page table scanning in rmap_walk should be stopped.
+	 * Otherwise, return true.
+	 */
+	bool (*rmap_one)(struct page *page, struct vm_area_struct *vma,
 					unsigned long addr, void *arg);
 	int (*done)(struct page *page);
 	struct anon_vma *(*anon_lock)(struct page *page);
diff --git a/mm/ksm.c b/mm/ksm.c
index 6edffb9a795b..d9fc0e456128 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1977,7 +1977,7 @@ again:
 			if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
 				continue;
 
-			if (SWAP_AGAIN != rwc->rmap_one(page, vma,
+			if (!rwc->rmap_one(page, vma,
 					rmap_item->address, rwc->arg)) {
 				anon_vma_unlock_read(anon_vma);
 				return;
diff --git a/mm/migrate.c b/mm/migrate.c
index 937378e8b883..410b56bdabed 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -194,7 +194,7 @@ void putback_movable_pages(struct list_head *l)
 /*
  * Restore a potential migration pte to a working pte entry
  */
-static int remove_migration_pte(struct page *page, struct vm_area_struct *vma,
+static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 				 unsigned long addr, void *old)
 {
 	struct page_vma_mapped_walk pvmw = {
@@ -253,7 +253,7 @@ static int remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 		update_mmu_cache(vma, pvmw.address, pvmw.pte);
 	}
 
-	return SWAP_AGAIN;
+	return true;
 }
 
 /*
diff --git a/mm/page_idle.c b/mm/page_idle.c
index b0ee56c56b58..1b0f48c62316 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -50,7 +50,7 @@ static struct page *page_idle_get_page(unsigned long pfn)
 	return page;
 }
 
-static int page_idle_clear_pte_refs_one(struct page *page,
+static bool page_idle_clear_pte_refs_one(struct page *page,
 					struct vm_area_struct *vma,
 					unsigned long addr, void *arg)
 {
@@ -84,7 +84,7 @@ static int page_idle_clear_pte_refs_one(struct page *page,
 		 */
 		set_page_young(page);
 	}
-	return SWAP_AGAIN;
+	return true;
 }
 
 static void page_idle_clear_pte_refs(struct page *page)
diff --git a/mm/rmap.c b/mm/rmap.c
index 7f22e8af80b5..fabf9aac4ea0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -724,7 +724,7 @@ struct page_referenced_arg {
 /*
  * arg: page_referenced_arg will be passed
  */
-static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
+static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
 			unsigned long address, void *arg)
 {
 	struct page_referenced_arg *pra = arg;
@@ -741,7 +741,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 		if (vma->vm_flags & VM_LOCKED) {
 			page_vma_mapped_walk_done(&pvmw);
 			pra->vm_flags |= VM_LOCKED;
-			return SWAP_FAIL; /* To break the loop */
+			return false; /* To break the loop */
 		}
 
 		if (pvmw.pte) {
@@ -781,9 +781,9 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 	}
 
 	if (!pra->mapcount)
-		return SWAP_SUCCESS; /* To break the loop */
+		return false; /* To break the loop */
 
-	return SWAP_AGAIN;
+	return true;
 }
 
 static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
@@ -854,7 +854,7 @@ int page_referenced(struct page *page,
 	return pra.referenced;
 }
 
-static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
+static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
 			    unsigned long address, void *arg)
 {
 	struct page_vma_mapped_walk pvmw = {
@@ -907,7 +907,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
 		}
 	}
 
-	return SWAP_AGAIN;
+	return true;
 }
 
 static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
@@ -1290,7 +1290,7 @@ void page_remove_rmap(struct page *page, bool compound)
 /*
  * @arg: enum ttu_flags will be passed to this argument
  */
-static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		     unsigned long address, void *arg)
 {
 	struct mm_struct *mm = vma->vm_mm;
@@ -1301,12 +1301,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	};
 	pte_t pteval;
 	struct page *subpage;
-	int ret = SWAP_AGAIN;
+	bool ret = true;
 	enum ttu_flags flags = (enum ttu_flags)arg;
 
 	/* munlock has nothing to gain from examining un-locked vmas */
 	if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
-		return SWAP_AGAIN;
+		return true;
 
 	if (flags & TTU_SPLIT_HUGE_PMD) {
 		split_huge_pmd_address(vma, address,
@@ -1329,7 +1329,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 					 */
 					mlock_vma_page(page);
 				}
-				ret = SWAP_FAIL;
+				ret = false;
 				page_vma_mapped_walk_done(&pvmw);
 				break;
 			}
@@ -1347,7 +1347,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		if (!(flags & TTU_IGNORE_ACCESS)) {
 			if (ptep_clear_flush_young_notify(vma, address,
 						pvmw.pte)) {
-				ret = SWAP_FAIL;
+				ret = false;
 				page_vma_mapped_walk_done(&pvmw);
 				break;
 			}
@@ -1437,14 +1437,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 				 */
 				set_pte_at(mm, address, pvmw.pte, pteval);
 				SetPageSwapBacked(page);
-				ret = SWAP_FAIL;
+				ret = false;
 				page_vma_mapped_walk_done(&pvmw);
 				break;
 			}
 
 			if (swap_duplicate(entry) < 0) {
 				set_pte_at(mm, address, pvmw.pte, pteval);
-				ret = SWAP_FAIL;
+				ret = false;
 				page_vma_mapped_walk_done(&pvmw);
 				break;
 			}
@@ -1637,7 +1637,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
 		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
 			continue;
 
-		if (SWAP_AGAIN != rwc->rmap_one(page, vma, address, rwc->arg))
+		if (!rwc->rmap_one(page, vma, address, rwc->arg))
 			break;
 		if (rwc->done && rwc->done(page))
 			break;
@@ -1691,7 +1691,7 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
 		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
 			continue;
 
-		if (SWAP_AGAIN != rwc->rmap_one(page, vma, address, rwc->arg))
+		if (!rwc->rmap_one(page, vma, address, rwc->arg))
 			goto done;
 		if (rwc->done && rwc->done(page))
 			goto done;
-- 
cgit v1.2.3


From c6814dc1cbbbc7d11bbda8d302cf917ec2af0a2a Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:20:59 +1000
Subject: mm: remove SWAP_[SUCCESS|AGAIN|FAIL]

There is no user for it.  Remove it.

Link: http://lkml.kernel.org/r/1489555493-14659-11-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 13ed232cbb29..43ef2c30cb0f 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -295,11 +295,4 @@ static inline int page_mkclean(struct page *page)
 
 #endif	/* CONFIG_MMU */
 
-/*
- * Return values of try_to_unmap
- */
-#define SWAP_SUCCESS	0
-#define SWAP_AGAIN	1
-#define SWAP_FAIL	2
-
 #endif	/* _LINUX_RMAP_H */
-- 
cgit v1.2.3


From e190bdc81bbc9bf6877c57f043948a3a8dcaaa85 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:20:59 +1000
Subject: mm: use false instead of SWAP_FAIL

There was mistake with git-rebase.  SWAP_FAIL was removed.  Use false.  It
could be folded into mm-make-ttus-return-boolean.patch.

Link: http://lkml.kernel.org/r/20170316053313.GA19241@bbox
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reported-by: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/rmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index fabf9aac4ea0..589c6f80c090 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1419,7 +1419,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 */
 			if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
 				WARN_ON_ONCE(1);
-				ret = SWAP_FAIL;
+				ret = false;
 				page_vma_mapped_walk_done(&pvmw);
 				break;
 			}
-- 
cgit v1.2.3


From d3e1ae61c04fe987b2045bdb441efa2c776c374f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 5 Apr 2017 09:21:00 +1000
Subject: mm, swap: Fix comment in __read_swap_cache_async

cbab0e4eec29 ("swap: avoid read_swap_cache_async() race to deadlock while
waiting on discard I/O completion") fixed a deadlock in
read_swap_cache_async().  Because at that time, in swap allocation path, a
swap entry may be set as SWAP_HAS_CACHE, then wait for discarding to
complete before the page for the swap entry is added to the swap cache.
But in 815c2c543d3a ("swap: make swap discard async"), the discarding for
swap become asynchronous, waiting for discarding to complete will be done
before the swap entry is set as SWAP_HAS_CACHE.  So the comments in code
is incorrect now.  This patch fixes the comments.

The cond_resched() added in the commit cbab0e4eec29 is not necessary now
too.  But if we added some sleep in swap allocation path in the future,
there may be some hard to debug/reproduce deadlock bug.  So it is kept.

Link: http://lkml.kernel.org/r/20170317064635.12792-1-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Rafael Aquini <aquini@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/swap_state.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 473b71e052a8..7bfb9bd1ca21 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -360,17 +360,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			/*
 			 * We might race against get_swap_page() and stumble
 			 * across a SWAP_HAS_CACHE swap_map entry whose page
-			 * has not been brought into the swapcache yet, while
-			 * the other end is scheduled away waiting on discard
-			 * I/O completion at scan_swap_map().
-			 *
-			 * In order to avoid turning this transitory state
-			 * into a permanent loop around this -EEXIST case
-			 * if !CONFIG_PREEMPT and the I/O completion happens
-			 * to be waiting on the CPU waitqueue where we are now
-			 * busy looping, we just conditionally invoke the
-			 * scheduler here, if there are some more important
-			 * tasks to run.
+			 * has not been brought into the swapcache yet.
 			 */
 			cond_resched();
 			continue;
-- 
cgit v1.2.3


From b988b7693347b8ea32bfc26b3d43e35451fe7429 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 5 Apr 2017 09:21:01 +1000
Subject: mm, swap: improve readability via make spin_lock/unlock balanced

This is just a cleanup patch, no functionality change.  In
cluster_list_add_tail(), spin_lock_nested() is used to lock the cluster,
while unlock_cluster() is used to unlock the cluster.  To improve the code
readability.  Use spin_unlock() directly to unlock the cluster.

Link: http://lkml.kernel.org/r/20170317064635.12792-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Tim Chen <tim.c.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/swapfile.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6b6bb1bb6209..42fd620dcf4c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -335,7 +335,7 @@ static void cluster_list_add_tail(struct swap_cluster_list *list,
 		ci_tail = ci + tail;
 		spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
 		cluster_set_next(ci_tail, idx);
-		unlock_cluster(ci_tail);
+		spin_unlock(&ci_tail->lock);
 		cluster_set_next_flag(&list->tail, idx, 0);
 	}
 }
-- 
cgit v1.2.3


From 1f1b7dc4acb469c145631c136453e01d00a8a91f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 5 Apr 2017 09:21:02 +1000
Subject: mm, swap: avoid lock swap_avail_lock when held cluster lock

Cluster lock is used to protect the swap_cluster_info and corresponding
elements in swap_info_struct->swap_map[].  But it is found that now in
scan_swap_map_slots(), swap_avail_lock may be acquired when cluster lock
is held.  This does no good except making the locking more complex and
improving the potential locking contention, because the
swap_info_struct->lock is used to protect the data structure operated in
the code already.  Fix this via moving the corresponding operations in
scan_swap_map_slots() out of cluster lock.

Link: http://lkml.kernel.org/r/20170317064635.12792-3-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Tim Chen <tim.c.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/swapfile.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 42fd620dcf4c..53b5881ee0d6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -672,6 +672,9 @@ checks:
 		else
 			goto done;
 	}
+	si->swap_map[offset] = usage;
+	inc_cluster_info_page(si, si->cluster_info, offset);
+	unlock_cluster(ci);
 
 	if (offset == si->lowest_bit)
 		si->lowest_bit++;
@@ -685,9 +688,6 @@ checks:
 		plist_del(&si->avail_list, &swap_avail_head);
 		spin_unlock(&swap_avail_lock);
 	}
-	si->swap_map[offset] = usage;
-	inc_cluster_info_page(si, si->cluster_info, offset);
-	unlock_cluster(ci);
 	si->cluster_next = offset + 1;
 	slots[n_ret++] = swp_entry(si->type, offset);
 
-- 
cgit v1.2.3


From 7bab0c9f866c8dc26faab297ab59e6163455c5a4 Mon Sep 17 00:00:00 2001
From: Vinayak Menon <vinmenon@codeaurora.org>
Date: Wed, 5 Apr 2017 09:21:03 +1000
Subject: mm: enable page poisoning early at boot

On SPARSEMEM systems page poisoning is enabled after buddy is up, because
of the dependency on page extension init.  This causes the pages released
by free_all_bootmem not to be poisoned.  This either delays or misses the
identification of some issues because the pages have to undergo another
cycle of alloc-free-alloc for any corruption to be detected.

Enable page poisoning early by getting rid of the PAGE_EXT_DEBUG_POISON
flag.  Since all the free pages will now be poisoned, the flag need not be
verified before checking the poison during an alloc.

Link: http://lkml.kernel.org/r/1490358246-11001-1-git-send-email-vinmenon@codeaurora.org
Signed-off-by: Vinayak Menon <vinmenon@codeaurora.org>
Acked-by: Laura Abbott <labbott@redhat.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  1 -
 mm/page_alloc.c    | 13 +++------
 mm/page_ext.c      |  3 ---
 mm/page_poison.c   | 77 +++++++++---------------------------------------------
 4 files changed, 15 insertions(+), 79 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00a8fa7e366a..8ae7460e44c6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2497,7 +2497,6 @@ extern long copy_huge_page_from_user(struct page *dst_page,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 
 extern struct page_ext_operations debug_guardpage_ops;
-extern struct page_ext_operations page_poisoning_ops;
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 extern unsigned int _debug_guardpage_minorder;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 975b081b5c6c..e025e620053d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1688,10 +1688,10 @@ static inline int check_new_page(struct page *page)
 	return 1;
 }
 
-static inline bool free_pages_prezeroed(bool poisoned)
+static inline bool free_pages_prezeroed(void)
 {
 	return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
-		page_poisoning_enabled() && poisoned;
+		page_poisoning_enabled();
 }
 
 #ifdef CONFIG_DEBUG_VM
@@ -1745,17 +1745,10 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
 							unsigned int alloc_flags)
 {
 	int i;
-	bool poisoned = true;
-
-	for (i = 0; i < (1 << order); i++) {
-		struct page *p = page + i;
-		if (poisoned)
-			poisoned &= page_is_poisoned(p);
-	}
 
 	post_alloc_hook(page, order, gfp_flags);
 
-	if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
+	if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
 		for (i = 0; i < (1 << order); i++)
 			clear_highpage(page + i);
 
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 121dcffc4ec1..fc3e7ffd189e 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -59,9 +59,6 @@
 
 static struct page_ext_operations *page_ext_ops[] = {
 	&debug_guardpage_ops,
-#ifdef CONFIG_PAGE_POISONING
-	&page_poisoning_ops,
-#endif
 #ifdef CONFIG_PAGE_OWNER
 	&page_owner_ops,
 #endif
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 2e647c65916b..be19e989ccff 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -6,7 +6,6 @@
 #include <linux/poison.h>
 #include <linux/ratelimit.h>
 
-static bool __page_poisoning_enabled __read_mostly;
 static bool want_page_poisoning __read_mostly;
 
 static int early_page_poison_param(char *buf)
@@ -18,75 +17,22 @@ static int early_page_poison_param(char *buf)
 early_param("page_poison", early_page_poison_param);
 
 bool page_poisoning_enabled(void)
-{
-	return __page_poisoning_enabled;
-}
-
-static bool need_page_poisoning(void)
-{
-	return want_page_poisoning;
-}
-
-static void init_page_poisoning(void)
 {
 	/*
-	 * page poisoning is debug page alloc for some arches. If either
-	 * of those options are enabled, enable poisoning
+	 * Assumes that debug_pagealloc_enabled is set before
+	 * free_all_bootmem.
+	 * Page poisoning is debug page alloc for some arches. If
+	 * either of those options are enabled, enable poisoning.
 	 */
-	if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) {
-		if (!want_page_poisoning && !debug_pagealloc_enabled())
-			return;
-	} else {
-		if (!want_page_poisoning)
-			return;
-	}
-
-	__page_poisoning_enabled = true;
-}
-
-struct page_ext_operations page_poisoning_ops = {
-	.need = need_page_poisoning,
-	.init = init_page_poisoning,
-};
-
-static inline void set_page_poison(struct page *page)
-{
-	struct page_ext *page_ext;
-
-	page_ext = lookup_page_ext(page);
-	if (unlikely(!page_ext))
-		return;
-
-	__set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-static inline void clear_page_poison(struct page *page)
-{
-	struct page_ext *page_ext;
-
-	page_ext = lookup_page_ext(page);
-	if (unlikely(!page_ext))
-		return;
-
-	__clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-bool page_is_poisoned(struct page *page)
-{
-	struct page_ext *page_ext;
-
-	page_ext = lookup_page_ext(page);
-	if (unlikely(!page_ext))
-		return false;
-
-	return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
+	return (want_page_poisoning ||
+		(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
+		debug_pagealloc_enabled()));
 }
 
 static void poison_page(struct page *page)
 {
 	void *addr = kmap_atomic(page);
 
-	set_page_poison(page);
 	memset(addr, PAGE_POISON, PAGE_SIZE);
 	kunmap_atomic(addr);
 }
@@ -140,12 +86,13 @@ static void unpoison_page(struct page *page)
 {
 	void *addr;
 
-	if (!page_is_poisoned(page))
-		return;
-
 	addr = kmap_atomic(page);
+	/*
+	 * Page poisoning when enabled poisons each and every page
+	 * that is freed to buddy. Thus no extra check is done to
+	 * see if a page was posioned.
+	 */
 	check_poison_mem(addr, PAGE_SIZE);
-	clear_page_poison(page);
 	kunmap_atomic(addr);
 }
 
-- 
cgit v1.2.3


From 3fd18b363b5aee037c3617654200afe121235d62 Mon Sep 17 00:00:00 2001
From: Vinayak Menon <vinmenon@codeaurora.org>
Date: Wed, 5 Apr 2017 09:21:03 +1000
Subject: mm-enable-page-poisoning-early-at-boot-v2

 (1) Removed "select PAGE_EXTENSION" on CONFIG_PAGE_POISONING
 (2) Removed CONFIG_PAGE_POISONING checks in page_ext.c

Link: http://lkml.kernel.org/r/1490878002-14423-1-git-send-email-vinmenon@codeaurora.org
Signed-off-by: Vinayak Menon <vinmenon@codeaurora.org>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/Kconfig.debug |  1 -
 mm/page_ext.c    | 10 ++--------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 79d0fd13b5b3..5b0adf1435de 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -42,7 +42,6 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT
 
 config PAGE_POISONING
 	bool "Poison pages after freeing"
-	select PAGE_EXTENSION
 	select PAGE_POISONING_NO_SANITY if HIBERNATION
 	---help---
 	  Fill the pages with poison patterns after free_pages() and verify
diff --git a/mm/page_ext.c b/mm/page_ext.c
index fc3e7ffd189e..88ccc044b09a 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -124,15 +124,12 @@ struct page_ext *lookup_page_ext(struct page *page)
 	struct page_ext *base;
 
 	base = NODE_DATA(page_to_nid(page))->node_page_ext;
-#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
+#if defined(CONFIG_DEBUG_VM)
 	/*
 	 * The sanity checks the page allocator does upon freeing a
 	 * page can reach here before the page_ext arrays are
 	 * allocated when feeding a range of pages to the allocator
 	 * for the first time during bootup or memory hotplug.
-	 *
-	 * This check is also necessary for ensuring page poisoning
-	 * works as expected when enabled
 	 */
 	if (unlikely(!base))
 		return NULL;
@@ -201,15 +198,12 @@ struct page_ext *lookup_page_ext(struct page *page)
 {
 	unsigned long pfn = page_to_pfn(page);
 	struct mem_section *section = __pfn_to_section(pfn);
-#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
+#if defined(CONFIG_DEBUG_VM)
 	/*
 	 * The sanity checks the page allocator does upon freeing a
 	 * page can reach here before the page_ext arrays are
 	 * allocated when feeding a range of pages to the allocator
 	 * for the first time during bootup or memory hotplug.
-	 *
-	 * This check is also necessary for ensuring page poisoning
-	 * works as expected when enabled
 	 */
 	if (!section->page_ext)
 		return NULL;
-- 
cgit v1.2.3


From e942163483b72adce996d802c638c63c9002210d Mon Sep 17 00:00:00 2001
From: Pushkar Jambhlekar <pushkar.iit@gmail.com>
Date: Wed, 5 Apr 2017 09:21:04 +1000
Subject: include/linux/migrate.h: add arg names to prototype

It is preferred, and the rest of migrate.h gets it right.

Link: http://lkml.kernel.org/r/1490336009-8024-1-git-send-email-pushkar.iit@gmail.com
Signed-off-by: Pushkar Jambhlekar <pushkar.iit@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/migrate.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index fa76b516fa47..48e24844b3c5 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -33,8 +33,9 @@ extern char *migrate_reason_names[MR_TYPES];
 #ifdef CONFIG_MIGRATION
 
 extern void putback_movable_pages(struct list_head *l);
-extern int migrate_page(struct address_space *,
-			struct page *, struct page *, enum migrate_mode);
+extern int migrate_page(struct address_space *mapping,
+			struct page *newpage, struct page *page,
+			enum migrate_mode mode);
 extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
 extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
-- 
cgit v1.2.3


From 5ef9d7880f7c5332e779c24e100a920f494224eb Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Wed, 5 Apr 2017 09:21:05 +1000
Subject: mm/swap_slots.c: add warning if swap slots cache failed to initialize

Add a warning diagnostics to user if we failed to allocate swap slots
cache and use it.

Link: http://lkml.kernel.org/r/20170328234827.GA10107@linux.intel.com
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/swap_slots.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index b1ccb58ad397..ceb783630880 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -241,8 +241,11 @@ int enable_swap_slots_cache(void)
 
 	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
 				alloc_swap_slot_cache, free_slot_cache);
-	if (ret < 0)
+	if (ret < 0) {
+		WARN_ONCE(1, "Cache allocation failed (%s), operate without swap slots cache.\n",
+				__func__);
 		goto out_unlock;
+	}
 	swap_slot_cache_initialized = true;
 	__reenable_swap_slots_cache();
 out_unlock:
-- 
cgit v1.2.3


From f2fc9d97b3123496f8cbab8097efe08645963348 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 5 Apr 2017 09:21:05 +1000
Subject: swap-add-warning-if-swap-slots-cache-failed-to-initialize-fix

use WARN_ONCE return value, fix grammar in message

Cc: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/swap_slots.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index ceb783630880..aa1c415f4abd 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -241,11 +241,10 @@ int enable_swap_slots_cache(void)
 
 	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
 				alloc_swap_slot_cache, free_slot_cache);
-	if (ret < 0) {
-		WARN_ONCE(1, "Cache allocation failed (%s), operate without swap slots cache.\n",
-				__func__);
+	if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
+			       "without swap slots cache.\n", __func__))
 		goto out_unlock;
-	}
+
 	swap_slot_cache_initialized = true;
 	__reenable_swap_slots_cache();
 out_unlock:
-- 
cgit v1.2.3


From 9ff290b4038492db1267ac3af9ca83d9e84b7cd2 Mon Sep 17 00:00:00 2001
From: Hao Lee <haolee.swjtu@gmail.com>
Date: Wed, 5 Apr 2017 09:21:06 +1000
Subject: mm: fix spelling error

Fix variable name error in comments. No code changes.

Link: http://lkml.kernel.org/r/20170403161655.5081-1-haolee.swjtu@gmail.com
Signed-off-by: Hao Lee <haolee.swjtu@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 2bfcfd33e476..2b1a44f5bdb6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -313,8 +313,8 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
 
 /*
  * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
- * zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long
- * and there are 16 of them to cover all possible combinations of
+ * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT
+ * bits long and there are 16 of them to cover all possible combinations of
  * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
  *
  * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
-- 
cgit v1.2.3


From 437f8d09b04871ce64ae42461bbd01dbecbf71e6 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 5 Apr 2017 09:21:07 +1000
Subject: userfaultfd: selftest: combine all cases into a single executable

Currently, selftest for userfaultfd is compiled three times: for
anonymous, shared and hugetlb memory.  Let's combine all the cases into a
single executable which will have a command line option for selection of
the test type.

Link: http://lkml.kernel.org/r/1490869741-5913-1-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/vm/Makefile      |  11 +-
 tools/testing/selftests/vm/run_vmtests   |   6 +-
 tools/testing/selftests/vm/userfaultfd.c | 207 +++++++++++++++++--------------
 3 files changed, 116 insertions(+), 108 deletions(-)

diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 41642ba5e318..dba889004ea1 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -15,21 +15,14 @@ TEST_GEN_FILES += on-fault-limit
 TEST_GEN_FILES += thuge-gen
 TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += userfaultfd
-TEST_GEN_FILES += userfaultfd_hugetlb
-TEST_GEN_FILES += userfaultfd_shmem
 TEST_GEN_FILES += mlock-random-test
 
 TEST_PROGS := run_vmtests
 
 include ../lib.mk
 
-$(OUTPUT)/userfaultfd: LDLIBS += -lpthread ../../../../usr/include/linux/kernel.h
-
-$(OUTPUT)/userfaultfd_hugetlb: userfaultfd.c ../../../../usr/include/linux/kernel.h
-	$(CC) $(CFLAGS) -DHUGETLB_TEST -O2 -o $@ $< -lpthread
-
-$(OUTPUT)/userfaultfd_shmem: userfaultfd.c  ../../../../usr/include/linux/kernel.h
-	$(CC) $(CFLAGS) -DSHMEM_TEST -O2 -o $@ $< -lpthread
+$(OUTPUT)/userfaultfd: ../../../../usr/include/linux/kernel.h
+$(OUTPUT)/userfaultfd: LDLIBS += -lpthread
 
 $(OUTPUT)/mlock-random-test: LDLIBS += -lcap
 
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index c92f6cf31d0a..3214a6456d13 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -95,7 +95,7 @@ echo "      hugetlb regression testing."
 echo "--------------------"
 echo "running userfaultfd"
 echo "--------------------"
-./userfaultfd 128 32
+./userfaultfd anon 128 32
 if [ $? -ne 0 ]; then
 	echo "[FAIL]"
 	exitcode=1
@@ -107,7 +107,7 @@ echo "----------------------------"
 echo "running userfaultfd_hugetlb"
 echo "----------------------------"
 # 258MB total huge pages == 128MB src and 128MB dst
-./userfaultfd_hugetlb 128 32 $mnt/ufd_test_file
+./userfaultfd hugetlb 128 32 $mnt/ufd_test_file
 if [ $? -ne 0 ]; then
 	echo "[FAIL]"
 	exitcode=1
@@ -119,7 +119,7 @@ rm -f $mnt/ufd_test_file
 echo "----------------------------"
 echo "running userfaultfd_shmem"
 echo "----------------------------"
-./userfaultfd_shmem 128 32
+./userfaultfd shmem 128 32
 if [ $? -ne 0 ]; then
 	echo "[FAIL]"
 	exitcode=1
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index e9449c801888..1eae79ae5b4e 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -77,10 +77,13 @@ static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
 #define BOUNCE_POLL		(1<<3)
 static int bounces;
 
-#ifdef HUGETLB_TEST
+#define TEST_ANON	1
+#define TEST_HUGETLB	2
+#define TEST_SHMEM	3
+static int test_type;
+
 static int huge_fd;
 static char *huge_fd_off0;
-#endif
 static unsigned long long *count_verify;
 static int uffd, uffd_flags, finished, *pipefd;
 static char *area_src, *area_dst;
@@ -102,14 +105,7 @@ pthread_attr_t attr;
 				 ~(unsigned long)(sizeof(unsigned long long) \
 						  -  1)))
 
-#if !defined(HUGETLB_TEST) && !defined(SHMEM_TEST)
-
-/* Anonymous memory */
-#define EXPECTED_IOCTLS		((1 << _UFFDIO_WAKE) | \
-				 (1 << _UFFDIO_COPY) | \
-				 (1 << _UFFDIO_ZEROPAGE))
-
-static int release_pages(char *rel_area)
+static int anon_release_pages(char *rel_area)
 {
 	int ret = 0;
 
@@ -121,7 +117,7 @@ static int release_pages(char *rel_area)
 	return ret;
 }
 
-static void allocate_area(void **alloc_area)
+static void anon_allocate_area(void **alloc_area)
 {
 	if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) {
 		fprintf(stderr, "out of memory\n");
@@ -129,14 +125,9 @@ static void allocate_area(void **alloc_area)
 	}
 }
 
-#else /* HUGETLB_TEST or SHMEM_TEST */
-
-#define EXPECTED_IOCTLS		UFFD_API_RANGE_IOCTLS_BASIC
-
-#ifdef HUGETLB_TEST
 
 /* HugeTLB memory */
-static int release_pages(char *rel_area)
+static int hugetlb_release_pages(char *rel_area)
 {
 	int ret = 0;
 
@@ -152,7 +143,7 @@ static int release_pages(char *rel_area)
 }
 
 
-static void allocate_area(void **alloc_area)
+static void hugetlb_allocate_area(void **alloc_area)
 {
 	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
 				MAP_PRIVATE | MAP_HUGETLB, huge_fd,
@@ -167,10 +158,8 @@ static void allocate_area(void **alloc_area)
 		huge_fd_off0 = *alloc_area;
 }
 
-#elif defined(SHMEM_TEST)
-
 /* Shared memory */
-static int release_pages(char *rel_area)
+static int shmem_release_pages(char *rel_area)
 {
 	int ret = 0;
 
@@ -182,7 +171,7 @@ static int release_pages(char *rel_area)
 	return ret;
 }
 
-static void allocate_area(void **alloc_area)
+static void shmem_allocate_area(void **alloc_area)
 {
 	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
 			   MAP_ANONYMOUS | MAP_SHARED, -1, 0);
@@ -192,11 +181,35 @@ static void allocate_area(void **alloc_area)
 	}
 }
 
-#else /* SHMEM_TEST */
-#error "Undefined test type"
-#endif /* HUGETLB_TEST */
-
-#endif /* !defined(HUGETLB_TEST) && !defined(SHMEM_TEST) */
+struct uffd_test_ops {
+	unsigned long expected_ioctls;
+	void (*allocate_area)(void **alloc_area);
+	int (*release_pages)(char *rel_area);
+};
+
+#define ANON_EXPECTED_IOCTLS		((1 << _UFFDIO_WAKE) | \
+					 (1 << _UFFDIO_COPY) | \
+					 (1 << _UFFDIO_ZEROPAGE))
+
+static struct uffd_test_ops anon_uffd_test_ops = {
+	.expected_ioctls = ANON_EXPECTED_IOCTLS,
+	.allocate_area	= anon_allocate_area,
+	.release_pages	= anon_release_pages,
+};
+
+static struct uffd_test_ops shmem_uffd_test_ops = {
+	.expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
+	.allocate_area	= shmem_allocate_area,
+	.release_pages	= shmem_release_pages,
+};
+
+static struct uffd_test_ops hugetlb_uffd_test_ops = {
+	.expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
+	.allocate_area	= hugetlb_allocate_area,
+	.release_pages	= hugetlb_release_pages,
+};
+
+static struct uffd_test_ops *uffd_test_ops;
 
 static int my_bcmp(char *str1, char *str2, size_t n)
 {
@@ -505,7 +518,7 @@ static int stress(unsigned long *userfaults)
 	 * UFFDIO_COPY without writing zero pages into area_dst
 	 * because the background threads already completed).
 	 */
-	if (release_pages(area_src))
+	if (uffd_test_ops->release_pages(area_src))
 		return 1;
 
 	for (cpu = 0; cpu < nr_cpus; cpu++) {
@@ -577,12 +590,12 @@ static int faulting_process(void)
 {
 	unsigned long nr;
 	unsigned long long count;
+	unsigned long split_nr_pages;
 
-#ifndef HUGETLB_TEST
-	unsigned long split_nr_pages = (nr_pages + 1) / 2;
-#else
-	unsigned long split_nr_pages = nr_pages;
-#endif
+	if (test_type != TEST_HUGETLB)
+		split_nr_pages = (nr_pages + 1) / 2;
+	else
+		split_nr_pages = nr_pages;
 
 	for (nr = 0; nr < split_nr_pages; nr++) {
 		count = *area_count(area_dst, nr);
@@ -594,7 +607,9 @@ static int faulting_process(void)
 		}
 	}
 
-#ifndef HUGETLB_TEST
+	if (test_type == TEST_HUGETLB)
+		return 0;
+
 	area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
 			  MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
 	if (area_dst == MAP_FAILED)
@@ -610,7 +625,7 @@ static int faulting_process(void)
 		}
 	}
 
-	if (release_pages(area_dst))
+	if (uffd_test_ops->release_pages(area_dst))
 		return 1;
 
 	for (nr = 0; nr < nr_pages; nr++) {
@@ -618,8 +633,6 @@ static int faulting_process(void)
 			fprintf(stderr, "nr %lu is not zero\n", nr), exit(1);
 	}
 
-#endif /* HUGETLB_TEST */
-
 	return 0;
 }
 
@@ -627,7 +640,9 @@ static int uffdio_zeropage(int ufd, unsigned long offset)
 {
 	struct uffdio_zeropage uffdio_zeropage;
 	int ret;
-	unsigned long has_zeropage = EXPECTED_IOCTLS & (1 << _UFFDIO_ZEROPAGE);
+	unsigned long has_zeropage;
+
+	has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE);
 
 	if (offset >= nr_pages * page_size)
 		fprintf(stderr, "unexpected offset %lu\n",
@@ -675,7 +690,7 @@ static int userfaultfd_zeropage_test(void)
 	printf("testing UFFDIO_ZEROPAGE: ");
 	fflush(stdout);
 
-	if (release_pages(area_dst))
+	if (uffd_test_ops->release_pages(area_dst))
 		return 1;
 
 	if (userfaultfd_open(0) < 0)
@@ -686,7 +701,7 @@ static int userfaultfd_zeropage_test(void)
 	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
 		fprintf(stderr, "register failure\n"), exit(1);
 
-	expected_ioctls = EXPECTED_IOCTLS;
+	expected_ioctls = uffd_test_ops->expected_ioctls;
 	if ((uffdio_register.ioctls & expected_ioctls) !=
 	    expected_ioctls)
 		fprintf(stderr,
@@ -716,7 +731,7 @@ static int userfaultfd_events_test(void)
 	printf("testing events (fork, remap, remove): ");
 	fflush(stdout);
 
-	if (release_pages(area_dst))
+	if (uffd_test_ops->release_pages(area_dst))
 		return 1;
 
 	features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
@@ -731,7 +746,7 @@ static int userfaultfd_events_test(void)
 	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
 		fprintf(stderr, "register failure\n"), exit(1);
 
-	expected_ioctls = EXPECTED_IOCTLS;
+	expected_ioctls = uffd_test_ops->expected_ioctls;
 	if ((uffdio_register.ioctls & expected_ioctls) !=
 	    expected_ioctls)
 		fprintf(stderr,
@@ -773,10 +788,10 @@ static int userfaultfd_stress(void)
 	int err;
 	unsigned long userfaults[nr_cpus];
 
-	allocate_area((void **)&area_src);
+	uffd_test_ops->allocate_area((void **)&area_src);
 	if (!area_src)
 		return 1;
-	allocate_area((void **)&area_dst);
+	uffd_test_ops->allocate_area((void **)&area_dst);
 	if (!area_dst)
 		return 1;
 
@@ -856,7 +871,7 @@ static int userfaultfd_stress(void)
 			fprintf(stderr, "register failure\n");
 			return 1;
 		}
-		expected_ioctls = EXPECTED_IOCTLS;
+		expected_ioctls = uffd_test_ops->expected_ioctls;
 		if ((uffdio_register.ioctls & expected_ioctls) !=
 		    expected_ioctls) {
 			fprintf(stderr,
@@ -888,7 +903,7 @@ static int userfaultfd_stress(void)
 		 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
 		 * required to MADV_DONTNEED here.
 		 */
-		if (release_pages(area_dst))
+		if (uffd_test_ops->release_pages(area_dst))
 			return 1;
 
 		/* bounce pass */
@@ -934,36 +949,6 @@ static int userfaultfd_stress(void)
 	return userfaultfd_zeropage_test() || userfaultfd_events_test();
 }
 
-#ifndef HUGETLB_TEST
-
-int main(int argc, char **argv)
-{
-	if (argc < 3)
-		fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
-	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-	page_size = sysconf(_SC_PAGE_SIZE);
-	if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
-	    > page_size)
-		fprintf(stderr, "Impossible to run this test\n"), exit(2);
-	nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
-		nr_cpus;
-	if (!nr_pages_per_cpu) {
-		fprintf(stderr, "invalid MiB\n");
-		fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
-	}
-	bounces = atoi(argv[2]);
-	if (bounces <= 0) {
-		fprintf(stderr, "invalid bounces\n");
-		fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
-	}
-	nr_pages = nr_pages_per_cpu * nr_cpus;
-	printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
-	       nr_pages, nr_pages_per_cpu);
-	return userfaultfd_stress();
-}
-
-#else /* HUGETLB_TEST */
-
 /*
  * Copied from mlock2-tests.c
  */
@@ -988,48 +973,78 @@ unsigned long default_huge_page_size(void)
 	return hps;
 }
 
-int main(int argc, char **argv)
+static void set_test_type(const char *type)
 {
-	if (argc < 4)
-		fprintf(stderr, "Usage: <MiB> <bounces> <hugetlbfs_file>\n"),
-				exit(1);
-	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-	page_size = default_huge_page_size();
+	if (!strcmp(type, "anon")) {
+		test_type = TEST_ANON;
+		uffd_test_ops = &anon_uffd_test_ops;
+	} else if (!strcmp(type, "hugetlb")) {
+		test_type = TEST_HUGETLB;
+		uffd_test_ops = &hugetlb_uffd_test_ops;
+	} else if (!strcmp(type, "shmem")) {
+		test_type = TEST_SHMEM;
+		uffd_test_ops = &shmem_uffd_test_ops;
+	} else {
+		fprintf(stderr, "Unknown test type: %s\n", type), exit(1);
+	}
+
+	if (test_type == TEST_HUGETLB)
+		page_size = default_huge_page_size();
+	else
+		page_size = sysconf(_SC_PAGE_SIZE);
+
 	if (!page_size)
-		fprintf(stderr, "Unable to determine huge page size\n"),
+		fprintf(stderr, "Unable to determine page size\n"),
 				exit(2);
 	if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
 	    > page_size)
 		fprintf(stderr, "Impossible to run this test\n"), exit(2);
-	nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
+}
+
+int main(int argc, char **argv)
+{
+	if (argc < 4)
+		fprintf(stderr, "Usage: <test type> <MiB> <bounces> [hugetlbfs_file]\n"),
+				exit(1);
+
+	set_test_type(argv[1]);
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
 		nr_cpus;
 	if (!nr_pages_per_cpu) {
 		fprintf(stderr, "invalid MiB\n");
 		fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
 	}
-	bounces = atoi(argv[2]);
+
+	bounces = atoi(argv[3]);
 	if (bounces <= 0) {
 		fprintf(stderr, "invalid bounces\n");
 		fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
 	}
 	nr_pages = nr_pages_per_cpu * nr_cpus;
-	huge_fd = open(argv[3], O_CREAT | O_RDWR, 0755);
-	if (huge_fd < 0) {
-		fprintf(stderr, "Open of %s failed", argv[3]);
-		perror("open");
-		exit(1);
-	}
-	if (ftruncate(huge_fd, 0)) {
-		fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
-		perror("ftruncate");
-		exit(1);
+
+	if (test_type == TEST_HUGETLB) {
+		if (argc < 5)
+			fprintf(stderr, "Usage: hugetlb <MiB> <bounces> <hugetlbfs_file>\n"),
+				exit(1);
+		huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
+		if (huge_fd < 0) {
+			fprintf(stderr, "Open of %s failed", argv[3]);
+			perror("open");
+			exit(1);
+		}
+		if (ftruncate(huge_fd, 0)) {
+			fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
+			perror("ftruncate");
+			exit(1);
+		}
 	}
 	printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
 	       nr_pages, nr_pages_per_cpu);
 	return userfaultfd_stress();
 }
 
-#endif
 #else /* __NR_userfaultfd */
 
 #warning "missing __NR_userfaultfd definition"
-- 
cgit v1.2.3


From d2da41daaebf5e81d5e102efb5cc079e938e8e47 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:21:07 +1000
Subject: zram: handle multiple pages attached to bio's bvec

Johannes Thumshirn reported system goes the panic when using NVMe over
Fabrics loopback target with zram.

The reason is zram expects each bvec in bio contains a single page but
nvme can attach a huge bulk of pages attached to the bio's bvec so that
zram's index arithmetic could be wrong so that out-of-bound access makes
panic.

It was solved by limiting max_sectors with SECTORS_PER_PAGE in
0bc315381fe9 ("zram: set physical queue limits to avoid array out of
bounds accesses") but that makes zram slow because a bio should split
with each pages.  So this patch makes zram aware of multiple pages in a
bvec so it can solve the panic without causing any performance
regression.

Link: http://lkml.kernel.org/r/1491196653-7388-2-git-send-email-minchan@kernel.org
Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reported-by: Johannes Thumshirn <jthumshirn@suse.de>
Tested-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Mika Penttil <mika.penttila@nextfour.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 39 ++++++++++-----------------------------
 1 file changed, 10 insertions(+), 29 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 01944419b1f3..28c2836f8c96 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -137,8 +137,7 @@ static inline bool valid_io_request(struct zram *zram,
 
 static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
 {
-	if (*offset + bvec->bv_len >= PAGE_SIZE)
-		(*index)++;
+	*index  += (*offset + bvec->bv_len) / PAGE_SIZE;
 	*offset = (*offset + bvec->bv_len) % PAGE_SIZE;
 }
 
@@ -838,34 +837,20 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
 	}
 
 	bio_for_each_segment(bvec, bio, iter) {
-		int max_transfer_size = PAGE_SIZE - offset;
-
-		if (bvec.bv_len > max_transfer_size) {
-			/*
-			 * zram_bvec_rw() can only make operation on a single
-			 * zram page. Split the bio vector.
-			 */
-			struct bio_vec bv;
-
-			bv.bv_page = bvec.bv_page;
-			bv.bv_len = max_transfer_size;
-			bv.bv_offset = bvec.bv_offset;
+		struct bio_vec bv = bvec;
+		unsigned int remained = bvec.bv_len;
 
+		do {
+			bv.bv_len = min_t(unsigned int, PAGE_SIZE, remained);
 			if (zram_bvec_rw(zram, &bv, index, offset,
-					 op_is_write(bio_op(bio))) < 0)
+					op_is_write(bio_op(bio))) < 0)
 				goto out;
 
-			bv.bv_len = bvec.bv_len - max_transfer_size;
-			bv.bv_offset += max_transfer_size;
-			if (zram_bvec_rw(zram, &bv, index + 1, 0,
-					 op_is_write(bio_op(bio))) < 0)
-				goto out;
-		} else
-			if (zram_bvec_rw(zram, &bvec, index, offset,
-					 op_is_write(bio_op(bio))) < 0)
-				goto out;
+			bv.bv_offset += bv.bv_len;
+			remained -= bv.bv_len;
 
-		update_position(&index, &offset, &bvec);
+			update_position(&index, &offset, &bv);
+		} while (remained);
 	}
 
 	bio_endio(bio);
@@ -882,8 +867,6 @@ static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio)
 {
 	struct zram *zram = queue->queuedata;
 
-	blk_queue_split(queue, &bio, queue->bio_split);
-
 	if (!valid_io_request(zram, bio->bi_iter.bi_sector,
 					bio->bi_iter.bi_size)) {
 		atomic64_inc(&zram->stats.invalid_io);
@@ -1191,8 +1174,6 @@ static int zram_add(void)
 	blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
 	blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
 	zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
-	zram->disk->queue->limits.max_sectors = SECTORS_PER_PAGE;
-	zram->disk->queue->limits.chunk_sectors = 0;
 	blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
 	/*
 	 * zram_bio_discard() will clear all logical blocks if logical block
-- 
cgit v1.2.3


From c610e8d62da66b47e41338c96260766a8cb0095e Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:21:08 +1000
Subject: zram: partial IO refactoring

For architecture(PAGE_SIZE > 4K), zram have supported partial IO.
However, the mixed code for handling normal/partial IO is too mess,
error-prone to modify IO handler functions with upcoming feature so this
patch aims for cleaning up zram's IO handling functions.

Link: http://lkml.kernel.org/r/1491196653-7388-3-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Mika Penttil <mika.penttila@nextfour.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 333 +++++++++++++++++++++++-------------------
 1 file changed, 184 insertions(+), 149 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 28c2836f8c96..7938f4b98b01 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -45,6 +45,8 @@ static const char *default_compressor = "lzo";
 /* Module params (documentation at end) */
 static unsigned int num_devices = 1;
 
+static void zram_free_page(struct zram *zram, size_t index);
+
 static inline bool init_done(struct zram *zram)
 {
 	return zram->disksize;
@@ -98,10 +100,17 @@ static void zram_set_obj_size(struct zram_meta *meta,
 	meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
 }
 
+#if PAGE_SIZE != 4096
 static inline bool is_partial_io(struct bio_vec *bvec)
 {
 	return bvec->bv_len != PAGE_SIZE;
 }
+#else
+static inline bool is_partial_io(struct bio_vec *bvec)
+{
+	return false;
+}
+#endif
 
 static void zram_revalidate_disk(struct zram *zram)
 {
@@ -191,18 +200,6 @@ static bool page_same_filled(void *ptr, unsigned long *element)
 	return true;
 }
 
-static void handle_same_page(struct bio_vec *bvec, unsigned long element)
-{
-	struct page *page = bvec->bv_page;
-	void *user_mem;
-
-	user_mem = kmap_atomic(page);
-	zram_fill_page(user_mem + bvec->bv_offset, bvec->bv_len, element);
-	kunmap_atomic(user_mem);
-
-	flush_dcache_page(page);
-}
-
 static ssize_t initstate_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -418,6 +415,53 @@ static DEVICE_ATTR_RO(io_stat);
 static DEVICE_ATTR_RO(mm_stat);
 static DEVICE_ATTR_RO(debug_stat);
 
+static bool zram_special_page_read(struct zram *zram, u32 index,
+				struct page *page,
+				unsigned int offset, unsigned int len)
+{
+	struct zram_meta *meta = zram->meta;
+
+	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+	if (unlikely(!meta->table[index].handle) ||
+			zram_test_flag(meta, index, ZRAM_SAME)) {
+		void *mem;
+
+		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+		mem = kmap_atomic(page);
+		zram_fill_page(mem + offset, len, meta->table[index].element);
+		kunmap_atomic(mem);
+		return true;
+	}
+	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+
+	return false;
+}
+
+static bool zram_special_page_write(struct zram *zram, u32 index,
+					struct page *page)
+{
+	unsigned long element;
+	void *mem = kmap_atomic(page);
+
+	if (page_same_filled(mem, &element)) {
+		struct zram_meta *meta = zram->meta;
+
+		kunmap_atomic(mem);
+		/* Free memory associated with this sector now. */
+		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+		zram_free_page(zram, index);
+		zram_set_flag(meta, index, ZRAM_SAME);
+		zram_set_element(meta, index, element);
+		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+
+		atomic64_inc(&zram->stats.same_pages);
+		return true;
+	}
+	kunmap_atomic(mem);
+
+	return false;
+}
+
 static void zram_meta_free(struct zram_meta *meta, u64 disksize)
 {
 	size_t num_pages = disksize >> PAGE_SHIFT;
@@ -504,169 +548,104 @@ static void zram_free_page(struct zram *zram, size_t index)
 	zram_set_obj_size(meta, index, 0);
 }
 
-static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
+static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 {
-	int ret = 0;
-	unsigned char *cmem;
-	struct zram_meta *meta = zram->meta;
+	int ret;
 	unsigned long handle;
 	unsigned int size;
+	void *src, *dst;
+	struct zram_meta *meta = zram->meta;
+
+	if (zram_special_page_read(zram, index, page, 0, PAGE_SIZE))
+		return 0;
 
 	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
 	handle = meta->table[index].handle;
 	size = zram_get_obj_size(meta, index);
 
-	if (!handle || zram_test_flag(meta, index, ZRAM_SAME)) {
-		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
-		zram_fill_page(mem, PAGE_SIZE, meta->table[index].element);
-		return 0;
-	}
-
-	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
+	src = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
 	if (size == PAGE_SIZE) {
-		copy_page(mem, cmem);
+		dst = kmap_atomic(page);
+		copy_page(dst, src);
+		kunmap_atomic(dst);
+		ret = 0;
 	} else {
 		struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
 
-		ret = zcomp_decompress(zstrm, cmem, size, mem);
+		dst = kmap_atomic(page);
+		ret = zcomp_decompress(zstrm, src, size, dst);
+		kunmap_atomic(dst);
 		zcomp_stream_put(zram->comp);
 	}
 	zs_unmap_object(meta->mem_pool, handle);
 	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
 
 	/* Should NEVER happen. Return bio error if it does. */
-	if (unlikely(ret)) {
+	if (unlikely(ret))
 		pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
-		return ret;
-	}
 
-	return 0;
+	return ret;
 }
 
 static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
-			  u32 index, int offset)
+				u32 index, int offset)
 {
 	int ret;
 	struct page *page;
-	unsigned char *user_mem, *uncmem = NULL;
-	struct zram_meta *meta = zram->meta;
-	page = bvec->bv_page;
 
-	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
-	if (unlikely(!meta->table[index].handle) ||
-			zram_test_flag(meta, index, ZRAM_SAME)) {
-		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
-		handle_same_page(bvec, meta->table[index].element);
+	page = bvec->bv_page;
+	if (zram_special_page_read(zram, index, page, bvec->bv_offset,
+				bvec->bv_len))
 		return 0;
-	}
-	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
-
-	if (is_partial_io(bvec))
-		/* Use  a temporary buffer to decompress the page */
-		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
-
-	user_mem = kmap_atomic(page);
-	if (!is_partial_io(bvec))
-		uncmem = user_mem;
 
-	if (!uncmem) {
-		pr_err("Unable to allocate temp memory\n");
-		ret = -ENOMEM;
-		goto out_cleanup;
+	if (is_partial_io(bvec)) {
+		/* Use a temporary buffer to decompress the page */
+		page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
+		if (!page)
+			return -ENOMEM;
 	}
 
-	ret = zram_decompress_page(zram, uncmem, index);
-	/* Should NEVER happen. Return bio error if it does. */
+	ret = zram_decompress_page(zram, page, index);
 	if (unlikely(ret))
-		goto out_cleanup;
+		goto out;
 
-	if (is_partial_io(bvec))
-		memcpy(user_mem + bvec->bv_offset, uncmem + offset,
-				bvec->bv_len);
+	if (is_partial_io(bvec)) {
+		void *dst = kmap_atomic(bvec->bv_page);
+		void *src = kmap_atomic(page);
 
-	flush_dcache_page(page);
-	ret = 0;
-out_cleanup:
-	kunmap_atomic(user_mem);
+		memcpy(dst + bvec->bv_offset, src + offset, bvec->bv_len);
+		kunmap_atomic(src);
+		kunmap_atomic(dst);
+	}
+out:
 	if (is_partial_io(bvec))
-		kfree(uncmem);
+		__free_page(page);
+
 	return ret;
 }
 
-static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
-			   int offset)
+static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
+			struct page *page,
+			unsigned long *out_handle, unsigned int *out_comp_len)
 {
-	int ret = 0;
-	unsigned int clen;
+	int ret;
+	unsigned int comp_len;
+	void *src;
 	unsigned long handle = 0;
-	struct page *page;
-	unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
 	struct zram_meta *meta = zram->meta;
-	struct zcomp_strm *zstrm = NULL;
-	unsigned long alloced_pages;
-	unsigned long element;
-
-	page = bvec->bv_page;
-	if (is_partial_io(bvec)) {
-		/*
-		 * This is a partial IO. We need to read the full page
-		 * before to write the changes.
-		 */
-		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
-		if (!uncmem) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		ret = zram_decompress_page(zram, uncmem, index);
-		if (ret)
-			goto out;
-	}
 
 compress_again:
-	user_mem = kmap_atomic(page);
-	if (is_partial_io(bvec)) {
-		memcpy(uncmem + offset, user_mem + bvec->bv_offset,
-		       bvec->bv_len);
-		kunmap_atomic(user_mem);
-		user_mem = NULL;
-	} else {
-		uncmem = user_mem;
-	}
-
-	if (page_same_filled(uncmem, &element)) {
-		if (user_mem)
-			kunmap_atomic(user_mem);
-		/* Free memory associated with this sector now. */
-		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
-		zram_free_page(zram, index);
-		zram_set_flag(meta, index, ZRAM_SAME);
-		zram_set_element(meta, index, element);
-		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
-
-		atomic64_inc(&zram->stats.same_pages);
-		ret = 0;
-		goto out;
-	}
-
-	zstrm = zcomp_stream_get(zram->comp);
-	ret = zcomp_compress(zstrm, uncmem, &clen);
-	if (!is_partial_io(bvec)) {
-		kunmap_atomic(user_mem);
-		user_mem = NULL;
-		uncmem = NULL;
-	}
+	src = kmap_atomic(page);
+	ret = zcomp_compress(*zstrm, src, &comp_len);
+	kunmap_atomic(src);
 
 	if (unlikely(ret)) {
 		pr_err("Compression failed! err=%d\n", ret);
-		goto out;
+		return ret;
 	}
 
-	src = zstrm->buffer;
-	if (unlikely(clen > max_zpage_size)) {
-		clen = PAGE_SIZE;
-		if (is_partial_io(bvec))
-			src = uncmem;
-	}
+	if (unlikely(comp_len > max_zpage_size))
+		comp_len = PAGE_SIZE;
 
 	/*
 	 * handle allocation has 2 paths:
@@ -682,50 +661,70 @@ compress_again:
 	 * from the slow path and handle has already been allocated.
 	 */
 	if (!handle)
-		handle = zs_malloc(meta->mem_pool, clen,
+		handle = zs_malloc(meta->mem_pool, comp_len,
 				__GFP_KSWAPD_RECLAIM |
 				__GFP_NOWARN |
 				__GFP_HIGHMEM |
 				__GFP_MOVABLE);
 	if (!handle) {
 		zcomp_stream_put(zram->comp);
-		zstrm = NULL;
-
 		atomic64_inc(&zram->stats.writestall);
-
-		handle = zs_malloc(meta->mem_pool, clen,
+		handle = zs_malloc(meta->mem_pool, comp_len,
 				GFP_NOIO | __GFP_HIGHMEM |
 				__GFP_MOVABLE);
+		*zstrm = zcomp_stream_get(zram->comp);
 		if (handle)
 			goto compress_again;
+		return -ENOMEM;
+	}
 
-		pr_err("Error allocating memory for compressed page: %u, size=%u\n",
-			index, clen);
-		ret = -ENOMEM;
-		goto out;
+	*out_handle = handle;
+	*out_comp_len = comp_len;
+	return 0;
+}
+
+static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
+				u32 index, int offset)
+{
+	int ret;
+	unsigned long handle;
+	unsigned int comp_len;
+	void *src, *dst;
+	struct zcomp_strm *zstrm;
+	unsigned long alloced_pages;
+	struct zram_meta *meta = zram->meta;
+	struct page *page = bvec->bv_page;
+
+	if (zram_special_page_write(zram, index, page))
+		return 0;
+
+	zstrm = zcomp_stream_get(zram->comp);
+	ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
+	if (ret) {
+		zcomp_stream_put(zram->comp);
+		return ret;
 	}
 
 	alloced_pages = zs_get_total_pages(meta->mem_pool);
 	update_used_max(zram, alloced_pages);
 
 	if (zram->limit_pages && alloced_pages > zram->limit_pages) {
+		zcomp_stream_put(zram->comp);
 		zs_free(meta->mem_pool, handle);
-		ret = -ENOMEM;
-		goto out;
+		return -ENOMEM;
 	}
 
-	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
+	dst = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
 
-	if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
+	if (comp_len == PAGE_SIZE) {
 		src = kmap_atomic(page);
-		copy_page(cmem, src);
+		copy_page(dst, src);
 		kunmap_atomic(src);
 	} else {
-		memcpy(cmem, src, clen);
+		memcpy(dst, zstrm->buffer, comp_len);
 	}
 
 	zcomp_stream_put(zram->comp);
-	zstrm = NULL;
 	zs_unmap_object(meta->mem_pool, handle);
 
 	/*
@@ -734,19 +733,54 @@ compress_again:
 	 */
 	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
 	zram_free_page(zram, index);
-
 	meta->table[index].handle = handle;
-	zram_set_obj_size(meta, index, clen);
+	zram_set_obj_size(meta, index, comp_len);
 	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
 
 	/* Update stats */
-	atomic64_add(clen, &zram->stats.compr_data_size);
+	atomic64_add(comp_len, &zram->stats.compr_data_size);
 	atomic64_inc(&zram->stats.pages_stored);
+	return 0;
+}
+
+static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
+				u32 index, int offset)
+{
+	int ret;
+	struct page *page = NULL;
+	void *src;
+	struct bio_vec vec;
+
+	vec = *bvec;
+	if (is_partial_io(bvec)) {
+		void *dst;
+		/*
+		 * This is a partial IO. We need to read the full page
+		 * before to write the changes.
+		 */
+		page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
+		if (!page)
+			return -ENOMEM;
+
+		ret = zram_decompress_page(zram, page, index);
+		if (ret)
+			goto out;
+
+		src = kmap_atomic(bvec->bv_page);
+		dst = kmap_atomic(page);
+		memcpy(dst + offset, src + bvec->bv_offset, bvec->bv_len);
+		kunmap_atomic(dst);
+		kunmap_atomic(src);
+
+		vec.bv_page = page;
+		vec.bv_len = PAGE_SIZE;
+		vec.bv_offset = 0;
+	}
+
+	ret = __zram_bvec_write(zram, &vec, index, offset);
 out:
-	if (zstrm)
-		zcomp_stream_put(zram->comp);
 	if (is_partial_io(bvec))
-		kfree(uncmem);
+		__free_page(page);
 	return ret;
 }
 
@@ -802,6 +836,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 	if (!is_write) {
 		atomic64_inc(&zram->stats.num_reads);
 		ret = zram_bvec_read(zram, bvec, index, offset);
+		flush_dcache_page(bvec->bv_page);
 	} else {
 		atomic64_inc(&zram->stats.num_writes);
 		ret = zram_bvec_write(zram, bvec, index, offset);
-- 
cgit v1.2.3


From 81dcaf8152aa471dfe836624af7e2628463e7c23 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:21:09 +1000
Subject: zram: use zram_slot_lock instead of raw bit_spin_lock op

With this cleanup phase, I want to use zram's wrapper function to lock
table access which is more consistent with other zram's functions.

Link: http://lkml.kernel.org/r/1491196653-7388-4-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Mika Penttil <mika.penttila@nextfour.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 42 ++++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 7938f4b98b01..71b0a584bc85 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -415,24 +415,39 @@ static DEVICE_ATTR_RO(io_stat);
 static DEVICE_ATTR_RO(mm_stat);
 static DEVICE_ATTR_RO(debug_stat);
 
+
+static void zram_slot_lock(struct zram *zram, u32 index)
+{
+	struct zram_meta *meta = zram->meta;
+
+	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+}
+
+static void zram_slot_unlock(struct zram *zram, u32 index)
+{
+	struct zram_meta *meta = zram->meta;
+
+	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+}
+
 static bool zram_special_page_read(struct zram *zram, u32 index,
 				struct page *page,
 				unsigned int offset, unsigned int len)
 {
 	struct zram_meta *meta = zram->meta;
 
-	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+	zram_slot_lock(zram, index);
 	if (unlikely(!meta->table[index].handle) ||
 			zram_test_flag(meta, index, ZRAM_SAME)) {
 		void *mem;
 
-		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+		zram_slot_unlock(zram, index);
 		mem = kmap_atomic(page);
 		zram_fill_page(mem + offset, len, meta->table[index].element);
 		kunmap_atomic(mem);
 		return true;
 	}
-	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+	zram_slot_unlock(zram, index);
 
 	return false;
 }
@@ -448,11 +463,11 @@ static bool zram_special_page_write(struct zram *zram, u32 index,
 
 		kunmap_atomic(mem);
 		/* Free memory associated with this sector now. */
-		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+		zram_slot_lock(zram, index);
 		zram_free_page(zram, index);
 		zram_set_flag(meta, index, ZRAM_SAME);
 		zram_set_element(meta, index, element);
-		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+		zram_slot_unlock(zram, index);
 
 		atomic64_inc(&zram->stats.same_pages);
 		return true;
@@ -559,7 +574,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 	if (zram_special_page_read(zram, index, page, 0, PAGE_SIZE))
 		return 0;
 
-	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+	zram_slot_lock(zram, index);
 	handle = meta->table[index].handle;
 	size = zram_get_obj_size(meta, index);
 
@@ -578,7 +593,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 		zcomp_stream_put(zram->comp);
 	}
 	zs_unmap_object(meta->mem_pool, handle);
-	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+	zram_slot_unlock(zram, index);
 
 	/* Should NEVER happen. Return bio error if it does. */
 	if (unlikely(ret))
@@ -731,11 +746,11 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
 	 * Free memory associated with this sector
 	 * before overwriting unused sectors.
 	 */
-	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+	zram_slot_lock(zram, index);
 	zram_free_page(zram, index);
 	meta->table[index].handle = handle;
 	zram_set_obj_size(meta, index, comp_len);
-	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+	zram_slot_unlock(zram, index);
 
 	/* Update stats */
 	atomic64_add(comp_len, &zram->stats.compr_data_size);
@@ -793,7 +808,6 @@ static void zram_bio_discard(struct zram *zram, u32 index,
 			     int offset, struct bio *bio)
 {
 	size_t n = bio->bi_iter.bi_size;
-	struct zram_meta *meta = zram->meta;
 
 	/*
 	 * zram manages data in physical block size units. Because logical block
@@ -814,9 +828,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
 	}
 
 	while (n >= PAGE_SIZE) {
-		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+		zram_slot_lock(zram, index);
 		zram_free_page(zram, index);
-		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+		zram_slot_unlock(zram, index);
 		atomic64_inc(&zram->stats.notify_free);
 		index++;
 		n -= PAGE_SIZE;
@@ -925,9 +939,9 @@ static void zram_slot_free_notify(struct block_device *bdev,
 	zram = bdev->bd_disk->private_data;
 	meta = zram->meta;
 
-	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+	zram_slot_lock(zram, index);
 	zram_free_page(zram, index);
-	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+	zram_slot_unlock(zram, index);
 	atomic64_inc(&zram->stats.notify_free);
 }
 
-- 
cgit v1.2.3


From cbdd6aae9e56a1683f18de141d9553cb1d1f7c52 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:21:09 +1000
Subject: zram: remove zram_meta structure

It's redundant now.  Instead, remove it and use zram structure directly.

Link: http://lkml.kernel.org/r/1491196653-7388-5-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Mika Penttil <mika.penttila@nextfour.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 163 +++++++++++++++++-------------------------
 drivers/block/zram/zram_drv.h |   6 +-
 2 files changed, 65 insertions(+), 104 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 71b0a584bc85..fdb73222841d 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -58,46 +58,46 @@ static inline struct zram *dev_to_zram(struct device *dev)
 }
 
 /* flag operations require table entry bit_spin_lock() being held */
-static int zram_test_flag(struct zram_meta *meta, u32 index,
+static int zram_test_flag(struct zram *zram, u32 index,
 			enum zram_pageflags flag)
 {
-	return meta->table[index].value & BIT(flag);
+	return zram->table[index].value & BIT(flag);
 }
 
-static void zram_set_flag(struct zram_meta *meta, u32 index,
+static void zram_set_flag(struct zram *zram, u32 index,
 			enum zram_pageflags flag)
 {
-	meta->table[index].value |= BIT(flag);
+	zram->table[index].value |= BIT(flag);
 }
 
-static void zram_clear_flag(struct zram_meta *meta, u32 index,
+static void zram_clear_flag(struct zram *zram, u32 index,
 			enum zram_pageflags flag)
 {
-	meta->table[index].value &= ~BIT(flag);
+	zram->table[index].value &= ~BIT(flag);
 }
 
-static inline void zram_set_element(struct zram_meta *meta, u32 index,
+static inline void zram_set_element(struct zram *zram, u32 index,
 			unsigned long element)
 {
-	meta->table[index].element = element;
+	zram->table[index].element = element;
 }
 
-static inline void zram_clear_element(struct zram_meta *meta, u32 index)
+static inline void zram_clear_element(struct zram *zram, u32 index)
 {
-	meta->table[index].element = 0;
+	zram->table[index].element = 0;
 }
 
-static size_t zram_get_obj_size(struct zram_meta *meta, u32 index)
+static size_t zram_get_obj_size(struct zram *zram, u32 index)
 {
-	return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
+	return zram->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
 }
 
-static void zram_set_obj_size(struct zram_meta *meta,
+static void zram_set_obj_size(struct zram *zram,
 					u32 index, size_t size)
 {
-	unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT;
+	unsigned long flags = zram->table[index].value >> ZRAM_FLAG_SHIFT;
 
-	meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
+	zram->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
 }
 
 #if PAGE_SIZE != 4096
@@ -252,9 +252,8 @@ static ssize_t mem_used_max_store(struct device *dev,
 
 	down_read(&zram->init_lock);
 	if (init_done(zram)) {
-		struct zram_meta *meta = zram->meta;
 		atomic_long_set(&zram->stats.max_used_pages,
-				zs_get_total_pages(meta->mem_pool));
+				zs_get_total_pages(zram->mem_pool));
 	}
 	up_read(&zram->init_lock);
 
@@ -327,7 +326,6 @@ static ssize_t compact_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
 	struct zram *zram = dev_to_zram(dev);
-	struct zram_meta *meta;
 
 	down_read(&zram->init_lock);
 	if (!init_done(zram)) {
@@ -335,8 +333,7 @@ static ssize_t compact_store(struct device *dev,
 		return -EINVAL;
 	}
 
-	meta = zram->meta;
-	zs_compact(meta->mem_pool);
+	zs_compact(zram->mem_pool);
 	up_read(&zram->init_lock);
 
 	return len;
@@ -373,8 +370,8 @@ static ssize_t mm_stat_show(struct device *dev,
 
 	down_read(&zram->init_lock);
 	if (init_done(zram)) {
-		mem_used = zs_get_total_pages(zram->meta->mem_pool);
-		zs_pool_stats(zram->meta->mem_pool, &pool_stats);
+		mem_used = zs_get_total_pages(zram->mem_pool);
+		zs_pool_stats(zram->mem_pool, &pool_stats);
 	}
 
 	orig_size = atomic64_read(&zram->stats.pages_stored);
@@ -418,32 +415,26 @@ static DEVICE_ATTR_RO(debug_stat);
 
 static void zram_slot_lock(struct zram *zram, u32 index)
 {
-	struct zram_meta *meta = zram->meta;
-
-	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+	bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value);
 }
 
 static void zram_slot_unlock(struct zram *zram, u32 index)
 {
-	struct zram_meta *meta = zram->meta;
-
-	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+	bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value);
 }
 
 static bool zram_special_page_read(struct zram *zram, u32 index,
 				struct page *page,
 				unsigned int offset, unsigned int len)
 {
-	struct zram_meta *meta = zram->meta;
-
 	zram_slot_lock(zram, index);
-	if (unlikely(!meta->table[index].handle) ||
-			zram_test_flag(meta, index, ZRAM_SAME)) {
+	if (unlikely(!zram->table[index].handle) ||
+			zram_test_flag(zram, index, ZRAM_SAME)) {
 		void *mem;
 
 		zram_slot_unlock(zram, index);
 		mem = kmap_atomic(page);
-		zram_fill_page(mem + offset, len, meta->table[index].element);
+		zram_fill_page(mem + offset, len, zram->table[index].element);
 		kunmap_atomic(mem);
 		return true;
 	}
@@ -459,14 +450,12 @@ static bool zram_special_page_write(struct zram *zram, u32 index,
 	void *mem = kmap_atomic(page);
 
 	if (page_same_filled(mem, &element)) {
-		struct zram_meta *meta = zram->meta;
-
 		kunmap_atomic(mem);
 		/* Free memory associated with this sector now. */
 		zram_slot_lock(zram, index);
 		zram_free_page(zram, index);
-		zram_set_flag(meta, index, ZRAM_SAME);
-		zram_set_element(meta, index, element);
+		zram_set_flag(zram, index, ZRAM_SAME);
+		zram_set_element(zram, index, element);
 		zram_slot_unlock(zram, index);
 
 		atomic64_inc(&zram->stats.same_pages);
@@ -477,56 +466,44 @@ static bool zram_special_page_write(struct zram *zram, u32 index,
 	return false;
 }
 
-static void zram_meta_free(struct zram_meta *meta, u64 disksize)
+static void zram_meta_free(struct zram *zram, u64 disksize)
 {
 	size_t num_pages = disksize >> PAGE_SHIFT;
 	size_t index;
 
 	/* Free all pages that are still in this zram device */
 	for (index = 0; index < num_pages; index++) {
-		unsigned long handle = meta->table[index].handle;
+		unsigned long handle = zram->table[index].handle;
 		/*
 		 * No memory is allocated for same element filled pages.
 		 * Simply clear same page flag.
 		 */
-		if (!handle || zram_test_flag(meta, index, ZRAM_SAME))
+		if (!handle || zram_test_flag(zram, index, ZRAM_SAME))
 			continue;
 
-		zs_free(meta->mem_pool, handle);
+		zs_free(zram->mem_pool, handle);
 	}
 
-	zs_destroy_pool(meta->mem_pool);
-	vfree(meta->table);
-	kfree(meta);
+	zs_destroy_pool(zram->mem_pool);
+	vfree(zram->table);
 }
 
-static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
+static bool zram_meta_alloc(struct zram *zram, u64 disksize)
 {
 	size_t num_pages;
-	struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
-
-	if (!meta)
-		return NULL;
 
 	num_pages = disksize >> PAGE_SHIFT;
-	meta->table = vzalloc(num_pages * sizeof(*meta->table));
-	if (!meta->table) {
-		pr_err("Error allocating zram address table\n");
-		goto out_error;
-	}
+	zram->table = vzalloc(num_pages * sizeof(*zram->table));
+	if (!zram->table)
+		return false;
 
-	meta->mem_pool = zs_create_pool(pool_name);
-	if (!meta->mem_pool) {
-		pr_err("Error creating memory pool\n");
-		goto out_error;
+	zram->mem_pool = zs_create_pool(zram->disk->disk_name);
+	if (!zram->mem_pool) {
+		vfree(zram->table);
+		return false;
 	}
 
-	return meta;
-
-out_error:
-	vfree(meta->table);
-	kfree(meta);
-	return NULL;
+	return true;
 }
 
 /*
@@ -536,16 +513,15 @@ out_error:
  */
 static void zram_free_page(struct zram *zram, size_t index)
 {
-	struct zram_meta *meta = zram->meta;
-	unsigned long handle = meta->table[index].handle;
+	unsigned long handle = zram->table[index].handle;
 
 	/*
 	 * No memory is allocated for same element filled pages.
 	 * Simply clear same page flag.
 	 */
-	if (zram_test_flag(meta, index, ZRAM_SAME)) {
-		zram_clear_flag(meta, index, ZRAM_SAME);
-		zram_clear_element(meta, index);
+	if (zram_test_flag(zram, index, ZRAM_SAME)) {
+		zram_clear_flag(zram, index, ZRAM_SAME);
+		zram_clear_element(zram, index);
 		atomic64_dec(&zram->stats.same_pages);
 		return;
 	}
@@ -553,14 +529,14 @@ static void zram_free_page(struct zram *zram, size_t index)
 	if (!handle)
 		return;
 
-	zs_free(meta->mem_pool, handle);
+	zs_free(zram->mem_pool, handle);
 
-	atomic64_sub(zram_get_obj_size(meta, index),
+	atomic64_sub(zram_get_obj_size(zram, index),
 			&zram->stats.compr_data_size);
 	atomic64_dec(&zram->stats.pages_stored);
 
-	meta->table[index].handle = 0;
-	zram_set_obj_size(meta, index, 0);
+	zram->table[index].handle = 0;
+	zram_set_obj_size(zram, index, 0);
 }
 
 static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
@@ -569,16 +545,15 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 	unsigned long handle;
 	unsigned int size;
 	void *src, *dst;
-	struct zram_meta *meta = zram->meta;
 
 	if (zram_special_page_read(zram, index, page, 0, PAGE_SIZE))
 		return 0;
 
 	zram_slot_lock(zram, index);
-	handle = meta->table[index].handle;
-	size = zram_get_obj_size(meta, index);
+	handle = zram->table[index].handle;
+	size = zram_get_obj_size(zram, index);
 
-	src = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
+	src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
 	if (size == PAGE_SIZE) {
 		dst = kmap_atomic(page);
 		copy_page(dst, src);
@@ -592,7 +567,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 		kunmap_atomic(dst);
 		zcomp_stream_put(zram->comp);
 	}
-	zs_unmap_object(meta->mem_pool, handle);
+	zs_unmap_object(zram->mem_pool, handle);
 	zram_slot_unlock(zram, index);
 
 	/* Should NEVER happen. Return bio error if it does. */
@@ -647,7 +622,6 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
 	unsigned int comp_len;
 	void *src;
 	unsigned long handle = 0;
-	struct zram_meta *meta = zram->meta;
 
 compress_again:
 	src = kmap_atomic(page);
@@ -676,7 +650,7 @@ compress_again:
 	 * from the slow path and handle has already been allocated.
 	 */
 	if (!handle)
-		handle = zs_malloc(meta->mem_pool, comp_len,
+		handle = zs_malloc(zram->mem_pool, comp_len,
 				__GFP_KSWAPD_RECLAIM |
 				__GFP_NOWARN |
 				__GFP_HIGHMEM |
@@ -684,7 +658,7 @@ compress_again:
 	if (!handle) {
 		zcomp_stream_put(zram->comp);
 		atomic64_inc(&zram->stats.writestall);
-		handle = zs_malloc(meta->mem_pool, comp_len,
+		handle = zs_malloc(zram->mem_pool, comp_len,
 				GFP_NOIO | __GFP_HIGHMEM |
 				__GFP_MOVABLE);
 		*zstrm = zcomp_stream_get(zram->comp);
@@ -707,7 +681,6 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
 	void *src, *dst;
 	struct zcomp_strm *zstrm;
 	unsigned long alloced_pages;
-	struct zram_meta *meta = zram->meta;
 	struct page *page = bvec->bv_page;
 
 	if (zram_special_page_write(zram, index, page))
@@ -720,16 +693,16 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
 		return ret;
 	}
 
-	alloced_pages = zs_get_total_pages(meta->mem_pool);
+	alloced_pages = zs_get_total_pages(zram->mem_pool);
 	update_used_max(zram, alloced_pages);
 
 	if (zram->limit_pages && alloced_pages > zram->limit_pages) {
 		zcomp_stream_put(zram->comp);
-		zs_free(meta->mem_pool, handle);
+		zs_free(zram->mem_pool, handle);
 		return -ENOMEM;
 	}
 
-	dst = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
+	dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
 
 	if (comp_len == PAGE_SIZE) {
 		src = kmap_atomic(page);
@@ -740,7 +713,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
 	}
 
 	zcomp_stream_put(zram->comp);
-	zs_unmap_object(meta->mem_pool, handle);
+	zs_unmap_object(zram->mem_pool, handle);
 
 	/*
 	 * Free memory associated with this sector
@@ -748,8 +721,8 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
 	 */
 	zram_slot_lock(zram, index);
 	zram_free_page(zram, index);
-	meta->table[index].handle = handle;
-	zram_set_obj_size(meta, index, comp_len);
+	zram->table[index].handle = handle;
+	zram_set_obj_size(zram, index, comp_len);
 	zram_slot_unlock(zram, index);
 
 	/* Update stats */
@@ -934,10 +907,8 @@ static void zram_slot_free_notify(struct block_device *bdev,
 				unsigned long index)
 {
 	struct zram *zram;
-	struct zram_meta *meta;
 
 	zram = bdev->bd_disk->private_data;
-	meta = zram->meta;
 
 	zram_slot_lock(zram, index);
 	zram_free_page(zram, index);
@@ -985,7 +956,6 @@ out:
 
 static void zram_reset_device(struct zram *zram)
 {
-	struct zram_meta *meta;
 	struct zcomp *comp;
 	u64 disksize;
 
@@ -998,7 +968,6 @@ static void zram_reset_device(struct zram *zram)
 		return;
 	}
 
-	meta = zram->meta;
 	comp = zram->comp;
 	disksize = zram->disksize;
 
@@ -1011,7 +980,7 @@ static void zram_reset_device(struct zram *zram)
 
 	up_write(&zram->init_lock);
 	/* I/O operation under all of CPU are done so let's free */
-	zram_meta_free(meta, disksize);
+	zram_meta_free(zram, disksize);
 	zcomp_destroy(comp);
 }
 
@@ -1020,7 +989,6 @@ static ssize_t disksize_store(struct device *dev,
 {
 	u64 disksize;
 	struct zcomp *comp;
-	struct zram_meta *meta;
 	struct zram *zram = dev_to_zram(dev);
 	int err;
 
@@ -1029,8 +997,7 @@ static ssize_t disksize_store(struct device *dev,
 		return -EINVAL;
 
 	disksize = PAGE_ALIGN(disksize);
-	meta = zram_meta_alloc(zram->disk->disk_name, disksize);
-	if (!meta)
+	if (!zram_meta_alloc(zram, disksize))
 		return -ENOMEM;
 
 	comp = zcomp_create(zram->compressor);
@@ -1048,7 +1015,6 @@ static ssize_t disksize_store(struct device *dev,
 		goto out_destroy_comp;
 	}
 
-	zram->meta = meta;
 	zram->comp = comp;
 	zram->disksize = disksize;
 	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
@@ -1061,7 +1027,7 @@ out_destroy_comp:
 	up_write(&zram->init_lock);
 	zcomp_destroy(comp);
 out_free_meta:
-	zram_meta_free(meta, disksize);
+	zram_meta_free(zram, disksize);
 	return err;
 }
 
@@ -1248,7 +1214,6 @@ static int zram_add(void)
 		goto out_free_disk;
 	}
 	strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
-	zram->meta = NULL;
 
 	pr_info("Added device: %s\n", zram->disk->disk_name);
 	return device_id;
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index caeff51f1571..e34e44d02e3e 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -92,13 +92,9 @@ struct zram_stats {
 	atomic64_t writestall;		/* no. of write slow paths */
 };
 
-struct zram_meta {
+struct zram {
 	struct zram_table_entry *table;
 	struct zs_pool *mem_pool;
-};
-
-struct zram {
-	struct zram_meta *meta;
 	struct zcomp *comp;
 	struct gendisk *disk;
 	/* Prevent concurrent execution of device init */
-- 
cgit v1.2.3


From 6d398c02ffb90d4a581a83d9a0a41db9ae5ec2c8 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 5 Apr 2017 09:21:10 +1000
Subject: zram: introduce zram data accessor

With element, sometime I got confused handle and element access.  It might
be my bad but I think it's time to introduce accessor to prevent future
idiot like me.  This patch is just clean-up patch so it shouldn't change
any behavior.

Link: http://lkml.kernel.org/r/1491196653-7388-6-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Mika Penttil <mika.penttila@nextfour.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index fdb73222841d..c3171e5aa582 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -57,6 +57,16 @@ static inline struct zram *dev_to_zram(struct device *dev)
 	return (struct zram *)dev_to_disk(dev)->private_data;
 }
 
+static unsigned long zram_get_handle(struct zram *zram, u32 index)
+{
+	return zram->table[index].handle;
+}
+
+static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
+{
+	zram->table[index].handle = handle;
+}
+
 /* flag operations require table entry bit_spin_lock() being held */
 static int zram_test_flag(struct zram *zram, u32 index,
 			enum zram_pageflags flag)
@@ -82,9 +92,9 @@ static inline void zram_set_element(struct zram *zram, u32 index,
 	zram->table[index].element = element;
 }
 
-static inline void zram_clear_element(struct zram *zram, u32 index)
+static unsigned long zram_get_element(struct zram *zram, u32 index)
 {
-	zram->table[index].element = 0;
+	return zram->table[index].element;
 }
 
 static size_t zram_get_obj_size(struct zram *zram, u32 index)
@@ -428,13 +438,14 @@ static bool zram_special_page_read(struct zram *zram, u32 index,
 				unsigned int offset, unsigned int len)
 {
 	zram_slot_lock(zram, index);
-	if (unlikely(!zram->table[index].handle) ||
-			zram_test_flag(zram, index, ZRAM_SAME)) {
+	if (unlikely(!zram_get_handle(zram, index) ||
+			zram_test_flag(zram, index, ZRAM_SAME))) {
 		void *mem;
 
 		zram_slot_unlock(zram, index);
 		mem = kmap_atomic(page);
-		zram_fill_page(mem + offset, len, zram->table[index].element);
+		zram_fill_page(mem + offset, len,
+					zram_get_element(zram, index));
 		kunmap_atomic(mem);
 		return true;
 	}
@@ -473,7 +484,7 @@ static void zram_meta_free(struct zram *zram, u64 disksize)
 
 	/* Free all pages that are still in this zram device */
 	for (index = 0; index < num_pages; index++) {
-		unsigned long handle = zram->table[index].handle;
+		unsigned long handle = zram_get_handle(zram, index);
 		/*
 		 * No memory is allocated for same element filled pages.
 		 * Simply clear same page flag.
@@ -513,7 +524,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
  */
 static void zram_free_page(struct zram *zram, size_t index)
 {
-	unsigned long handle = zram->table[index].handle;
+	unsigned long handle = zram_get_handle(zram, index);
 
 	/*
 	 * No memory is allocated for same element filled pages.
@@ -521,7 +532,7 @@ static void zram_free_page(struct zram *zram, size_t index)
 	 */
 	if (zram_test_flag(zram, index, ZRAM_SAME)) {
 		zram_clear_flag(zram, index, ZRAM_SAME);
-		zram_clear_element(zram, index);
+		zram_set_element(zram, index, 0);
 		atomic64_dec(&zram->stats.same_pages);
 		return;
 	}
@@ -535,7 +546,7 @@ static void zram_free_page(struct zram *zram, size_t index)
 			&zram->stats.compr_data_size);
 	atomic64_dec(&zram->stats.pages_stored);
 
-	zram->table[index].handle = 0;
+	zram_set_handle(zram, index, 0);
 	zram_set_obj_size(zram, index, 0);
 }
 
@@ -550,7 +561,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 		return 0;
 
 	zram_slot_lock(zram, index);
-	handle = zram->table[index].handle;
+	handle = zram_get_handle(zram, index);
 	size = zram_get_obj_size(zram, index);
 
 	src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
@@ -721,7 +732,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
 	 */
 	zram_slot_lock(zram, index);
 	zram_free_page(zram, index);
-	zram->table[index].handle = handle;
+	zram_set_handle(zram, index, handle);
 	zram_set_obj_size(zram, index, comp_len);
 	zram_slot_unlock(zram, index);
 
-- 
cgit v1.2.3


From f0f450254111c052e14bb721089ec893ced2d3e0 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 5 Apr 2017 09:21:11 +1000
Subject: mm/slab.h: add additional consistency check

As found in PaX, this adds a cheap check on heap consistency, just to
notice if things have gotten corrupted in the page lookup.

Given the kinds of heap attacks I've been seeing, I think this added
consistency check is worth it given how inexpensive it is.  When heap
metadata gets corrupted, we can get into nasty side-effects that can be
attacker-controlled, so better to catch obviously bad states as early as
possible.

Link: http://lkml.kernel.org/r/20170331164028.GA118828@beast
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/slab.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/slab.h b/mm/slab.h
index 65e7c3fcac72..64447640b70c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -384,6 +384,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
 		return s;
 
 	page = virt_to_head_page(x);
+	BUG_ON(!PageSlab(page));
 	cachep = page->slab_cache;
 	if (slab_equal_or_root(cachep, s))
 		return cachep;
-- 
cgit v1.2.3


From bccc07c8529761ea6e0e8609f40c1b3bb8ddc2e0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 5 Apr 2017 09:21:11 +1000
Subject: oom: improve oom disable handling

Tetsuo has reported that sysrq triggered OOM killer will print a
misleading information when no tasks are selected:

[  713.805315] sysrq: SysRq : Manual OOM execution
[  713.808920] Out of memory: Kill process 4468 ((agetty)) score 0 or sacrifice child
[  713.814913] Killed process 4468 ((agetty)) total-vm:43704kB, anon-rss:1760kB, file-rss:0kB, shmem-rss:0kB
[  714.004805] sysrq: SysRq : Manual OOM execution
[  714.005936] Out of memory: Kill process 4469 (systemd-cgroups) score 0 or sacrifice child
[  714.008117] Killed process 4469 (systemd-cgroups) total-vm:10704kB, anon-rss:120kB, file-rss:0kB, shmem-rss:0kB
[  714.189310] sysrq: SysRq : Manual OOM execution
[  714.193425] sysrq: OOM request ignored because killer is disabled
[  714.381313] sysrq: SysRq : Manual OOM execution
[  714.385158] sysrq: OOM request ignored because killer is disabled
[  714.573320] sysrq: SysRq : Manual OOM execution
[  714.576988] sysrq: OOM request ignored because killer is disabled

The real reason is that there are no eligible tasks for the OOM killer to
select but since 7c5f64f84483bd13 ("mm: oom: deduplicate victim selection
code for memcg and global oom") the semantic of out_of_memory has changed
without updating moom_callback.

This patch updates moom_callback to tell that no task was eligible which
is the case for both oom killer disabled and no eligible tasks.  In order
to help distinguish first case from the second add printk to both
oom_killer_{enable,disable}.  This information is useful on its own
because it might help debugging potential memory allocation failures.

Fixes: 7c5f64f84483bd13 ("mm: oom: deduplicate victim selection code for memcg and global oom")
Link: http://lkml.kernel.org/r/20170404134705.6361-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/tty/sysrq.c | 2 +-
 mm/oom_kill.c       | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index c6fc7141d7b2..32d61f1810f0 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -372,7 +372,7 @@ static void moom_callback(struct work_struct *ignored)
 
 	mutex_lock(&oom_lock);
 	if (!out_of_memory(&oc))
-		pr_info("OOM request ignored because killer is disabled\n");
+		pr_info("OOM request ignored. No task eligible\n");
 	mutex_unlock(&oom_lock);
 }
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d083714a2bb9..04c9143a8625 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -685,6 +685,7 @@ void exit_oom_victim(void)
 void oom_killer_enable(void)
 {
 	oom_killer_disabled = false;
+	pr_info("OOM killer enabled.\n");
 }
 
 /**
@@ -721,6 +722,7 @@ bool oom_killer_disable(signed long timeout)
 		oom_killer_enable();
 		return false;
 	}
+	pr_info("OOM killer disabled.\n");
 
 	return true;
 }
-- 
cgit v1.2.3


From 5560b2a6eac9664304bbbc280b007e019c491470 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Wed, 5 Apr 2017 09:21:12 +1000
Subject: mm/mmap: replace SHM_HUGE_MASK with MAP_HUGE_MASK inside mmap_pgoff

091d0d55b286 ("shm: fix null pointer deref when userspace specifies
invalid hugepage size") had replaced MAP_HUGE_MASK with SHM_HUGE_MASK.
Though both of them contain the same numeric value of 0x3f, MAP_HUGE_MASK
flag sounds more appropriate than the other one in the context.  Hence
change it back.

Link: http://lkml.kernel.org/r/20170404045635.616-1-khandual@linux.vnet.ibm.com
Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/mmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index bfbe8856d134..f82741e199c0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1479,7 +1479,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 		struct user_struct *user = NULL;
 		struct hstate *hs;
 
-		hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK);
+		hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
 		if (!hs)
 			return -EINVAL;
 
-- 
cgit v1.2.3


From 4bc7055c7f0f9aee23aeed9001ff5530416fa069 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Wed, 5 Apr 2017 09:21:12 +1000
Subject: mm/page_alloc: return 0 in case this node has no page within the zone

The whole memory space is divided into several zones and nodes may have no
page in some zones.  In this case, the __absent_pages_in_range() would
return 0, since the range it is searching for is an empty range.

Also this happens more often to those nodes with higher memory range when
there are more nodes, which is a trend for future architectures.

This patch checks the zone range after clamp and adjustment, return 0 if
the range is an empty range.

Link: http://lkml.kernel.org/r/20170206154314.15705-1-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e025e620053d..4ec48c83c460 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5783,6 +5783,11 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 			node_start_pfn, node_end_pfn,
 			&zone_start_pfn, &zone_end_pfn);
+
+	/* If this node has no page within this zone, return 0. */
+	if (zone_start_pfn == zone_end_pfn)
+		return 0;
+
 	nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 
 	/*
-- 
cgit v1.2.3


From d53fffc22b530fc4ed15e6f031ced4b54ee8f46c Mon Sep 17 00:00:00 2001
From: Vinayak Menon <vinmenon@codeaurora.org>
Date: Wed, 5 Apr 2017 09:21:13 +1000
Subject: mm: vmscan: do not pass reclaimed slab to vmpressure

During global reclaim, the nr_reclaimed passed to vmpressure includes
the pages reclaimed from slab.  But the corresponding scanned slab
pages is not passed.  There is an impact to the vmpressure values
because of this.  While moving from kernel version 3.18 to 4.4, a
difference is seen in the vmpressure values for the same workload
resulting in a different behaviour of the vmpressure consumer.  One
such case is of a vmpressure based lowmemorykiller.  It is observed
that the vmpressure events are received late and less in number
resulting in tasks not being killed at the right time.  In this use
case, The number of critical vmpressure events received is around 50%
less on 4.4 than 3.18.  The following numbers show the impact on
reclaim activity due to the change in behaviour of lowmemorykiller on a
4GB device.  The test launches a number of apps in sequence and repeats
it multiple times.  The difference in reclaim behaviour is because of
lesser number of kills and kills happening late, resulting in more
swapping and page cache reclaim.

                      v4.4           v3.18
pgpgin                163016456      145617236
pgpgout               4366220        4188004
workingset_refault    29857868       26781854
workingset_activate   6293946        5634625
pswpin                1327601        1133912
pswpout               3593842        3229602
pgalloc_dma           99520618       94402970
pgalloc_normal        104046854      98124798
pgfree                203772640      192600737
pgmajfault            2126962        1851836
pgsteal_kswapd_dma    19732899       18039462
pgsteal_kswapd_normal 19945336       17977706
pgsteal_direct_dma    206757         131376
pgsteal_direct_normal 236783         138247
pageoutrun            116622         108370
allocstall            7220           4684
compact_stall         931            856

The lowmemorykiller example above is just for indicating the difference
in vmpressure events between 4.4 and 3.18.

Do not consider reclaimed slab pages for vmpressure calculation.  The
reclaimed pages from slab can be excluded because the freeing of a page
by slab shrinking depends on each slab's object population, making the
cost model (i.e.  scan:free) different from that of LRU.  Also, not
every shrinker accounts the pages it reclaims.  Ideally the pages
reclaimed from slab should be passed to vmpressure, otherwise higher
vmpressure levels can be triggered even when there is a reclaim
progress.  But accounting only the reclaimed slab pages without the
scanned, and adding something which does not fit into the cost model
just adds noise to the vmpressure values.

Fixes: 6b4f7799c6a5 ("mm: vmscan: invoke slab shrinkers from shrink_zone()")
Link: http://lkml.kernel.org/r/1486641577-11685-2-git-send-email-vinmenon@codeaurora.org
Signed-off-by: Vinayak Menon <vinmenon@codeaurora.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Anton Vorontsov <anton.vorontsov@linaro.org>
Cc: Shiraz Hashim <shashim@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/vmscan.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7a30150b4dee..58615bb27f2f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2550,16 +2550,23 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 				    sc->nr_scanned - nr_scanned,
 				    node_lru_pages);
 
+		/*
+		 * Record the subtree's reclaim efficiency. The reclaimed
+		 * pages from slab is excluded here because the corresponding
+		 * scanned pages is not accounted. Moreover, freeing a page
+		 * by slab shrinking depends on each slab's object population,
+		 * making the cost model (i.e. scan:free) different from that
+		 * of LRU.
+		 */
+		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
+			   sc->nr_scanned - nr_scanned,
+			   sc->nr_reclaimed - nr_reclaimed);
+
 		if (reclaim_state) {
 			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
 			reclaim_state->reclaimed_slab = 0;
 		}
 
-		/* Record the subtree's reclaim efficiency */
-		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
-			   sc->nr_scanned - nr_scanned,
-			   sc->nr_reclaimed - nr_reclaimed);
-
 		if (sc->nr_reclaimed - nr_reclaimed)
 			reclaimable = true;
 
-- 
cgit v1.2.3


From 013ae23fee7926cffae1e1b840f0466baed4191b Mon Sep 17 00:00:00 2001
From: zhong jiang <zhongjiang@huawei.com>
Date: Wed, 5 Apr 2017 09:21:13 +1000
Subject: mm/page_owner: align with pageblock_nr pages

When pfn_valid(pfn) returns false, pfn should be aligned with
pageblock_nr_pages other than MAX_ORDER_NR_PAGES in init_pages_in_zone,
because the skipped 2M may be valid pfn, as a result, early allocated
count will not be accurate.

Link: http://lkml.kernel.org/r/1468938136-24228-1-git-send-email-zhongjiang@huawei.com
Signed-off-by: zhong jiang <zhongjiang@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_owner.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_owner.c b/mm/page_owner.c
index 60634dc53a88..754efdd52bf7 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -527,7 +527,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
 	 */
 	for (; pfn < end_pfn; ) {
 		if (!pfn_valid(pfn)) {
-			pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+			pfn = ALIGN(pfn + 1, pageblock_nr_pages);
 			continue;
 		}
 
-- 
cgit v1.2.3


From 0fba0909df759f27efbed7f234beff93139ba60e Mon Sep 17 00:00:00 2001
From: zhong jiang <zhongjiang@huawei.com>
Date: Wed, 5 Apr 2017 09:21:14 +1000
Subject: mm/vmstat.c: walk the zone in pageblock_nr_pages steps

when walking the zone, we can happens to the holes. we should not
align MAX_ORDER_NR_PAGES, so it can skip the normal memory.

In addition, pagetypeinfo_showmixedcount_print reflect fragmentization.
we hope to get more accurate data. therefore, I decide to fix it.

Link: http://lkml.kernel.org/r/1469502526-24486-2-git-send-email-zhongjiang@huawei.com
Signed-off-by: zhong jiang <zhongjiang@huawei.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_owner.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_owner.c b/mm/page_owner.c
index 754efdd52bf7..c3cee247f2e6 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -261,7 +261,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 	 */
 	for (; pfn < end_pfn; ) {
 		if (!pfn_valid(pfn)) {
-			pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+			pfn = ALIGN(pfn + 1, pageblock_nr_pages);
 			continue;
 		}
 
-- 
cgit v1.2.3


From b4b49d521d1ee53b4988ea36eaa292a0787e330e Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 5 Apr 2017 09:21:14 +1000
Subject: kasan: introduce helper functions for determining bug type

Patch series "kasan: improve error reports", v2.

This patchset improves KASAN reports by making them easier to read
and a little more detailed.
Also improves mm/kasan/report.c readability.

Effectively changes a use-after-free report to:

==================================================================
BUG: KASAN: use-after-free in kmalloc_uaf+0xaa/0xb6 [test_kasan]
Write of size 1 at addr ffff88006aa59da8 by task insmod/3951

CPU: 1 PID: 3951 Comm: insmod Tainted: G    B           4.10.0+ #84
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
 dump_stack+0x292/0x398
 print_address_description+0x73/0x280
 kasan_report.part.2+0x207/0x2f0
 __asan_report_store1_noabort+0x2c/0x30
 kmalloc_uaf+0xaa/0xb6 [test_kasan]
 kmalloc_tests_init+0x4f/0xa48 [test_kasan]
 do_one_initcall+0xf3/0x390
 do_init_module+0x215/0x5d0
 load_module+0x54de/0x82b0
 SYSC_init_module+0x3be/0x430
 SyS_init_module+0x9/0x10
 entry_SYSCALL_64_fastpath+0x1f/0xc2
RIP: 0033:0x7f22cfd0b9da
RSP: 002b:00007ffe69118a78 EFLAGS: 00000206 ORIG_RAX: 00000000000000af
RAX: ffffffffffffffda RBX: 0000555671242090 RCX: 00007f22cfd0b9da
RDX: 00007f22cffcaf88 RSI: 000000000004df7e RDI: 00007f22d0399000
RBP: 00007f22cffcaf88 R08: 0000000000000003 R09: 0000000000000000
R10: 00007f22cfd07d0a R11: 0000000000000206 R12: 0000555671243190
R13: 000000000001fe81 R14: 0000000000000000 R15: 0000000000000004

Allocated by task 3951:
 save_stack_trace+0x16/0x20
 save_stack+0x43/0xd0
 kasan_kmalloc+0xad/0xe0
 kmem_cache_alloc_trace+0x82/0x270
 kmalloc_uaf+0x56/0xb6 [test_kasan]
 kmalloc_tests_init+0x4f/0xa48 [test_kasan]
 do_one_initcall+0xf3/0x390
 do_init_module+0x215/0x5d0
 load_module+0x54de/0x82b0
 SYSC_init_module+0x3be/0x430
 SyS_init_module+0x9/0x10
 entry_SYSCALL_64_fastpath+0x1f/0xc2

Freed by task 3951:
 save_stack_trace+0x16/0x20
 save_stack+0x43/0xd0
 kasan_slab_free+0x72/0xc0
 kfree+0xe8/0x2b0
 kmalloc_uaf+0x85/0xb6 [test_kasan]
 kmalloc_tests_init+0x4f/0xa48 [test_kasan]
 do_one_initcall+0xf3/0x390
 do_init_module+0x215/0x5d0
 load_module+0x54de/0x82b0
 SYSC_init_module+0x3be/0x430
 SyS_init_module+0x9/0x10
 entry_SYSCALL_64_fastpath+0x1f/0xc

The buggy address belongs to the object at ffff88006aa59da0
 which belongs to the cache kmalloc-16 of size 16
The buggy address is located 8 bytes inside of
 16-byte region [ffff88006aa59da0, ffff88006aa59db0)
The buggy address belongs to the page:
page:ffffea0001aa9640 count:1 mapcount:0 mapping:          (null) index:0x0
flags: 0x100000000000100(slab)
raw: 0100000000000100 0000000000000000 0000000000000000 0000000180800080
raw: ffffea0001abe380 0000000700000007 ffff88006c401b40 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff88006aa59c80: 00 00 fc fc 00 00 fc fc 00 00 fc fc 00 00 fc fc
 ffff88006aa59d00: 00 00 fc fc 00 00 fc fc 00 00 fc fc 00 00 fc fc
>ffff88006aa59d80: fb fb fc fc fb fb fc fc fb fb fc fc fb fb fc fc
                                  ^
 ffff88006aa59e00: fb fb fc fc fb fb fc fc fb fb fc fc fb fb fc fc
 ffff88006aa59e80: fb fb fc fc 00 00 fc fc 00 00 fc fc 00 00 fc fc
==================================================================

from:

==================================================================
BUG: KASAN: use-after-free in kmalloc_uaf+0xaa/0xb6 [test_kasan] at addr ffff88006c4dcb28
Write of size 1 by task insmod/3984
CPU: 1 PID: 3984 Comm: insmod Tainted: G    B           4.10.0+ #83
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
 dump_stack+0x292/0x398
 kasan_object_err+0x1c/0x70
 kasan_report.part.1+0x20e/0x4e0
 __asan_report_store1_noabort+0x2c/0x30
 kmalloc_uaf+0xaa/0xb6 [test_kasan]
 kmalloc_tests_init+0x4f/0xa48 [test_kasan]
 do_one_initcall+0xf3/0x390
 do_init_module+0x215/0x5d0
 load_module+0x54de/0x82b0
 SYSC_init_module+0x3be/0x430
 SyS_init_module+0x9/0x10
 entry_SYSCALL_64_fastpath+0x1f/0xc2
RIP: 0033:0x7feca0f779da
RSP: 002b:00007ffdfeae5218 EFLAGS: 00000206 ORIG_RAX: 00000000000000af
RAX: ffffffffffffffda RBX: 000055a064c13090 RCX: 00007feca0f779da
RDX: 00007feca1236f88 RSI: 000000000004df7e RDI: 00007feca1605000
RBP: 00007feca1236f88 R08: 0000000000000003 R09: 0000000000000000
R10: 00007feca0f73d0a R11: 0000000000000206 R12: 000055a064c14190
R13: 000000000001fe81 R14: 0000000000000000 R15: 0000000000000004
Object at ffff88006c4dcb20, in cache kmalloc-16 size: 16
Allocated:
PID = 3984
 save_stack_trace+0x16/0x20
 save_stack+0x43/0xd0
 kasan_kmalloc+0xad/0xe0
 kmem_cache_alloc_trace+0x82/0x270
 kmalloc_uaf+0x56/0xb6 [test_kasan]
 kmalloc_tests_init+0x4f/0xa48 [test_kasan]
 do_one_initcall+0xf3/0x390
 do_init_module+0x215/0x5d0
 load_module+0x54de/0x82b0
 SYSC_init_module+0x3be/0x430
 SyS_init_module+0x9/0x10
 entry_SYSCALL_64_fastpath+0x1f/0xc2
Freed:
PID = 3984
 save_stack_trace+0x16/0x20
 save_stack+0x43/0xd0
 kasan_slab_free+0x73/0xc0
 kfree+0xe8/0x2b0
 kmalloc_uaf+0x85/0xb6 [test_kasan]
 kmalloc_tests_init+0x4f/0xa48 [test_kasan]
 do_one_initcall+0xf3/0x390
 do_init_module+0x215/0x5d0
 load_module+0x54de/0x82b0
 SYSC_init_module+0x3be/0x430
 SyS_init_module+0x9/0x10
 entry_SYSCALL_64_fastpath+0x1f/0xc2
Memory state around the buggy address:
 ffff88006c4dca00: fb fb fc fc fb fb fc fc fb fb fc fc fb fb fc fc
 ffff88006c4dca80: fb fb fc fc fb fb fc fc fb fb fc fc fb fb fc fc
>ffff88006c4dcb00: fb fb fc fc fb fb fc fc fb fb fc fc fb fb fc fc
                                  ^
 ffff88006c4dcb80: fb fb fc fc 00 00 fc fc fb fb fc fc fb fb fc fc
 ffff88006c4dcc00: fb fb fc fc fb fb fc fc fb fb fc fc fb fb fc fc
==================================================================

This patch (of 9):

Introduce get_shadow_bug_type() function, which determines bug type based
on the shadow value for a particular kernel address.  Introduce
get_wild_bug_type() function, which determines bug type for addresses
which don't have a corresponding shadow value.

Link: http://lkml.kernel.org/r/20170302134851.101218-2-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kasan/report.c | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index ab42a0803f16..c2bc08b1b5e0 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -51,7 +51,13 @@ static const void *find_first_bad_addr(const void *addr, size_t size)
 	return first_bad_addr;
 }
 
-static void print_error_description(struct kasan_access_info *info)
+static bool addr_has_shadow(struct kasan_access_info *info)
+{
+	return (info->access_addr >=
+		kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+}
+
+static const char *get_shadow_bug_type(struct kasan_access_info *info)
 {
 	const char *bug_type = "unknown-crash";
 	u8 *shadow_addr;
@@ -98,6 +104,27 @@ static void print_error_description(struct kasan_access_info *info)
 		break;
 	}
 
+	return bug_type;
+}
+
+const char *get_wild_bug_type(struct kasan_access_info *info)
+{
+	const char *bug_type = "unknown-crash";
+
+	if ((unsigned long)info->access_addr < PAGE_SIZE)
+		bug_type = "null-ptr-deref";
+	else if ((unsigned long)info->access_addr < TASK_SIZE)
+		bug_type = "user-memory-access";
+	else
+		bug_type = "wild-memory-access";
+
+	return bug_type;
+}
+
+static void print_error_description(struct kasan_access_info *info)
+{
+	const char *bug_type = get_shadow_bug_type(info);
+
 	pr_err("BUG: KASAN: %s in %pS at addr %p\n",
 		bug_type, (void *)info->ip,
 		info->access_addr);
@@ -267,18 +294,11 @@ static void print_shadow_for_address(const void *addr)
 static void kasan_report_error(struct kasan_access_info *info)
 {
 	unsigned long flags;
-	const char *bug_type;
 
 	kasan_start_report(&flags);
 
-	if (info->access_addr <
-			kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
-		if ((unsigned long)info->access_addr < PAGE_SIZE)
-			bug_type = "null-ptr-deref";
-		else if ((unsigned long)info->access_addr < TASK_SIZE)
-			bug_type = "user-memory-access";
-		else
-			bug_type = "wild-memory-access";
+	if (!addr_has_shadow(info)) {
+		const char *bug_type = get_wild_bug_type(info);
 		pr_err("BUG: KASAN: %s on address %p\n",
 			bug_type, info->access_addr);
 		pr_err("%s of size %zu by task %s/%d\n",
-- 
cgit v1.2.3


From cd971c1a06d1c77a03873e8c7670bf6a9ab9c639 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 5 Apr 2017 09:21:15 +1000
Subject: kasan: unify report headers

Unify KASAN report header format for different kinds of bad memory
accesses. Makes the code simpler.

Link: http://lkml.kernel.org/r/20170302134851.101218-3-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kasan/report.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index c2bc08b1b5e0..d6b6ec77c56a 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -121,16 +121,22 @@ const char *get_wild_bug_type(struct kasan_access_info *info)
 	return bug_type;
 }
 
+static const char *get_bug_type(struct kasan_access_info *info)
+{
+	if (addr_has_shadow(info))
+		return get_shadow_bug_type(info);
+	return get_wild_bug_type(info);
+}
+
 static void print_error_description(struct kasan_access_info *info)
 {
-	const char *bug_type = get_shadow_bug_type(info);
+	const char *bug_type = get_bug_type(info);
 
 	pr_err("BUG: KASAN: %s in %pS at addr %p\n",
-		bug_type, (void *)info->ip,
-		info->access_addr);
+		bug_type, (void *)info->ip, info->access_addr);
 	pr_err("%s of size %zu by task %s/%d\n",
-		info->is_write ? "Write" : "Read",
-		info->access_size, current->comm, task_pid_nr(current));
+		info->is_write ? "Write" : "Read", info->access_size,
+		current->comm, task_pid_nr(current));
 }
 
 static inline bool kernel_or_module_addr(const void *addr)
@@ -297,17 +303,11 @@ static void kasan_report_error(struct kasan_access_info *info)
 
 	kasan_start_report(&flags);
 
+	print_error_description(info);
+
 	if (!addr_has_shadow(info)) {
-		const char *bug_type = get_wild_bug_type(info);
-		pr_err("BUG: KASAN: %s on address %p\n",
-			bug_type, info->access_addr);
-		pr_err("%s of size %zu by task %s/%d\n",
-			info->is_write ? "Write" : "Read",
-			info->access_size, current->comm,
-			task_pid_nr(current));
 		dump_stack();
 	} else {
-		print_error_description(info);
 		print_address_description(info);
 		print_shadow_for_address(info->first_bad_addr);
 	}
-- 
cgit v1.2.3


From 20e5c73f3be18668199219e9685c4aa6b1eac669 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 5 Apr 2017 09:21:15 +1000
Subject: kasan: change allocation and freeing stack traces headers

Change stack traces headers from:

Allocated:
PID = 42

to:

Allocated by task 42:

Makes the report one line shorter and look better.

Link: http://lkml.kernel.org/r/20170302134851.101218-4-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kasan/report.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index d6b6ec77c56a..7d24363edd66 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -177,9 +177,9 @@ static void kasan_end_report(unsigned long *flags)
 	kasan_enable_current();
 }
 
-static void print_track(struct kasan_track *track)
+static void print_track(struct kasan_track *track, const char *prefix)
 {
-	pr_err("PID = %u\n", track->pid);
+	pr_err("%s by task %u:\n", prefix, track->pid);
 	if (track->stack) {
 		struct stack_trace trace;
 
@@ -201,10 +201,8 @@ static void kasan_object_err(struct kmem_cache *cache, void *object)
 	if (!(cache->flags & SLAB_KASAN))
 		return;
 
-	pr_err("Allocated:\n");
-	print_track(&alloc_info->alloc_track);
-	pr_err("Freed:\n");
-	print_track(&alloc_info->free_track);
+	print_track(&alloc_info->alloc_track, "Allocated");
+	print_track(&alloc_info->free_track, "Freed");
 }
 
 void kasan_report_double_free(struct kmem_cache *cache, void *object,
-- 
cgit v1.2.3


From 20af2c1cc0bb35b70578ed9d8f209ac8a5df6d5d Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 5 Apr 2017 09:21:16 +1000
Subject: kasan: simplify address description logic

Simplify logic for describing a memory address.
Add addr_to_page() helper function.

Makes the code easier to follow.

Link: http://lkml.kernel.org/r/20170302134851.101218-5-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kasan/report.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 7d24363edd66..a82d6896062b 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -190,11 +190,18 @@ static void print_track(struct kasan_track *track, const char *prefix)
 	}
 }
 
-static void kasan_object_err(struct kmem_cache *cache, void *object)
+static struct page *addr_to_page(const void *addr)
+{
+	if ((addr >= (void *)PAGE_OFFSET) &&
+			(addr < high_memory))
+		return virt_to_head_page(addr);
+	return NULL;
+}
+
+static void describe_object(struct kmem_cache *cache, void *object)
 {
 	struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
 
-	dump_stack();
 	pr_err("Object at %p, in cache %s size: %d\n", object, cache->name,
 		cache->object_size);
 
@@ -213,34 +220,32 @@ void kasan_report_double_free(struct kmem_cache *cache, void *object,
 	kasan_start_report(&flags);
 	pr_err("BUG: Double free or freeing an invalid pointer\n");
 	pr_err("Unexpected shadow byte: 0x%hhX\n", shadow);
-	kasan_object_err(cache, object);
+	dump_stack();
+	describe_object(cache, object);
 	kasan_end_report(&flags);
 }
 
 static void print_address_description(struct kasan_access_info *info)
 {
 	const void *addr = info->access_addr;
+	struct page *page = addr_to_page(addr);
 
-	if ((addr >= (void *)PAGE_OFFSET) &&
-		(addr < high_memory)) {
-		struct page *page = virt_to_head_page(addr);
-
-		if (PageSlab(page)) {
-			void *object;
-			struct kmem_cache *cache = page->slab_cache;
-			object = nearest_obj(cache, page,
-						(void *)info->access_addr);
-			kasan_object_err(cache, object);
-			return;
-		}
+	if (page)
 		dump_page(page, "kasan: bad access detected");
+
+	dump_stack();
+
+	if (page && PageSlab(page)) {
+		struct kmem_cache *cache = page->slab_cache;
+		void *object = nearest_obj(cache, page,	(void *)addr);
+
+		describe_object(cache, object);
 	}
 
 	if (kernel_or_module_addr(addr)) {
 		if (!init_task_stack_addr(addr))
 			pr_err("Address belongs to variable %pS\n", addr);
 	}
-	dump_stack();
 }
 
 static bool row_is_guilty(const void *row, const void *guilty)
-- 
cgit v1.2.3


From 90ca2984ef36bf76ebb10b004677cc555bb29200 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 5 Apr 2017 09:21:16 +1000
Subject: kasan: change report header

Change report header format from:

BUG: KASAN: use-after-free in unwind_get_return_address+0x28a/0x2c0 at addr ffff880069437950
Read of size 8 by task insmod/3925

to:

BUG: KASAN: use-after-free in unwind_get_return_address+0x28a/0x2c0
Read of size 8 at addr ffff880069437950 by task insmod/3925

The exact access address is not usually important, so move it to the
second line. This also makes the header look visually balanced.

Link: http://lkml.kernel.org/r/20170302134851.101218-6-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kasan/report.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index a82d6896062b..8efc69473a37 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -132,11 +132,11 @@ static void print_error_description(struct kasan_access_info *info)
 {
 	const char *bug_type = get_bug_type(info);
 
-	pr_err("BUG: KASAN: %s in %pS at addr %p\n",
-		bug_type, (void *)info->ip, info->access_addr);
-	pr_err("%s of size %zu by task %s/%d\n",
+	pr_err("BUG: KASAN: %s in %pS\n",
+		bug_type, (void *)info->ip);
+	pr_err("%s of size %zu at addr %p by task %s/%d\n",
 		info->is_write ? "Write" : "Read", info->access_size,
-		current->comm, task_pid_nr(current));
+		info->access_addr, current->comm, task_pid_nr(current));
 }
 
 static inline bool kernel_or_module_addr(const void *addr)
-- 
cgit v1.2.3


From 493f185ab63909658f4c500ad0ee652e0abece4e Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 5 Apr 2017 09:21:17 +1000
Subject: kasan: improve slab object description

Changes slab object description from:

Object at ffff880068388540, in cache kmalloc-128 size: 128

to:

The buggy address belongs to the object at ffff880068388540
 which belongs to the cache kmalloc-128 of size 128
The buggy address is located 123 bytes inside of
 128-byte region [ffff880068388540, ffff8800683885c0)

Makes it more explanatory and adds information about relative offset
of the accessed address to the start of the object.

Link: http://lkml.kernel.org/r/20170302134851.101218-7-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kasan/report.c | 53 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 8efc69473a37..a1b1ef7a19f5 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -198,18 +198,49 @@ static struct page *addr_to_page(const void *addr)
 	return NULL;
 }
 
-static void describe_object(struct kmem_cache *cache, void *object)
+static void describe_object_addr(struct kmem_cache *cache, void *object,
+				const void *addr)
 {
-	struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+	unsigned long access_addr = (unsigned long)addr;
+	unsigned long object_addr = (unsigned long)object;
+	const char *rel_type;
+	int rel_bytes;
 
-	pr_err("Object at %p, in cache %s size: %d\n", object, cache->name,
-		cache->object_size);
+	pr_err("The buggy address belongs to the object at %p\n"
+	       " which belongs to the cache %s of size %d\n",
+		object, cache->name, cache->object_size);
 
-	if (!(cache->flags & SLAB_KASAN))
+	if (!addr)
 		return;
 
-	print_track(&alloc_info->alloc_track, "Allocated");
-	print_track(&alloc_info->free_track, "Freed");
+	if (access_addr < object_addr) {
+		rel_type = "to the left";
+		rel_bytes = object_addr - access_addr;
+	} else if (access_addr >= object_addr + cache->object_size) {
+		rel_type = "to the right";
+		rel_bytes = access_addr - (object_addr + cache->object_size);
+	} else {
+		rel_type = "inside";
+		rel_bytes = access_addr - object_addr;
+	}
+
+	pr_err("The buggy address is located %d bytes %s of\n"
+	       " %d-byte region [%p, %p)\n",
+		rel_bytes, rel_type, cache->object_size, (void *)object_addr,
+		(void *)(object_addr + cache->object_size));
+}
+
+static void describe_object(struct kmem_cache *cache, void *object,
+				const void *addr)
+{
+	struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+
+	if (cache->flags & SLAB_KASAN) {
+		print_track(&alloc_info->alloc_track, "Allocated");
+		print_track(&alloc_info->free_track, "Freed");
+	}
+
+	describe_object_addr(cache, object, addr);
 }
 
 void kasan_report_double_free(struct kmem_cache *cache, void *object,
@@ -221,13 +252,13 @@ void kasan_report_double_free(struct kmem_cache *cache, void *object,
 	pr_err("BUG: Double free or freeing an invalid pointer\n");
 	pr_err("Unexpected shadow byte: 0x%hhX\n", shadow);
 	dump_stack();
-	describe_object(cache, object);
+	describe_object(cache, object, NULL);
 	kasan_end_report(&flags);
 }
 
 static void print_address_description(struct kasan_access_info *info)
 {
-	const void *addr = info->access_addr;
+	void *addr = (void *)info->access_addr;
 	struct page *page = addr_to_page(addr);
 
 	if (page)
@@ -237,9 +268,9 @@ static void print_address_description(struct kasan_access_info *info)
 
 	if (page && PageSlab(page)) {
 		struct kmem_cache *cache = page->slab_cache;
-		void *object = nearest_obj(cache, page,	(void *)addr);
+		void *object = nearest_obj(cache, page,	addr);
 
-		describe_object(cache, object);
+		describe_object(cache, object, addr);
 	}
 
 	if (kernel_or_module_addr(addr)) {
-- 
cgit v1.2.3


From 8927e95194a69c91b9a7c71b6f83a2fbcd81573f Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 5 Apr 2017 09:21:17 +1000
Subject: kasan: print page description after stacks

Moves page description after the stacks since it's less important.

Link: http://lkml.kernel.org/r/20170302134851.101218-8-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kasan/report.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index a1b1ef7a19f5..b015acc80876 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -261,9 +261,6 @@ static void print_address_description(struct kasan_access_info *info)
 	void *addr = (void *)info->access_addr;
 	struct page *page = addr_to_page(addr);
 
-	if (page)
-		dump_page(page, "kasan: bad access detected");
-
 	dump_stack();
 
 	if (page && PageSlab(page)) {
@@ -273,9 +270,14 @@ static void print_address_description(struct kasan_access_info *info)
 		describe_object(cache, object, addr);
 	}
 
-	if (kernel_or_module_addr(addr)) {
-		if (!init_task_stack_addr(addr))
-			pr_err("Address belongs to variable %pS\n", addr);
+	if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
+		pr_err("The buggy address belongs to the variable:\n");
+		pr_err(" %pS\n", addr);
+	}
+
+	if (page) {
+		pr_err("The buggy address belongs to the page:\n");
+		dump_page(page, "kasan: bad access detected");
 	}
 }
 
-- 
cgit v1.2.3


From ae8d01251c6ccb361a3cf3ae881f360e5e48fa54 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 5 Apr 2017 09:21:18 +1000
Subject: kasan: improve double-free report format

Changes double-free report header from

BUG: Double free or freeing an invalid pointer
Unexpected shadow byte: 0xFB

to

BUG: KASAN: double-free or invalid-free in kmalloc_oob_left+0xe5/0xef

This makes a bug uniquely identifiable by the first report line.
To account for removing of the unexpected shadow value, print shadow
bytes at the end of the report as in reports for other kinds of bugs.

Link: http://lkml.kernel.org/r/20170302134851.101218-9-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kasan/kasan.c  |  3 ++-
 mm/kasan/kasan.h  |  2 +-
 mm/kasan/report.c | 30 ++++++++++++++----------------
 3 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 98b27195e38b..9348d27088c1 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -577,7 +577,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
 
 	shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
 	if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) {
-		kasan_report_double_free(cache, object, shadow_byte);
+		kasan_report_double_free(cache, object,
+				__builtin_return_address(1));
 		return true;
 	}
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index dd2dea8eb077..1229298cce64 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -99,7 +99,7 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 void kasan_report(unsigned long addr, size_t size,
 		bool is_write, unsigned long ip);
 void kasan_report_double_free(struct kmem_cache *cache, void *object,
-			s8 shadow);
+					void *ip);
 
 #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
 void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index b015acc80876..7d3d9670e233 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -243,22 +243,8 @@ static void describe_object(struct kmem_cache *cache, void *object,
 	describe_object_addr(cache, object, addr);
 }
 
-void kasan_report_double_free(struct kmem_cache *cache, void *object,
-			s8 shadow)
-{
-	unsigned long flags;
-
-	kasan_start_report(&flags);
-	pr_err("BUG: Double free or freeing an invalid pointer\n");
-	pr_err("Unexpected shadow byte: 0x%hhX\n", shadow);
-	dump_stack();
-	describe_object(cache, object, NULL);
-	kasan_end_report(&flags);
-}
-
-static void print_address_description(struct kasan_access_info *info)
+static void print_address_description(void *addr)
 {
-	void *addr = (void *)info->access_addr;
 	struct page *page = addr_to_page(addr);
 
 	dump_stack();
@@ -333,6 +319,18 @@ static void print_shadow_for_address(const void *addr)
 	}
 }
 
+void kasan_report_double_free(struct kmem_cache *cache, void *object,
+				void *ip)
+{
+	unsigned long flags;
+
+	kasan_start_report(&flags);
+	pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip);
+	print_address_description(object);
+	print_shadow_for_address(object);
+	kasan_end_report(&flags);
+}
+
 static void kasan_report_error(struct kasan_access_info *info)
 {
 	unsigned long flags;
@@ -344,7 +342,7 @@ static void kasan_report_error(struct kasan_access_info *info)
 	if (!addr_has_shadow(info)) {
 		dump_stack();
 	} else {
-		print_address_description(info);
+		print_address_description((void *)info->access_addr);
 		print_shadow_for_address(info->first_bad_addr);
 	}
 
-- 
cgit v1.2.3


From 60a165975d8fb595641ce715522b071e6de0b1e6 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 5 Apr 2017 09:21:18 +1000
Subject: kasan: separate report parts by empty lines

Makes the report easier to read.

Link: http://lkml.kernel.org/r/20170302134851.101218-10-andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/kasan/report.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 7d3d9670e233..beee0e980e2d 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -237,7 +237,9 @@ static void describe_object(struct kmem_cache *cache, void *object,
 
 	if (cache->flags & SLAB_KASAN) {
 		print_track(&alloc_info->alloc_track, "Allocated");
+		pr_err("\n");
 		print_track(&alloc_info->free_track, "Freed");
+		pr_err("\n");
 	}
 
 	describe_object_addr(cache, object, addr);
@@ -248,6 +250,7 @@ static void print_address_description(void *addr)
 	struct page *page = addr_to_page(addr);
 
 	dump_stack();
+	pr_err("\n");
 
 	if (page && PageSlab(page)) {
 		struct kmem_cache *cache = page->slab_cache;
@@ -326,7 +329,9 @@ void kasan_report_double_free(struct kmem_cache *cache, void *object,
 
 	kasan_start_report(&flags);
 	pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip);
+	pr_err("\n");
 	print_address_description(object);
+	pr_err("\n");
 	print_shadow_for_address(object);
 	kasan_end_report(&flags);
 }
@@ -338,11 +343,13 @@ static void kasan_report_error(struct kasan_access_info *info)
 	kasan_start_report(&flags);
 
 	print_error_description(info);
+	pr_err("\n");
 
 	if (!addr_has_shadow(info)) {
 		dump_stack();
 	} else {
 		print_address_description((void *)info->access_addr);
+		pr_err("\n");
 		print_shadow_for_address(info->first_bad_addr);
 	}
 
-- 
cgit v1.2.3


From 4e9ebb5190ba6df165cfec2dd6f71e00739ec5b0 Mon Sep 17 00:00:00 2001
From: "Tobin C. Harding" <me@tobin.cc>
Date: Wed, 5 Apr 2017 09:21:19 +1000
Subject: fs/proc/inode.c: remove cast from memory allocation

Coccinelle emits WARNING: casting value returned by memory allocation
function to (struct proc_inode *) is useless.

Remove unnecessary cast.

Link: http://lkml.kernel.org/r/1487745720-16967-1-git-send-email-me@tobin.cc
Signed-off-by: Tobin C. Harding <me@tobin.cc>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 2cc7a8030275..e250910cffc8 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -58,7 +58,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 	struct proc_inode *ei;
 	struct inode *inode;
 
-	ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
+	ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 	ei->pid = NULL;
-- 
cgit v1.2.3


From 44f507272bf7630f6f5dfc029c848e9afb0e5c5a Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Wed, 5 Apr 2017 09:21:20 +1000
Subject: proc/sysctl: fix the int overflow for jiffies conversion

do_proc_dointvec_jiffies_conv() uses LONG_MAX/HZ as the max value to avoid
overflow.  But actually the *valp is int type, so it still causes
overflow.

For example,
echo 2147483647 > ./sys/net/ipv4/tcp_keepalive_time
Then,
cat ./sys/net/ipv4/tcp_keepalive_time
The output is "-1", it is not expected.

Now use INT_MAX/HZ as the max value instead LONG_MAX/HZ to fix it.

Link: http://lkml.kernel.org/r/1490109532-9228-1-git-send-email-fgao@ikuai8.com
Signed-off-by: Gao Feng <fgao@ikuai8.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index acf0a5a06da7..60474dfa45c9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2571,7 +2571,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
 					 int write, void *data)
 {
 	if (write) {
-		if (*lvalp > LONG_MAX / HZ)
+		if (*lvalp > INT_MAX / HZ)
 			return 1;
 		*valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
 	} else {
-- 
cgit v1.2.3


From cbf9d4280bc59f242e0266258da11685380558cf Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Wed, 5 Apr 2017 09:21:20 +1000
Subject: drivers/virt/fsl_hypervisor.c: use get_user_pages_unlocked()

Moving from get_user_pages() to get_user_pages_unlocked() simplifies the code
and takes advantage of VM_FAULT_RETRY functionality when faulting in pages.

Link: http://lkml.kernel.org/r/20161101194332.23961-1-lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Kumar Gala <galak@kernel.crashing.org>
Cc: Mihai Caraman <mihai.caraman@freescale.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/virt/fsl_hypervisor.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c
index 150ce2abf6c8..d3eca879a0a8 100644
--- a/drivers/virt/fsl_hypervisor.c
+++ b/drivers/virt/fsl_hypervisor.c
@@ -243,11 +243,8 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p)
 	sg_list = PTR_ALIGN(sg_list_unaligned, sizeof(struct fh_sg_list));
 
 	/* Get the physical addresses of the source buffer */
-	down_read(&current->mm->mmap_sem);
-	num_pinned = get_user_pages(param.local_vaddr - lb_offset,
-		num_pages, (param.source == -1) ? 0 : FOLL_WRITE,
-		pages, NULL);
-	up_read(&current->mm->mmap_sem);
+	num_pinned = get_user_pages_unlocked(param.local_vaddr - lb_offset,
+		num_pages, pages, (param.source == -1) ? 0 : FOLL_WRITE);
 
 	if (num_pinned != num_pages) {
 		/* get_user_pages() failed */
-- 
cgit v1.2.3


From 5e2b943025143912ffc0cf30dd2c04c25733e014 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Wed, 5 Apr 2017 09:21:21 +1000
Subject: jiffies.h: declare jiffies and jiffies_64 with
 ____cacheline_aligned_in_smp

jiffies_64 is defined in kernel/time/timer.c with
____cacheline_aligned_in_smp, however this macro is not part of the
declaration of jiffies and jiffies_64 in jiffies.h.

As a result clang generates the following warning:

kernel/time/timer.c:57:26: error: section does not match previous declaration [-Werror,-Wsection]
__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
                         ^
include/linux/cache.h:39:36: note: expanded from macro '__cacheline_aligned_in_smp'
                                   ^
include/linux/cache.h:34:4: note: expanded from macro '__cacheline_aligned'
                 __section__(".data..cacheline_aligned")))
                 ^
include/linux/jiffies.h:77:12: note: previous attribute is here
extern u64 __jiffy_data jiffies_64;
           ^
include/linux/jiffies.h:70:38: note: expanded from macro '__jiffy_data'

Link: http://lkml.kernel.org/r/20170403190200.70273-1-mka@chromium.org
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
Cc: Grant Grundler <grundler@chromium.org>
Cc: Michael Davidson <md@google.com>
Cc: Greg Hackmann <ghackmann@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/jiffies.h | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 624215cebee5..36872fbb815d 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_JIFFIES_H
 #define _LINUX_JIFFIES_H
 
+#include <linux/cache.h>
 #include <linux/math64.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
@@ -63,19 +64,13 @@ extern int register_refined_jiffies(long clock_tick_rate);
 /* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */
 #define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ)
 
-/* some arch's have a small-data section that can be accessed register-relative
- * but that can only take up to, say, 4-byte variables. jiffies being part of
- * an 8-byte variable may not be correctly accessed unless we force the issue
- */
-#define __jiffy_data  __attribute__((section(".data")))
-
 /*
  * The 64-bit value is not atomic - you MUST NOT read it
  * without sampling the sequence number in jiffies_lock.
  * get_jiffies_64() will do this for you as appropriate.
  */
-extern u64 __jiffy_data jiffies_64;
-extern unsigned long volatile __jiffy_data jiffies;
+extern u64 __cacheline_aligned_in_smp jiffies_64;
+extern unsigned long volatile __cacheline_aligned_in_smp jiffies;
 
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void);
-- 
cgit v1.2.3


From d1b6ad546626d39f914fb4777fdcbed07a616932 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 5 Apr 2017 09:21:21 +1000
Subject: kernel/hung_task.c: defer showing held locks

When I was running my testcase which may block hundreds of threads on fs
locks, I got lockup due to output from debug_show_all_locks() added by
commit b2d4c2edb2e4f89a ("locking/hung_task: Show all locks").

For example, if 1000 threads were blocked in TASK_UNINTERRUPTIBLE state
and 500 out of 1000 threads hold some lock, debug_show_all_locks() from
for_each_process_thread() loop will report locks held by 500 threads for
1000 times.  This is a too much noise.

In order to make sure rcu_lock_break() is called frequently, we should
avoid calling debug_show_all_locks() from for_each_process_thread() loop
because debug_show_all_locks() effectively calls for_each_process_thread()
loop.  Let's defer calling debug_show_all_locks() till before panic() or
leaving for_each_process_thread() loop.

Link: http://lkml.kernel.org/r/1489296834-60436-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/hung_task.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index f0f8e2a8496f..751593ed7c0b 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -43,6 +43,7 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_
 int __read_mostly sysctl_hung_task_warnings = 10;
 
 static int __read_mostly did_panic;
+static bool hung_task_show_lock;
 
 static struct task_struct *watchdog_task;
 
@@ -120,12 +121,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 		pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
 			" disables this message.\n");
 		sched_show_task(t);
-		debug_show_all_locks();
+		hung_task_show_lock = true;
 	}
 
 	touch_nmi_watchdog();
 
 	if (sysctl_hung_task_panic) {
+		if (hung_task_show_lock)
+			debug_show_all_locks();
 		trigger_all_cpu_backtrace();
 		panic("hung_task: blocked tasks");
 	}
@@ -172,6 +175,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	if (test_taint(TAINT_DIE) || did_panic)
 		return;
 
+	hung_task_show_lock = false;
 	rcu_read_lock();
 	for_each_process_thread(g, t) {
 		if (!max_count--)
@@ -187,6 +191,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	}
  unlock:
 	rcu_read_unlock();
+	if (hung_task_show_lock)
+		debug_show_all_locks();
 }
 
 static long hung_timeout_jiffies(unsigned long last_checked,
-- 
cgit v1.2.3


From cc70daf1fe328db2d728652f44266766f54669e8 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 5 Apr 2017 09:21:22 +1000
Subject: drivers/misc/vmw_vmci/vmci_queue_pair.c: fix a couple integer
 overflow tests

The "DIV_ROUND_UP(size, PAGE_SIZE)" operation can overflow if "size" is
more than ULLONG_MAX - PAGE_SIZE.

Link: http://lkml.kernel.org/r/20170322111950.GA11279@mwanda
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Jorgen Hansen <jhansen@vmware.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/misc/vmw_vmci/vmci_queue_pair.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c
index 498c0854305f..06c4974ee8dd 100644
--- a/drivers/misc/vmw_vmci/vmci_queue_pair.c
+++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c
@@ -298,8 +298,11 @@ static void *qp_alloc_queue(u64 size, u32 flags)
 	size_t pas_size;
 	size_t vas_size;
 	size_t queue_size = sizeof(*queue) + sizeof(*queue->kernel_if);
-	const u64 num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1;
+	u64 num_pages;
 
+	if (size > SIZE_MAX - PAGE_SIZE)
+		return NULL;
+	num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1;
 	if (num_pages >
 		 (SIZE_MAX - queue_size) /
 		 (sizeof(*queue->kernel_if->u.g.pas) +
@@ -624,9 +627,12 @@ static struct vmci_queue *qp_host_alloc_queue(u64 size)
 {
 	struct vmci_queue *queue;
 	size_t queue_page_size;
-	const u64 num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1;
+	u64 num_pages;
 	const size_t queue_size = sizeof(*queue) + sizeof(*(queue->kernel_if));
 
+	if (size > SIZE_MAX - PAGE_SIZE)
+		return NULL;
+	num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1;
 	if (num_pages > (SIZE_MAX - queue_size) /
 		 sizeof(*queue->kernel_if->u.h.page))
 		return NULL;
-- 
cgit v1.2.3


From cf3e4b6661253d8670899cba733b0a94b41df943 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Wed, 5 Apr 2017 09:21:23 +1000
Subject: Revert "lib/test_sort.c: make it explicitly non-modular"

Patch series "lib: add module support to sort tests".

This patch series allows to compile the array-based and linked list sort
test code either to loadable modules, or builtin into the kernel.

It's very valuable to have modular tests, so you can run them just by
insmodding the test modules, instead of needing a separate kernel that
runs them at boot.

This patch (of 3):

This reverts commit 8893f519330bb073a49c5b4676fce4be6f1be15d.

It's very valuable to have modular tests, so you can run them just by
insmodding the test modules, instead of needing a separate kernel that
runs them at boot.

Link: http://lkml.kernel.org/r/1488287219-15832-2-git-send-email-geert@linux-m68k.org
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/test_sort.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/lib/test_sort.c b/lib/test_sort.c
index 4db3911db50a..d389c1cc2f6c 100644
--- a/lib/test_sort.c
+++ b/lib/test_sort.c
@@ -1,11 +1,8 @@
 #include <linux/sort.h>
 #include <linux/slab.h>
-#include <linux/init.h>
+#include <linux/module.h>
 
-/*
- * A simple boot-time regression test
- * License: GPL
- */
+/* a simple boot-time regression test */
 
 #define TEST_LEN 1000
 
@@ -41,4 +38,6 @@ exit:
 	kfree(a);
 	return err;
 }
-subsys_initcall(test_sort_init);
+
+module_init(test_sort_init);
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From d07dca671be9499dc32308f7b2f19666c60f480e Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Wed, 5 Apr 2017 09:21:23 +1000
Subject: lib: add module support to array-based sort tests

Allow to compile the array-based sort test code either to a loadable
module, or builtin into the kernel.

Link: http://lkml.kernel.org/r/1488287219-15832-3-git-send-email-geert@linux-m68k.org
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/Kconfig.debug | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 97d62c2da6c2..22dd8721d7c2 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1727,10 +1727,11 @@ config TEST_LIST_SORT
 	  If unsure, say N.
 
 config TEST_SORT
-	bool "Array-based sort test"
-	depends on DEBUG_KERNEL
+	tristate "Array-based sort test"
+	depends on DEBUG_KERNEL || m
 	help
-	  This option enables the self-test function of 'sort()' at boot.
+	  This option enables the self-test function of 'sort()' at boot,
+	  or at module load time.
 
 	  If unsure, say N.
 
-- 
cgit v1.2.3


From a496bb33d48780a9bb8fc35e6174ef29ed83d33a Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Wed, 5 Apr 2017 09:21:24 +1000
Subject: lib: add module support to linked list sorting tests

Extract the linked list sorting test code into its own source file, to
allow to compile it either to a loadable module, or builtin into the
kernel.

Link: http://lkml.kernel.org/r/1488287219-15832-4-git-send-email-geert@linux-m68k.org
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/Kconfig.debug    |   7 +--
 lib/Makefile         |   1 +
 lib/list_sort.c      | 149 --------------------------------------------------
 lib/test_list_sort.c | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 155 insertions(+), 152 deletions(-)
 create mode 100644 lib/test_list_sort.c

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 22dd8721d7c2..924f210db65f 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1718,11 +1718,12 @@ config LKDTM
 	Documentation/fault-injection/provoke-crashes.txt
 
 config TEST_LIST_SORT
-	bool "Linked list sorting test"
-	depends on DEBUG_KERNEL
+	tristate "Linked list sorting test"
+	depends on DEBUG_KERNEL || m
 	help
 	  Enable this to turn on 'list_sort()' function test. This test is
-	  executed only once during system boot, so affects only boot time.
+	  executed only once during system boot (so affects only boot time),
+	  or at module load time.
 
 	  If unsure, say N.
 
diff --git a/lib/Makefile b/lib/Makefile
index 320ac46a8725..786c4538a91f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
 obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o
 obj-$(CONFIG_TEST_KASAN) += test_kasan.o
 obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
+obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o
 obj-$(CONFIG_TEST_LKM) += test_module.o
 obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o
 obj-$(CONFIG_TEST_SORT) += test_sort.o
diff --git a/lib/list_sort.c b/lib/list_sort.c
index 3fe401067e20..9e9acc37652f 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -1,6 +1,3 @@
-
-#define pr_fmt(fmt) "list_sort_test: " fmt
-
 #include <linux/kernel.h>
 #include <linux/bug.h>
 #include <linux/compiler.h>
@@ -145,149 +142,3 @@ void list_sort(void *priv, struct list_head *head,
 	merge_and_restore_back_links(priv, cmp, head, part[max_lev], list);
 }
 EXPORT_SYMBOL(list_sort);
-
-#ifdef CONFIG_TEST_LIST_SORT
-
-#include <linux/slab.h>
-#include <linux/random.h>
-
-/*
- * The pattern of set bits in the list length determines which cases
- * are hit in list_sort().
- */
-#define TEST_LIST_LEN (512+128+2) /* not including head */
-
-#define TEST_POISON1 0xDEADBEEF
-#define TEST_POISON2 0xA324354C
-
-struct debug_el {
-	unsigned int poison1;
-	struct list_head list;
-	unsigned int poison2;
-	int value;
-	unsigned serial;
-};
-
-/* Array, containing pointers to all elements in the test list */
-static struct debug_el **elts __initdata;
-
-static int __init check(struct debug_el *ela, struct debug_el *elb)
-{
-	if (ela->serial >= TEST_LIST_LEN) {
-		pr_err("error: incorrect serial %d\n", ela->serial);
-		return -EINVAL;
-	}
-	if (elb->serial >= TEST_LIST_LEN) {
-		pr_err("error: incorrect serial %d\n", elb->serial);
-		return -EINVAL;
-	}
-	if (elts[ela->serial] != ela || elts[elb->serial] != elb) {
-		pr_err("error: phantom element\n");
-		return -EINVAL;
-	}
-	if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) {
-		pr_err("error: bad poison: %#x/%#x\n",
-			ela->poison1, ela->poison2);
-		return -EINVAL;
-	}
-	if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) {
-		pr_err("error: bad poison: %#x/%#x\n",
-			elb->poison1, elb->poison2);
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static int __init cmp(void *priv, struct list_head *a, struct list_head *b)
-{
-	struct debug_el *ela, *elb;
-
-	ela = container_of(a, struct debug_el, list);
-	elb = container_of(b, struct debug_el, list);
-
-	check(ela, elb);
-	return ela->value - elb->value;
-}
-
-static int __init list_sort_test(void)
-{
-	int i, count = 1, err = -ENOMEM;
-	struct debug_el *el;
-	struct list_head *cur;
-	LIST_HEAD(head);
-
-	pr_debug("start testing list_sort()\n");
-
-	elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL);
-	if (!elts) {
-		pr_err("error: cannot allocate memory\n");
-		return err;
-	}
-
-	for (i = 0; i < TEST_LIST_LEN; i++) {
-		el = kmalloc(sizeof(*el), GFP_KERNEL);
-		if (!el) {
-			pr_err("error: cannot allocate memory\n");
-			goto exit;
-		}
-		 /* force some equivalencies */
-		el->value = prandom_u32() % (TEST_LIST_LEN / 3);
-		el->serial = i;
-		el->poison1 = TEST_POISON1;
-		el->poison2 = TEST_POISON2;
-		elts[i] = el;
-		list_add_tail(&el->list, &head);
-	}
-
-	list_sort(NULL, &head, cmp);
-
-	err = -EINVAL;
-	for (cur = head.next; cur->next != &head; cur = cur->next) {
-		struct debug_el *el1;
-		int cmp_result;
-
-		if (cur->next->prev != cur) {
-			pr_err("error: list is corrupted\n");
-			goto exit;
-		}
-
-		cmp_result = cmp(NULL, cur, cur->next);
-		if (cmp_result > 0) {
-			pr_err("error: list is not sorted\n");
-			goto exit;
-		}
-
-		el = container_of(cur, struct debug_el, list);
-		el1 = container_of(cur->next, struct debug_el, list);
-		if (cmp_result == 0 && el->serial >= el1->serial) {
-			pr_err("error: order of equivalent elements not "
-				"preserved\n");
-			goto exit;
-		}
-
-		if (check(el, el1)) {
-			pr_err("error: element check failed\n");
-			goto exit;
-		}
-		count++;
-	}
-	if (head.prev != cur) {
-		pr_err("error: list is corrupted\n");
-		goto exit;
-	}
-
-
-	if (count != TEST_LIST_LEN) {
-		pr_err("error: bad list length %d", count);
-		goto exit;
-	}
-
-	err = 0;
-exit:
-	for (i = 0; i < TEST_LIST_LEN; i++)
-		kfree(elts[i]);
-	kfree(elts);
-	return err;
-}
-late_initcall(list_sort_test);
-#endif /* CONFIG_TEST_LIST_SORT */
diff --git a/lib/test_list_sort.c b/lib/test_list_sort.c
new file mode 100644
index 000000000000..28e817387b04
--- /dev/null
+++ b/lib/test_list_sort.c
@@ -0,0 +1,150 @@
+#define pr_fmt(fmt) "list_sort_test: " fmt
+
+#include <linux/kernel.h>
+#include <linux/list_sort.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+
+/*
+ * The pattern of set bits in the list length determines which cases
+ * are hit in list_sort().
+ */
+#define TEST_LIST_LEN (512+128+2) /* not including head */
+
+#define TEST_POISON1 0xDEADBEEF
+#define TEST_POISON2 0xA324354C
+
+struct debug_el {
+	unsigned int poison1;
+	struct list_head list;
+	unsigned int poison2;
+	int value;
+	unsigned serial;
+};
+
+/* Array, containing pointers to all elements in the test list */
+static struct debug_el **elts __initdata;
+
+static int __init check(struct debug_el *ela, struct debug_el *elb)
+{
+	if (ela->serial >= TEST_LIST_LEN) {
+		pr_err("error: incorrect serial %d\n", ela->serial);
+		return -EINVAL;
+	}
+	if (elb->serial >= TEST_LIST_LEN) {
+		pr_err("error: incorrect serial %d\n", elb->serial);
+		return -EINVAL;
+	}
+	if (elts[ela->serial] != ela || elts[elb->serial] != elb) {
+		pr_err("error: phantom element\n");
+		return -EINVAL;
+	}
+	if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) {
+		pr_err("error: bad poison: %#x/%#x\n",
+			ela->poison1, ela->poison2);
+		return -EINVAL;
+	}
+	if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) {
+		pr_err("error: bad poison: %#x/%#x\n",
+			elb->poison1, elb->poison2);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int __init cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct debug_el *ela, *elb;
+
+	ela = container_of(a, struct debug_el, list);
+	elb = container_of(b, struct debug_el, list);
+
+	check(ela, elb);
+	return ela->value - elb->value;
+}
+
+static int __init list_sort_test(void)
+{
+	int i, count = 1, err = -ENOMEM;
+	struct debug_el *el;
+	struct list_head *cur;
+	LIST_HEAD(head);
+
+	pr_debug("start testing list_sort()\n");
+
+	elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL);
+	if (!elts) {
+		pr_err("error: cannot allocate memory\n");
+		return err;
+	}
+
+	for (i = 0; i < TEST_LIST_LEN; i++) {
+		el = kmalloc(sizeof(*el), GFP_KERNEL);
+		if (!el) {
+			pr_err("error: cannot allocate memory\n");
+			goto exit;
+		}
+		 /* force some equivalencies */
+		el->value = prandom_u32() % (TEST_LIST_LEN / 3);
+		el->serial = i;
+		el->poison1 = TEST_POISON1;
+		el->poison2 = TEST_POISON2;
+		elts[i] = el;
+		list_add_tail(&el->list, &head);
+	}
+
+	list_sort(NULL, &head, cmp);
+
+	err = -EINVAL;
+	for (cur = head.next; cur->next != &head; cur = cur->next) {
+		struct debug_el *el1;
+		int cmp_result;
+
+		if (cur->next->prev != cur) {
+			pr_err("error: list is corrupted\n");
+			goto exit;
+		}
+
+		cmp_result = cmp(NULL, cur, cur->next);
+		if (cmp_result > 0) {
+			pr_err("error: list is not sorted\n");
+			goto exit;
+		}
+
+		el = container_of(cur, struct debug_el, list);
+		el1 = container_of(cur->next, struct debug_el, list);
+		if (cmp_result == 0 && el->serial >= el1->serial) {
+			pr_err("error: order of equivalent elements not "
+				"preserved\n");
+			goto exit;
+		}
+
+		if (check(el, el1)) {
+			pr_err("error: element check failed\n");
+			goto exit;
+		}
+		count++;
+	}
+	if (head.prev != cur) {
+		pr_err("error: list is corrupted\n");
+		goto exit;
+	}
+
+
+	if (count != TEST_LIST_LEN) {
+		pr_err("error: bad list length %d", count);
+		goto exit;
+	}
+
+	err = 0;
+exit:
+	for (i = 0; i < TEST_LIST_LEN; i++)
+		kfree(elts[i]);
+	kfree(elts);
+	return err;
+}
+module_init(list_sort_test);
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 004d4db8186a65fdb9035c56487ad716fb3cab28 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Wed, 5 Apr 2017 09:21:24 +1000
Subject: firmware/Makefile: force recompilation if makefile changes

If you modify the target asm we currently do not force the recompilation
of the firmware files.  The target asm is in the firmware/Makefile, peg
this file as a dependency to require re-compilation of firmware targets
when the asm changes.

Link: http://lkml.kernel.org/r/20170123150727.4883-1-mcgrof@kernel.org
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michal Marek <mmarek@suse.com>
Cc: Ming Lei <ming.lei@canonical.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Tom Gundersen <teg@jklm.no>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 firmware/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/firmware/Makefile b/firmware/Makefile
index e297e1b52636..fa3e81c2a97b 100644
--- a/firmware/Makefile
+++ b/firmware/Makefile
@@ -176,7 +176,8 @@ quiet_cmd_fwbin = MK_FW   $@
 wordsize_deps := $(wildcard include/config/64bit.h include/config/32bit.h \
 		include/config/ppc32.h include/config/ppc64.h \
 		include/config/superh32.h include/config/superh64.h \
-		include/config/x86_32.h include/config/x86_64.h)
+		include/config/x86_32.h include/config/x86_64.h \
+		firmware/Makefile)
 
 $(patsubst %,$(obj)/%.gen.S, $(fw-shipped-y)): %: $(wordsize_deps)
 	$(call cmd,fwbin,$(patsubst %.gen.S,%,$@))
-- 
cgit v1.2.3


From 968fc5a74122ccb4805e158f00041c310b00c4d9 Mon Sep 17 00:00:00 2001
From: Ruslan Bilovol <ruslan.bilovol@gmail.com>
Date: Wed, 5 Apr 2017 09:21:25 +1000
Subject: checkpatch: remove obsolete CONFIG_EXPERIMENTAL checks

Config EXPERIMENTAL has been removed from kernel in 2013 (see 3d374d0:
"final removal of CONFIG_EXPERIMENTAL"), there is no any reason to do
these checks now.

Link: http://lkml.kernel.org/r/1488234097-20119-1-git-send-email-ruslan.bilovol@gmail.com
Signed-off-by: Ruslan Bilovol <ruslan.bilovol@gmail.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 scripts/checkpatch.pl | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index baa3c7be04ad..30eeba4f1602 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2757,13 +2757,6 @@ sub process {
 			#print "is_start<$is_start> is_end<$is_end> length<$length>\n";
 		}
 
-# discourage the addition of CONFIG_EXPERIMENTAL in Kconfig.
-		if ($realfile =~ /Kconfig/ &&
-		    $line =~ /.\s*depends on\s+.*\bEXPERIMENTAL\b/) {
-			WARN("CONFIG_EXPERIMENTAL",
-			     "Use of CONFIG_EXPERIMENTAL is deprecated. For alternatives, see https://lkml.org/lkml/2012/10/23/580\n");
-		}
-
 # discourage the use of boolean for type definition attributes of Kconfig options
 		if ($realfile =~ /Kconfig/ &&
 		    $line =~ /^\+\s*\bboolean\b/) {
@@ -3157,12 +3150,6 @@ sub process {
 			}
 		}
 
-# discourage the addition of CONFIG_EXPERIMENTAL in #if(def).
-		if ($line =~ /^\+\s*\#\s*if.*\bCONFIG_EXPERIMENTAL\b/) {
-			WARN("CONFIG_EXPERIMENTAL",
-			     "Use of CONFIG_EXPERIMENTAL is deprecated. For alternatives, see https://lkml.org/lkml/2012/10/23/580\n");
-		}
-
 # check for RCS/CVS revision markers
 		if ($rawline =~ /^\+.*\$(Revision|Log|Id)(?:\$|)/) {
 			WARN("CVS_KEYWORD",
-- 
cgit v1.2.3


From 76a02a9d1023bdc9a5e6cc40549f9a3b2bb807ac Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 5 Apr 2017 09:21:25 +1000
Subject: checkpatch: add ability to find bad uses of vsprintf %p<foo>
 extensions

%pK was at least once misused at %pk in an out-of-tree module.  This lead
to some security concerns.  Add the ability to track single and multiple
line statements for misuses of %p<foo>.

Link: http://lkml.kernel.org/r/163a690510e636a23187c0dc9caa09ddac6d4cde.1488228427.git.joe@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: William Roberts <william.c.roberts@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 scripts/checkpatch.pl | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 30eeba4f1602..732bb3e2fe9a 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -5663,6 +5663,32 @@ sub process {
 			}
 		}
 
+		# check for vsprintf extension %p<foo> misuses
+		if ($^V && $^V ge 5.10.0 &&
+		    defined $stat &&
+		    $stat =~ /^\+(?![^\{]*\{\s*).*\b(\w+)\s*\(.*$String\s*,/s &&
+		    $1 !~ /^_*volatile_*$/) {
+			my $bad_extension = "";
+			my $lc = $stat =~ tr@\n@@;
+			$lc = $lc + $linenr;
+		        for (my $count = $linenr; $count <= $lc; $count++) {
+				my $fmt = get_quoted_string($lines[$count - 1], raw_line($count, 0));
+				$fmt =~ s/%%//g;
+				if ($fmt =~ /(\%[\*\d\.]*p(?![\WFfSsBKRraEhMmIiUDdgVCbGN]).)/) {
+					$bad_extension = $1;
+					last;
+				}
+			}
+			if ($bad_extension ne "") {
+				my $stat_real = raw_line($linenr, 0);
+				for (my $count = $linenr + 1; $count <= $lc; $count++) {
+					$stat_real = $stat_real . "\n" . raw_line($count, 0);
+				}
+				WARN("VSPRINTF_POINTER_EXTENSION",
+				     "Invalid vsprintf pointer extension '$bad_extension'\n" . "$here\n$stat_real\n");
+			}
+		}
+
 # Check for misused memsets
 		if ($^V && $^V ge 5.10.0 &&
 		    defined $stat &&
-- 
cgit v1.2.3


From f58283acea78e2ce95a3dc9b6bdc39160cfe4dd8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 5 Apr 2017 09:21:26 +1000
Subject: 
 checkpatch-add-ability-to-find-bad-uses-of-vsprintf-%pfoo-extensions-fix

add helpful comment into lib/vsprintf.c

Cc: Joe Perches <joe@perches.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: William Roberts <william.c.roberts@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/vsprintf.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index e3bf4e0f10b5..df113aa9ae5e 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1477,6 +1477,9 @@ int kptr_restrict __read_mostly;
  * by an extra set of alphanumeric characters that are extended format
  * specifiers.
  *
+ * Please update scripts/checkpatch.pl when adding new conversion characters.
+ * (search for "check for vsprintf extension").
+ *
  * Right now we handle:
  *
  * - 'F' For symbolic function descriptor pointers with offset
-- 
cgit v1.2.3


From f5c1271b9d5bddc50ccc3a4b1a186501a6d3fecd Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 5 Apr 2017 09:21:27 +1000
Subject: 
 checkpatch-add-ability-to-find-bad-uses-of-vsprintf-%pfoo-extensions-fix-fix

text tweak

Cc: Joe Perches <joe@perches.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: William Roberts <william.c.roberts@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/vsprintf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index df113aa9ae5e..db44e6049028 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1477,8 +1477,8 @@ int kptr_restrict __read_mostly;
  * by an extra set of alphanumeric characters that are extended format
  * specifiers.
  *
- * Please update scripts/checkpatch.pl when adding new conversion characters.
- * (search for "check for vsprintf extension").
+ * Please update scripts/checkpatch.pl when adding/removing conversion
+ * characters.  (Search for "check for vsprintf extension").
  *
  * Right now we handle:
  *
-- 
cgit v1.2.3


From c8fd864cec92975be639d388837fff5dcc4e727f Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 5 Apr 2017 09:21:27 +1000
Subject: checkpatch: improve EMBEDDED_FUNCTION_NAME test

The existing behavior relies on patch context to identify function
declarations.  Add the ability to find function declarations when there is
an open brace in column 1.

This finds function declarations only in specific single line forms where
the function name is on a single line like:

int foo(args...)
{

and

int
foo(args...)
{

It does not recognize function declarations like:

int foo(int bar,
        int baz)
{

Link: http://lkml.kernel.org/r/738d74bbbe1a06b80f11ed504818107c68903095.1488155636.git.joe@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 scripts/checkpatch.pl | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 732bb3e2fe9a..832e8150dba3 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3126,6 +3126,17 @@ sub process {
 # check we are in a valid C source file if not then ignore this hunk
 		next if ($realfile !~ /\.(h|c)$/);
 
+# check if this appears to be the start function declaration, save the name
+		if ($sline =~ /^\+\{\s*$/ &&
+		    $prevline =~ /^\+(?:(?:(?:$Storage|$Inline)\s*)*\s*$Type\s*)?($Ident)\(/) {
+			$context_function = $1;
+		}
+
+# check if this appears to be the end of function declaration
+		if ($sline =~ /^\+\}\s*$/) {
+			undef $context_function;
+		}
+
 # check indentation of any line with a bare else
 # (but not if it is a multiple line "if (foo) return bar; else return baz;")
 # if the previous line is a break or return and is indented 1 tab more...
-- 
cgit v1.2.3


From 3a44cbfea104dd310da61346c85f0e913f7c555b Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 5 Apr 2017 09:21:28 +1000
Subject: checkpatch: allow space leading blank lines in email headers

Allow a leading space and otherwise blank link in the email headers as it
can be a line wrapped Spamassassin multiple line string or any other valid
rfc 2822/5322 email header.

The line with space causes checkpatch to erroniously think that it's in
the content body, as opposed to headers and thus flag a mail header as an
unwrapped long comment line.

Link: http://lkml.kernel.org/r/d75a9f0b78b3488078429f4037d9fff3bdfa3b78.1490247180.git.joe@perches.com
Signed-off-by: Joe Perches <joe@perches.com>Reported-by: Darren Hart (VMware) <dvhart@infradead.org>
Tested-by: Darren Hart (VMware) <dvhart@infradead.org>
Reviewed-by: Darren Hart (VMware) <dvhart@vmware.com>
Original-patch-by: John 'Warthog9' Hawley (VMware) <warthog9@eaglescrag.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 scripts/checkpatch.pl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 832e8150dba3..089c974aa3a5 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2628,8 +2628,8 @@ sub process {
 # Check if it's the start of a commit log
 # (not a header line and we haven't seen the patch filename)
 		if ($in_header_lines && $realfile =~ /^$/ &&
-		    !($rawline =~ /^\s+\S/ ||
-		      $rawline =~ /^(commit\b|from\b|[\w-]+:).*$/i)) {
+		    !($rawline =~ /^\s+(?:\S|$)/ ||
+		      $rawline =~ /^(?:commit\b|from\b|[\w-]+:)/i)) {
 			$in_header_lines = 0;
 			$in_commit_log = 1;
 			$has_commit_log = 1;
-- 
cgit v1.2.3


From 315ed516e9cade03627f7ae8f03d60e0edf19c18 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 5 Apr 2017 09:21:29 +1000
Subject: reiserfs: use designated initializers

Prepare to mark sensitive kernel structures for randomization by making
sure they're using designated initializers.  These were identified during
allyesconfig builds of x86, arm, and arm64, with most initializer fixes
extracted from grsecurity.

Link: http://lkml.kernel.org/r/20170329210419.GA40066@beast
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/reiserfs/item_ops.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index aca73dd73906..e3c558d1b78c 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -724,18 +724,18 @@ static void errcatch_print_vi(struct virtual_item *vi)
 }
 
 static struct item_operations errcatch_ops = {
-	errcatch_bytes_number,
-	errcatch_decrement_key,
-	errcatch_is_left_mergeable,
-	errcatch_print_item,
-	errcatch_check_item,
-
-	errcatch_create_vi,
-	errcatch_check_left,
-	errcatch_check_right,
-	errcatch_part_size,
-	errcatch_unit_num,
-	errcatch_print_vi
+	.bytes_number = errcatch_bytes_number,
+	.decrement_key = errcatch_decrement_key,
+	.is_left_mergeable = errcatch_is_left_mergeable,
+	.print_item = errcatch_print_item,
+	.check_item = errcatch_check_item,
+
+	.create_vi = errcatch_create_vi,
+	.check_left = errcatch_check_left,
+	.check_right = errcatch_check_right,
+	.part_size = errcatch_part_size,
+	.unit_num = errcatch_unit_num,
+	.print_vi = errcatch_print_vi
 };
 
 #if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3)
-- 
cgit v1.2.3


From bf4f265e6677e23afe15a2e4cd2d4a9db9144bab Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 5 Apr 2017 09:21:30 +1000
Subject: cpumask: make "nr_cpumask_bits" unsigned

Bit searching functions accept "unsigned long" indices but
"nr_cpumask_bits" is "int" which is signed, so inevitable sign extensions
occur on x86_64.  Those MOVSX are #1 MOVSX bloat by number of uses across
whole kernel.

Change "nr_cpumask_bits" to unsigned, this number can't be negative after
all.  It allows to do implicit zero-extension on x86_64 without MOVSX.

Change signed comparisons into unsigned comparisons where necessary.

Other uses looks fine because it is either argument passed to a function
or comparison is already unsigned.

Net win on allyesconfig type of kernel: ~2.8 KB (!)

	add/remove: 0/0 grow/shrink: 8/725 up/down: 93/-2926 (-2833)
	function                                     old     new   delta
	xen_exit_mmap                                691     735     +44
	qstat_read                                   426     440     +14
	__cpufreq_cooling_register                  1678    1687      +9
	trace_rb_cpu_prepare                         447     455      +8
	vermagic                                      54      60      +6
	nfp_driver_version                            54      60      +6
	rcu_torture_stats_print                     1147    1151      +4
	find_next_push_cpu                           267     269      +2
	xen_irq_resume                               961     960      -1
				...
	init_vp_index                                946     906     -40
	od_set_powersave_bias                        328     281     -47
	power_cpu_exit                               193     139     -54
	arch_show_interrupts                        3538    3484     -54
	select_idle_sibling                         1558    1471     -87
	Total: Before=158358910, After=158356077, chg -0.00%

Same arguments apply to "nr_cpu_ids" but I haven't yet found enough
courage to delve into this issue (and proper fix may require new type
"cpu_t" which is whole separate story).

Link: http://lkml.kernel.org/r/20170309205322.GA1728@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/mips/kernel/perf_event_mipsxx.c | 2 +-
 arch/s390/kernel/perf_cpum_sf.c      | 2 +-
 include/linux/cpumask.h              | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c
index 8c35b3152e1e..44b50646582d 100644
--- a/arch/mips/kernel/perf_event_mipsxx.c
+++ b/arch/mips/kernel/perf_event_mipsxx.c
@@ -618,7 +618,7 @@ static int mipspmu_event_init(struct perf_event *event)
 		return -ENOENT;
 	}
 
-	if (event->cpu >= nr_cpumask_bits ||
+	if ((unsigned int)event->cpu >= nr_cpumask_bits ||
 	    (event->cpu >= 0 && !cpu_online(event->cpu)))
 		return -ENODEV;
 
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 1c0b58545c04..534a10c00c54 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -823,7 +823,7 @@ static int cpumsf_pmu_event_init(struct perf_event *event)
 	}
 
 	/* Check online status of the CPU to which the event is pinned */
-	if (event->cpu >= nr_cpumask_bits ||
+	if ((unsigned int)event->cpu >= nr_cpumask_bits ||
 	    (event->cpu >= 0 && !cpu_online(event->cpu)))
 		return -ENODEV;
 
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 96f1e88b767c..a3ba193f042e 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -40,9 +40,9 @@ extern int nr_cpu_ids;
 #ifdef CONFIG_CPUMASK_OFFSTACK
 /* Assuming NR_CPUS is huge, a runtime limit is more efficient.  Also,
  * not all bits may be allocated. */
-#define nr_cpumask_bits	nr_cpu_ids
+#define nr_cpumask_bits	((unsigned int)nr_cpu_ids)
 #else
-#define nr_cpumask_bits	NR_CPUS
+#define nr_cpumask_bits	((unsigned int)NR_CPUS)
 #endif
 
 /*
-- 
cgit v1.2.3


From 28c968ba5ee884e98f2c54da11b6336586bdea17 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.vnet.ibm.com>
Date: Wed, 5 Apr 2017 09:21:31 +1000
Subject: crash: move crashkernel parsing and vmcore related code under
 CONFIG_CRASH_CORE

Patch series "kexec/fadump: remove dependency with CONFIG_KEXEC and reuse
crashkernel parameter for fadump", v4.

Traditionally, kdump is used to save vmcore in case of a crash.  Some
architectures like powerpc can save vmcore using architecture specific
support instead of kexec/kdump mechanism.  Such architecture specific
support also needs to reserve memory, to be used by dump capture kernel.
crashkernel parameter can be a reused, for memory reservation, by such
architecture specific infrastructure.

This patchset removes dependency with CONFIG_KEXEC for crashkernel
parameter and vmcoreinfo related code as it can be reused without kexec
support.  Also, crashkernel parameter is reused instead of
fadump_reserve_mem to reserve memory for fadump.

The first patch moves crashkernel parameter parsing and vmcoreinfo related
code under CONFIG_CRASH_CORE instead of CONFIG_KEXEC_CORE.  The second
patch reuses the definitions of append_elf_note() & final_note() functions
under CONFIG_CRASH_CORE in IA64 arch code.  The third patch removes
dependency on CONFIG_KEXEC for firmware-assisted dump (fadump) in powerpc.
The next patch reuses crashkernel parameter for reserving memory for
fadump, instead of the fadump_reserve_mem parameter.  This has the
advantage of using all syntaxes crashkernel parameter supports, for fadump
as well.  The last patch updates fadump kernel documentation about use of
crashkernel parameter.

This patch (of 5):

Traditionally, kdump is used to save vmcore in case of a crash.  Some
architectures like powerpc can save vmcore using architecture specific
support instead of kexec/kdump mechanism.  Such architecture specific
support also needs to reserve memory, to be used by dump capture kernel.
crashkernel parameter can be a reused, for memory reservation, by such
architecture specific infrastructure.

But currently, code related to vmcoreinfo and parsing of crashkernel
parameter is built under CONFIG_KEXEC_CORE.  This patch introduces
CONFIG_CRASH_CORE and moves the above mentioned code under this config,
allowing code reuse without dependency on CONFIG_KEXEC.  There is no
functional change with this patch.

Link: http://lkml.kernel.org/r/149035338104.6881.4550894432615189948.stgit@hbathini.in.ibm.com
Signed-off-by: Hari Bathini <hbathini@linux.vnet.ibm.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/Kconfig               |   4 +
 include/linux/crash_core.h |  65 +++++++
 include/linux/kexec.h      |  57 +-----
 include/linux/printk.h     |   4 +-
 kernel/Makefile            |   1 +
 kernel/crash_core.c        | 445 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/kexec_core.c        | 403 ----------------------------------------
 kernel/ksysfs.c            |   8 +-
 kernel/printk/printk.c     |   6 +-
 9 files changed, 531 insertions(+), 462 deletions(-)
 create mode 100644 include/linux/crash_core.h
 create mode 100644 kernel/crash_core.c

diff --git a/arch/Kconfig b/arch/Kconfig
index cd211a14a88f..ffdf5e3833da 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -2,7 +2,11 @@
 # General architecture dependent options
 #
 
+config CRASH_CORE
+	bool
+
 config KEXEC_CORE
+	select CRASH_CORE
 	bool
 
 config HAVE_IMA_KEXEC
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
new file mode 100644
index 000000000000..18d0f946fda3
--- /dev/null
+++ b/include/linux/crash_core.h
@@ -0,0 +1,65 @@
+#ifndef LINUX_CRASH_CORE_H
+#define LINUX_CRASH_CORE_H
+
+#include <linux/linkage.h>
+#include <linux/elfcore.h>
+#include <linux/elf.h>
+
+#define CRASH_CORE_NOTE_NAME	   "CORE"
+#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
+#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4)
+#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4)
+
+#define CRASH_CORE_NOTE_BYTES	   ((CRASH_CORE_NOTE_HEAD_BYTES * 2) +	\
+				     CRASH_CORE_NOTE_NAME_BYTES +	\
+				     CRASH_CORE_NOTE_DESC_BYTES)
+
+#define VMCOREINFO_BYTES	   (4096)
+#define VMCOREINFO_NOTE_NAME	   "VMCOREINFO"
+#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
+#define VMCOREINFO_NOTE_SIZE	   ((CRASH_CORE_NOTE_HEAD_BYTES * 2) +	\
+				     VMCOREINFO_NOTE_NAME_BYTES +	\
+				     VMCOREINFO_BYTES)
+
+typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4];
+
+void crash_save_vmcoreinfo(void);
+void arch_crash_save_vmcoreinfo(void);
+__printf(1, 2)
+void vmcoreinfo_append_str(const char *fmt, ...);
+phys_addr_t paddr_vmcoreinfo_note(void);
+
+#define VMCOREINFO_OSRELEASE(value) \
+	vmcoreinfo_append_str("OSRELEASE=%s\n", value)
+#define VMCOREINFO_PAGESIZE(value) \
+	vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
+#define VMCOREINFO_SYMBOL(name) \
+	vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
+#define VMCOREINFO_SIZE(name) \
+	vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
+			      (unsigned long)sizeof(name))
+#define VMCOREINFO_STRUCT_SIZE(name) \
+	vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
+			      (unsigned long)sizeof(struct name))
+#define VMCOREINFO_OFFSET(name, field) \
+	vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
+			      (unsigned long)offsetof(struct name, field))
+#define VMCOREINFO_LENGTH(name, value) \
+	vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value)
+#define VMCOREINFO_NUMBER(name) \
+	vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
+#define VMCOREINFO_CONFIG(name) \
+	vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+
+extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+extern size_t vmcoreinfo_size;
+extern size_t vmcoreinfo_max_size;
+
+int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
+		unsigned long long *crash_size, unsigned long long *crash_base);
+int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
+		unsigned long long *crash_size, unsigned long long *crash_base);
+int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
+		unsigned long long *crash_size, unsigned long long *crash_base);
+
+#endif /* LINUX_CRASH_CORE_H */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index d419d0e51fe5..c9481ebcbc0c 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -14,17 +14,15 @@
 
 #if !defined(__ASSEMBLY__)
 
+#include <linux/crash_core.h>
 #include <asm/io.h>
 
 #include <uapi/linux/kexec.h>
 
 #ifdef CONFIG_KEXEC_CORE
 #include <linux/list.h>
-#include <linux/linkage.h>
 #include <linux/compat.h>
 #include <linux/ioport.h>
-#include <linux/elfcore.h>
-#include <linux/elf.h>
 #include <linux/module.h>
 #include <asm/kexec.h>
 
@@ -62,19 +60,15 @@
 #define KEXEC_CRASH_MEM_ALIGN PAGE_SIZE
 #endif
 
-#define KEXEC_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
-#define KEXEC_CORE_NOTE_NAME "CORE"
-#define KEXEC_CORE_NOTE_NAME_BYTES ALIGN(sizeof(KEXEC_CORE_NOTE_NAME), 4)
-#define KEXEC_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4)
+#define KEXEC_CORE_NOTE_NAME	CRASH_CORE_NOTE_NAME
+
 /*
  * The per-cpu notes area is a list of notes terminated by a "NULL"
  * note header.  For kdump, the code in vmcore.c runs in the context
  * of the second kernel to combine them into one note.
  */
 #ifndef KEXEC_NOTE_BYTES
-#define KEXEC_NOTE_BYTES ( (KEXEC_NOTE_HEAD_BYTES * 2) +		\
-			    KEXEC_CORE_NOTE_NAME_BYTES +		\
-			    KEXEC_CORE_NOTE_DESC_BYTES )
+#define KEXEC_NOTE_BYTES	CRASH_CORE_NOTE_BYTES
 #endif
 
 /*
@@ -256,33 +250,6 @@ extern void crash_kexec(struct pt_regs *);
 int kexec_should_crash(struct task_struct *);
 int kexec_crash_loaded(void);
 void crash_save_cpu(struct pt_regs *regs, int cpu);
-void crash_save_vmcoreinfo(void);
-void arch_crash_save_vmcoreinfo(void);
-__printf(1, 2)
-void vmcoreinfo_append_str(const char *fmt, ...);
-phys_addr_t paddr_vmcoreinfo_note(void);
-
-#define VMCOREINFO_OSRELEASE(value) \
-	vmcoreinfo_append_str("OSRELEASE=%s\n", value)
-#define VMCOREINFO_PAGESIZE(value) \
-	vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
-#define VMCOREINFO_SYMBOL(name) \
-	vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
-#define VMCOREINFO_SIZE(name) \
-	vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
-			      (unsigned long)sizeof(name))
-#define VMCOREINFO_STRUCT_SIZE(name) \
-	vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
-			      (unsigned long)sizeof(struct name))
-#define VMCOREINFO_OFFSET(name, field) \
-	vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
-			      (unsigned long)offsetof(struct name, field))
-#define VMCOREINFO_LENGTH(name, value) \
-	vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value)
-#define VMCOREINFO_NUMBER(name) \
-	vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
-#define VMCOREINFO_CONFIG(name) \
-	vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
 
 extern struct kimage *kexec_image;
 extern struct kimage *kexec_crash_image;
@@ -303,31 +270,15 @@ extern int kexec_load_disabled;
 #define KEXEC_FILE_FLAGS	(KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
 				 KEXEC_FILE_NO_INITRAMFS)
 
-#define VMCOREINFO_BYTES           (4096)
-#define VMCOREINFO_NOTE_NAME       "VMCOREINFO"
-#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
-#define VMCOREINFO_NOTE_SIZE       (KEXEC_NOTE_HEAD_BYTES*2 + VMCOREINFO_BYTES \
-				    + VMCOREINFO_NOTE_NAME_BYTES)
-
 /* Location of a reserved region to hold the crash kernel.
  */
 extern struct resource crashk_res;
 extern struct resource crashk_low_res;
-typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4];
 extern note_buf_t __percpu *crash_notes;
-extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-extern size_t vmcoreinfo_size;
-extern size_t vmcoreinfo_max_size;
 
 /* flag to track if kexec reboot is in progress */
 extern bool kexec_in_progress;
 
-int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
-		unsigned long long *crash_size, unsigned long long *crash_base);
-int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
-		unsigned long long *crash_size, unsigned long long *crash_base);
-int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
-		unsigned long long *crash_size, unsigned long long *crash_base);
 int crash_shrink_memory(unsigned long new_size);
 size_t crash_get_memory_size(void);
 void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 571257e0f53d..e10f27468322 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -198,7 +198,7 @@ extern void wake_up_klogd(void);
 
 char *log_buf_addr_get(void);
 u32 log_buf_len_get(void);
-void log_buf_kexec_setup(void);
+void log_buf_vmcoreinfo_setup(void);
 void __init setup_log_buf(int early);
 __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
 void dump_stack_print_info(const char *log_lvl);
@@ -246,7 +246,7 @@ static inline u32 log_buf_len_get(void)
 	return 0;
 }
 
-static inline void log_buf_kexec_setup(void)
+static inline void log_buf_vmcoreinfo_setup(void)
 {
 }
 
diff --git a/kernel/Makefile b/kernel/Makefile
index b302b4731d16..72aa080f91f0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -59,6 +59,7 @@ obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_MODULE_SIG) += module_signing.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_CRASH_CORE) += crash_core.o
 obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
new file mode 100644
index 000000000000..4261587a34d2
--- /dev/null
+++ b/kernel/crash_core.c
@@ -0,0 +1,445 @@
+/*
+ * crash.c - kernel crash support code.
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/crash_core.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+/* vmcoreinfo stuff */
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+size_t vmcoreinfo_size;
+size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ *   crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+					unsigned long long system_ram,
+					unsigned long long *crash_size,
+					unsigned long long *crash_base)
+{
+	char *cur = cmdline, *tmp;
+
+	/* for each entry of the comma-separated list */
+	do {
+		unsigned long long start, end = ULLONG_MAX, size;
+
+		/* get the start of the range */
+		start = memparse(cur, &tmp);
+		if (cur == tmp) {
+			pr_warn("crashkernel: Memory value expected\n");
+			return -EINVAL;
+		}
+		cur = tmp;
+		if (*cur != '-') {
+			pr_warn("crashkernel: '-' expected\n");
+			return -EINVAL;
+		}
+		cur++;
+
+		/* if no ':' is here, than we read the end */
+		if (*cur != ':') {
+			end = memparse(cur, &tmp);
+			if (cur == tmp) {
+				pr_warn("crashkernel: Memory value expected\n");
+				return -EINVAL;
+			}
+			cur = tmp;
+			if (end <= start) {
+				pr_warn("crashkernel: end <= start\n");
+				return -EINVAL;
+			}
+		}
+
+		if (*cur != ':') {
+			pr_warn("crashkernel: ':' expected\n");
+			return -EINVAL;
+		}
+		cur++;
+
+		size = memparse(cur, &tmp);
+		if (cur == tmp) {
+			pr_warn("Memory value expected\n");
+			return -EINVAL;
+		}
+		cur = tmp;
+		if (size >= system_ram) {
+			pr_warn("crashkernel: invalid size\n");
+			return -EINVAL;
+		}
+
+		/* match ? */
+		if (system_ram >= start && system_ram < end) {
+			*crash_size = size;
+			break;
+		}
+	} while (*cur++ == ',');
+
+	if (*crash_size > 0) {
+		while (*cur && *cur != ' ' && *cur != '@')
+			cur++;
+		if (*cur == '@') {
+			cur++;
+			*crash_base = memparse(cur, &tmp);
+			if (cur == tmp) {
+				pr_warn("Memory value expected after '@'\n");
+				return -EINVAL;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ *	crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+					   unsigned long long *crash_size,
+					   unsigned long long *crash_base)
+{
+	char *cur = cmdline;
+
+	*crash_size = memparse(cmdline, &cur);
+	if (cmdline == cur) {
+		pr_warn("crashkernel: memory value expected\n");
+		return -EINVAL;
+	}
+
+	if (*cur == '@')
+		*crash_base = memparse(cur+1, &cur);
+	else if (*cur != ' ' && *cur != '\0') {
+		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW  1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+	[SUFFIX_HIGH] = ",high",
+	[SUFFIX_LOW]  = ",low",
+	[SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix"  crashkernel command lines like
+ *
+ *	crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+					   unsigned long long	*crash_size,
+					   const char *suffix)
+{
+	char *cur = cmdline;
+
+	*crash_size = memparse(cmdline, &cur);
+	if (cmdline == cur) {
+		pr_warn("crashkernel: memory value expected\n");
+		return -EINVAL;
+	}
+
+	/* check with suffix */
+	if (strncmp(cur, suffix, strlen(suffix))) {
+		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+		return -EINVAL;
+	}
+	cur += strlen(suffix);
+	if (*cur != ' ' && *cur != '\0') {
+		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+			     const char *name,
+			     const char *suffix)
+{
+	char *p = cmdline, *ck_cmdline = NULL;
+
+	/* find crashkernel and use the last one if there are more */
+	p = strstr(p, name);
+	while (p) {
+		char *end_p = strchr(p, ' ');
+		char *q;
+
+		if (!end_p)
+			end_p = p + strlen(p);
+
+		if (!suffix) {
+			int i;
+
+			/* skip the one with any known suffix */
+			for (i = 0; suffix_tbl[i]; i++) {
+				q = end_p - strlen(suffix_tbl[i]);
+				if (!strncmp(q, suffix_tbl[i],
+					     strlen(suffix_tbl[i])))
+					goto next;
+			}
+			ck_cmdline = p;
+		} else {
+			q = end_p - strlen(suffix);
+			if (!strncmp(q, suffix, strlen(suffix)))
+				ck_cmdline = p;
+		}
+next:
+		p = strstr(p+1, name);
+	}
+
+	if (!ck_cmdline)
+		return NULL;
+
+	return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base,
+			     const char *name,
+			     const char *suffix)
+{
+	char	*first_colon, *first_space;
+	char	*ck_cmdline;
+
+	BUG_ON(!crash_size || !crash_base);
+	*crash_size = 0;
+	*crash_base = 0;
+
+	ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+
+	if (!ck_cmdline)
+		return -EINVAL;
+
+	ck_cmdline += strlen(name);
+
+	if (suffix)
+		return parse_crashkernel_suffix(ck_cmdline, crash_size,
+				suffix);
+	/*
+	 * if the commandline contains a ':', then that's the extended
+	 * syntax -- if not, it must be the classic syntax
+	 */
+	first_colon = strchr(ck_cmdline, ':');
+	first_space = strchr(ck_cmdline, ' ');
+	if (first_colon && (!first_space || first_colon < first_space))
+		return parse_crashkernel_mem(ck_cmdline, system_ram,
+				crash_size, crash_base);
+
+	return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base)
+{
+	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+					"crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base)
+{
+	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+				"crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+}
+
+int __init parse_crashkernel_low(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base)
+{
+	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+				"crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned int type,
+			    void *data, size_t data_len)
+{
+	struct elf_note note;
+
+	note.n_namesz = strlen(name) + 1;
+	note.n_descsz = data_len;
+	note.n_type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) + 3)/4;
+	memcpy(buf, name, note.n_namesz);
+	buf += (note.n_namesz + 3)/4;
+	memcpy(buf, data, note.n_descsz);
+	buf += (note.n_descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	struct elf_note note;
+
+	note.n_namesz = 0;
+	note.n_descsz = 0;
+	note.n_type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+static void update_vmcoreinfo_note(void)
+{
+	u32 *buf = vmcoreinfo_note;
+
+	if (!vmcoreinfo_size)
+		return;
+	buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+			      vmcoreinfo_size);
+	final_note(buf);
+}
+
+void crash_save_vmcoreinfo(void)
+{
+	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+	update_vmcoreinfo_note();
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+	va_list args;
+	char buf[0x50];
+	size_t r;
+
+	va_start(args, fmt);
+	r = vscnprintf(buf, sizeof(buf), fmt, args);
+	va_end(args);
+
+	r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+
+	memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+	vmcoreinfo_size += r;
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+
+phys_addr_t __weak paddr_vmcoreinfo_note(void)
+{
+	return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
+}
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+	VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+	VMCOREINFO_SYMBOL(init_uts_ns);
+	VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+	VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
+	VMCOREINFO_SYMBOL(_stext);
+	VMCOREINFO_SYMBOL(vmap_area_list);
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+	VMCOREINFO_SYMBOL(mem_map);
+	VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+	VMCOREINFO_SYMBOL(mem_section);
+	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+	VMCOREINFO_STRUCT_SIZE(mem_section);
+	VMCOREINFO_OFFSET(mem_section, section_mem_map);
+#endif
+	VMCOREINFO_STRUCT_SIZE(page);
+	VMCOREINFO_STRUCT_SIZE(pglist_data);
+	VMCOREINFO_STRUCT_SIZE(zone);
+	VMCOREINFO_STRUCT_SIZE(free_area);
+	VMCOREINFO_STRUCT_SIZE(list_head);
+	VMCOREINFO_SIZE(nodemask_t);
+	VMCOREINFO_OFFSET(page, flags);
+	VMCOREINFO_OFFSET(page, _refcount);
+	VMCOREINFO_OFFSET(page, mapping);
+	VMCOREINFO_OFFSET(page, lru);
+	VMCOREINFO_OFFSET(page, _mapcount);
+	VMCOREINFO_OFFSET(page, private);
+	VMCOREINFO_OFFSET(page, compound_dtor);
+	VMCOREINFO_OFFSET(page, compound_order);
+	VMCOREINFO_OFFSET(page, compound_head);
+	VMCOREINFO_OFFSET(pglist_data, node_zones);
+	VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+	VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+	VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+	VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+	VMCOREINFO_OFFSET(pglist_data, node_id);
+	VMCOREINFO_OFFSET(zone, free_area);
+	VMCOREINFO_OFFSET(zone, vm_stat);
+	VMCOREINFO_OFFSET(zone, spanned_pages);
+	VMCOREINFO_OFFSET(free_area, free_list);
+	VMCOREINFO_OFFSET(list_head, next);
+	VMCOREINFO_OFFSET(list_head, prev);
+	VMCOREINFO_OFFSET(vmap_area, va_start);
+	VMCOREINFO_OFFSET(vmap_area, list);
+	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+	log_buf_vmcoreinfo_setup();
+	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+	VMCOREINFO_NUMBER(NR_FREE_PAGES);
+	VMCOREINFO_NUMBER(PG_lru);
+	VMCOREINFO_NUMBER(PG_private);
+	VMCOREINFO_NUMBER(PG_swapcache);
+	VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+	VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+	VMCOREINFO_NUMBER(PG_head_mask);
+	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_HUGETLB_PAGE
+	VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
+#endif
+
+	arch_crash_save_vmcoreinfo();
+	update_vmcoreinfo_note();
+
+	return 0;
+}
+
+subsys_initcall(crash_save_vmcoreinfo_init);
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index bfe62d5b3872..9dd722912850 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -51,12 +51,6 @@ DEFINE_MUTEX(kexec_mutex);
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
 
-/* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
-
 /* Flag to indicate we are going to kexec a new kernel */
 bool kexec_in_progress = false;
 
@@ -1084,403 +1078,6 @@ static int __init crash_notes_memory_init(void)
 subsys_initcall(crash_notes_memory_init);
 
 
-/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
-
-
-/*
- * This function parses command lines in the format
- *
- *   crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
-					unsigned long long system_ram,
-					unsigned long long *crash_size,
-					unsigned long long *crash_base)
-{
-	char *cur = cmdline, *tmp;
-
-	/* for each entry of the comma-separated list */
-	do {
-		unsigned long long start, end = ULLONG_MAX, size;
-
-		/* get the start of the range */
-		start = memparse(cur, &tmp);
-		if (cur == tmp) {
-			pr_warn("crashkernel: Memory value expected\n");
-			return -EINVAL;
-		}
-		cur = tmp;
-		if (*cur != '-') {
-			pr_warn("crashkernel: '-' expected\n");
-			return -EINVAL;
-		}
-		cur++;
-
-		/* if no ':' is here, than we read the end */
-		if (*cur != ':') {
-			end = memparse(cur, &tmp);
-			if (cur == tmp) {
-				pr_warn("crashkernel: Memory value expected\n");
-				return -EINVAL;
-			}
-			cur = tmp;
-			if (end <= start) {
-				pr_warn("crashkernel: end <= start\n");
-				return -EINVAL;
-			}
-		}
-
-		if (*cur != ':') {
-			pr_warn("crashkernel: ':' expected\n");
-			return -EINVAL;
-		}
-		cur++;
-
-		size = memparse(cur, &tmp);
-		if (cur == tmp) {
-			pr_warn("Memory value expected\n");
-			return -EINVAL;
-		}
-		cur = tmp;
-		if (size >= system_ram) {
-			pr_warn("crashkernel: invalid size\n");
-			return -EINVAL;
-		}
-
-		/* match ? */
-		if (system_ram >= start && system_ram < end) {
-			*crash_size = size;
-			break;
-		}
-	} while (*cur++ == ',');
-
-	if (*crash_size > 0) {
-		while (*cur && *cur != ' ' && *cur != '@')
-			cur++;
-		if (*cur == '@') {
-			cur++;
-			*crash_base = memparse(cur, &tmp);
-			if (cur == tmp) {
-				pr_warn("Memory value expected after '@'\n");
-				return -EINVAL;
-			}
-		}
-	}
-
-	return 0;
-}
-
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- *	crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
-					   unsigned long long *crash_size,
-					   unsigned long long *crash_base)
-{
-	char *cur = cmdline;
-
-	*crash_size = memparse(cmdline, &cur);
-	if (cmdline == cur) {
-		pr_warn("crashkernel: memory value expected\n");
-		return -EINVAL;
-	}
-
-	if (*cur == '@')
-		*crash_base = memparse(cur+1, &cur);
-	else if (*cur != ' ' && *cur != '\0') {
-		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW  1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
-	[SUFFIX_HIGH] = ",high",
-	[SUFFIX_LOW]  = ",low",
-	[SUFFIX_NULL] = NULL,
-};
-
-/*
- * That function parses "suffix"  crashkernel command lines like
- *
- *	crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
-					   unsigned long long	*crash_size,
-					   const char *suffix)
-{
-	char *cur = cmdline;
-
-	*crash_size = memparse(cmdline, &cur);
-	if (cmdline == cur) {
-		pr_warn("crashkernel: memory value expected\n");
-		return -EINVAL;
-	}
-
-	/* check with suffix */
-	if (strncmp(cur, suffix, strlen(suffix))) {
-		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
-		return -EINVAL;
-	}
-	cur += strlen(suffix);
-	if (*cur != ' ' && *cur != '\0') {
-		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static __init char *get_last_crashkernel(char *cmdline,
-			     const char *name,
-			     const char *suffix)
-{
-	char *p = cmdline, *ck_cmdline = NULL;
-
-	/* find crashkernel and use the last one if there are more */
-	p = strstr(p, name);
-	while (p) {
-		char *end_p = strchr(p, ' ');
-		char *q;
-
-		if (!end_p)
-			end_p = p + strlen(p);
-
-		if (!suffix) {
-			int i;
-
-			/* skip the one with any known suffix */
-			for (i = 0; suffix_tbl[i]; i++) {
-				q = end_p - strlen(suffix_tbl[i]);
-				if (!strncmp(q, suffix_tbl[i],
-					     strlen(suffix_tbl[i])))
-					goto next;
-			}
-			ck_cmdline = p;
-		} else {
-			q = end_p - strlen(suffix);
-			if (!strncmp(q, suffix, strlen(suffix)))
-				ck_cmdline = p;
-		}
-next:
-		p = strstr(p+1, name);
-	}
-
-	if (!ck_cmdline)
-		return NULL;
-
-	return ck_cmdline;
-}
-
-static int __init __parse_crashkernel(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base,
-			     const char *name,
-			     const char *suffix)
-{
-	char	*first_colon, *first_space;
-	char	*ck_cmdline;
-
-	BUG_ON(!crash_size || !crash_base);
-	*crash_size = 0;
-	*crash_base = 0;
-
-	ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-
-	if (!ck_cmdline)
-		return -EINVAL;
-
-	ck_cmdline += strlen(name);
-
-	if (suffix)
-		return parse_crashkernel_suffix(ck_cmdline, crash_size,
-				suffix);
-	/*
-	 * if the commandline contains a ':', then that's the extended
-	 * syntax -- if not, it must be the classic syntax
-	 */
-	first_colon = strchr(ck_cmdline, ':');
-	first_space = strchr(ck_cmdline, ' ');
-	if (first_colon && (!first_space || first_colon < first_space))
-		return parse_crashkernel_mem(ck_cmdline, system_ram,
-				crash_size, crash_base);
-
-	return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- */
-int __init parse_crashkernel(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
-{
-	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-					"crashkernel=", NULL);
-}
-
-int __init parse_crashkernel_high(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
-{
-	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-				"crashkernel=", suffix_tbl[SUFFIX_HIGH]);
-}
-
-int __init parse_crashkernel_low(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
-{
-	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-				"crashkernel=", suffix_tbl[SUFFIX_LOW]);
-}
-
-static void update_vmcoreinfo_note(void)
-{
-	u32 *buf = vmcoreinfo_note;
-
-	if (!vmcoreinfo_size)
-		return;
-	buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
-			      vmcoreinfo_size);
-	final_note(buf);
-}
-
-void crash_save_vmcoreinfo(void)
-{
-	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
-	update_vmcoreinfo_note();
-}
-
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
-	va_list args;
-	char buf[0x50];
-	size_t r;
-
-	va_start(args, fmt);
-	r = vscnprintf(buf, sizeof(buf), fmt, args);
-	va_end(args);
-
-	r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
-
-	memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-
-	vmcoreinfo_size += r;
-}
-
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
-
-phys_addr_t __weak paddr_vmcoreinfo_note(void)
-{
-	return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
-}
-
-static int __init crash_save_vmcoreinfo_init(void)
-{
-	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
-	VMCOREINFO_PAGESIZE(PAGE_SIZE);
-
-	VMCOREINFO_SYMBOL(init_uts_ns);
-	VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
-	VMCOREINFO_SYMBOL(swapper_pg_dir);
-#endif
-	VMCOREINFO_SYMBOL(_stext);
-	VMCOREINFO_SYMBOL(vmap_area_list);
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-	VMCOREINFO_SYMBOL(mem_map);
-	VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
-	VMCOREINFO_SYMBOL(mem_section);
-	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
-	VMCOREINFO_STRUCT_SIZE(mem_section);
-	VMCOREINFO_OFFSET(mem_section, section_mem_map);
-#endif
-	VMCOREINFO_STRUCT_SIZE(page);
-	VMCOREINFO_STRUCT_SIZE(pglist_data);
-	VMCOREINFO_STRUCT_SIZE(zone);
-	VMCOREINFO_STRUCT_SIZE(free_area);
-	VMCOREINFO_STRUCT_SIZE(list_head);
-	VMCOREINFO_SIZE(nodemask_t);
-	VMCOREINFO_OFFSET(page, flags);
-	VMCOREINFO_OFFSET(page, _refcount);
-	VMCOREINFO_OFFSET(page, mapping);
-	VMCOREINFO_OFFSET(page, lru);
-	VMCOREINFO_OFFSET(page, _mapcount);
-	VMCOREINFO_OFFSET(page, private);
-	VMCOREINFO_OFFSET(page, compound_dtor);
-	VMCOREINFO_OFFSET(page, compound_order);
-	VMCOREINFO_OFFSET(page, compound_head);
-	VMCOREINFO_OFFSET(pglist_data, node_zones);
-	VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
-	VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
-	VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
-	VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
-	VMCOREINFO_OFFSET(pglist_data, node_id);
-	VMCOREINFO_OFFSET(zone, free_area);
-	VMCOREINFO_OFFSET(zone, vm_stat);
-	VMCOREINFO_OFFSET(zone, spanned_pages);
-	VMCOREINFO_OFFSET(free_area, free_list);
-	VMCOREINFO_OFFSET(list_head, next);
-	VMCOREINFO_OFFSET(list_head, prev);
-	VMCOREINFO_OFFSET(vmap_area, va_start);
-	VMCOREINFO_OFFSET(vmap_area, list);
-	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
-	log_buf_kexec_setup();
-	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
-	VMCOREINFO_NUMBER(NR_FREE_PAGES);
-	VMCOREINFO_NUMBER(PG_lru);
-	VMCOREINFO_NUMBER(PG_private);
-	VMCOREINFO_NUMBER(PG_swapcache);
-	VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
-	VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
-	VMCOREINFO_NUMBER(PG_head_mask);
-	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLB_PAGE
-	VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
-#endif
-
-	arch_crash_save_vmcoreinfo();
-	update_vmcoreinfo_note();
-
-	return 0;
-}
-
-subsys_initcall(crash_save_vmcoreinfo_init);
-
 /*
  * Move into place and start executing a preloaded standalone
  * executable.  If nothing was preloaded return an error.
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 0999679d6f26..23cd70651238 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -125,6 +125,10 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
 }
 KERNEL_ATTR_RW(kexec_crash_size);
 
+#endif /* CONFIG_KEXEC_CORE */
+
+#ifdef CONFIG_CRASH_CORE
+
 static ssize_t vmcoreinfo_show(struct kobject *kobj,
 			       struct kobj_attribute *attr, char *buf)
 {
@@ -134,7 +138,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(vmcoreinfo);
 
-#endif /* CONFIG_KEXEC_CORE */
+#endif /* CONFIG_CRASH_CORE */
 
 /* whether file capabilities are enabled */
 static ssize_t fscaps_show(struct kobject *kobj,
@@ -219,6 +223,8 @@ static struct attribute * kernel_attrs[] = {
 	&kexec_loaded_attr.attr,
 	&kexec_crash_loaded_attr.attr,
 	&kexec_crash_size_attr.attr,
+#endif
+#ifdef CONFIG_CRASH_CORE
 	&vmcoreinfo_attr.attr,
 #endif
 #ifndef CONFIG_TINY_RCU
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 2984fb0f0257..721d37b248c7 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -32,7 +32,7 @@
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
 #include <linux/syscalls.h>
-#include <linux/kexec.h>
+#include <linux/crash_core.h>
 #include <linux/kdb.h>
 #include <linux/ratelimit.h>
 #include <linux/kmsg_dump.h>
@@ -1002,7 +1002,7 @@ const struct file_operations kmsg_fops = {
 	.release = devkmsg_release,
 };
 
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_CORE
 /*
  * This appends the listed symbols to /proc/vmcore
  *
@@ -1011,7 +1011,7 @@ const struct file_operations kmsg_fops = {
  * symbols are specifically used so that utilities can access and extract the
  * dmesg log from a vmcore file after a crash.
  */
-void log_buf_kexec_setup(void)
+void log_buf_vmcoreinfo_setup(void)
 {
 	VMCOREINFO_SYMBOL(log_buf);
 	VMCOREINFO_SYMBOL(log_buf_len);
-- 
cgit v1.2.3


From 562e209d35194eb900be9a3cef4572e3836f89d8 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.vnet.ibm.com>
Date: Wed, 5 Apr 2017 09:21:31 +1000
Subject: ia64: reuse append_elf_note() and final_note() functions

Get rid of multiple definitions of append_elf_note() & final_note()
functions.  Reuse these functions compiled under CONFIG_CRASH_CORE Also,
define Elf_Word and use it instead of generic u32 or the more specific
Elf64_Word.

Link: http://lkml.kernel.org/r/149035342324.6881.11667840929850361402.stgit@hbathini.in.ibm.com
Signed-off-by: Hari Bathini <hbathini@linux.vnet.ibm.com>
Acked-by: Dave Young <dyoung@redhat.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/ia64/kernel/crash.c   | 22 ----------------------
 include/linux/crash_core.h |  4 ++++
 include/linux/elf.h        |  2 ++
 kernel/crash_core.c        | 34 ++++++++++++++--------------------
 kernel/kexec_core.c        | 28 ----------------------------
 5 files changed, 20 insertions(+), 70 deletions(-)

diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c
index 2955f359e2a7..75859a07d75b 100644
--- a/arch/ia64/kernel/crash.c
+++ b/arch/ia64/kernel/crash.c
@@ -27,28 +27,6 @@ static int kdump_freeze_monarch;
 static int kdump_on_init = 1;
 static int kdump_on_fatal_mca = 1;
 
-static inline Elf64_Word
-*append_elf_note(Elf64_Word *buf, char *name, unsigned type, void *data,
-		size_t data_len)
-{
-	struct elf_note *note = (struct elf_note *)buf;
-	note->n_namesz = strlen(name) + 1;
-	note->n_descsz = data_len;
-	note->n_type   = type;
-	buf += (sizeof(*note) + 3)/4;
-	memcpy(buf, name, note->n_namesz);
-	buf += (note->n_namesz + 3)/4;
-	memcpy(buf, data, data_len);
-	buf += (data_len + 3)/4;
-	return buf;
-}
-
-static void
-final_note(void *buf)
-{
-	memset(buf, 0, sizeof(struct elf_note));
-}
-
 extern void ia64_dump_cpu_regs(void *);
 
 static DEFINE_PER_CPU(struct elf_prstatus, elf_prstatus);
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 18d0f946fda3..541a197ba4a2 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -55,6 +55,10 @@ extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
 extern size_t vmcoreinfo_size;
 extern size_t vmcoreinfo_max_size;
 
+Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
+			  void *data, size_t data_len);
+void final_note(Elf_Word *buf);
+
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base);
 int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
diff --git a/include/linux/elf.h b/include/linux/elf.h
index 20fa8d8ae313..ba069e8f4f78 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -29,6 +29,7 @@ extern Elf32_Dyn _DYNAMIC [];
 #define elf_note	elf32_note
 #define elf_addr_t	Elf32_Off
 #define Elf_Half	Elf32_Half
+#define Elf_Word	Elf32_Word
 
 #else
 
@@ -39,6 +40,7 @@ extern Elf64_Dyn _DYNAMIC [];
 #define elf_note	elf64_note
 #define elf_addr_t	Elf64_Off
 #define Elf_Half	Elf64_Half
+#define Elf_Word	Elf64_Word
 
 #endif
 
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 4261587a34d2..fcbd568f1e95 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -291,32 +291,26 @@ int __init parse_crashkernel_low(char *cmdline,
 				"crashkernel=", suffix_tbl[SUFFIX_LOW]);
 }
 
-static u32 *append_elf_note(u32 *buf, char *name, unsigned int type,
-			    void *data, size_t data_len)
+Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
+			  void *data, size_t data_len)
 {
-	struct elf_note note;
-
-	note.n_namesz = strlen(name) + 1;
-	note.n_descsz = data_len;
-	note.n_type   = type;
-	memcpy(buf, &note, sizeof(note));
-	buf += (sizeof(note) + 3)/4;
-	memcpy(buf, name, note.n_namesz);
-	buf += (note.n_namesz + 3)/4;
-	memcpy(buf, data, note.n_descsz);
-	buf += (note.n_descsz + 3)/4;
+	struct elf_note *note = (struct elf_note *)buf;
+
+	note->n_namesz = strlen(name) + 1;
+	note->n_descsz = data_len;
+	note->n_type   = type;
+	buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word));
+	memcpy(buf, name, note->n_namesz);
+	buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word));
+	memcpy(buf, data, data_len);
+	buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word));
 
 	return buf;
 }
 
-static void final_note(u32 *buf)
+void final_note(Elf_Word *buf)
 {
-	struct elf_note note;
-
-	note.n_namesz = 0;
-	note.n_descsz = 0;
-	note.n_type   = 0;
-	memcpy(buf, &note, sizeof(note));
+	memset(buf, 0, sizeof(struct elf_note));
 }
 
 static void update_vmcoreinfo_note(void)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 9dd722912850..ae1a3ba24df5 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -990,34 +990,6 @@ unlock:
 	return ret;
 }
 
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
-			    size_t data_len)
-{
-	struct elf_note note;
-
-	note.n_namesz = strlen(name) + 1;
-	note.n_descsz = data_len;
-	note.n_type   = type;
-	memcpy(buf, &note, sizeof(note));
-	buf += (sizeof(note) + 3)/4;
-	memcpy(buf, name, note.n_namesz);
-	buf += (note.n_namesz + 3)/4;
-	memcpy(buf, data, note.n_descsz);
-	buf += (note.n_descsz + 3)/4;
-
-	return buf;
-}
-
-static void final_note(u32 *buf)
-{
-	struct elf_note note;
-
-	note.n_namesz = 0;
-	note.n_descsz = 0;
-	note.n_type   = 0;
-	memcpy(buf, &note, sizeof(note));
-}
-
 void crash_save_cpu(struct pt_regs *regs, int cpu)
 {
 	struct elf_prstatus prstatus;
-- 
cgit v1.2.3


From 0c9d00685af0b6363ffe74bd124997849c6dbc36 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.vnet.ibm.com>
Date: Wed, 5 Apr 2017 09:21:32 +1000
Subject: powerpc/fadump: remove dependency with CONFIG_KEXEC

Now that crashkernel parameter parsing and vmcoreinfo related code is
moved under CONFIG_CRASH_CORE instead of CONFIG_KEXEC_CORE, remove
dependency with CONFIG_KEXEC for CONFIG_FA_DUMP.  While here, get rid of
definitions of fadump_append_elf_note() & fadump_final_note() functions to
reuse similar functions compiled under CONFIG_CRASH_CORE.

Link: http://lkml.kernel.org/r/149035343956.6881.1536459326017709354.stgit@hbathini.in.ibm.com
Signed-off-by: Hari Bathini <hbathini@linux.vnet.ibm.com>
Reviewed-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/Kconfig               | 10 ++++++----
 arch/powerpc/include/asm/fadump.h  |  2 ++
 arch/powerpc/kernel/crash.c        |  2 --
 arch/powerpc/kernel/fadump.c       | 34 +++-------------------------------
 arch/powerpc/kernel/setup-common.c |  5 +++++
 5 files changed, 16 insertions(+), 37 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 97a8bc8a095c..6bc1fa232341 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -522,21 +522,23 @@ config RELOCATABLE_TEST
 	  relocation code.
 
 config CRASH_DUMP
-	bool "Build a kdump crash kernel"
+	bool "Build a dump capture kernel"
 	depends on PPC64 || 6xx || FSL_BOOKE || (44x && !SMP)
 	select RELOCATABLE if (PPC64 && !COMPILE_TEST) || 44x || FSL_BOOKE
 	help
-	  Build a kernel suitable for use as a kdump capture kernel.
+	  Build a kernel suitable for use as a dump capture kernel.
 	  The same kernel binary can be used as production kernel and dump
 	  capture kernel.
 
 config FA_DUMP
 	bool "Firmware-assisted dump"
-	depends on PPC64 && PPC_RTAS && CRASH_DUMP && KEXEC_CORE
+	depends on PPC64 && PPC_RTAS
+	select CRASH_CORE
+	select CRASH_DUMP
 	help
 	  A robust mechanism to get reliable kernel crash dump with
 	  assistance from firmware. This approach does not use kexec,
-	  instead firmware assists in booting the kdump kernel
+	  instead firmware assists in booting the capture kernel
 	  while preserving memory contents. Firmware-assisted dump
 	  is meant to be a kdump replacement offering robustness and
 	  speed not possible without system firmware assistance.
diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
index 0031806475f0..60b91084f33c 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -73,6 +73,8 @@
 	reg_entry++;							\
 })
 
+extern int crashing_cpu;
+
 /* Kernel Dump section info */
 struct fadump_section {
 	__be32	request_flag;
diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
index 47b63de81f9b..cbabb5adccd9 100644
--- a/arch/powerpc/kernel/crash.c
+++ b/arch/powerpc/kernel/crash.c
@@ -43,8 +43,6 @@
 #define IPI_TIMEOUT		10000
 #define REAL_MODE_TIMEOUT	10000
 
-/* This keeps a track of which one is the crashing cpu. */
-int crashing_cpu = -1;
 static int time_to_dump;
 
 #define CRASH_HANDLER_MAX 3
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 8ff0dd4e77a7..31c0abec51cf 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -509,34 +509,6 @@ fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs)
 	return reg_entry;
 }
 
-static u32 *fadump_append_elf_note(u32 *buf, char *name, unsigned type,
-						void *data, size_t data_len)
-{
-	struct elf_note note;
-
-	note.n_namesz = strlen(name) + 1;
-	note.n_descsz = data_len;
-	note.n_type   = type;
-	memcpy(buf, &note, sizeof(note));
-	buf += (sizeof(note) + 3)/4;
-	memcpy(buf, name, note.n_namesz);
-	buf += (note.n_namesz + 3)/4;
-	memcpy(buf, data, note.n_descsz);
-	buf += (note.n_descsz + 3)/4;
-
-	return buf;
-}
-
-static void fadump_final_note(u32 *buf)
-{
-	struct elf_note note;
-
-	note.n_namesz = 0;
-	note.n_descsz = 0;
-	note.n_type   = 0;
-	memcpy(buf, &note, sizeof(note));
-}
-
 static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs)
 {
 	struct elf_prstatus prstatus;
@@ -547,8 +519,8 @@ static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs)
 	 * prstatus.pr_pid = ????
 	 */
 	elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
-	buf = fadump_append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
-				&prstatus, sizeof(prstatus));
+	buf = append_elf_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS,
+			      &prstatus, sizeof(prstatus));
 	return buf;
 }
 
@@ -689,7 +661,7 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm)
 			note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
 		}
 	}
-	fadump_final_note(note_buf);
+	final_note(note_buf);
 
 	if (fdh) {
 		pr_debug("Updating elfcore header (%llx) with cpu notes\n",
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 4697da895133..508dd9af20b1 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -125,6 +125,11 @@ int ppc_do_canonicalize_irqs;
 EXPORT_SYMBOL(ppc_do_canonicalize_irqs);
 #endif
 
+#ifdef CONFIG_CRASH_CORE
+/* This keeps a track of which one is the crashing cpu. */
+int crashing_cpu = -1;
+#endif
+
 /* also used by kexec */
 void machine_shutdown(void)
 {
-- 
cgit v1.2.3


From 74177c5d453e368b274e49331267f3a81fb28253 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.vnet.ibm.com>
Date: Wed, 5 Apr 2017 09:21:32 +1000
Subject: powerpc/fadump: reuse crashkernel parameter for fadump memory
 reservation

fadump supports specifying memory to reserve for fadump's crash kernel
with fadump_reserve_mem kernel parameter.  This parameter currently
supports passing a fixed memory size, like fadump_reserve_mem=<size> only.
This patch aims to add support for other syntaxes like range-based memory
size <range1>:<size1>[,<range2>:<size2>,<range3>:<size3>,...] which allows
using the same parameter to boot the kernel with different system RAM
sizes.

As crashkernel parameter already supports the above mentioned syntaxes,
this patch deprecates fadump_reserve_mem parameter and reuses crashkernel
parameter instead, to specify memory for fadump's crash kernel memory
reservation as well.  If any offset is provided in crashkernel parameter,
it will be ignored in case of fadump, as fadump reserves memory at end of
RAM.

Advantages using crashkernel parameter instead of fadump_reserve_mem
parameter are one less kernel parameter overall, code reuse and support
for multiple syntaxes to specify memory.

Suggested-by: Dave Young <dyoung@redhat.com>
Link: http://lkml.kernel.org/r/149035346749.6881.911095631212975718.stgit@hbathini.in.ibm.com
Signed-off-by: Hari Bathini <hbathini@linux.vnet.ibm.com>
Reviewed-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/kernel/fadump.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 31c0abec51cf..e013f8fa6f99 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -210,14 +210,20 @@ static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm,
  */
 static inline unsigned long fadump_calculate_reserve_size(void)
 {
-	unsigned long size;
+	int ret;
+	unsigned long long base, size;
 
 	/*
-	 * Check if the size is specified through fadump_reserve_mem= cmdline
-	 * option. If yes, then use that.
+	 * Check if the size is specified through crashkernel= cmdline
+	 * option. If yes, then use that but ignore base as fadump
+	 * reserves memory at end of RAM.
 	 */
-	if (fw_dump.reserve_bootvar)
+	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
+				&size, &base);
+	if (ret == 0 && size > 0) {
+		fw_dump.reserve_bootvar = (unsigned long)size;
 		return fw_dump.reserve_bootvar;
+	}
 
 	/* divide by 20 to get 5% of value */
 	size = memblock_end_of_DRAM() / 20;
@@ -353,15 +359,6 @@ static int __init early_fadump_param(char *p)
 }
 early_param("fadump", early_fadump_param);
 
-/* Look for fadump_reserve_mem= cmdline option */
-static int __init early_fadump_reserve_mem(char *p)
-{
-	if (p)
-		fw_dump.reserve_bootvar = memparse(p, &p);
-	return 0;
-}
-early_param("fadump_reserve_mem", early_fadump_reserve_mem);
-
 static void register_fw_dump(struct fadump_mem_struct *fdm)
 {
 	int rc;
-- 
cgit v1.2.3


From be8232029aaeec1258d04f99ac9196125cbc7ed4 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.vnet.ibm.com>
Date: Wed, 5 Apr 2017 09:21:33 +1000
Subject: powerpc/fadump: update documentation about crashkernel parameter
 reuse

As we are reusing crashkernel parameter instead of fadump_reserve_mem
parameter to specify the memory to reserve for fadump's crash kernel,
update the documentation accordingly.

Link: http://lkml.kernel.org/r/149035347559.6881.14224829694291758581.stgit@hbathini.in.ibm.com
Signed-off-by: Hari Bathini <hbathini@linux.vnet.ibm.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/powerpc/firmware-assisted-dump.txt | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt
index 3007bc98af28..8394bc8e34d7 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -55,10 +55,14 @@ as follows:
          booted with restricted memory. By default, the boot memory
          size will be the larger of 5% of system RAM or 256MB.
          Alternatively, user can also specify boot memory size
-         through boot parameter 'fadump_reserve_mem=' which will
-         override the default calculated size. Use this option
-         if default boot memory size is not sufficient for second
-         kernel to boot successfully.
+         through boot parameter 'crashkernel=' which will override
+         the default calculated size. Use this option if default
+         boot memory size is not sufficient for second kernel to
+         boot successfully. For syntax of crashkernel= parameter,
+         refer to Documentation/kdump/kdump.txt. If any offset is
+         provided in crashkernel= parameter, it will be ignored
+         as fadump reserves memory at end of RAM for boot memory
+         dump preservation in case of a crash.
 
 -- After the low memory (boot memory) area has been saved, the
    firmware will reset PCI and other hardware state.  It will
@@ -158,13 +162,16 @@ How to enable firmware-assisted dump (fadump):
 
 1. Set config option CONFIG_FA_DUMP=y and build kernel.
 2. Boot into linux kernel with 'fadump=on' kernel cmdline option.
-3. Optionally, user can also set 'fadump_reserve_mem=' kernel cmdline
+3. Optionally, user can also set 'crashkernel=' kernel cmdline
    to specify size of the memory to reserve for boot memory dump
    preservation.
 
-NOTE: If firmware-assisted dump fails to reserve memory then it will
-   fallback to existing kdump mechanism if 'crashkernel=' option
-   is set at kernel cmdline.
+NOTE: 1. 'fadump_reserve_mem=' parameter has been deprecated. Instead
+         use 'crashkernel=' to specify size of the memory to reserve
+         for boot memory dump preservation.
+      2. If firmware-assisted dump fails to reserve memory then it
+         will fallback to existing kdump mechanism if 'crashkernel='
+         option is set at kernel cmdline.
 
 Sysfs/debugfs files:
 ------------
-- 
cgit v1.2.3


From 15f71e0aff918af8c8bc28268ce5f34f365db257 Mon Sep 17 00:00:00 2001
From: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Date: Wed, 5 Apr 2017 09:21:33 +1000
Subject: kdump, vmcoreinfo: report actual value of phys_base

Currently, VMCOREINFO note information reports the virtual address of
phys_base that is assigned to symbol phys_base.  But this doesn't make
sense because to refer to phys_base, it's necessary to get the value of
phys_base itself we are now about to refer to.

Userland tools related to kdump such as makedumpfile and crash utility so
far have made some efforts to calculate phys_base on crash dump formats
generated by mechanisms running outside Linux kernel, such as virtual
machine hypervisor such as qemu dump, which ordinary users use via virsh
dump, or ones implemented on vendor specific firmware.

That is, find a kernel data whose virtual and physical addresses are
available via its note information and calculate phys_base from it.
However, such data structure is not the one prepared for phys_base
purpose.  There's no guarantee that other crash dump mechanisms include
such information that can be used to calculate phys_base similarly.

To get VMCOREINFO in vmcore, it's easy to use strings and grep commands
like this; VMCOREINFO consists of simple string:

$ strings vmcore-3.10.0-121.el7.x86_64 | grep -E ".*VMCOREINFO.*" -A 100
VMCOREINFO
OSRELEASE=3.10.0-121.el7.x86_64
PAGESIZE=4096
...

This is also useful to get value of phys_base in kdump 2nd kernel
contained in vmcore using the above-mentioned external crash dump
mechanism; kdump 2nd kernel is an inherently relocated kernel.

This commit doesn't remove VMCOREINFO_SYMBOL(phys_base) line because
makedumpfile refers to it and if removing it, old versions makedumpfile
doesn't work well.

Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Dave Anderson <anderson@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/kernel/machine_kexec_64.c | 1 +
 include/linux/crash_core.h         | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 857cdbd02867..8b9947c414c4 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -341,6 +341,7 @@ void arch_crash_save_vmcoreinfo(void)
 	vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
 			      kaslr_offset());
 	VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+	VMCOREINFO_PHYS_BASE(phys_base);
 }
 
 /* arch-dependent functionality related to kexec file-based syscall */
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 541a197ba4a2..eb71a70ea2b5 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -50,6 +50,8 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 	vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
 #define VMCOREINFO_CONFIG(name) \
 	vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+#define VMCOREINFO_PHYS_BASE(value) \
+	vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value)
 
 extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
 extern size_t vmcoreinfo_size;
-- 
cgit v1.2.3


From 76e0c165b113a14f6fdce951675e28b6371abd97 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Wed, 5 Apr 2017 09:21:34 +1000
Subject: uapi: fix linux/sysctl.h userspace compilation errors

Include <stddef.h> (guarded by #ifndef __KERNEL__) to fix the following
linux/sysctl.h userspace compilation errors:

/usr/include/linux/sysctl.h:38:2: error: unknown type name 'size_t'
  size_t *oldlenp;
/usr/include/linux/sysctl.h:40:2: error: unknown type name 'size_t'
  size_t newlen;

This also fixes userspace compilation of uapi headers that include
linux/sysctl.h, e.g. linux/netfilter.h.

Link: http://lkml.kernel.org/r/20170222230652.GA14373@altlinux.org
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/sysctl.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index d2b12152e358..c6d18aaeb3a4 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -26,6 +26,10 @@
 #include <linux/types.h>
 #include <linux/compiler.h>
 
+#ifndef __KERNEL__
+#include <stddef.h>		/* For size_t. */
+#endif
+
 #define CTL_MAXNAME 10		/* how many path components do we allow in a
 				   call to sysctl?   In other words, what is
 				   the largest acceptable value for the nlen
-- 
cgit v1.2.3


From beb35b0574f603c376a8da0f4f08ef594dde4fa7 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 5 Apr 2017 09:21:35 +1000
Subject: fs/nsfs.c: allow ns_entries to have custom symlink content

Patch series "Expose task pid_ns_for_children to userspace".

pid_ns_for_children set by a task is known only to the task itself, and
it's impossible to identify it from outside.

It's a big problem for checkpoint/restore software like CRIU, because it
can't correctly handle tasks, that do setns(CLONE_NEWPID) in proccess of
their work.  If they have a custom pid_ns_for_children before dump, they
must have the same ns after restore.  Otherwise, restored task bumped into
enviroment it does not expect.

This patchset solves the problem.  It exposes pid_ns_for_children to ns
directory in standard way with the name "pid_for_children":

~# ls /proc/5531/ns -l | grep pid
lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid -> pid:[4026531836]
lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid_for_children -> pid:[4026532286]

This patch (of 2):

Make it possible to have link content prefix yyy different from the link
name xxx:

$ readlink /proc/[pid]/ns/xxx
yyy:[4026531838]

This will be used in the patch "pidns: expose task pid_ns_for_children to
userspace".

Link: http://lkml.kernel.org/r/149086966919.4388.10564546347286785860.stgit@localhost.localdomain
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nsfs.c               | 4 +++-
 include/linux/proc_ns.h | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 1656843e87d2..495f12b83a7b 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -195,9 +195,11 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task,
 {
 	struct ns_common *ns;
 	int res = -ENOENT;
+	const char *name;
 	ns = ns_ops->get(task);
 	if (ns) {
-		res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum);
+		name = ns_ops->real_ns_name ? : ns_ops->name;
+		res = snprintf(buf, size, "%s:[%u]", name, ns->inum);
 		ns_ops->put(ns);
 	}
 	return res;
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 12cb8bd81d2d..88dba3b53375 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -14,6 +14,7 @@ struct inode;
 
 struct proc_ns_operations {
 	const char *name;
+	const char *real_ns_name;
 	int type;
 	struct ns_common *(*get)(struct task_struct *task);
 	void (*put)(struct ns_common *ns);
-- 
cgit v1.2.3


From 5a2ab9e5545bb110c2a31f9e07d40791f20aae18 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 5 Apr 2017 09:21:35 +1000
Subject: pidns: expose task pid_ns_for_children to userspace

pid_ns_for_children set by a task is known only to the task itself, and
it's impossible to identify it from outside.

It's a big problem for checkpoint/restore software like CRIU, because it
can't correctly handle tasks, that do setns(CLONE_NEWPID) in proccess of
their work.

This patch solves the problem, and it exposes pid_ns_for_children to ns
directory in standard way with the name "pid_for_children":

~# ls /proc/5531/ns -l | grep pid
lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid -> pid:[4026531836]
lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid_for_children -> pid:[4026532286]

Link: http://lkml.kernel.org/r/149086967937.4388.471494976517194744.stgit@localhost.localdomain
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/namespaces.c    |  1 +
 include/linux/proc_ns.h |  1 +
 kernel/pid_namespace.c  | 25 +++++++++++++++++++++++++
 3 files changed, 27 insertions(+)

diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 766f0c637ad1..3803b24ca220 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -23,6 +23,7 @@ static const struct proc_ns_operations *ns_entries[] = {
 #endif
 #ifdef CONFIG_PID_NS
 	&pidns_operations,
+	&pidns_for_children_operations,
 #endif
 #ifdef CONFIG_USER_NS
 	&userns_operations,
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 88dba3b53375..58ab28d81fc2 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -27,6 +27,7 @@ extern const struct proc_ns_operations netns_operations;
 extern const struct proc_ns_operations utsns_operations;
 extern const struct proc_ns_operations ipcns_operations;
 extern const struct proc_ns_operations pidns_operations;
+extern const struct proc_ns_operations pidns_for_children_operations;
 extern const struct proc_ns_operations userns_operations;
 extern const struct proc_ns_operations mntns_operations;
 extern const struct proc_ns_operations cgroupns_operations;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index de461aa0bf9a..4dd02ff0b0bd 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -374,6 +374,20 @@ static struct ns_common *pidns_get(struct task_struct *task)
 	return ns ? &ns->ns : NULL;
 }
 
+static struct ns_common *pidns_for_children_get(struct task_struct *task)
+{
+	struct pid_namespace *ns = NULL;
+
+	task_lock(task);
+	if (task->nsproxy) {
+		ns = task->nsproxy->pid_ns_for_children;
+		get_pid_ns(ns);
+	}
+	task_unlock(task);
+
+	return ns ? &ns->ns : NULL;
+}
+
 static void pidns_put(struct ns_common *ns)
 {
 	put_pid_ns(to_pid_ns(ns));
@@ -443,6 +457,17 @@ const struct proc_ns_operations pidns_operations = {
 	.get_parent	= pidns_get_parent,
 };
 
+const struct proc_ns_operations pidns_for_children_operations = {
+	.name		= "pid_for_children",
+	.real_ns_name	= "pid",
+	.type		= CLONE_NEWPID,
+	.get		= pidns_for_children_get,
+	.put		= pidns_put,
+	.install	= pidns_install,
+	.owner		= pidns_owner,
+	.get_parent	= pidns_get_parent,
+};
+
 static __init int pid_namespaces_init(void)
 {
 	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
-- 
cgit v1.2.3


From 65ab2c501ec974cbd5a7fa842b0d983933b7656e Mon Sep 17 00:00:00 2001
From: Zhang Xiao <xiao.zhang@windriver.com>
Date: Wed, 5 Apr 2017 09:21:36 +1000
Subject: taskstats: add e/u/stime for TGID command

The elapsed time, user CPU time and system CPU time for the thread group
status request are presently left at zero.  Fill these in.

Link: http://lkml.kernel.org/r/1488508424-12322-1-git-send-email-xiao.zhang@windriver.com
Signed-off-by: Zhang Xiao <xiao.zhang@windriver.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/taskstats.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 8a5e44236f78..802a2bb7ccb9 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -210,6 +210,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
 	struct task_struct *tsk, *first;
 	unsigned long flags;
 	int rc = -ESRCH;
+	u64 delta, utime, stime;
 
 	/*
 	 * Add additional stats from live tasks except zombie thread group
@@ -238,6 +239,16 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
 		 */
 		delayacct_add_tsk(stats, tsk);
 
+		/* calculate task elapsed time in nsec */
+		delta = ktime_get_ns() - tsk->start_time;
+		/* Convert to micro seconds */
+		do_div(delta, NSEC_PER_USEC);
+		stats->ac_etime += delta;
+
+		task_cputime(tsk, &utime, &stime);
+		stats->ac_utime += div_u64(utime, NSEC_PER_USEC);
+		stats->ac_stime += div_u64(stime, NSEC_PER_USEC);
+
 		stats->nvcsw += tsk->nvcsw;
 		stats->nivcsw += tsk->nivcsw;
 	} while_each_thread(first, tsk);
-- 
cgit v1.2.3


From 36645cb271707698d13fcdcb0a002d379e54b14d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 5 Apr 2017 09:21:36 +1000
Subject: taskstats-add-e-u-stime-for-tgid-command-fix

run ktime_get_ns() a single time

Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Zhang Xiao <xiao.zhang@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/taskstats.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 802a2bb7ccb9..1247db3ff002 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -211,6 +211,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
 	unsigned long flags;
 	int rc = -ESRCH;
 	u64 delta, utime, stime;
+	u64 start_time;
 
 	/*
 	 * Add additional stats from live tasks except zombie thread group
@@ -228,6 +229,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
 		memset(stats, 0, sizeof(*stats));
 
 	tsk = first;
+	start_time = ktime_get_ns();
 	do {
 		if (tsk->exit_state)
 			continue;
@@ -240,7 +242,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
 		delayacct_add_tsk(stats, tsk);
 
 		/* calculate task elapsed time in nsec */
-		delta = ktime_get_ns() - tsk->start_time;
+		delta = start_time - tsk->start_time;
 		/* Convert to micro seconds */
 		do_div(delta, NSEC_PER_USEC);
 		stats->ac_etime += delta;
-- 
cgit v1.2.3


From 70658bc98c977ff8abcf809e023fb3e0e2ef02df Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 5 Apr 2017 09:21:37 +1000
Subject: taskstats-add-e-u-stime-for-tgid-command-fix-fix

include linux/sched/cputime.h for task_cputime()

Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Zhang Xiao <xiao.zhang@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/taskstats.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 1247db3ff002..4559e914452b 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -30,6 +30,7 @@
 #include <linux/pid_namespace.h>
 #include <net/genetlink.h>
 #include <linux/atomic.h>
+#include <linux/sched/cputime.h>
 
 /*
  * Maximum length of a cpumask that can be specified in
-- 
cgit v1.2.3


From e342b172a3e716b213689dfdb0ef62e7de5e59ab Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Wed, 5 Apr 2017 09:21:38 +1000
Subject: kcov: simplify interrupt check

in_interrupt() semantics are confusing and wrong for most users as it also
returns true when bh is disabled.  Thus we open coded a proper check for
interrupts in __sanitizer_cov_trace_pc() with a lengthy explanatory
comment.

Use the new in_task() predicate instead.

Link: http://lkml.kernel.org/r/20170321091026.139655-1-dvyukov@google.com
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: James Morse <james.morse@arm.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/kcov.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/kernel/kcov.c b/kernel/kcov.c
index 85e5546cd791..cd771993f96f 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -60,15 +60,8 @@ void notrace __sanitizer_cov_trace_pc(void)
 	/*
 	 * We are interested in code coverage as a function of a syscall inputs,
 	 * so we ignore code executed in interrupts.
-	 * The checks for whether we are in an interrupt are open-coded, because
-	 * 1. We can't use in_interrupt() here, since it also returns true
-	 *    when we are inside local_bh_disable() section.
-	 * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()),
-	 *    since that leads to slower generated code (three separate tests,
-	 *    one for each of the flags).
 	 */
-	if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET
-							| NMI_MASK)))
+	if (!t || !in_task())
 		return;
 	mode = READ_ONCE(t->kcov_mode);
 	if (mode == KCOV_MODE_TRACE) {
-- 
cgit v1.2.3


From 0e01f1d018f05f36e3fdf18c48242a757ac17808 Mon Sep 17 00:00:00 2001
From: Peter Griffin <peter.griffin@linaro.org>
Date: Wed, 5 Apr 2017 09:21:39 +1000
Subject: scripts/gdb: add lx-fdtdump command

lx-fdtdump dumps the flattened device tree passed to the kernel from the
bootloader to the filename specified as the command argument.  If no
argument is provided it defaults to fdtdump.dtb.  This then allows further
post processing on the machine running GDB.  The fdt header is also also
printed in the GDB console.  For example:

(gdb) lx-fdtdump
fdt_magic:         0xD00DFEED
fdt_totalsize:     0xC108
off_dt_struct:     0x38
off_dt_strings:    0x3804
off_mem_rsvmap:    0x28
version:           17
last_comp_version: 16
Dumped fdt to fdtdump.dtb

>fdtdump fdtdump.dtb | less

This command is useful as the bootloader can often re-write parts of the
device tree, and this can sometimes cause the kernel to not boot.

Link: http://lkml.kernel.org/r/1481280065-5336-2-git-send-email-kbingham@kernel.org
Signed-off-by: Peter Griffin <peter.griffin@linaro.org>
Signed-off-by: Kieran Bingham <kbingham@kernel.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 scripts/gdb/linux/constants.py.in |  7 ++++
 scripts/gdb/linux/proc.py         | 73 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+)

diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in
index 7986f4e0da12..7aad82406422 100644
--- a/scripts/gdb/linux/constants.py.in
+++ b/scripts/gdb/linux/constants.py.in
@@ -14,6 +14,7 @@
 
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/of_fdt.h>
 
 /* We need to stringify expanded macros so that they can be parsed */
 
@@ -50,3 +51,9 @@ LX_VALUE(MNT_NOEXEC)
 LX_VALUE(MNT_NOATIME)
 LX_VALUE(MNT_NODIRATIME)
 LX_VALUE(MNT_RELATIME)
+
+/* linux/of_fdt.h> */
+LX_VALUE(OF_DT_HEADER)
+
+/* Kernel Configs */
+LX_CONFIG(CONFIG_OF)
diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py
index 38b1f09d1cd9..086d27223c0c 100644
--- a/scripts/gdb/linux/proc.py
+++ b/scripts/gdb/linux/proc.py
@@ -16,6 +16,7 @@ from linux import constants
 from linux import utils
 from linux import tasks
 from linux import lists
+from struct import *
 
 
 class LxCmdLine(gdb.Command):
@@ -195,3 +196,75 @@ values of that process namespace"""
                         info_opts(MNT_INFO, m_flags)))
 
 LxMounts()
+
+
+class LxFdtDump(gdb.Command):
+    """Output Flattened Device Tree header and dump FDT blob to the filename
+       specified as the command argument. Equivalent to
+       'cat /proc/fdt > fdtdump.dtb' on a running target"""
+
+    def __init__(self):
+        super(LxFdtDump, self).__init__("lx-fdtdump", gdb.COMMAND_DATA,
+                                        gdb.COMPLETE_FILENAME)
+
+    def fdthdr_to_cpu(self, fdt_header):
+
+        fdt_header_be = ">IIIIIII"
+        fdt_header_le = "<IIIIIII"
+
+        if utils.get_target_endianness() == 1:
+            output_fmt = fdt_header_le
+        else:
+            output_fmt = fdt_header_be
+
+        return unpack(output_fmt, pack(fdt_header_be,
+                                       fdt_header['magic'],
+                                       fdt_header['totalsize'],
+                                       fdt_header['off_dt_struct'],
+                                       fdt_header['off_dt_strings'],
+                                       fdt_header['off_mem_rsvmap'],
+                                       fdt_header['version'],
+                                       fdt_header['last_comp_version']))
+
+    def invoke(self, arg, from_tty):
+
+        if not constants.LX_CONFIG_OF:
+            raise gdb.GdbError("Kernel not compiled with CONFIG_OF\n")
+
+        if len(arg) == 0:
+            filename = "fdtdump.dtb"
+        else:
+            filename = arg
+
+        py_fdt_header_ptr = gdb.parse_and_eval(
+            "(const struct fdt_header *) initial_boot_params")
+        py_fdt_header = py_fdt_header_ptr.dereference()
+
+        fdt_header = self.fdthdr_to_cpu(py_fdt_header)
+
+        if fdt_header[0] != constants.LX_OF_DT_HEADER:
+            raise gdb.GdbError("No flattened device tree magic found\n")
+
+        gdb.write("fdt_magic:         0x{:02X}\n".format(fdt_header[0]))
+        gdb.write("fdt_totalsize:     0x{:02X}\n".format(fdt_header[1]))
+        gdb.write("off_dt_struct:     0x{:02X}\n".format(fdt_header[2]))
+        gdb.write("off_dt_strings:    0x{:02X}\n".format(fdt_header[3]))
+        gdb.write("off_mem_rsvmap:    0x{:02X}\n".format(fdt_header[4]))
+        gdb.write("version:           {}\n".format(fdt_header[5]))
+        gdb.write("last_comp_version: {}\n".format(fdt_header[6]))
+
+        inf = gdb.inferiors()[0]
+        fdt_buf = utils.read_memoryview(inf, py_fdt_header_ptr,
+                                        fdt_header[1]).tobytes()
+
+        try:
+            f = open(filename, 'wb')
+        except:
+            raise gdb.GdbError("Could not open file to dump fdt")
+
+        f.write(fdt_buf)
+        f.close()
+
+        gdb.write("Dumped fdt blob to " + filename + "\n")
+
+LxFdtDump()
-- 
cgit v1.2.3


From 196ae5749070023a056598ac56a32c3a57aad042 Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Wed, 5 Apr 2017 09:21:40 +1000
Subject: kernel/reboot.c: add devm_register_reboot_notifier()

Add devm_* wrapper around register_reboot_notifier to simplify device
specific reboot notifier registration/unregistration.

Link: http://lkml.kernel.org/r/20170320171753.1705-1-andrew.smirnov@gmail.com
Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/reboot.h |  3 +++
 kernel/reboot.c        | 27 +++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index a7ff409f386d..0ca25413ad2d 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -38,6 +38,9 @@ extern int reboot_force;
 extern int register_reboot_notifier(struct notifier_block *);
 extern int unregister_reboot_notifier(struct notifier_block *);
 
+struct device;
+extern int devm_register_reboot_notifier(struct device *, struct notifier_block *);
+
 extern int register_restart_handler(struct notifier_block *);
 extern int unregister_restart_handler(struct notifier_block *);
 extern void do_kernel_restart(char *cmd);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index bd30a973fe94..e4ced883d8de 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,6 +104,33 @@ int unregister_reboot_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_reboot_notifier);
 
+static void devm_unregister_reboot_notifier(struct device *dev, void *res)
+{
+	WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res));
+}
+
+int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb)
+{
+	struct notifier_block **rcnb;
+	int ret;
+
+	rcnb = devres_alloc(devm_unregister_reboot_notifier,
+			    sizeof(*rcnb), GFP_KERNEL);
+	if (!rcnb)
+		return -ENOMEM;
+
+	ret = register_reboot_notifier(nb);
+	if (!ret) {
+		*rcnb = nb;
+		devres_add(dev, rcnb);
+	} else {
+		devres_free(rcnb);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(devm_register_reboot_notifier);
+
 /*
  *	Notifier list for kernel code which wants to be called
  *	to restart the system.
-- 
cgit v1.2.3


From d2cf93a6f60fa275105a284f28e8f1f27276804d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 5 Apr 2017 09:21:40 +1000
Subject: kernel-reboot-add-devm_register_reboot_notifier-fix

move `struct device' forward declaration to top-of-file

Cc: Andrey Smirnov <andrew.smirnov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/reboot.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 0ca25413ad2d..ecbf7b56b9db 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -5,6 +5,8 @@
 #include <linux/notifier.h>
 #include <uapi/linux/reboot.h>
 
+struct device;
+
 #define SYS_DOWN	0x0001	/* Notify of system down */
 #define SYS_RESTART	SYS_DOWN
 #define SYS_HALT	0x0002	/* Notify of system halt */
@@ -38,7 +40,6 @@ extern int reboot_force;
 extern int register_reboot_notifier(struct notifier_block *);
 extern int unregister_reboot_notifier(struct notifier_block *);
 
-struct device;
 extern int devm_register_reboot_notifier(struct device *, struct notifier_block *);
 
 extern int register_restart_handler(struct notifier_block *);
-- 
cgit v1.2.3


From cb20e61ae7843089f254998cfa01e81b98c9e0ff Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Wed, 5 Apr 2017 09:21:41 +1000
Subject: lib/fault-inject.c: use correct check for interrupts

in_interrupt() also returns true when bh is disabled in task context.
That's not what fail_task() wants to check.  Use the new in_task()
predicate that does the right thing.

Link: http://lkml.kernel.org/r/20170321091805.140676-1-dvyukov@google.com
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/fault-inject.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index 6a823a53e357..4ff157159a0d 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -56,7 +56,7 @@ static void fail_dump(struct fault_attr *attr)
 
 static bool fail_task(struct fault_attr *attr, struct task_struct *task)
 {
-	return !in_interrupt() && task->make_it_fail;
+	return in_task() && task->make_it_fail;
 }
 
 #define MAX_STACK_TRACE_DEPTH 32
-- 
cgit v1.2.3


From a85fa626d581c57c4a9efedfe4811bf0671aec62 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Wed, 5 Apr 2017 09:21:41 +1000
Subject: fault-inject: support systematic fault injection

Add /proc/self/task/<current-tid>/fail-nth file that allows failing
0-th, 1-st, 2-nd and so on calls systematically.
Excerpt from the added documentation:

===
Write to this file of integer N makes N-th call in the current task fail
(N is 0-based). Read from this file returns a single char 'Y' or 'N'
that says if the fault setup with a previous write to this file was
injected or not, and disables the fault if it wasn't yet injected.
Note that this file enables all types of faults (slab, futex, etc).
This setting takes precedence over all other generic settings like
probability, interval, times, etc. But per-capability settings
(e.g. fail_futex/ignore-private) take precedence over it.
This feature is intended for systematic testing of faults in a single
system call. See an example below.
===

Why adding new setting:
1. Existing settings are global rather than per-task.
   So parallel testing is not possible.
2. attr->interval is close but it depends on attr->count
   which is non reset to 0, so interval does not work as expected.
3. Trying to model this with existing settings requires manipulations
   of all of probability, interval, times, space, task-filter and
   unexposed count and per-task make-it-fail files.
4. Existing settings are per-failure-type, and the set of failure
   types is potentially expanding.
5. make-it-fail can't be changed by unprivileged user and aggressive
   stress testing better be done from an unprivileged user.
   Similarly, this would require opening the debugfs files to the
   unprivileged user, as he would need to reopen at least times file
   (not possible to pre-open before dropping privs).

The proposed interface solves all of the above (see the example).

We want to integrate this into syzkaller fuzzer.  A prototype has found 10
bugs in kernel in first day of usage:
https://groups.google.com/forum/#!searchin/syzkaller/%22FAULT_INJECTION%22%7Csort:relevance

Link: http://lkml.kernel.org/r/20170328130128.101773-1-dvyukov@google.com
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/fault-injection/fault-injection.txt | 78 +++++++++++++++++++++++
 fs/proc/base.c                                    | 52 +++++++++++++++
 include/linux/sched.h                             |  1 +
 kernel/fork.c                                     |  4 ++
 lib/fault-inject.c                                |  7 ++
 5 files changed, 142 insertions(+)

diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt
index 415484f3d59a..192d8cbcc5f9 100644
--- a/Documentation/fault-injection/fault-injection.txt
+++ b/Documentation/fault-injection/fault-injection.txt
@@ -134,6 +134,22 @@ use the boot option:
 	fail_futex=
 	mmc_core.fail_request=<interval>,<probability>,<space>,<times>
 
+o proc entries
+
+- /proc/self/task/<current-tid>/fail-nth:
+
+	Write to this file of integer N makes N-th call in the current task fail
+	(N is 0-based). Read from this file returns a single char 'Y' or 'N'
+	that says if the fault setup with a previous write to this file was
+	injected or not, and disables the fault if it wasn't yet injected.
+	Note that this file enables all types of faults (slab, futex, etc).
+	This setting takes precedence over all other generic debugfs settings
+	like probability, interval, times, etc. But per-capability settings
+	(e.g. fail_futex/ignore-private) take precedence over it.
+
+	This feature is intended for systematic testing of faults in a single
+	system call. See an example below.
+
 How to add new fault injection capability
 -----------------------------------------
 
@@ -278,3 +294,65 @@ allocation failure.
 	# env FAILCMD_TYPE=fail_page_alloc \
 		./tools/testing/fault-injection/failcmd.sh --times=100 \
                 -- make -C tools/testing/selftests/ run_tests
+
+Systematic faults using fail-nth
+---------------------------------
+
+The following code systematically faults 0-th, 1-st, 2-nd and so on
+capabilities in the socketpair() system call.
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/syscall.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+
+int main()
+{
+	int i, err, res, fail_nth, fds[2];
+	char buf[128];
+
+	system("echo N > /sys/kernel/debug/failslab/ignore-gfp-wait");
+	sprintf(buf, "/proc/self/task/%ld/fail-nth", syscall(SYS_gettid));
+	fail_nth = open(buf, O_RDWR);
+	for (i = 0;; i++) {
+		sprintf(buf, "%d", i);
+		write(fail_nth, buf, strlen(buf));
+		res = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds);
+		err = errno;
+		read(fail_nth, buf, 1);
+		if (res == 0) {
+			close(fds[0]);
+			close(fds[1]);
+		}
+		printf("%d-th fault %c: res=%d/%d\n", i, buf[0], res, err);
+		if (buf[0] != 'Y')
+			break;
+	}
+	return 0;
+}
+
+An example output:
+
+0-th fault Y: res=-1/23
+1-th fault Y: res=-1/23
+2-th fault Y: res=-1/23
+3-th fault Y: res=-1/12
+4-th fault Y: res=-1/12
+5-th fault Y: res=-1/23
+6-th fault Y: res=-1/23
+7-th fault Y: res=-1/23
+8-th fault Y: res=-1/12
+9-th fault Y: res=-1/12
+10-th fault Y: res=-1/12
+11-th fault Y: res=-1/12
+12-th fault Y: res=-1/12
+13-th fault Y: res=-1/12
+14-th fault Y: res=-1/12
+15-th fault Y: res=-1/12
+16-th fault N: res=0/12
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c87b6b9a8a76..9fc0b20644a6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1358,6 +1358,53 @@ static const struct file_operations proc_fault_inject_operations = {
 	.write		= proc_fault_inject_write,
 	.llseek		= generic_file_llseek,
 };
+
+static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	int err, n;
+
+	task = get_proc_task(file_inode(file));
+	if (!task)
+		return -ESRCH;
+	put_task_struct(task);
+	if (task != current)
+		return -EPERM;
+	err = kstrtoint_from_user(buf, count, 10, &n);
+	if (err)
+		return err;
+	if (n < 0 || n == INT_MAX)
+		return -EINVAL;
+	current->fail_nth = n + 1;
+	return len;
+}
+
+static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	int err;
+
+	task = get_proc_task(file_inode(file));
+	if (!task)
+		return -ESRCH;
+	put_task_struct(task);
+	if (task != current)
+		return -EPERM;
+	if (count < 1)
+		return -EINVAL;
+	err = put_user((char)(current->fail_nth ? 'N' : 'Y'), buf);
+	if (err)
+		return err;
+	current->fail_nth = 0;
+	return 1;
+}
+
+static const struct file_operations proc_fail_nth_operations = {
+	.read		= proc_fail_nth_read,
+	.write		= proc_fail_nth_write,
+};
 #endif
 
 
@@ -3302,6 +3349,11 @@ static const struct pid_entry tid_base_stuff[] = {
 #endif
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
+	/*
+	 * Operations on the file check that the task is current,
+	 * so we create it with 0666 to support testing under unprivileged user.
+	 */
+	REG("fail-nth", 0666, proc_fail_nth_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 	ONE("io",	S_IRUSR, proc_tid_io_accounting),
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9c3ee2281a56..27bcbd8269f9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -941,6 +941,7 @@ struct task_struct {
 
 #ifdef CONFIG_FAULT_INJECTION
 	int				make_it_fail;
+	int fail_nth;
 #endif
 	/*
 	 * When (nr_dirtied >= nr_dirtied_pause), it's time to call
diff --git a/kernel/fork.c b/kernel/fork.c
index 6c463c80e93d..a0151354c959 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -555,6 +555,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 
 	kcov_task_init(tsk);
 
+#ifdef CONFIG_FAULT_INJECTION
+	tsk->fail_nth = 0;
+#endif
+
 	return tsk;
 
 free_stack:
diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index 4ff157159a0d..09ac73c177fd 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -107,6 +107,12 @@ static inline bool fail_stacktrace(struct fault_attr *attr)
 
 bool should_fail(struct fault_attr *attr, ssize_t size)
 {
+	if (in_task() && current->fail_nth) {
+		if (--current->fail_nth == 0)
+			goto fail;
+		return false;
+	}
+
 	/* No need to check any other properties if the probability is 0 */
 	if (attr->probability == 0)
 		return false;
@@ -134,6 +140,7 @@ bool should_fail(struct fault_attr *attr, ssize_t size)
 	if (!fail_stacktrace(attr))
 		return false;
 
+fail:
 	fail_dump(attr);
 
 	if (atomic_read(&attr->times) != -1)
-- 
cgit v1.2.3


From 3d332675fd34293faf3af6d0ad4f4f2c54914efa Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 5 Apr 2017 09:21:42 +1000
Subject: fault-inject-support-systematic-fault-injection-fix

fix build

Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/base.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9fc0b20644a6..ff6d71bcca59 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1377,7 +1377,7 @@ static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf,
 	if (n < 0 || n == INT_MAX)
 		return -EINVAL;
 	current->fail_nth = n + 1;
-	return len;
+	return count;
 }
 
 static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
-- 
cgit v1.2.3


From bb95cc4e70630cf553f54eec6baabe0787d2060b Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Wed, 5 Apr 2017 09:21:43 +1000
Subject: lib/zlib_inflate/inftrees.c: fix potential buffer overflow

smatch says:

WARNING: please, no spaces at the start of a line
#30: FILE: lib/zlib_inflate/inftrees.c:112:
+    for (min = 1; min < MAXBITS; min++)$

total: 0 errors, 1 warnings, 8 lines checked

NOTE: For some of the reported defects, checkpatch may be able to
      mechanically convert to the typical style using --fix or --fix-inplace.

./patches/zlib-inflate-fix-potential-buffer-overflow.patch has style problems, please review.

NOTE: If any of the errors are false positives, please report
      them to the maintainer, see CHECKPATCH in MAINTAINERS.

Please run checkpatch prior to sending patches

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/zlib_inflate/inftrees.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/zlib_inflate/inftrees.c b/lib/zlib_inflate/inftrees.c
index 3fe6ce5b53e5..028943052926 100644
--- a/lib/zlib_inflate/inftrees.c
+++ b/lib/zlib_inflate/inftrees.c
@@ -109,7 +109,7 @@ int zlib_inflate_table(codetype type, unsigned short *lens, unsigned codes,
         *bits = 1;
         return 0;     /* no symbols, but wait for decoding to report error */
     }
-    for (min = 1; min <= MAXBITS; min++)
+    for (min = 1; min < MAXBITS; min++)
         if (count[min] != 0) break;
     if (root < min) root = min;
 
-- 
cgit v1.2.3


From dc2f0d5206967436bfecd01e759099b827004d49 Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Wed, 5 Apr 2017 09:21:43 +1000
Subject: initramfs: provide a way to ignore image provided by bootloader

Many "embedded" architectures provide CMDLINE_FORCE to allow the kernel to
override the command line provided by an inflexible bootloader.  However
there is currrently no way for the kernel to override the initramfs image
provided by the bootloader meaning there are still ways for bootloaders to
make things difficult for us.

Fix this by introducing INITRAMFS_FORCE which can prevent the kernel from
loading the bootloader supplied image.

We use CMDLINE_FORCE (and its friend CMDLINE_EXTEND) to imply that the
system has an inflexible bootloader.  This allow us to avoid presenting
this config option to users of systems where inflexible bootloaders aren't
usually a problem.

Link: http://lkml.kernel.org/r/20170217121940.30126-1-daniel.thompson@linaro.org
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 init/initramfs.c |  2 +-
 usr/Kconfig      | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/init/initramfs.c b/init/initramfs.c
index 981f286c1d16..bf7707b44988 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -611,7 +611,7 @@ static int __init populate_rootfs(void)
 	char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
 	if (err)
 		panic("%s", err); /* Failed to decompress INTERNAL initramfs */
-	if (initrd_start) {
+	if (initrd_start && !IS_ENABLED(CONFIG_INITRAMFS_FORCE)) {
 #ifdef CONFIG_BLK_DEV_RAM
 		int fd;
 		printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n");
diff --git a/usr/Kconfig b/usr/Kconfig
index 6278f135256d..c0c48507e44e 100644
--- a/usr/Kconfig
+++ b/usr/Kconfig
@@ -21,6 +21,16 @@ config INITRAMFS_SOURCE
 
 	  If you are not sure, leave it blank.
 
+config INITRAMFS_FORCE
+	bool "Ignore the initramfs passed by the bootloader"
+	depends on CMDLINE_EXTEND || CMDLINE_FORCE
+	help
+	  This option causes the kernel to ignore the initramfs image
+	  (or initrd image) passed to it by the bootloader. This is
+	  analogous to CMDLINE_FORCE, which is found on some architectures,
+	  and is useful if you cannot or don't want to change the image
+	  your bootloader passes to the kernel.
+
 config INITRAMFS_ROOT_UID
 	int "User ID to map to 0 (user root)"
 	depends on INITRAMFS_SOURCE!=""
-- 
cgit v1.2.3


From 975c2abe355d835bfb01407cc22ced366261de45 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 5 Apr 2017 09:21:44 +1000
Subject: initramfs: use vfs_stat/lstat directly

sys_newlstat is a system call implementation that is meant for user space,
and that copies kernel-internal data structure to the user format, which
is not needed for in-kernel users.

Further, as we rearrange the system call implementation so we can extend
it with 64-bit time_t, the prototype for sys_newlstat changes.

This changes the initramfs code to use vfs_lstat directly, to get it out
of the way of the time_t changes, and make it slightly more efficient in
the process.  Along the same lines we also replace sys_stat and sys_stat64
with vfs_stat.

Link: http://lkml.kernel.org/r/20170314214932.4052842-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 init/do_mounts.h | 22 ++++------------------
 init/initramfs.c | 12 ++++++------
 2 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/init/do_mounts.h b/init/do_mounts.h
index 067af1d9e8b6..282d65bfd674 100644
--- a/init/do_mounts.h
+++ b/init/do_mounts.h
@@ -19,29 +19,15 @@ static inline int create_dev(char *name, dev_t dev)
 	return sys_mknod(name, S_IFBLK|0600, new_encode_dev(dev));
 }
 
-#if BITS_PER_LONG == 32
 static inline u32 bstat(char *name)
 {
-	struct stat64 stat;
-	if (sys_stat64(name, &stat) != 0)
+	struct kstat stat;
+	if (vfs_stat(name, &stat) != 0)
 		return 0;
-	if (!S_ISBLK(stat.st_mode))
+	if (!S_ISBLK(stat.mode))
 		return 0;
-	if (stat.st_rdev != (u32)stat.st_rdev)
-		return 0;
-	return stat.st_rdev;
-}
-#else
-static inline u32 bstat(char *name)
-{
-	struct stat stat;
-	if (sys_newstat(name, &stat) != 0)
-		return 0;
-	if (!S_ISBLK(stat.st_mode))
-		return 0;
-	return stat.st_rdev;
+	return stat.rdev;
 }
-#endif
 
 #ifdef CONFIG_BLK_DEV_RAM
 
diff --git a/init/initramfs.c b/init/initramfs.c
index bf7707b44988..a5b686696535 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -312,10 +312,10 @@ static int __init maybe_link(void)
 
 static void __init clean_path(char *path, umode_t fmode)
 {
-	struct stat st;
+	struct kstat st;
 
-	if (!sys_newlstat(path, &st) && (st.st_mode ^ fmode) & S_IFMT) {
-		if (S_ISDIR(st.st_mode))
+	if (!vfs_lstat(path, &st) && (st.mode ^ fmode) & S_IFMT) {
+		if (S_ISDIR(st.mode))
 			sys_rmdir(path);
 		else
 			sys_unlink(path);
@@ -581,13 +581,13 @@ static void __init clean_rootfs(void)
 	num = sys_getdents64(fd, dirp, BUF_SIZE);
 	while (num > 0) {
 		while (num > 0) {
-			struct stat st;
+			struct kstat st;
 			int ret;
 
-			ret = sys_newlstat(dirp->d_name, &st);
+			ret = vfs_lstat(dirp->d_name, &st);
 			WARN_ON_ONCE(ret);
 			if (!ret) {
-				if (S_ISDIR(st.st_mode))
+				if (S_ISDIR(st.mode))
 					sys_rmdir(dirp->d_name);
 				else
 					sys_unlink(dirp->d_name);
-- 
cgit v1.2.3


From 357d0c1c96dc69abe858381237d2573a766987b6 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 5 Apr 2017 09:21:45 +1000
Subject: ipc/shm: some shmat cleanups

Clean up early flag and address some minutia.

Link: http://lkml.kernel.org/r/1486673582-6979-3-git-send-email-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 ipc/shm.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index 481d2a9c298a..34c4344e8d4b 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1095,11 +1095,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
 	      ulong *raddr, unsigned long shmlba)
 {
 	struct shmid_kernel *shp;
-	unsigned long addr;
+	unsigned long addr = (unsigned long)shmaddr;
 	unsigned long size;
 	struct file *file;
 	int    err;
-	unsigned long flags;
+	unsigned long flags = MAP_SHARED;
 	unsigned long prot;
 	int acc_mode;
 	struct ipc_namespace *ns;
@@ -1111,7 +1111,8 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
 	err = -EINVAL;
 	if (shmid < 0)
 		goto out;
-	else if ((addr = (ulong)shmaddr)) {
+
+	if (addr) {
 		if (addr & (shmlba - 1)) {
 			/*
 			 * Round down to the nearest multiple of shmlba.
@@ -1126,13 +1127,10 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
 #endif
 					goto out;
 		}
-		flags = MAP_SHARED | MAP_FIXED;
-	} else {
-		if ((shmflg & SHM_REMAP))
-			goto out;
 
-		flags = MAP_SHARED;
-	}
+		flags |= MAP_FIXED;
+	} else if ((shmflg & SHM_REMAP))
+		goto out;
 
 	if (shmflg & SHM_RDONLY) {
 		prot = PROT_READ;
-- 
cgit v1.2.3


From 35ea10aa22077996ff58d495aca9397e61f4c1f6 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 5 Apr 2017 09:21:45 +1000
Subject: sysv,ipc: cacheline align kern_ipc_perm

Assign 'struct kern_ipc_perm' its own cacheline to avoid false sharing
with sysv ipc calls.  While the structure itself is rather read-mostly
throughout the lifespan of ipc, the spinlock causes most of the
invalidations.  One example is 31a7c4746e9 ("ipc/sem.c: cacheline align
the ipc spinlock for semaphores").  Therefore, extend this to all ipc.

The effect of cacheline alignment on sems can be seen in sembench, which
deals mostly with semtimedop wait/wakes is seen to improve raw throughput
(worker loops) between 8 to 12% on a 24-core x86 with over 4 threads.

Link: http://lkml.kernel.org/r/1486673582-6979-4-git-send-email-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ipc.h | 7 +++----
 include/linux/sem.h | 3 +--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index 9d84942ae2e5..71fd92d81b26 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -8,8 +8,7 @@
 #define IPCMNI 32768  /* <= MAX_INT limit for ipc arrays (including sysctl changes) */
 
 /* used by in-kernel data structures */
-struct kern_ipc_perm
-{
+struct kern_ipc_perm {
 	spinlock_t	lock;
 	bool		deleted;
 	int		id;
@@ -18,9 +17,9 @@ struct kern_ipc_perm
 	kgid_t		gid;
 	kuid_t		cuid;
 	kgid_t		cgid;
-	umode_t		mode; 
+	umode_t		mode;
 	unsigned long	seq;
 	void		*security;
-};
+} ____cacheline_aligned_in_smp;
 
 #endif /* _LINUX_IPC_H */
diff --git a/include/linux/sem.h b/include/linux/sem.h
index 4fc222f8755d..9edec926e9d9 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -10,8 +10,7 @@ struct task_struct;
 
 /* One sem_array data structure for each set of semaphores in the system. */
 struct sem_array {
-	struct kern_ipc_perm	____cacheline_aligned_in_smp
-				sem_perm;	/* permissions .. see ipc.h */
+	struct kern_ipc_perm	sem_perm;	/* permissions .. see ipc.h */
 	time_t			sem_ctime;	/* last change time */
 	struct sem		*sem_base;	/* ptr to first semaphore in array */
 	struct list_head	pending_alter;	/* pending operations */
-- 
cgit v1.2.3