From 7ace99170789bc53cbb7e9e352d7a3851208fbcf Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 6 Aug 2015 15:46:13 -0700
Subject: mm, meminit: allow early_pfn_to_nid to be used during runtime

early_pfn_to_nid() historically was inherently not SMP safe but only
used during boot which is inherently single threaded or during hotplug
which is protected by a giant mutex.

With deferred memory initialisation there was a thread-safe version
introduced and the early_pfn_to_nid would trigger a BUG_ON if used
unsafely.  Memory hotplug hit that check.  This patch makes
early_pfn_to_nid introduces a lock to make it safe to use during
hotplug.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reported-by: Alex Ng <alexng@microsoft.com>
Tested-by: Alex Ng <alexng@microsoft.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Nicolai Stange <nicstange@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'mm/page_alloc.c')
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef19f22b2b7d..ea0e6a6e7a11 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -981,21 +981,21 @@ static void __init __free_pages_boot_core(struct page *page,
 
 #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
 	defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
-/* Only safe to use early in boot when initialisation is single-threaded */
+
 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
 
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
+	static DEFINE_SPINLOCK(early_pfn_lock);
 	int nid;
 
-	/* The system will behave unpredictably otherwise */
-	BUG_ON(system_state != SYSTEM_BOOTING);
-
+	spin_lock(&early_pfn_lock);
 	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
-	if (nid >= 0)
-		return nid;
-	/* just returns 0 */
-	return 0;
+	if (nid < 0)
+		nid = 0;
+	spin_unlock(&early_pfn_lock);
+
+	return nid;
 }
 #endif
 
-- 
cgit v1.2.3


From d3cd131d935ab3bab700491edbbd7cad4040ce50 Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Thu, 6 Aug 2015 15:46:16 -0700
Subject: mm, meminit: replace rwsem with completion

Commit 0e1cc95b4cc7 ("mm: meminit: finish initialisation of struct pages
before basic setup") introduced a rwsem to signal completion of the
initialization workers.

Lockdep complains about possible recursive locking:
  =============================================
  [ INFO: possible recursive locking detected ]
  4.1.0-12802-g1dc51b8 #3 Not tainted
  ---------------------------------------------
  swapper/0/1 is trying to acquire lock:
  (pgdat_init_rwsem){++++.+},
    at: [<ffffffff8424c7fb>] page_alloc_init_late+0xc7/0xe6

  but task is already holding lock:
  (pgdat_init_rwsem){++++.+},
    at: [<ffffffff8424c772>] page_alloc_init_late+0x3e/0xe6

Replace the rwsem by a completion together with an atomic
"outstanding work counter".

[peterz@infradead.org: Barrier removal on the grounds of being pointless]
[mgorman@suse.de: Applied review feedback]
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Alex Ng <alexng@microsoft.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ea0e6a6e7a11..322628278ae4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,7 +18,6 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
-#include <linux/rwsem.h>
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/bootmem.h>
@@ -1060,7 +1059,15 @@ static void __init deferred_free_range(struct page *page,
 		__free_pages_boot_core(page, pfn, 0);
 }
 
-static __initdata DECLARE_RWSEM(pgdat_init_rwsem);
+/* Completion tracking for deferred_init_memmap() threads */
+static atomic_t pgdat_init_n_undone __initdata;
+static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
+
+static inline void __init pgdat_init_report_one_done(void)
+{
+	if (atomic_dec_and_test(&pgdat_init_n_undone))
+		complete(&pgdat_init_all_done_comp);
+}
 
 /* Initialise remaining memory on a node */
 static int __init deferred_init_memmap(void *data)
@@ -1077,7 +1084,7 @@ static int __init deferred_init_memmap(void *data)
 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 
 	if (first_init_pfn == ULONG_MAX) {
-		up_read(&pgdat_init_rwsem);
+		pgdat_init_report_one_done();
 		return 0;
 	}
 
@@ -1177,7 +1184,8 @@ free_range:
 
 	pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
 					jiffies_to_msecs(jiffies - start));
-	up_read(&pgdat_init_rwsem);
+
+	pgdat_init_report_one_done();
 	return 0;
 }
 
@@ -1185,14 +1193,14 @@ void __init page_alloc_init_late(void)
 {
 	int nid;
 
+	/* There will be num_node_state(N_MEMORY) threads */
+	atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
 	for_each_node_state(nid, N_MEMORY) {
-		down_read(&pgdat_init_rwsem);
 		kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
 	}
 
 	/* Block until all are initialised */
-	down_write(&pgdat_init_rwsem);
-	up_write(&pgdat_init_rwsem);
+	wait_for_completion(&pgdat_init_all_done_comp);
 }
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
-- 
cgit v1.2.3


From 4248b0da460839e30eaaad78992b9a1dd3e63e21 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 6 Aug 2015 15:46:20 -0700
Subject: fs, file table: reinit files_stat.max_files after deferred memory
 initialisation

Dave Hansen reported the following;

	My laptop has been behaving strangely with 4.2-rc2.  Once I log
	in to my X session, I start getting all kinds of strange errors
	from applications and see this in my dmesg:

        	VFS: file-max limit 8192 reached

The problem is that the file-max is calculated before memory is fully
initialised and miscalculates how much memory the kernel is using.  This
patch recalculates file-max after deferred memory initialisation.  Note
that using memory hotplug infrastructure would not have avoided this
problem as the value is not recalculated after memory hot-add.

4.1:             files_stat.max_files = 6582781
4.2-rc2:         files_stat.max_files = 8192
4.2-rc2 patched: files_stat.max_files = 6562467

Small differences with the patch applied and 4.1 but not enough to matter.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reported-by: Dave Hansen <dave.hansen@intel.com>
Cc: Nicolai Stange <nicstange@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Alex Ng <alexng@microsoft.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c        | 13 +++----------
 fs/file_table.c    | 24 +++++++++++++++---------
 include/linux/fs.h |  5 +++--
 init/main.c        |  2 +-
 mm/page_alloc.c    |  3 +++
 5 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/fs/dcache.c b/fs/dcache.c
index 5c8ea15e73a5..9b5fe503f6cb 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3442,22 +3442,15 @@ void __init vfs_caches_init_early(void)
 	inode_init_early();
 }
 
-void __init vfs_caches_init(unsigned long mempages)
+void __init vfs_caches_init(void)
 {
-	unsigned long reserve;
-
-	/* Base hash sizes on available memory, with a reserve equal to
-           150% of current kernel size */
-
-	reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1);
-	mempages -= reserve;
-
 	names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	dcache_init();
 	inode_init();
-	files_init(mempages);
+	files_init();
+	files_maxfiles_init();
 	mnt_init();
 	bdev_cache_init();
 	chrdev_init();
diff --git a/fs/file_table.c b/fs/file_table.c
index 7f9d407c7595..ad17e05ebf95 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -25,6 +25,7 @@
 #include <linux/hardirq.h>
 #include <linux/task_work.h>
 #include <linux/ima.h>
+#include <linux/swap.h>
 
 #include <linux/atomic.h>
 
@@ -308,19 +309,24 @@ void put_filp(struct file *file)
 	}
 }
 
-void __init files_init(unsigned long mempages)
+void __init files_init(void)
 { 
-	unsigned long n;
-
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
+}
 
-	/*
-	 * One file with associated inode and dcache is very roughly 1K.
-	 * Per default don't use more than 10% of our memory for files. 
-	 */ 
+/*
+ * One file with associated inode and dcache is very roughly 1K. Per default
+ * do not use more than 10% of our memory for files.
+ */
+void __init files_maxfiles_init(void)
+{
+	unsigned long n;
+	unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2;
+
+	memreserve = min(memreserve, totalram_pages - 1);
+	n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
 
-	n = (mempages * (PAGE_SIZE / 1024)) / 10;
 	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
-	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
 } 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cc008c338f5a..84b783f277f7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -55,7 +55,8 @@ struct vm_fault;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
-extern void __init files_init(unsigned long);
+extern void __init files_init(void);
+extern void __init files_maxfiles_init(void);
 
 extern struct files_stat_struct files_stat;
 extern unsigned long get_max_files(void);
@@ -2245,7 +2246,7 @@ extern int ioctl_preallocate(struct file *filp, void __user *argp);
 
 /* fs/dcache.c */
 extern void __init vfs_caches_init_early(void);
-extern void __init vfs_caches_init(unsigned long);
+extern void __init vfs_caches_init(void);
 
 extern struct kmem_cache *names_cachep;
 
diff --git a/init/main.c b/init/main.c
index c5d5626289ce..56506553d4d8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -656,7 +656,7 @@ asmlinkage __visible void __init start_kernel(void)
 	key_init();
 	security_init();
 	dbg_late_init();
-	vfs_caches_init(totalram_pages);
+	vfs_caches_init();
 	signals_init();
 	/* rootfs populating might need page-writeback */
 	page_writeback_init();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 322628278ae4..cb61f44eb3fc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1201,6 +1201,9 @@ void __init page_alloc_init_late(void)
 
 	/* Block until all are initialised */
 	wait_for_completion(&pgdat_init_all_done_comp);
+
+	/* Reinit limits that are based on free pages after the kernel is up */
+	files_maxfiles_init();
 }
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
-- 
cgit v1.2.3


From f4c18e6f7b5bbb5b528b3334115806b0d76f50f9 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Thu, 6 Aug 2015 15:47:08 -0700
Subject: mm: check __PG_HWPOISON separately from PAGE_FLAGS_CHECK_AT_*

The race condition addressed in commit add05cecef80 ("mm: soft-offline:
don't free target page in successful page migration") was not closed
completely, because that can happen not only for soft-offline, but also
for hard-offline.  Consider that a slab page is about to be freed into
buddy pool, and then an uncorrected memory error hits the page just
after entering __free_one_page(), then VM_BUG_ON_PAGE(page->flags &
PAGE_FLAGS_CHECK_AT_PREP) is triggered, despite the fact that it's not
necessary because the data on the affected page is not consumed.

To solve it, this patch drops __PG_HWPOISON from page flag checks at
allocation/free time.  I think it's justified because __PG_HWPOISON
flags is defined to prevent the page from being reused, and setting it
outside the page's alloc-free cycle is a designed behavior (not a bug.)

For recent months, I was annoyed about BUG_ON when soft-offlined page
remains on lru cache list for a while, which is avoided by calling
put_page() instead of putback_lru_page() in page migration's success
path.  This means that this patch reverts a major change from commit
add05cecef80 about the new refcounting rule of soft-offlined pages, so
"reuse window" revives.  This will be closed by a subsequent patch.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Dean Nelson <dnelson@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 10 +++++++---
 mm/huge_memory.c           |  7 +------
 mm/migrate.c               |  5 ++++-
 mm/page_alloc.c            |  4 ++++
 4 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f34e040b34e9..41c93844fb1d 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -631,15 +631,19 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
 	 1 << PG_private | 1 << PG_private_2 | \
 	 1 << PG_writeback | 1 << PG_reserved | \
 	 1 << PG_slab	 | 1 << PG_swapcache | 1 << PG_active | \
-	 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \
+	 1 << PG_unevictable | __PG_MLOCKED | \
 	 __PG_COMPOUND_LOCK)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
- * Pages being prepped should not have any flags set.  It they are set,
+ * Pages being prepped should not have these flags set.  It they are set,
  * there has been a kernel bug or struct page corruption.
+ *
+ * __PG_HWPOISON is exceptional because it needs to be kept beyond page's
+ * alloc-free cycle to prevent from reusing the page.
  */
-#define PAGE_FLAGS_CHECK_AT_PREP	((1 << NR_PAGEFLAGS) - 1)
+#define PAGE_FLAGS_CHECK_AT_PREP	\
+	(((1 << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)
 
 #define PAGE_FLAGS_PRIVATE				\
 	(1 << PG_private | 1 << PG_private_2)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c107094f79ba..097c7a4bfbd9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1676,12 +1676,7 @@ static void __split_huge_page_refcount(struct page *page,
 		/* after clearing PageTail the gup refcount can be released */
 		smp_mb__after_atomic();
 
-		/*
-		 * retain hwpoison flag of the poisoned tail page:
-		 *   fix for the unsuitable process killed on Guest Machine(KVM)
-		 *   by the memory-failure.
-		 */
-		page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
+		page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 		page_tail->flags |= (page->flags &
 				     ((1L << PG_referenced) |
 				      (1L << PG_swapbacked) |
diff --git a/mm/migrate.c b/mm/migrate.c
index ee401e4e5ef1..f2415be7d93b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -950,7 +950,10 @@ out:
 		list_del(&page->lru);
 		dec_zone_page_state(page, NR_ISOLATED_ANON +
 				page_is_file_cache(page));
-		if (reason != MR_MEMORY_FAILURE)
+		/* Soft-offlined page shouldn't go through lru cache list */
+		if (reason == MR_MEMORY_FAILURE)
+			put_page(page);
+		else
 			putback_lru_page(page);
 	}
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cb61f44eb3fc..beda41710802 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1296,6 +1296,10 @@ static inline int check_new_page(struct page *page)
 		bad_reason = "non-NULL mapping";
 	if (unlikely(atomic_read(&page->_count) != 0))
 		bad_reason = "nonzero _count";
+	if (unlikely(page->flags & __PG_HWPOISON)) {
+		bad_reason = "HWPoisoned (hardware-corrupted)";
+		bad_flags = __PG_HWPOISON;
+	}
 	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
 		bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
 		bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
-- 
cgit v1.2.3


From f9126ab9241f66562debf69c2c9d8fee32ddcc53 Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Fri, 14 Aug 2015 15:35:16 -0700
Subject: memory-hotplug: fix wrong edge when hot add a new node

When we add a new node, the edge of memory may be wrong.

e.g. system has 4 nodes, and node3 is movable, node3 mem:[24G-32G],

1. hotremove the node3,
2. then hotadd node3 with a part of memory, mem:[26G-30G],
3. call hotadd_new_pgdat()
        free_area_init_node()
                get_pfn_range_for_nid()
4. it will return wrong start_pfn and end_pfn, because we have not
update the memblock.

This patch also fixes a BUG_ON during hot-addition, please see
http://marc.info/?l=linux-kernel&m=142961156129456&w=2

Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 3 +++
 mm/page_alloc.c     | 8 ++++++++
 2 files changed, 11 insertions(+)

(limited to 'mm/page_alloc.c')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 003dbe4b060d..6da82bcb0a8b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1277,6 +1277,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
 
 	/* create new memmap entry */
 	firmware_map_add_hotplug(start, start + size, "System RAM");
+	memblock_add_node(start, size, nid);
 
 	goto out;
 
@@ -2013,6 +2014,8 @@ void __ref remove_memory(int nid, u64 start, u64 size)
 
 	/* remove memmap entry */
 	firmware_map_remove(start, start + size, "System RAM");
+	memblock_free(start, size);
+	memblock_remove(start, size);
 
 	arch_remove_memory(start, size);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index beda41710802..df959b7d6085 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5060,6 +5060,10 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 {
 	unsigned long zone_start_pfn, zone_end_pfn;
 
+	/* When hotadd a new node, the node should be empty */
+	if (!node_start_pfn && !node_end_pfn)
+		return 0;
+
 	/* Get the start and end of the zone */
 	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
 	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
@@ -5123,6 +5127,10 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
 	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
 	unsigned long zone_start_pfn, zone_end_pfn;
 
+	/* When hotadd a new node, the node should be empty */
+	if (!node_start_pfn && !node_end_pfn)
+		return 0;
+
 	zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
 	zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
 
-- 
cgit v1.2.3


From 2f064f3485cd29633ad1b3cfb00cc519509a3d72 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 21 Aug 2015 14:11:51 -0700
Subject: mm: make page pfmemalloc check more robust

Commit c48a11c7ad26 ("netvm: propagate page->pfmemalloc to skb") added
checks for page->pfmemalloc to __skb_fill_page_desc():

        if (page->pfmemalloc && !page->mapping)
                skb->pfmemalloc = true;

It assumes page->mapping == NULL implies that page->pfmemalloc can be
trusted.  However, __delete_from_page_cache() can set set page->mapping
to NULL and leave page->index value alone.  Due to being in union, a
non-zero page->index will be interpreted as true page->pfmemalloc.

So the assumption is invalid if the networking code can see such a page.
And it seems it can.  We have encountered this with a NFS over loopback
setup when such a page is attached to a new skbuf.  There is no copying
going on in this case so the page confuses __skb_fill_page_desc which
interprets the index as pfmemalloc flag and the network stack drops
packets that have been allocated using the reserves unless they are to
be queued on sockets handling the swapping which is the case here and
that leads to hangs when the nfs client waits for a response from the
server which has been dropped and thus never arrive.

The struct page is already heavily packed so rather than finding another
hole to put it in, let's do a trick instead.  We can reuse the index
again but define it to an impossible value (-1UL).  This is the page
index so it should never see the value that large.  Replace all direct
users of page->pfmemalloc by page_is_pfmemalloc which will hide this
nastiness from unspoiled eyes.

The information will get lost if somebody wants to use page->index
obviously but that was the case before and the original code expected
that the information should be persisted somewhere else if that is
really needed (e.g.  what SLAB and SLUB do).

[akpm@linux-foundation.org: fix blooper in slub]
Fixes: c48a11c7ad26 ("netvm: propagate page->pfmemalloc to skb")
Signed-off-by: Michal Hocko <mhocko@suse.com>
Debugged-by: Vlastimil Babka <vbabka@suse.com>
Debugged-by: Jiri Bohac <jbohac@suse.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: David Miller <davem@davemloft.net>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: <stable@vger.kernel.org>	[3.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/net/ethernet/intel/fm10k/fm10k_main.c     |  2 +-
 drivers/net/ethernet/intel/igb/igb_main.c         |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |  2 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  2 +-
 include/linux/mm.h                                | 28 +++++++++++++++++++++++
 include/linux/mm_types.h                          |  9 --------
 include/linux/skbuff.h                            | 14 ++++--------
 mm/page_alloc.c                                   |  9 +++++---
 mm/slab.c                                         |  4 ++--
 mm/slub.c                                         |  2 +-
 net/core/skbuff.c                                 |  2 +-
 11 files changed, 47 insertions(+), 29 deletions(-)

(limited to 'mm/page_alloc.c')

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index 982fdcdc795b..b5b2925103ec 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -216,7 +216,7 @@ static void fm10k_reuse_rx_page(struct fm10k_ring *rx_ring,
 
 static inline bool fm10k_page_is_reserved(struct page *page)
 {
-	return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
+	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
 static bool fm10k_can_reuse_rx_page(struct fm10k_rx_buffer *rx_buffer,
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 2f70a9b152bd..830466c49987 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6566,7 +6566,7 @@ static void igb_reuse_rx_page(struct igb_ring *rx_ring,
 
 static inline bool igb_page_is_reserved(struct page *page)
 {
-	return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
+	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
 static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 9aa6104e34ea..ae21e0b06c3a 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1832,7 +1832,7 @@ static void ixgbe_reuse_rx_page(struct ixgbe_ring *rx_ring,
 
 static inline bool ixgbe_page_is_reserved(struct page *page)
 {
-	return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
+	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index e71cdde9cb01..1d7b00b038a2 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -765,7 +765,7 @@ static void ixgbevf_reuse_rx_page(struct ixgbevf_ring *rx_ring,
 
 static inline bool ixgbevf_page_is_reserved(struct page *page)
 {
-	return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
+	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
 /**
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2e872f92dbac..bf6f117fcf4d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1002,6 +1002,34 @@ static inline int page_mapped(struct page *page)
 	return atomic_read(&(page)->_mapcount) >= 0;
 }
 
+/*
+ * Return true only if the page has been allocated with
+ * ALLOC_NO_WATERMARKS and the low watermark was not
+ * met implying that the system is under some pressure.
+ */
+static inline bool page_is_pfmemalloc(struct page *page)
+{
+	/*
+	 * Page index cannot be this large so this must be
+	 * a pfmemalloc page.
+	 */
+	return page->index == -1UL;
+}
+
+/*
+ * Only to be called by the page allocator on a freshly allocated
+ * page.
+ */
+static inline void set_page_pfmemalloc(struct page *page)
+{
+	page->index = -1UL;
+}
+
+static inline void clear_page_pfmemalloc(struct page *page)
+{
+	page->index = 0;
+}
+
 /*
  * Different kinds of faults, as returned by handle_mm_fault().
  * Used to decide whether a process gets delivered SIGBUS or
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0038ac7466fd..15549578d559 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -63,15 +63,6 @@ struct page {
 		union {
 			pgoff_t index;		/* Our offset within mapping. */
 			void *freelist;		/* sl[aou]b first free object */
-			bool pfmemalloc;	/* If set by the page allocator,
-						 * ALLOC_NO_WATERMARKS was set
-						 * and the low watermark was not
-						 * met implying that the system
-						 * is under some pressure. The
-						 * caller should try ensure
-						 * this page is only used to
-						 * free other pages.
-						 */
 		};
 
 		union {
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 22b6d9ca1654..9b88536487e6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1602,20 +1602,16 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
 	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
 	/*
-	 * Propagate page->pfmemalloc to the skb if we can. The problem is
-	 * that not all callers have unique ownership of the page. If
-	 * pfmemalloc is set, we check the mapping as a mapping implies
-	 * page->index is set (index and pfmemalloc share space).
-	 * If it's a valid mapping, we cannot use page->pfmemalloc but we
-	 * do not lose pfmemalloc information as the pages would not be
-	 * allocated using __GFP_MEMALLOC.
+	 * Propagate page pfmemalloc to the skb if we can. The problem is
+	 * that not all callers have unique ownership of the page but rely
+	 * on page_is_pfmemalloc doing the right thing(tm).
 	 */
 	frag->page.p		  = page;
 	frag->page_offset	  = off;
 	skb_frag_size_set(frag, size);
 
 	page = compound_head(page);
-	if (page->pfmemalloc && !page->mapping)
+	if (page_is_pfmemalloc(page))
 		skb->pfmemalloc	= true;
 }
 
@@ -2263,7 +2259,7 @@ static inline struct page *dev_alloc_page(void)
 static inline void skb_propagate_pfmemalloc(struct page *page,
 					     struct sk_buff *skb)
 {
-	if (page && page->pfmemalloc)
+	if (page_is_pfmemalloc(page))
 		skb->pfmemalloc = true;
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index df959b7d6085..5b5240b7f642 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1343,12 +1343,15 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
 	set_page_owner(page, order, gfp_flags);
 
 	/*
-	 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to
+	 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
 	 * allocate the page. The expectation is that the caller is taking
 	 * steps that will free more memory. The caller should avoid the page
 	 * being used for !PFMEMALLOC purposes.
 	 */
-	page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
+	if (alloc_flags & ALLOC_NO_WATERMARKS)
+		set_page_pfmemalloc(page);
+	else
+		clear_page_pfmemalloc(page);
 
 	return 0;
 }
@@ -3345,7 +3348,7 @@ refill:
 		atomic_add(size - 1, &page->_count);
 
 		/* reset page count bias and offset to start of new frag */
-		nc->pfmemalloc = page->pfmemalloc;
+		nc->pfmemalloc = page_is_pfmemalloc(page);
 		nc->pagecnt_bias = size;
 		nc->offset = size;
 	}
diff --git a/mm/slab.c b/mm/slab.c
index 200e22412a16..bbd0b47dc6a9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1603,7 +1603,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
 	}
 
 	/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
-	if (unlikely(page->pfmemalloc))
+	if (page_is_pfmemalloc(page))
 		pfmemalloc_active = true;
 
 	nr_pages = (1 << cachep->gfporder);
@@ -1614,7 +1614,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
 		add_zone_page_state(page_zone(page),
 			NR_SLAB_UNRECLAIMABLE, nr_pages);
 	__SetPageSlab(page);
-	if (page->pfmemalloc)
+	if (page_is_pfmemalloc(page))
 		SetPageSlabPfmemalloc(page);
 
 	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
diff --git a/mm/slub.c b/mm/slub.c
index 816df0016555..f68c0e50f3c0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1427,7 +1427,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 	inc_slabs_node(s, page_to_nid(page), page->objects);
 	page->slab_cache = s;
 	__SetPageSlab(page);
-	if (page->pfmemalloc)
+	if (page_is_pfmemalloc(page))
 		SetPageSlabPfmemalloc(page);
 
 	start = page_address(page);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index bf9a5d93c2d1..7b84330e5d30 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -340,7 +340,7 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
 
 	if (skb && frag_size) {
 		skb->head_frag = 1;
-		if (virt_to_head_page(data)->pfmemalloc)
+		if (page_is_pfmemalloc(virt_to_head_page(data)))
 			skb->pfmemalloc = 1;
 	}
 	return skb;
-- 
cgit v1.2.3