From 0ebff32c3637e0ed551c017eb9599ac108ab36aa Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Fri, 22 Feb 2013 16:33:59 -0800
Subject: memory-failure: fix an error of mce_bad_pages statistics

When doing

    $ echo paddr > /sys/devices/system/memory/soft_offline_page

to offline a *free* page, the value of mce_bad_pages will be added, and
the page is set HWPoison flag, but it is still managed by page buddy
alocator.

   $ cat /proc/meminfo | grep HardwareCorrupted

shows the value.

If we offline the same page, the value of mce_bad_pages will be added
*again*, this means the value is incorrect now.  Assume the page is
still free during this short time.

  soft_offline_page()
    get_any_page()
      "else if (is_free_buddy_page(p))" branch return 0
        "goto done";
           "atomic_long_add(1, &mce_bad_pages);"

This patch:

Move poisoned page check at the beginning of the function in order to
fix the error.

Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Tested-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 38 +++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

(limited to 'mm/memory-failure.c')

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c6e4dd3e1c08..1a9242c53315 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1419,18 +1419,17 @@ static int soft_offline_huge_page(struct page *page, int flags)
 	unsigned long pfn = page_to_pfn(page);
 	struct page *hpage = compound_head(page);
 
+	if (PageHWPoison(hpage)) {
+		pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
+		return -EBUSY;
+	}
+
 	ret = get_any_page(page, pfn, flags);
 	if (ret < 0)
 		return ret;
 	if (ret == 0)
 		goto done;
 
-	if (PageHWPoison(hpage)) {
-		put_page(hpage);
-		pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
-		return -EBUSY;
-	}
-
 	/* Keep page count to indicate a given hugepage is isolated. */
 	ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
 				MIGRATE_SYNC);
@@ -1441,12 +1440,11 @@ static int soft_offline_huge_page(struct page *page, int flags)
 		return ret;
 	}
 done:
-	if (!PageHWPoison(hpage))
-		atomic_long_add(1 << compound_trans_order(hpage),
-				&mce_bad_pages);
+	/* keep elevated page count for bad page */
+	atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
 	set_page_hwpoison_huge_page(hpage);
 	dequeue_hwpoisoned_huge_page(hpage);
-	/* keep elevated page count for bad page */
+
 	return ret;
 }
 
@@ -1488,6 +1486,11 @@ int soft_offline_page(struct page *page, int flags)
 		}
 	}
 
+	if (PageHWPoison(page)) {
+		pr_info("soft offline: %#lx page already poisoned\n", pfn);
+		return -EBUSY;
+	}
+
 	ret = get_any_page(page, pfn, flags);
 	if (ret < 0)
 		return ret;
@@ -1519,19 +1522,11 @@ int soft_offline_page(struct page *page, int flags)
 		return -EIO;
 	}
 
-	lock_page(page);
-	wait_on_page_writeback(page);
-
 	/*
 	 * Synchronized using the page lock with memory_failure()
 	 */
-	if (PageHWPoison(page)) {
-		unlock_page(page);
-		put_page(page);
-		pr_info("soft offline: %#lx page already poisoned\n", pfn);
-		return -EBUSY;
-	}
-
+	lock_page(page);
+	wait_on_page_writeback(page);
 	/*
 	 * Try to invalidate first. This should work for
 	 * non dirty unmapped page cache pages.
@@ -1583,8 +1578,9 @@ int soft_offline_page(struct page *page, int flags)
 		return ret;
 
 done:
+	/* keep elevated page count for bad page */
 	atomic_long_add(1, &mce_bad_pages);
 	SetPageHWPoison(page);
-	/* keep elevated page count for bad page */
+
 	return ret;
 }
-- 
cgit v1.2.3


From fa8dd8a92dccc1b29cefd7f51334285d6ed35281 Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Fri, 22 Feb 2013 16:34:00 -0800
Subject: memory-failure: do code refactor of soft_offline_page()

There are too many return points randomly intermingled with some "goto
done" return points.  So adjust the function structure, one for the
success path, the other for the failure path.  Use atomic_long_inc
instead of atomic_long_add.

Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

(limited to 'mm/memory-failure.c')

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1a9242c53315..f4c9fa1149e2 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1421,12 +1421,13 @@ static int soft_offline_huge_page(struct page *page, int flags)
 
 	if (PageHWPoison(hpage)) {
 		pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
-		return -EBUSY;
+		ret = -EBUSY;
+		goto out;
 	}
 
 	ret = get_any_page(page, pfn, flags);
 	if (ret < 0)
-		return ret;
+		goto out;
 	if (ret == 0)
 		goto done;
 
@@ -1437,14 +1438,14 @@ static int soft_offline_huge_page(struct page *page, int flags)
 	if (ret) {
 		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
 			pfn, ret, page->flags);
-		return ret;
+		goto out;
 	}
 done:
 	/* keep elevated page count for bad page */
 	atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
 	set_page_hwpoison_huge_page(hpage);
 	dequeue_hwpoisoned_huge_page(hpage);
-
+out:
 	return ret;
 }
 
@@ -1476,24 +1477,28 @@ int soft_offline_page(struct page *page, int flags)
 	unsigned long pfn = page_to_pfn(page);
 	struct page *hpage = compound_trans_head(page);
 
-	if (PageHuge(page))
-		return soft_offline_huge_page(page, flags);
+	if (PageHuge(page)) {
+		ret = soft_offline_huge_page(page, flags);
+		goto out;
+	}
 	if (PageTransHuge(hpage)) {
 		if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
 			pr_info("soft offline: %#lx: failed to split THP\n",
 				pfn);
-			return -EBUSY;
+			ret = -EBUSY;
+			goto out;
 		}
 	}
 
 	if (PageHWPoison(page)) {
 		pr_info("soft offline: %#lx page already poisoned\n", pfn);
-		return -EBUSY;
+		ret = -EBUSY;
+		goto out;
 	}
 
 	ret = get_any_page(page, pfn, flags);
 	if (ret < 0)
-		return ret;
+		goto out;
 	if (ret == 0)
 		goto done;
 
@@ -1512,14 +1517,15 @@ int soft_offline_page(struct page *page, int flags)
 		 */
 		ret = get_any_page(page, pfn, 0);
 		if (ret < 0)
-			return ret;
+			goto out;
 		if (ret == 0)
 			goto done;
 	}
 	if (!PageLRU(page)) {
 		pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
 			pfn, page->flags);
-		return -EIO;
+		ret = -EIO;
+		goto out;
 	}
 
 	/*
@@ -1575,12 +1581,12 @@ int soft_offline_page(struct page *page, int flags)
 			pfn, ret, page_count(page), page->flags);
 	}
 	if (ret)
-		return ret;
+		goto out;
 
 done:
 	/* keep elevated page count for bad page */
-	atomic_long_add(1, &mce_bad_pages);
+	atomic_long_inc(&mce_bad_pages);
 	SetPageHWPoison(page);
-
+out:
 	return ret;
 }
-- 
cgit v1.2.3


From 293c07e31ab5a0b8df8c19b2a9e5c6fa30308849 Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Fri, 22 Feb 2013 16:34:02 -0800
Subject: memory-failure: use num_poisoned_pages instead of mce_bad_pages

Since MCE is an x86 concept, and this code is in mm/, it would be better
to use the name num_poisoned_pages instead of mce_bad_pages.

[akpm@linux-foundation.org: fix mm/sparse.c]
Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Suggested-by: Borislav Petkov <bp@alien8.de>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c   |  2 +-
 include/linux/mm.h  |  2 +-
 mm/memory-failure.c | 16 ++++++++--------
 mm/sparse.c         |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'mm/memory-failure.c')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 80e4645f7990..c3dac611c3c0 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -158,7 +158,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		vmi.used >> 10,
 		vmi.largest_chunk >> 10
 #ifdef CONFIG_MEMORY_FAILURE
-		,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
+		,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 		,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 72a42c0fa633..a114b8eb7676 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1753,7 +1753,7 @@ extern int unpoison_memory(unsigned long pfn);
 extern int sysctl_memory_failure_early_kill;
 extern int sysctl_memory_failure_recovery;
 extern void shake_page(struct page *p, int access);
-extern atomic_long_t mce_bad_pages;
+extern atomic_long_t num_poisoned_pages;
 extern int soft_offline_page(struct page *page, int flags);
 
 extern void dump_page(struct page *page);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f4c9fa1149e2..c95e19af510b 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -61,7 +61,7 @@ int sysctl_memory_failure_early_kill __read_mostly = 0;
 
 int sysctl_memory_failure_recovery __read_mostly = 1;
 
-atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
+atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
 
 #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
 
@@ -1040,7 +1040,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 	}
 
 	nr_pages = 1 << compound_trans_order(hpage);
-	atomic_long_add(nr_pages, &mce_bad_pages);
+	atomic_long_add(nr_pages, &num_poisoned_pages);
 
 	/*
 	 * We need/can do nothing about count=0 pages.
@@ -1070,7 +1070,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 			if (!PageHWPoison(hpage)
 			    || (hwpoison_filter(p) && TestClearPageHWPoison(p))
 			    || (p != hpage && TestSetPageHWPoison(hpage))) {
-				atomic_long_sub(nr_pages, &mce_bad_pages);
+				atomic_long_sub(nr_pages, &num_poisoned_pages);
 				return 0;
 			}
 			set_page_hwpoison_huge_page(hpage);
@@ -1128,7 +1128,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 	}
 	if (hwpoison_filter(p)) {
 		if (TestClearPageHWPoison(p))
-			atomic_long_sub(nr_pages, &mce_bad_pages);
+			atomic_long_sub(nr_pages, &num_poisoned_pages);
 		unlock_page(hpage);
 		put_page(hpage);
 		return 0;
@@ -1323,7 +1323,7 @@ int unpoison_memory(unsigned long pfn)
 			return 0;
 		}
 		if (TestClearPageHWPoison(p))
-			atomic_long_sub(nr_pages, &mce_bad_pages);
+			atomic_long_sub(nr_pages, &num_poisoned_pages);
 		pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
 		return 0;
 	}
@@ -1337,7 +1337,7 @@ int unpoison_memory(unsigned long pfn)
 	 */
 	if (TestClearPageHWPoison(page)) {
 		pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
-		atomic_long_sub(nr_pages, &mce_bad_pages);
+		atomic_long_sub(nr_pages, &num_poisoned_pages);
 		freeit = 1;
 		if (PageHuge(page))
 			clear_page_hwpoison_huge_page(page);
@@ -1442,7 +1442,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
 	}
 done:
 	/* keep elevated page count for bad page */
-	atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
+	atomic_long_add(1 << compound_trans_order(hpage), &num_poisoned_pages);
 	set_page_hwpoison_huge_page(hpage);
 	dequeue_hwpoisoned_huge_page(hpage);
 out:
@@ -1585,7 +1585,7 @@ int soft_offline_page(struct page *page, int flags)
 
 done:
 	/* keep elevated page count for bad page */
-	atomic_long_inc(&mce_bad_pages);
+	atomic_long_inc(&num_poisoned_pages);
 	SetPageHWPoison(page);
 out:
 	return ret;
diff --git a/mm/sparse.c b/mm/sparse.c
index cff97960f1d7..7ca6dc847947 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -783,7 +783,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 
 	for (i = 0; i < PAGES_PER_SECTION; i++) {
 		if (PageHWPoison(&memmap[i])) {
-			atomic_long_sub(1, &mce_bad_pages);
+			atomic_long_sub(1, &num_poisoned_pages);
 			ClearPageHWPoison(&memmap[i]);
 		}
 	}
-- 
cgit v1.2.3


From af8fae7c08862bb85c5cf445bf9b36314b82111f Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Fri, 22 Feb 2013 16:34:03 -0800
Subject: mm/memory-failure.c: clean up soft_offline_page()

Currently soft_offline_page() is hard to maintain because it has many
return points and goto statements.  All of this mess come from
get_any_page().

This function should only get page refcount as the name implies, but it
does some page isolating actions like SetPageHWPoison() and dequeuing
hugepage.  This patch corrects it and introduces some internal
subroutines to make soft offlining code more readable and maintainable.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reviewed-by: Andi Kleen <andi@firstfloor.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 156 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 87 insertions(+), 69 deletions(-)

(limited to 'mm/memory-failure.c')

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c95e19af510b..9cab165fd668 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1368,7 +1368,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
  * that is not free, and 1 for any other page type.
  * For 1 the page is returned with increased page count, otherwise not.
  */
-static int get_any_page(struct page *p, unsigned long pfn, int flags)
+static int __get_any_page(struct page *p, unsigned long pfn, int flags)
 {
 	int ret;
 
@@ -1393,11 +1393,9 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
 	if (!get_page_unless_zero(compound_head(p))) {
 		if (PageHuge(p)) {
 			pr_info("%s: %#lx free huge page\n", __func__, pfn);
-			ret = dequeue_hwpoisoned_huge_page(compound_head(p));
+			ret = 0;
 		} else if (is_free_buddy_page(p)) {
 			pr_info("%s: %#lx free buddy page\n", __func__, pfn);
-			/* Set hwpoison bit while page is still isolated */
-			SetPageHWPoison(p);
 			ret = 0;
 		} else {
 			pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
@@ -1413,23 +1411,48 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
 	return ret;
 }
 
+static int get_any_page(struct page *page, unsigned long pfn, int flags)
+{
+	int ret = __get_any_page(page, pfn, flags);
+
+	if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
+		/*
+		 * Try to free it.
+		 */
+		put_page(page);
+		shake_page(page, 1);
+
+		/*
+		 * Did it turn free?
+		 */
+		ret = __get_any_page(page, pfn, 0);
+		if (!PageLRU(page)) {
+			pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
+				pfn, page->flags);
+			return -EIO;
+		}
+	}
+	return ret;
+}
+
 static int soft_offline_huge_page(struct page *page, int flags)
 {
 	int ret;
 	unsigned long pfn = page_to_pfn(page);
 	struct page *hpage = compound_head(page);
 
+	/*
+	 * This double-check of PageHWPoison is to avoid the race with
+	 * memory_failure(). See also comment in __soft_offline_page().
+	 */
+	lock_page(hpage);
 	if (PageHWPoison(hpage)) {
+		unlock_page(hpage);
+		put_page(hpage);
 		pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
-		ret = -EBUSY;
-		goto out;
+		return -EBUSY;
 	}
-
-	ret = get_any_page(page, pfn, flags);
-	if (ret < 0)
-		goto out;
-	if (ret == 0)
-		goto done;
+	unlock_page(hpage);
 
 	/* Keep page count to indicate a given hugepage is isolated. */
 	ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
@@ -1438,17 +1461,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
 	if (ret) {
 		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
 			pfn, ret, page->flags);
-		goto out;
+	} else {
+		set_page_hwpoison_huge_page(hpage);
+		dequeue_hwpoisoned_huge_page(hpage);
+		atomic_long_add(1 << compound_trans_order(hpage),
+				&num_poisoned_pages);
 	}
-done:
 	/* keep elevated page count for bad page */
-	atomic_long_add(1 << compound_trans_order(hpage), &num_poisoned_pages);
-	set_page_hwpoison_huge_page(hpage);
-	dequeue_hwpoisoned_huge_page(hpage);
-out:
 	return ret;
 }
 
+static int __soft_offline_page(struct page *page, int flags);
+
 /**
  * soft_offline_page - Soft offline a page.
  * @page: page to offline
@@ -1477,62 +1501,60 @@ int soft_offline_page(struct page *page, int flags)
 	unsigned long pfn = page_to_pfn(page);
 	struct page *hpage = compound_trans_head(page);
 
-	if (PageHuge(page)) {
-		ret = soft_offline_huge_page(page, flags);
-		goto out;
+	if (PageHWPoison(page)) {
+		pr_info("soft offline: %#lx page already poisoned\n", pfn);
+		return -EBUSY;
 	}
-	if (PageTransHuge(hpage)) {
+	if (!PageHuge(page) && PageTransHuge(hpage)) {
 		if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
 			pr_info("soft offline: %#lx: failed to split THP\n",
 				pfn);
-			ret = -EBUSY;
-			goto out;
+			return -EBUSY;
 		}
 	}
 
-	if (PageHWPoison(page)) {
-		pr_info("soft offline: %#lx page already poisoned\n", pfn);
-		ret = -EBUSY;
-		goto out;
-	}
-
 	ret = get_any_page(page, pfn, flags);
 	if (ret < 0)
-		goto out;
-	if (ret == 0)
-		goto done;
-
-	/*
-	 * Page cache page we can handle?
-	 */
-	if (!PageLRU(page)) {
-		/*
-		 * Try to free it.
-		 */
-		put_page(page);
-		shake_page(page, 1);
-
-		/*
-		 * Did it turn free?
-		 */
-		ret = get_any_page(page, pfn, 0);
-		if (ret < 0)
-			goto out;
-		if (ret == 0)
-			goto done;
-	}
-	if (!PageLRU(page)) {
-		pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
-			pfn, page->flags);
-		ret = -EIO;
-		goto out;
+		return ret;
+	if (ret) { /* for in-use pages */
+		if (PageHuge(page))
+			ret = soft_offline_huge_page(page, flags);
+		else
+			ret = __soft_offline_page(page, flags);
+	} else { /* for free pages */
+		if (PageHuge(page)) {
+			set_page_hwpoison_huge_page(hpage);
+			dequeue_hwpoisoned_huge_page(hpage);
+			atomic_long_add(1 << compound_trans_order(hpage),
+					&num_poisoned_pages);
+		} else {
+			SetPageHWPoison(page);
+			atomic_long_inc(&num_poisoned_pages);
+		}
 	}
+	/* keep elevated page count for bad page */
+	return ret;
+}
+
+static int __soft_offline_page(struct page *page, int flags)
+{
+	int ret;
+	unsigned long pfn = page_to_pfn(page);
 
 	/*
-	 * Synchronized using the page lock with memory_failure()
+	 * Check PageHWPoison again inside page lock because PageHWPoison
+	 * is set by memory_failure() outside page lock. Note that
+	 * memory_failure() also double-checks PageHWPoison inside page lock,
+	 * so there's no race between soft_offline_page() and memory_failure().
 	 */
 	lock_page(page);
 	wait_on_page_writeback(page);
+	if (PageHWPoison(page)) {
+		unlock_page(page);
+		put_page(page);
+		pr_info("soft offline: %#lx page already poisoned\n", pfn);
+		return -EBUSY;
+	}
 	/*
 	 * Try to invalidate first. This should work for
 	 * non dirty unmapped page cache pages.
@@ -1545,9 +1567,10 @@ int soft_offline_page(struct page *page, int flags)
 	 */
 	if (ret == 1) {
 		put_page(page);
-		ret = 0;
 		pr_info("soft_offline: %#lx: invalidated\n", pfn);
-		goto done;
+		SetPageHWPoison(page);
+		atomic_long_inc(&num_poisoned_pages);
+		return 0;
 	}
 
 	/*
@@ -1575,18 +1598,13 @@ int soft_offline_page(struct page *page, int flags)
 				pfn, ret, page->flags);
 			if (ret > 0)
 				ret = -EIO;
+		} else {
+			SetPageHWPoison(page);
+			atomic_long_inc(&num_poisoned_pages);
 		}
 	} else {
 		pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
 			pfn, ret, page_count(page), page->flags);
 	}
-	if (ret)
-		goto out;
-
-done:
-	/* keep elevated page count for bad page */
-	atomic_long_inc(&num_poisoned_pages);
-	SetPageHWPoison(page);
-out:
 	return ret;
 }
-- 
cgit v1.2.3


From 4db0e950c5b78586bea9e1b027be849631f89a17 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Fri, 22 Feb 2013 16:34:05 -0800
Subject: mm/memory-failure.c: fix wrong num_poisoned_pages in handling memory
 error on thp

num_poisoned_pages counts up the number of pages isolated by memory
errors.  But for thp, only one subpage is isolated because memory error
handler splits it, so it's wrong to add (1 << compound_trans_order).

[akpm@linux-foundation.org: tweak comment]
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'mm/memory-failure.c')

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9cab165fd668..1a56d63adf9c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1039,7 +1039,17 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 		return 0;
 	}
 
-	nr_pages = 1 << compound_trans_order(hpage);
+	/*
+	 * Currently errors on hugetlbfs pages are measured in hugepage units,
+	 * so nr_pages should be 1 << compound_order.  OTOH when errors are on
+	 * transparent hugepages, they are supposed to be split and error
+	 * measurement is done in normal page units.  So nr_pages should be one
+	 * in this case.
+	 */
+	if (PageHuge(p))
+		nr_pages = 1 << compound_order(hpage);
+	else /* normal page or thp */
+		nr_pages = 1;
 	atomic_long_add(nr_pages, &num_poisoned_pages);
 
 	/*
-- 
cgit v1.2.3


From 9c620e2bc5aa4256c102ada34e6c76204ed5898b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 22 Feb 2013 16:35:14 -0800
Subject: mm: remove offlining arg to migrate_pages

No functional change, but the only purpose of the offlining argument to
migrate_pages() etc, was to ensure that __unmap_and_move() could migrate a
KSM page for memory hotremove (which took ksm_thread_mutex) but not for
other callers.  Now all cases are safe, remove the arg.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Petr Holasek <pholasek@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Izik Eidus <izik.eidus@ravellosystems.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 14 ++++++--------
 mm/compaction.c         |  2 +-
 mm/memory-failure.c     |  7 +++----
 mm/memory_hotplug.c     |  3 +--
 mm/mempolicy.c          |  8 +++-----
 mm/migrate.c            | 35 +++++++++++++----------------------
 mm/page_alloc.c         |  6 ++----
 7 files changed, 29 insertions(+), 46 deletions(-)

(limited to 'mm/memory-failure.c')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 1e9f627967a3..a405d3dc0f61 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -40,11 +40,9 @@ extern void putback_movable_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
 extern int migrate_pages(struct list_head *l, new_page_t x,
-			unsigned long private, bool offlining,
-			enum migrate_mode mode, int reason);
+		unsigned long private, enum migrate_mode mode, int reason);
 extern int migrate_huge_page(struct page *, new_page_t x,
-			unsigned long private, bool offlining,
-			enum migrate_mode mode);
+		unsigned long private, enum migrate_mode mode);
 
 extern int fail_migrate_page(struct address_space *,
 			struct page *, struct page *);
@@ -62,11 +60,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 static inline void putback_lru_pages(struct list_head *l) {}
 static inline void putback_movable_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t x,
-		unsigned long private, bool offlining,
-		enum migrate_mode mode, int reason) { return -ENOSYS; }
+		unsigned long private, enum migrate_mode mode, int reason)
+	{ return -ENOSYS; }
 static inline int migrate_huge_page(struct page *page, new_page_t x,
-		unsigned long private, bool offlining,
-		enum migrate_mode mode) { return -ENOSYS; }
+		unsigned long private, enum migrate_mode mode)
+	{ return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
 static inline int migrate_prep_local(void) { return -ENOSYS; }
diff --git a/mm/compaction.c b/mm/compaction.c
index ef53f352b606..25e75e3e2ac6 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -980,7 +980,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 		nr_migrate = cc->nr_migratepages;
 		err = migrate_pages(&cc->migratepages, compaction_alloc,
-				(unsigned long)cc, false,
+				(unsigned long)cc,
 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
 				MR_COMPACTION);
 		update_nr_listpages(cc);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1a56d63adf9c..e4683459ab26 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1465,7 +1465,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
 	unlock_page(hpage);
 
 	/* Keep page count to indicate a given hugepage is isolated. */
-	ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
+	ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL,
 				MIGRATE_SYNC);
 	put_page(hpage);
 	if (ret) {
@@ -1597,11 +1597,10 @@ static int __soft_offline_page(struct page *page, int flags)
 	if (!ret) {
 		LIST_HEAD(pagelist);
 		inc_zone_page_state(page, NR_ISOLATED_ANON +
-					    page_is_file_cache(page));
+					page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
 		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
-							false, MIGRATE_SYNC,
-							MR_MEMORY_FAILURE);
+					MIGRATE_SYNC, MR_MEMORY_FAILURE);
 		if (ret) {
 			putback_lru_pages(&pagelist);
 			pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 77a6abfe3291..dda1ca695a08 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1286,8 +1286,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		 * migrate_pages returns # of failed pages.
 		 */
 		ret = migrate_pages(&source, alloc_migrate_target, 0,
-							true, MIGRATE_SYNC,
-							MR_MEMORY_HOTPLUG);
+					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
 		if (ret)
 			putback_lru_pages(&source);
 	}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d344c36db63f..e2661d1c5c33 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1014,8 +1014,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 
 	if (!list_empty(&pagelist)) {
 		err = migrate_pages(&pagelist, new_node_page, dest,
-							false, MIGRATE_SYNC,
-							MR_SYSCALL);
+					MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_lru_pages(&pagelist);
 	}
@@ -1259,9 +1258,8 @@ static long do_mbind(unsigned long start, unsigned long len,
 		if (!list_empty(&pagelist)) {
 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
 			nr_failed = migrate_pages(&pagelist, new_vma_page,
-						(unsigned long)vma,
-						false, MIGRATE_SYNC,
-						MR_MEMPOLICY_MBIND);
+					(unsigned long)vma,
+					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
 			if (nr_failed)
 				putback_lru_pages(&pagelist);
 		}
diff --git a/mm/migrate.c b/mm/migrate.c
index 20a03eb0667f..3bbaf5d230b0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -701,7 +701,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 }
 
 static int __unmap_and_move(struct page *page, struct page *newpage,
-			int force, bool offlining, enum migrate_mode mode)
+				int force, enum migrate_mode mode)
 {
 	int rc = -EAGAIN;
 	int remap_swapcache = 1;
@@ -847,8 +847,7 @@ out:
  * to the newly allocated page in newpage.
  */
 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-			struct page *page, int force, bool offlining,
-			enum migrate_mode mode)
+			struct page *page, int force, enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -866,7 +865,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 		if (unlikely(split_huge_page(page)))
 			goto out;
 
-	rc = __unmap_and_move(page, newpage, force, offlining, mode);
+	rc = __unmap_and_move(page, newpage, force, mode);
 
 	if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
 		/*
@@ -926,8 +925,7 @@ out:
  */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
 				unsigned long private, struct page *hpage,
-				int force, bool offlining,
-				enum migrate_mode mode)
+				int force, enum migrate_mode mode)
 {
 	int rc = 0;
 	int *result = NULL;
@@ -989,9 +987,8 @@ out:
  *
  * Return: Number of pages not migrated or error code.
  */
-int migrate_pages(struct list_head *from,
-		new_page_t get_new_page, unsigned long private, bool offlining,
-		enum migrate_mode mode, int reason)
+int migrate_pages(struct list_head *from, new_page_t get_new_page,
+		unsigned long private, enum migrate_mode mode, int reason)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -1012,8 +1009,7 @@ int migrate_pages(struct list_head *from,
 			cond_resched();
 
 			rc = unmap_and_move(get_new_page, private,
-						page, pass > 2, offlining,
-						mode);
+						page, pass > 2, mode);
 
 			switch(rc) {
 			case -ENOMEM:
@@ -1046,15 +1042,13 @@ out:
 }
 
 int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
-		      unsigned long private, bool offlining,
-		      enum migrate_mode mode)
+		      unsigned long private, enum migrate_mode mode)
 {
 	int pass, rc;
 
 	for (pass = 0; pass < 10; pass++) {
-		rc = unmap_and_move_huge_page(get_new_page,
-					      private, hpage, pass > 2, offlining,
-					      mode);
+		rc = unmap_and_move_huge_page(get_new_page, private,
+						hpage, pass > 2, mode);
 		switch (rc) {
 		case -ENOMEM:
 			goto out;
@@ -1177,8 +1171,7 @@ set_status:
 	err = 0;
 	if (!list_empty(&pagelist)) {
 		err = migrate_pages(&pagelist, new_page_node,
-				(unsigned long)pm, 0, MIGRATE_SYNC,
-				MR_SYSCALL);
+				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_lru_pages(&pagelist);
 	}
@@ -1613,10 +1606,8 @@ int migrate_misplaced_page(struct page *page, int node)
 		goto out;
 
 	list_add(&page->lru, &migratepages);
-	nr_remaining = migrate_pages(&migratepages,
-			alloc_misplaced_dst_page,
-			node, false, MIGRATE_ASYNC,
-			MR_NUMA_MISPLACED);
+	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
+				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
 	if (nr_remaining) {
 		putback_lru_pages(&migratepages);
 		isolated = 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 445718b328b6..64c83a8c3220 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6084,10 +6084,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 							&cc->migratepages);
 		cc->nr_migratepages -= nr_reclaimed;
 
-		ret = migrate_pages(&cc->migratepages,
-				    alloc_migrate_target,
-				    0, false, MIGRATE_SYNC,
-				    MR_CMA);
+		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
+				    0, MIGRATE_SYNC, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);
-- 
cgit v1.2.3


From 524fca1e7356f8f9f92c51ca52727187872fc5f5 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Fri, 22 Feb 2013 16:35:51 -0800
Subject: HWPOISON: fix misjudgement of page_action() for errors on mlocked
 pages

memory_failure() can't handle memory errors on mlocked pages correctly,
because page_action() judges such errors as ones on "unknown pages"
instead of ones on "unevictable LRU page" or "mlocked LRU page".  In
order to determine page_state page_action() checks page flags at the
timing of the judgement, but such page flags are not the same with those
just after memory_failure() is called, because memory_failure() does
unmapping of the error pages before doing page_action().  This unmapping
changes the page state, especially page_remove_rmap() (called from
try_to_unmap_one()) clears PG_mlocked, so page_action() can't catch
mlocked pages after that.

With this patch, we store the page flag of the error page before doing
unmap, and (only) if the first check with page flags at the time decided
the error page is unknown, we do the second check with the stored page
flag.  This implementation doesn't change error handling for the page
types for which the first check can determine the page state correctly.

[akpm@linux-foundation.org: tweak comments]
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Chen Gong <gong.chen@linux.intel.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

(limited to 'mm/memory-failure.c')

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e4683459ab26..8f13b26a5b93 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1021,6 +1021,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 	struct page *hpage;
 	int res;
 	unsigned int nr_pages;
+	unsigned long page_flags;
 
 	if (!sysctl_memory_failure_recovery)
 		panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -1128,6 +1129,15 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 	 */
 	lock_page(hpage);
 
+	/*
+	 * We use page flags to determine what action should be taken, but
+	 * the flags can be modified by the error containment action.  One
+	 * example is an mlocked page, where PG_mlocked is cleared by
+	 * page_remove_rmap() in try_to_unmap_one(). So to determine page status
+	 * correctly, we save a copy of the page flags at this time.
+	 */
+	page_flags = p->flags;
+
 	/*
 	 * unpoison always clear PG_hwpoison inside page lock
 	 */
@@ -1186,12 +1196,19 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 	}
 
 	res = -EBUSY;
-	for (ps = error_states;; ps++) {
-		if ((p->flags & ps->mask) == ps->res) {
-			res = page_action(ps, p, pfn);
+	/*
+	 * The first check uses the current page flags which may not have any
+	 * relevant information. The second check with the saved page flagss is
+	 * carried out only if the first check can't determine the page status.
+	 */
+	for (ps = error_states;; ps++)
+		if ((p->flags & ps->mask) == ps->res)
 			break;
-		}
-	}
+	if (!ps->mask)
+		for (ps = error_states;; ps++)
+			if ((page_flags & ps->mask) == ps->res)
+				break;
+	res = page_action(ps, p, pfn);
 out:
 	unlock_page(hpage);
 	return res;
-- 
cgit v1.2.3


From 5f4b9fc5c1d3c8fc6037fa99d527ad3264dc0038 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Fri, 22 Feb 2013 16:35:53 -0800
Subject: HWPOISON: change order of error_states[]'s elements

error_states[] has two separate states "unevictable LRU page" and
"mlocked LRU page", and the former one has the higher priority now.  But
because of that the latter one is rarely chosen because pages with
PageMlocked highly likely have PG_unevictable set.  On the other hand,
PG_unevictable without PageMlocked is common for ramfs or SHM_LOCKed
shared memory, so reversing the priority of these two states helps us
clearly distinguish them.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Chen Gong <gong.chen@linux.intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm/memory-failure.c')

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 8f13b26a5b93..df0694c6adef 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -784,12 +784,12 @@ static struct page_state {
 	{ sc|dirty,	sc|dirty,	"dirty swapcache",	me_swapcache_dirty },
 	{ sc|dirty,	sc,		"clean swapcache",	me_swapcache_clean },
 
-	{ unevict|dirty, unevict|dirty,	"dirty unevictable LRU", me_pagecache_dirty },
-	{ unevict,	unevict,	"clean unevictable LRU", me_pagecache_clean },
-
 	{ mlock|dirty,	mlock|dirty,	"dirty mlocked LRU",	me_pagecache_dirty },
 	{ mlock,	mlock,		"clean mlocked LRU",	me_pagecache_clean },
 
+	{ unevict|dirty, unevict|dirty,	"dirty unevictable LRU", me_pagecache_dirty },
+	{ unevict,	unevict,	"clean unevictable LRU", me_pagecache_clean },
+
 	{ lru|dirty,	lru|dirty,	"dirty LRU",	me_pagecache_dirty },
 	{ lru|dirty,	lru,		"clean LRU",	me_pagecache_clean },
 
-- 
cgit v1.2.3